[Bast-commits] r9285 - in ironman/trunk/plagger/lib/Plagger/Plugin: . Bundle Filter

castaway at dev.catalyst.perl.org castaway at dev.catalyst.perl.org
Sun May 2 13:05:33 GMT 2010


Author: castaway
Date: 2010-05-02 14:05:33 +0100 (Sun, 02 May 2010)
New Revision: 9285

Added:
   ironman/trunk/plagger/lib/Plagger/Plugin/Filter/
   ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLScrubber.pm
   ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLStripScripts.pm
   ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLTruncate.pm
Modified:
   ironman/trunk/plagger/lib/Plagger/Plugin/Bundle/PagedPlanet.pm
Log:
Commiting ole plagger truncating/scrubbing code


Modified: ironman/trunk/plagger/lib/Plagger/Plugin/Bundle/PagedPlanet.pm
===================================================================
--- ironman/trunk/plagger/lib/Plagger/Plugin/Bundle/PagedPlanet.pm	2010-05-01 09:55:04 UTC (rev 9284)
+++ ironman/trunk/plagger/lib/Plagger/Plugin/Bundle/PagedPlanet.pm	2010-05-02 13:05:33 UTC (rev 9285)
@@ -14,6 +14,17 @@
         }
     }
 
+# Would like to dump raw data at this point, but whatever publish gets data from
+# has only a composite feed 
+#     $context->load_plugin({
+# 	module => 'Publish::Serializer',
+# 	config => {
+# 	    dir => '/var/www/ironboy.enlightenedperl.org/raw',
+# #	    serializer => 'Config::General',
+# 	    filename => '%i.conf',
+# 	}
+#     });
+
     $context->load_plugin({
         module => 'Filter::StripTagsFromTitle',
     });
@@ -25,10 +36,6 @@
         });
     }
 
-    $context->load_plugin({
-        module => 'Filter::HTMLScrubber',
-        config => $self->conf->{scrubber} || {},
-    });
 
     $context->load_plugin({
         module => 'Filter::GuessTimeZoneByDomain',
@@ -58,6 +65,43 @@
         },
     });
 
+    $context->load_plugin({
+        module => 'Filter::HTMLScrubber',
+        config => $self->conf->{scrubber} || {},
+    });
+
+    $context->load_plugin({
+ 	module => 'Publish::Serializer',
+ 	config => {
+ 	    dir => '/var/www/ironboy.enlightenedperl.org/dump',
+ #	    serializer => 'Config::General',
+ 	    filename => '%i.conf',
+ 	}
+     });
+
+#     config:
+#       dir: /var/www/ironboy.enlightenedperl.org/plagger/csv
+#       encoding: utf-8
+#       filename: my_%t.csv
+#       mode: append
+#       column:
+#        - author
+#        - title
+#        - permalink
+#        - date
+
+    $context->load_plugin({
+	module => 'Publish::CSV',
+	config => {
+	    dir => '/var/www/ironboy.enlightenedperl.org/plagger/csv',
+	    encoding => 'utf-8',
+	    filename => 'my_%t.csv',
+	    mode => 'append',
+	    column => [ qw/author title permalink date/
+		       ]
+	}
+    }); 
+
     my $rule = {
         expression => q{ $args->{feed}->id eq 'smartfeed:all' },
     };
@@ -155,7 +199,7 @@
 
 =item Filter::HTMLTidy (if HTML::Tidy is available)
 
-=item Filter::HTMLScrubber
+=item Filter::HTMLStripScripts
 
 =item Filter::GuessTimeZoneByDomain
 

Added: ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLScrubber.pm
===================================================================
--- ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLScrubber.pm	                        (rev 0)
+++ ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLScrubber.pm	2010-05-02 13:05:33 UTC (rev 9285)
@@ -0,0 +1,163 @@
+package Plagger::Plugin::Filter::HTMLScrubber;
+use strict;
+use base qw( Plagger::Plugin );
+
+use HTML::Scrubber;
+
+sub rules {
+    return(
+        img => {
+            src => qr{^http://},    # only URL with http://
+            alt => 1,               # alt attributes allowed
+            '*' => 0,               # deny all others
+        },
+        style  => 0,
+        script => 0,
+    );
+}
+
+sub default {
+    return(
+        '*'    => 1,                        # default rule, allow all attributes
+        'href' => qr{^(?!(?:java)?script)}i,
+        'src'  => qr{^(?!(?:java)?script)}i,
+        'cite'     => '(?i-xsm:^(?!(?:java)?script))',
+        'language' => 0,
+        'name'        => 1,                 # could be sneaky, but hey ;)
+        'onblur'      => 0,
+        'onchange'    => 0,
+        'onclick'     => 0,
+        'ondblclick'  => 0,
+        'onerror'     => 0,
+        'onfocus'     => 0,
+        'onkeydown'   => 0,
+        'onkeypress'  => 0,
+        'onkeyup'     => 0,
+        'onload'      => 0,
+        'onmousedown' => 0,
+        'onmousemove' => 0,
+        'onmouseout'  => 0,
+        'onmouseover' => 0,
+        'onmouseup'   => 0,
+        'onreset'     => 0,
+        'onselect'    => 0,
+        'onsubmit'    => 0,
+        'onunload'    => 0,
+        'src'         => 0,
+        'type'        => 0,
+        'style'       => 0,
+    );
+}
+
+sub register {
+    my ( $self, $context ) = @_;
+
+    $context->register_hook(
+        $self,
+        'update.entry.fixup' => \&update,
+        'plugin.init'        => \&initialize,
+    );
+}
+
+sub initialize {
+    my($self, $context, $args) = @_;
+
+    $self->{scrubber} = do {
+        my $scrubber = HTML::Scrubber->new;
+        my $config   = $self->conf;
+
+        my ( %rules, %default );
+        unless ( delete $config->{no_default_configs} ) {
+            %rules   = $self->rules;
+            %default = $self->default;
+        }
+        $scrubber->rules( %rules, %{ delete $config->{rules} || {} } );
+        $scrubber->default(1, { %default, %{ delete $config->{default} || {} } });
+
+        while ( my ( $method, $arg ) = each %$config ) {
+            eval {
+                $scrubber->$method(
+                      ref $arg eq 'ARRAY' ? @$arg
+                    : ref $arg eq 'HASH'  ? %$arg
+                    : $arg );
+            };
+            $context->error(qq/Invalid method call "$method": $@/) if $@;
+        }
+
+        $scrubber;
+    };
+}
+
+sub update {
+    my ( $self, $context, $args ) = @_;
+
+    foreach my $attr (qw/body summary/) {
+         if (defined $args->{entry}->$attr && $args->{entry}->$attr->is_html) {
+             $context->log(debug => "Scrubbing $attr for " . $args->{entry}->permalink || '(no-link)');
+            my $content = $self->{scrubber}->scrub( $args->{entry}->$attr );
+            $args->{entry}->$attr($content);
+            $context->log(debug => "After: $attr ||$content||\n");   
+        }
+     }
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Plagger::Plugin::Filter::HTMLScrubber - Scrub feed content
+
+=head1 SYNOPSIS
+
+  - module: Filter::HTMLScrubber
+    config:
+      rules:
+        style: 0
+        script: 0
+
+=head1 DESCRIPTION
+
+This plugin scrubs feed content using L<HTML::Scrubber>.
+
+All config parameters (except 'no_default_configs') are implemented as
+HTML::Scrubber's method: value.  For example, if you write:
+
+    method: value
+
+in the config: section, this plugin will automatically turn the config
+into the method call:
+
+    $scrubber->method('value');
+
+See L<HTML::Scrubber> document for details.
+
+=head1 CONFIG
+
+=over 4
+
+=item no_default_configs
+
+Some rules and default config parameters are set by default. See I<rules>
+and I<default> methods defined in this module code for details.
+
+If you don't need these settings, use C<no_default_configs>
+
+   no_detault_configs: 1
+
+Defaults to 0, which means it uses the default (somewhat secure) config.
+
+=back
+
+=head1 AUTHOR
+
+Daisuke Murase <typester at cpan.org>
+
+Tatsuhiko Miyagawa
+
+=head1 SEE ALSO
+
+L<Plagger>, L<HTML::Scrubber>
+
+=cut

Added: ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLStripScripts.pm
===================================================================
--- ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLStripScripts.pm	                        (rev 0)
+++ ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLStripScripts.pm	2010-05-02 13:05:33 UTC (rev 9285)
@@ -0,0 +1,175 @@
+package Plagger::Plugin::Filter::HTMLStripScripts;
+use strict;
+use base qw( Plagger::Plugin );
+
+use HTML::StripScripts::Parser;
+
+sub rules {
+    return(
+	    pre => 1,
+	    code => 1,
+#	    div => 1,
+	    span => 1,
+	    br => 1,
+	    p => 1,
+	    blockquote => 1,
+	    a => 1,
+	    img => {
+		src => qr{^http://},    # only URL with http://
+		alt => 1,               # alt attributes allowed
+		'*' => 0,               # deny all others
+	    },
+	    style  => 0,
+	    script => 0,
+    );
+}
+
+sub default {
+    return(
+        '*'    => 1,                        # default rule, allow all attributes
+        'href' => qr{^(?!(?:java)?script)}i,
+        'src'  => qr{^(?!(?:java)?script)}i,
+        'cite'     => '(?i-xsm:^(?!(?:java)?script))',
+        'language' => 0,
+        'name'        => 1,                 # could be sneaky, but hey ;)
+        'onblur'      => 0,
+        'onchange'    => 0,
+        'onclick'     => 0,
+        'ondblclick'  => 0,
+        'onerror'     => 0,
+        'onfocus'     => 0,
+        'onkeydown'   => 0,
+        'onkeypress'  => 0,
+        'onkeyup'     => 0,
+        'onload'      => 0,
+        'onmousedown' => 0,
+        'onmousemove' => 0,
+        'onmouseout'  => 0,
+        'onmouseover' => 0,
+        'onmouseup'   => 0,
+        'onreset'     => 0,
+        'onselect'    => 0,
+        'onsubmit'    => 0,
+        'onunload'    => 0,
+        'src'         => 0,
+        'type'        => 0,
+        'style'       => 0,
+    );
+}
+
+sub register {
+    my ( $self, $context ) = @_;
+
+    $context->register_hook(
+        $self,
+        'update.entry.fixup' => \&update,
+        'plugin.init'        => \&initialize,
+    );
+}
+
+sub initialize {
+    my($self, $context, $args) = @_;
+
+    $self->{scrubber} = do {
+        my $config   = $self->conf;
+
+        my ( %rules, %default );
+        unless ( delete $config->{no_default_configs} ) {
+            %rules   = $self->rules;
+            %default = $self->default;
+        }
+        my $scrubber = HTML::StripScripts::Parser->new(
+						       {
+							   Context => 'Flow',
+							   AllowHref => 1,
+							   AllowMailTo => 1,
+							   BanList => ['link'],
+							   Rules => \%rules,
+						       });
+
+#        while ( my ( $method, $arg ) = each %$config ) {
+#            eval {
+#                $scrubber->$method(
+#                      ref $arg eq 'ARRAY' ? @$arg
+#                    : ref $arg eq 'HASH'  ? %$arg
+#                    : $arg );
+#            };
+#            $context->error(qq/Invalid method call "$method": $@/) if $@;
+#        }
+
+        $scrubber;
+    };
+}
+
+sub update {
+    my ( $self, $context, $args ) = @_;
+
+    if (defined $args->{entry}->body && $args->{entry}->body->is_html) {
+        $context->log(debug => "Stripping body for " . $args->{entry}->permalink || '(no-link)');
+	$context->log(debug => "Before: " . $args->{entry}->body);
+        my $body = $self->{scrubber}->filter_html( $args->{entry}->body );
+        $args->{entry}->body($body);
+	$context->log(debug => "After: " . $args->{entry}->body);
+    }
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Plagger::Plugin::Filter::HTMLScrubber - Scrub feed content
+
+=head1 SYNOPSIS
+
+  - module: Filter::HTMLScrubber
+    config:
+      rules:
+        style: 0
+        script: 0
+
+=head1 DESCRIPTION
+
+This plugin scrubs feed content using L<HTML::Scrubber>.
+
+All config parameters (except 'no_default_configs') are implemented as
+HTML::Scrubber's method: value.  For example, if you write:
+
+    method: value
+
+in the config: section, this plugin will automatically turn the config
+into the method call:
+
+    $scrubber->method('value');
+
+See L<HTML::Scrubber> document for details.
+
+=head1 CONFIG
+
+=over 4
+
+=item no_default_configs
+
+Some rules and default config parameters are set by default. See I<rules>
+and I<default> methods defined in this module code for details.
+
+If you don't need these settings, use C<no_default_configs>
+
+   no_detault_configs: 1
+
+Defaults to 0, which means it uses the default (somewhat secure) config.
+
+=back
+
+=head1 AUTHOR
+
+Daisuke Murase <typester at cpan.org>
+
+Tatsuhiko Miyagawa
+
+=head1 SEE ALSO
+
+L<Plagger>, L<HTML::Scrubber>
+
+=cut

Added: ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLTruncate.pm
===================================================================
--- ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLTruncate.pm	                        (rev 0)
+++ ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLTruncate.pm	2010-05-02 13:05:33 UTC (rev 9285)
@@ -0,0 +1,163 @@
+package Plagger::Plugin::Filter::HTMLScrubber;
+use strict;
+use base qw( Plagger::Plugin );
+
+use HTML::Scrubber;
+
+sub rules {
+    return(
+        img => {
+            src => qr{^http://},    # only URL with http://
+            alt => 1,               # alt attributes allowed
+            '*' => 0,               # deny all others
+        },
+        style  => 0,
+        script => 0,
+    );
+}
+
+sub default {
+    return(
+        '*'    => 1,                        # default rule, allow all attributes
+        'href' => qr{^(?!(?:java)?script)}i,
+        'src'  => qr{^(?!(?:java)?script)}i,
+        'cite'     => '(?i-xsm:^(?!(?:java)?script))',
+        'language' => 0,
+        'name'        => 1,                 # could be sneaky, but hey ;)
+        'onblur'      => 0,
+        'onchange'    => 0,
+        'onclick'     => 0,
+        'ondblclick'  => 0,
+        'onerror'     => 0,
+        'onfocus'     => 0,
+        'onkeydown'   => 0,
+        'onkeypress'  => 0,
+        'onkeyup'     => 0,
+        'onload'      => 0,
+        'onmousedown' => 0,
+        'onmousemove' => 0,
+        'onmouseout'  => 0,
+        'onmouseover' => 0,
+        'onmouseup'   => 0,
+        'onreset'     => 0,
+        'onselect'    => 0,
+        'onsubmit'    => 0,
+        'onunload'    => 0,
+        'src'         => 0,
+        'type'        => 0,
+        'style'       => 0,
+    );
+}
+
+sub register {
+    my ( $self, $context ) = @_;
+
+    $context->register_hook(
+        $self,
+        'update.entry.fixup' => \&update,
+        'plugin.init'        => \&initialize,
+    );
+}
+
+sub initialize {
+    my($self, $context, $args) = @_;
+
+    $self->{scrubber} = do {
+        my $scrubber = HTML::Scrubber->new;
+        my $config   = $self->conf;
+
+        my ( %rules, %default );
+        unless ( delete $config->{no_default_configs} ) {
+            %rules   = $self->rules;
+            %default = $self->default;
+        }
+        $scrubber->rules( %rules, %{ delete $config->{rules} || {} } );
+        $scrubber->default(1, { %default, %{ delete $config->{default} || {} } });
+
+        while ( my ( $method, $arg ) = each %$config ) {
+            eval {
+                $scrubber->$method(
+                      ref $arg eq 'ARRAY' ? @$arg
+                    : ref $arg eq 'HASH'  ? %$arg
+                    : $arg );
+            };
+            $context->error(qq/Invalid method call "$method": $@/) if $@;
+        }
+
+        $scrubber;
+    };
+}
+
+sub update {
+    my ( $self, $context, $args ) = @_;
+
+    foreach my $attr (qw/body summary/) {
+         if (defined $args->{entry}->$attr && $args->{entry}->$attr->is_html) {
+             $context->log(debug => "Scrubbing $attr for " . $args->{entry}->permalink || '(no-link)');
+            my $content = $self->{scrubber}->scrub( $args->{entry}->$attr );
+            $args->{entry}->$attr($content);
+            $context->log(debug => "After: $attr ||$content||\n");   
+        }
+     }
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Plagger::Plugin::Filter::HTMLScrubber - Scrub feed content
+
+=head1 SYNOPSIS
+
+  - module: Filter::HTMLScrubber
+    config:
+      rules:
+        style: 0
+        script: 0
+
+=head1 DESCRIPTION
+
+This plugin scrubs feed content using L<HTML::Scrubber>.
+
+All config parameters (except 'no_default_configs') are implemented as
+HTML::Scrubber's method: value.  For example, if you write:
+
+    method: value
+
+in the config: section, this plugin will automatically turn the config
+into the method call:
+
+    $scrubber->method('value');
+
+See L<HTML::Scrubber> document for details.
+
+=head1 CONFIG
+
+=over 4
+
+=item no_default_configs
+
+Some rules and default config parameters are set by default. See I<rules>
+and I<default> methods defined in this module code for details.
+
+If you don't need these settings, use C<no_default_configs>
+
+   no_detault_configs: 1
+
+Defaults to 0, which means it uses the default (somewhat secure) config.
+
+=back
+
+=head1 AUTHOR
+
+Daisuke Murase <typester at cpan.org>
+
+Tatsuhiko Miyagawa
+
+=head1 SEE ALSO
+
+L<Plagger>, L<HTML::Scrubber>
+
+=cut




More information about the Bast-commits mailing list