[Bast-commits] r9285 - in ironman/trunk/plagger/lib/Plagger/Plugin:
. Bundle Filter
castaway at dev.catalyst.perl.org
castaway at dev.catalyst.perl.org
Sun May 2 13:05:33 GMT 2010
Author: castaway
Date: 2010-05-02 14:05:33 +0100 (Sun, 02 May 2010)
New Revision: 9285
Added:
ironman/trunk/plagger/lib/Plagger/Plugin/Filter/
ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLScrubber.pm
ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLStripScripts.pm
ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLTruncate.pm
Modified:
ironman/trunk/plagger/lib/Plagger/Plugin/Bundle/PagedPlanet.pm
Log:
Commiting ole plagger truncating/scrubbing code
Modified: ironman/trunk/plagger/lib/Plagger/Plugin/Bundle/PagedPlanet.pm
===================================================================
--- ironman/trunk/plagger/lib/Plagger/Plugin/Bundle/PagedPlanet.pm 2010-05-01 09:55:04 UTC (rev 9284)
+++ ironman/trunk/plagger/lib/Plagger/Plugin/Bundle/PagedPlanet.pm 2010-05-02 13:05:33 UTC (rev 9285)
@@ -14,6 +14,17 @@
}
}
+# Would like to dump raw data at this point, but whatever publish gets data from
+# has only a composite feed
+# $context->load_plugin({
+# module => 'Publish::Serializer',
+# config => {
+# dir => '/var/www/ironboy.enlightenedperl.org/raw',
+# # serializer => 'Config::General',
+# filename => '%i.conf',
+# }
+# });
+
$context->load_plugin({
module => 'Filter::StripTagsFromTitle',
});
@@ -25,10 +36,6 @@
});
}
- $context->load_plugin({
- module => 'Filter::HTMLScrubber',
- config => $self->conf->{scrubber} || {},
- });
$context->load_plugin({
module => 'Filter::GuessTimeZoneByDomain',
@@ -58,6 +65,43 @@
},
});
+ $context->load_plugin({
+ module => 'Filter::HTMLScrubber',
+ config => $self->conf->{scrubber} || {},
+ });
+
+ $context->load_plugin({
+ module => 'Publish::Serializer',
+ config => {
+ dir => '/var/www/ironboy.enlightenedperl.org/dump',
+ # serializer => 'Config::General',
+ filename => '%i.conf',
+ }
+ });
+
+# config:
+# dir: /var/www/ironboy.enlightenedperl.org/plagger/csv
+# encoding: utf-8
+# filename: my_%t.csv
+# mode: append
+# column:
+# - author
+# - title
+# - permalink
+# - date
+
+ $context->load_plugin({
+ module => 'Publish::CSV',
+ config => {
+ dir => '/var/www/ironboy.enlightenedperl.org/plagger/csv',
+ encoding => 'utf-8',
+ filename => 'my_%t.csv',
+ mode => 'append',
+ column => [ qw/author title permalink date/
+ ]
+ }
+ });
+
my $rule = {
expression => q{ $args->{feed}->id eq 'smartfeed:all' },
};
@@ -155,7 +199,7 @@
=item Filter::HTMLTidy (if HTML::Tidy is available)
-=item Filter::HTMLScrubber
+=item Filter::HTMLStripScripts
=item Filter::GuessTimeZoneByDomain
Added: ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLScrubber.pm
===================================================================
--- ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLScrubber.pm (rev 0)
+++ ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLScrubber.pm 2010-05-02 13:05:33 UTC (rev 9285)
@@ -0,0 +1,163 @@
+package Plagger::Plugin::Filter::HTMLScrubber;
+use strict;
+use base qw( Plagger::Plugin );
+
+use HTML::Scrubber;
+
+sub rules {
+ return(
+ img => {
+ src => qr{^http://}, # only URL with http://
+ alt => 1, # alt attributes allowed
+ '*' => 0, # deny all others
+ },
+ style => 0,
+ script => 0,
+ );
+}
+
+sub default {
+ return(
+ '*' => 1, # default rule, allow all attributes
+ 'href' => qr{^(?!(?:java)?script)}i,
+ 'src' => qr{^(?!(?:java)?script)}i,
+ 'cite' => '(?i-xsm:^(?!(?:java)?script))',
+ 'language' => 0,
+ 'name' => 1, # could be sneaky, but hey ;)
+ 'onblur' => 0,
+ 'onchange' => 0,
+ 'onclick' => 0,
+ 'ondblclick' => 0,
+ 'onerror' => 0,
+ 'onfocus' => 0,
+ 'onkeydown' => 0,
+ 'onkeypress' => 0,
+ 'onkeyup' => 0,
+ 'onload' => 0,
+ 'onmousedown' => 0,
+ 'onmousemove' => 0,
+ 'onmouseout' => 0,
+ 'onmouseover' => 0,
+ 'onmouseup' => 0,
+ 'onreset' => 0,
+ 'onselect' => 0,
+ 'onsubmit' => 0,
+ 'onunload' => 0,
+ 'src' => 0,
+ 'type' => 0,
+ 'style' => 0,
+ );
+}
+
+sub register {
+ my ( $self, $context ) = @_;
+
+ $context->register_hook(
+ $self,
+ 'update.entry.fixup' => \&update,
+ 'plugin.init' => \&initialize,
+ );
+}
+
+sub initialize {
+ my($self, $context, $args) = @_;
+
+ $self->{scrubber} = do {
+ my $scrubber = HTML::Scrubber->new;
+ my $config = $self->conf;
+
+ my ( %rules, %default );
+ unless ( delete $config->{no_default_configs} ) {
+ %rules = $self->rules;
+ %default = $self->default;
+ }
+ $scrubber->rules( %rules, %{ delete $config->{rules} || {} } );
+ $scrubber->default(1, { %default, %{ delete $config->{default} || {} } });
+
+ while ( my ( $method, $arg ) = each %$config ) {
+ eval {
+ $scrubber->$method(
+ ref $arg eq 'ARRAY' ? @$arg
+ : ref $arg eq 'HASH' ? %$arg
+ : $arg );
+ };
+ $context->error(qq/Invalid method call "$method": $@/) if $@;
+ }
+
+ $scrubber;
+ };
+}
+
+sub update {
+ my ( $self, $context, $args ) = @_;
+
+ foreach my $attr (qw/body summary/) {
+ if (defined $args->{entry}->$attr && $args->{entry}->$attr->is_html) {
+ $context->log(debug => "Scrubbing $attr for " . $args->{entry}->permalink || '(no-link)');
+ my $content = $self->{scrubber}->scrub( $args->{entry}->$attr );
+ $args->{entry}->$attr($content);
+ $context->log(debug => "After: $attr ||$content||\n");
+ }
+ }
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Plagger::Plugin::Filter::HTMLScrubber - Scrub feed content
+
+=head1 SYNOPSIS
+
+ - module: Filter::HTMLScrubber
+ config:
+ rules:
+ style: 0
+ script: 0
+
+=head1 DESCRIPTION
+
+This plugin scrubs feed content using L<HTML::Scrubber>.
+
+All config parameters (except 'no_default_configs') are implemented as
+HTML::Scrubber's method: value. For example, if you write:
+
+ method: value
+
+in the config: section, this plugin will automatically turn the config
+into the method call:
+
+ $scrubber->method('value');
+
+See L<HTML::Scrubber> document for details.
+
+=head1 CONFIG
+
+=over 4
+
+=item no_default_configs
+
+Some rules and default config parameters are set by default. See I<rules>
+and I<default> methods defined in this module code for details.
+
+If you don't need these settings, use C<no_default_configs>
+
+ no_detault_configs: 1
+
+Defaults to 0, which means it uses the default (somewhat secure) config.
+
+=back
+
+=head1 AUTHOR
+
+Daisuke Murase <typester at cpan.org>
+
+Tatsuhiko Miyagawa
+
+=head1 SEE ALSO
+
+L<Plagger>, L<HTML::Scrubber>
+
+=cut
Added: ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLStripScripts.pm
===================================================================
--- ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLStripScripts.pm (rev 0)
+++ ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLStripScripts.pm 2010-05-02 13:05:33 UTC (rev 9285)
@@ -0,0 +1,175 @@
+package Plagger::Plugin::Filter::HTMLStripScripts;
+use strict;
+use base qw( Plagger::Plugin );
+
+use HTML::StripScripts::Parser;
+
+sub rules {
+ return(
+ pre => 1,
+ code => 1,
+# div => 1,
+ span => 1,
+ br => 1,
+ p => 1,
+ blockquote => 1,
+ a => 1,
+ img => {
+ src => qr{^http://}, # only URL with http://
+ alt => 1, # alt attributes allowed
+ '*' => 0, # deny all others
+ },
+ style => 0,
+ script => 0,
+ );
+}
+
+sub default {
+ return(
+ '*' => 1, # default rule, allow all attributes
+ 'href' => qr{^(?!(?:java)?script)}i,
+ 'src' => qr{^(?!(?:java)?script)}i,
+ 'cite' => '(?i-xsm:^(?!(?:java)?script))',
+ 'language' => 0,
+ 'name' => 1, # could be sneaky, but hey ;)
+ 'onblur' => 0,
+ 'onchange' => 0,
+ 'onclick' => 0,
+ 'ondblclick' => 0,
+ 'onerror' => 0,
+ 'onfocus' => 0,
+ 'onkeydown' => 0,
+ 'onkeypress' => 0,
+ 'onkeyup' => 0,
+ 'onload' => 0,
+ 'onmousedown' => 0,
+ 'onmousemove' => 0,
+ 'onmouseout' => 0,
+ 'onmouseover' => 0,
+ 'onmouseup' => 0,
+ 'onreset' => 0,
+ 'onselect' => 0,
+ 'onsubmit' => 0,
+ 'onunload' => 0,
+ 'src' => 0,
+ 'type' => 0,
+ 'style' => 0,
+ );
+}
+
+sub register {
+ my ( $self, $context ) = @_;
+
+ $context->register_hook(
+ $self,
+ 'update.entry.fixup' => \&update,
+ 'plugin.init' => \&initialize,
+ );
+}
+
+sub initialize {
+ my($self, $context, $args) = @_;
+
+ $self->{scrubber} = do {
+ my $config = $self->conf;
+
+ my ( %rules, %default );
+ unless ( delete $config->{no_default_configs} ) {
+ %rules = $self->rules;
+ %default = $self->default;
+ }
+ my $scrubber = HTML::StripScripts::Parser->new(
+ {
+ Context => 'Flow',
+ AllowHref => 1,
+ AllowMailTo => 1,
+ BanList => ['link'],
+ Rules => \%rules,
+ });
+
+# while ( my ( $method, $arg ) = each %$config ) {
+# eval {
+# $scrubber->$method(
+# ref $arg eq 'ARRAY' ? @$arg
+# : ref $arg eq 'HASH' ? %$arg
+# : $arg );
+# };
+# $context->error(qq/Invalid method call "$method": $@/) if $@;
+# }
+
+ $scrubber;
+ };
+}
+
+sub update {
+ my ( $self, $context, $args ) = @_;
+
+ if (defined $args->{entry}->body && $args->{entry}->body->is_html) {
+ $context->log(debug => "Stripping body for " . $args->{entry}->permalink || '(no-link)');
+ $context->log(debug => "Before: " . $args->{entry}->body);
+ my $body = $self->{scrubber}->filter_html( $args->{entry}->body );
+ $args->{entry}->body($body);
+ $context->log(debug => "After: " . $args->{entry}->body);
+ }
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Plagger::Plugin::Filter::HTMLScrubber - Scrub feed content
+
+=head1 SYNOPSIS
+
+ - module: Filter::HTMLScrubber
+ config:
+ rules:
+ style: 0
+ script: 0
+
+=head1 DESCRIPTION
+
+This plugin scrubs feed content using L<HTML::Scrubber>.
+
+All config parameters (except 'no_default_configs') are implemented as
+HTML::Scrubber's method: value. For example, if you write:
+
+ method: value
+
+in the config: section, this plugin will automatically turn the config
+into the method call:
+
+ $scrubber->method('value');
+
+See L<HTML::Scrubber> document for details.
+
+=head1 CONFIG
+
+=over 4
+
+=item no_default_configs
+
+Some rules and default config parameters are set by default. See I<rules>
+and I<default> methods defined in this module code for details.
+
+If you don't need these settings, use C<no_default_configs>
+
+ no_detault_configs: 1
+
+Defaults to 0, which means it uses the default (somewhat secure) config.
+
+=back
+
+=head1 AUTHOR
+
+Daisuke Murase <typester at cpan.org>
+
+Tatsuhiko Miyagawa
+
+=head1 SEE ALSO
+
+L<Plagger>, L<HTML::Scrubber>
+
+=cut
Added: ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLTruncate.pm
===================================================================
--- ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLTruncate.pm (rev 0)
+++ ironman/trunk/plagger/lib/Plagger/Plugin/Filter/HTMLTruncate.pm 2010-05-02 13:05:33 UTC (rev 9285)
@@ -0,0 +1,163 @@
+package Plagger::Plugin::Filter::HTMLScrubber;
+use strict;
+use base qw( Plagger::Plugin );
+
+use HTML::Scrubber;
+
+sub rules {
+ return(
+ img => {
+ src => qr{^http://}, # only URL with http://
+ alt => 1, # alt attributes allowed
+ '*' => 0, # deny all others
+ },
+ style => 0,
+ script => 0,
+ );
+}
+
+sub default {
+ return(
+ '*' => 1, # default rule, allow all attributes
+ 'href' => qr{^(?!(?:java)?script)}i,
+ 'src' => qr{^(?!(?:java)?script)}i,
+ 'cite' => '(?i-xsm:^(?!(?:java)?script))',
+ 'language' => 0,
+ 'name' => 1, # could be sneaky, but hey ;)
+ 'onblur' => 0,
+ 'onchange' => 0,
+ 'onclick' => 0,
+ 'ondblclick' => 0,
+ 'onerror' => 0,
+ 'onfocus' => 0,
+ 'onkeydown' => 0,
+ 'onkeypress' => 0,
+ 'onkeyup' => 0,
+ 'onload' => 0,
+ 'onmousedown' => 0,
+ 'onmousemove' => 0,
+ 'onmouseout' => 0,
+ 'onmouseover' => 0,
+ 'onmouseup' => 0,
+ 'onreset' => 0,
+ 'onselect' => 0,
+ 'onsubmit' => 0,
+ 'onunload' => 0,
+ 'src' => 0,
+ 'type' => 0,
+ 'style' => 0,
+ );
+}
+
+sub register {
+ my ( $self, $context ) = @_;
+
+ $context->register_hook(
+ $self,
+ 'update.entry.fixup' => \&update,
+ 'plugin.init' => \&initialize,
+ );
+}
+
+sub initialize {
+ my($self, $context, $args) = @_;
+
+ $self->{scrubber} = do {
+ my $scrubber = HTML::Scrubber->new;
+ my $config = $self->conf;
+
+ my ( %rules, %default );
+ unless ( delete $config->{no_default_configs} ) {
+ %rules = $self->rules;
+ %default = $self->default;
+ }
+ $scrubber->rules( %rules, %{ delete $config->{rules} || {} } );
+ $scrubber->default(1, { %default, %{ delete $config->{default} || {} } });
+
+ while ( my ( $method, $arg ) = each %$config ) {
+ eval {
+ $scrubber->$method(
+ ref $arg eq 'ARRAY' ? @$arg
+ : ref $arg eq 'HASH' ? %$arg
+ : $arg );
+ };
+ $context->error(qq/Invalid method call "$method": $@/) if $@;
+ }
+
+ $scrubber;
+ };
+}
+
+sub update {
+ my ( $self, $context, $args ) = @_;
+
+ foreach my $attr (qw/body summary/) {
+ if (defined $args->{entry}->$attr && $args->{entry}->$attr->is_html) {
+ $context->log(debug => "Scrubbing $attr for " . $args->{entry}->permalink || '(no-link)');
+ my $content = $self->{scrubber}->scrub( $args->{entry}->$attr );
+ $args->{entry}->$attr($content);
+ $context->log(debug => "After: $attr ||$content||\n");
+ }
+ }
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Plagger::Plugin::Filter::HTMLScrubber - Scrub feed content
+
+=head1 SYNOPSIS
+
+ - module: Filter::HTMLScrubber
+ config:
+ rules:
+ style: 0
+ script: 0
+
+=head1 DESCRIPTION
+
+This plugin scrubs feed content using L<HTML::Scrubber>.
+
+All config parameters (except 'no_default_configs') are implemented as
+HTML::Scrubber's method: value. For example, if you write:
+
+ method: value
+
+in the config: section, this plugin will automatically turn the config
+into the method call:
+
+ $scrubber->method('value');
+
+See L<HTML::Scrubber> document for details.
+
+=head1 CONFIG
+
+=over 4
+
+=item no_default_configs
+
+Some rules and default config parameters are set by default. See I<rules>
+and I<default> methods defined in this module code for details.
+
+If you don't need these settings, use C<no_default_configs>
+
+ no_detault_configs: 1
+
+Defaults to 0, which means it uses the default (somewhat secure) config.
+
+=back
+
+=head1 AUTHOR
+
+Daisuke Murase <typester at cpan.org>
+
+Tatsuhiko Miyagawa
+
+=head1 SEE ALSO
+
+L<Plagger>, L<HTML::Scrubber>
+
+=cut
More information about the Bast-commits
mailing list