[Bast-commits] r8832 - ironman/Perlanet-IronMan/lib/Perlanet

idn at dev.catalyst.perl.org idn at dev.catalyst.perl.org
Sun Feb 28 21:06:21 GMT 2010


Author: idn
Date: 2010-02-28 21:06:21 +0000 (Sun, 28 Feb 2010)
New Revision: 8832

Modified:
   ironman/Perlanet-IronMan/lib/Perlanet/IronMan.pm
Log:
Latest round of commits.  Large number of fixes.  Correct use of Perlanet::Entry object out of select_entries means that the feed object is correctly stored within the entry object (or a ref to it at any rate).  Comments added and boilerplate pod.  Lots of other stuff that I can't remember

Modified: ironman/Perlanet-IronMan/lib/Perlanet/IronMan.pm
===================================================================
--- ironman/Perlanet-IronMan/lib/Perlanet/IronMan.pm	2010-02-28 04:49:17 UTC (rev 8831)
+++ ironman/Perlanet-IronMan/lib/Perlanet/IronMan.pm	2010-02-28 21:06:21 UTC (rev 8832)
@@ -1,6 +1,10 @@
 package Perlanet::IronMan;
+
+use 5.8.0;
+use strict;
+use warnings;
+
 use Moose;
-
 use IronMan::Schema;
 use Perlanet::Feed;
 use Data::Dumper;
@@ -10,97 +14,107 @@
 extends 'Perlanet';
 use Perlanet::Entry;
 
+our $VERSION = '0.01_01';
+
+=head1 NAME
+
+Perlanet::IronMan
+
+This module extends Perlanet for the specific requirements of the Enlightened
+Perl Organisation IronMan project.
+
+=head1 SYNOPSIS
+
+
+
+=head1 DESCRIPTION
+
+This module uses an IronMan::Schema database to define feeds, collect the feeds
+and then store them back into the IronMan::Schema database.
+
+=cut
+
+# This is some kind of Moose magic that I don't understand....  I think this
+# means tha schema attribute of this object is built using the _build_schema
+# method when the schema attribute is first used
+
 has 'schema' => (
     is => 'rw',
     lazy_build => 1,
 );
 
-sub _build_schema {
-    my $self = shift;
-    return IronMan::Schema->connect(
-        $self->cfg->{db}{dsn},
-        $self->cfg->{db}{username},
-        $self->cfg->{db}{password},
-    );
-}
-
-override '_build_feeds' => sub {
-    my $self = shift;
-    return [ map {
-        Perlanet::Feed->new(
-            id => $_->id,
-            url => $_->url,
-            website => $_->link || $_->url,
-            title => $_->title,
-            author => $_->owner,
-        );
-    } $self->schema->resultset('Feed')->all ];
-};
-
 =head2 select_entries
 
-The select entries function takes an array of feed entries and filters it to
-remove duplicates.  The non-duplicated feed entries are then returned to the
-caller as an array of feed entries.
+The select entries function takes an array of Perlanet::Feed objects and
+filters it to remove duplicates.  The non-duplicated feed entries are then
+returned to the caller as an array of Perlanet::Entry objects.
 
-Okay, that's what I thought this function does, but it doesn't seem to quite be
-that straight forward.  The objects passed in and the objects returned are not
-the same.
+    my $perlanet_entries = select_entries( @{ $perlanet_feeds });
 
 =cut
 
 override 'select_entries' => sub {
     my ($self, @feeds) = @_;
 
-    my @entries;
+    # Perlanet::Feed objects to return
+    my @feed_entries;
+
+    # Iterate over the feeds working on them.
     for my $feed (@feeds) {
 
-        #print(Dumper($feed));
+        # Fetch the XML::Feed:Entry objects from the Perlanet::Feed object
+        my @entries = $feed->_xml_feed->entries;
 
-        my $unseen = $self->schema->resultset('Feed')->filter_unseen($feed->url, [ $feed->_xml_feed->entries ]);
+        # Iterate over the XML::Feed::Entry objects
+        foreach my $xml_entry (@entries) {
 
-        push @entries, @$unseen;
-    }
+            # Problem with XML::Feed's conversion of RSS to Atom
+            if ($xml_entry->issued && ! $xml_entry->modified) {
+              $xml_entry->modified($xml_entry->issued);
+            }
 
-    return @entries;
-};
+            #print(Dumper($xml_entry));
 
-override 'save' => sub { };
+            # Always set category to something
+            unless(defined($xml_entry->category)) {
+                $xml_entry->category('');
+            }
 
-override 'render' => sub {
-    my ($self, $feed, $post) = @_;
+            # Filter on keywords.  This fails for HTML encoded languages.
+            # See http://onperl.ru/onperl/atom.xml for examples
+            # specifically http://onperl.ru/onperl/2010/02/post.html
+            # FIXME
+            unless($self->_filter_entry_on_keywords($xml_entry)) {
+                next;
+            }
 
-#    my $new = [
-#        map {
-#
-#
-#{
-#            feed_id          => 'davorg',
-#            author           => 'Dave',
-#            tags             => '',
-#            url              => $_->link,
-#            title            => $_->title,
-#            posted_on        => $_->issued || DateTime->from_epoch(epoch => 0),
-#            summary          => $_->summary->body,
-#            summary_filtered => $self->clean($_->summary->body),
-#            body             => $_->content->body,
-#            body_filtered    => $self->clean($_->content->body),
-#        } 
+            # De-duplicate
+            unless($self->_filter_entry_for_duplicate($xml_entry)) {
+                next;
+            }
 
-    #map($self->_db_insert($_), @{ $post->entries });
+            # Create a Perlanet::Entry object from the XML data retrieved           
+            my $entry = Perlanet::Entry->new(
+                            _entry => $xml_entry,
+                            feed   => $feed
+            );
 
-    $self->_db_insert($feed, $post);
+            push @feed_entries, $entry;
+        }
+   }
 
-    #print(Dumper($new));
-
-    #$self->schema->resultset('Post')->populate($new);
+    return @feed_entries;
 };
 
+=head2 render
 
+Given a Perlanet::Entry object, store the entry as a post in the
+Schema::IronMan database
 
-sub _db_insert {
+=cut
+
+override 'render' => sub {
     my $self = shift;
-    my $feed = shift;
     my $post = shift;
 
     my $posts = $post->entries;
@@ -110,52 +124,183 @@
 
     foreach my $post (@{$posts}) {
 
-        # Check we've got tags on this post
-        unless(defined($post->category)) {
-            next;
-        }
+        # Set the summary text if not explicity specified (250 characters)
+        # This should probably be in config rather than hard coded.
+        my $summary = $post->_entry->summary->body || substr($post->_entry->content->body, 0, 250) . "...";
 
-	my @tags = $post->category;
+        #print(Dumper($post));
+        #exit;
 
-        # Check that ironman or perl is one of the tags or in the body
-        unless(grep(/ironman/i, @tags) ||
-               grep(/perl/i,    @tags) ||
-               $post->content->body =~ /ironman/i ||
-               $post->content->body =~ /perl/i
-            ) {
+        # Can't store a post if we can't work out the URL to link to it.
+        unless(defined($post->_entry->link)) {
+            print("ERROR.  Can't deal with lack of URL returned from XML::Feed::Entry for feed '" . $post->feed->url . "'\n");
             next;
         }
 
-        # Set the summary text if not explicity specified
-        my $summary = $post->summary->body || substr($post->content->body, 0, 250) . "...";
-
-        #print(Dumper($post));
-        #exit;
-
         try {
-
+            # Do that whole insert thing...
             $self->schema->resultset('Post')->populate( [ {
-                feed_id          => $feed->id,
-                author           => $post->author,
-                tags             => @tags,
-                url              => $post->link,
-                title            => $post->title,
-                posted_on        => $post->issued || DateTime->from_epoch(epoch => 0),
+                feed_id          => $post->feed->id,
+                author           => $post->_entry->author,
+                tags             => $post->_entry->category,
+                url              => $post->_entry->link,
+                title            => $post->_entry->title,
+                posted_on        => $post->_entry->issued || DateTime->from_epoch(epoch => 0),
                 summary          => $summary,
                 summary_filtered => $self->clean($summary),
-                body             => $post->content->body,
-                body_filtered    => $self->clean($post->content->body),
+                body             => $post->_entry->content->body,
+                body_filtered    => $self->clean($post->_entry->content->body),
             } ] );
         }
 
         catch {
-            Carp::cluck("ERROR: $!\n");
-            Carp::cluck("ERROR: Link URL is '" . $post->link . "'\n");
+            Carp::cluck("ERROR: $_\n");
             Carp::cluck("ERROR: Post is:\n" . Dumper($post) . "\n");
+            Carp::cluck("ERROR: Link URL is '" . $post->_entry->link . "'\n");
         };
+
+    #print(Dumper($post));
+
     } 
+};
+
+=head2 save
+
+Save is not required within the context of this module so we override it.
+
+=cut
+
+override 'save' => sub { };
+
+=head2 _build_feeds
+
+Feeds are built from the Schema::IronMan database overriding the internal
+defaults of utilising feeds specified in either the configuration file or as
+configuration options when creating the Perlanet object.
+
+=cut
+
+override '_build_feeds' => sub {
+    my $self = shift;
+    return [ map {
+        Perlanet::Feed->new(
+            id => $_->id,
+            url => $_->url || $_->link,
+            website => $_->link || $_->url,
+            title => $_->title,
+            author => $_->owner,
+        );
+    } $self->schema->resultset('Feed')->all ];
+};
+
+=head2 _build_schema
+
+Build and return a schema object the first time that the schema attribute of
+this object is accessed.
+
+=cut
+
+sub _build_schema {
+    my $self = shift;
+    return IronMan::Schema->connect(
+        $self->cfg->{db}{dsn},
+        $self->cfg->{db}{username},
+        $self->cfg->{db}{password},
+    );
 }
 
+=head2 _filter_entry_for_duplicate
+
+Test to see if the supplied XML::Feed::Entry passes the configured filters.
+
+Return 1 for a good entry and 0 for a bad entry.
+
+=cut
+
+sub _filter_entry_for_duplicate {
+    my $self = shift;
+    my $xml_entry = shift;
+
+    my $count = $self->schema->resultset('Post')->search(
+        { url => $xml_entry->link }
+    )->count;
+
+    if($count > 0) {
+        #print("Duplicate post found for url '" . $xml_entry->link . "'\n");
+        return 0;
+    }
+
+    #print("Post at url '" . $xml_entry->link . "' is new\n");
+
+    return 1;
+}
+
+=head2 _filter_entry_on_keywords
+
+Test to see if the supplied XML::Feed::Entry passes the configured filters.
+
+Return 1 for a good entry and 0 for a bad entry.
+
+THIS FUNCTION IS BROKEN.  SEE THE NOTES AT IT'S CALL.
+
+=cut
+
+sub _filter_entry_on_keywords {
+    my $self = shift;
+    my $xml_entry = shift;
+
+    # print("Called to keyword filter url '" . $xml_entry->link . "'\n");
+
+    # If no filter is defined, then we pass.
+    unless(defined($self->{cfg}->{filter}->{keywords})) {
+        print("No filter defined so skipping this check.\n");
+        return 1;
+    }
+
+    my $filters = $self->{cfg}->{filter}->{keywords};
+
+    # Iterate through the defined filters checking them
+    foreach my $filter (@ { $filters } ) {
+
+        #print("Checking for filter '$filter'\n");
+
+        # If tags have been defined, check them.
+        if(defined($xml_entry->tags)) {
+            if(grep(/$filter/i, $xml_entry->tags)) {
+                return 1;
+            }
+        }
+ 
+        # Check the body
+        if($xml_entry->content->body =~ m/$filter/i) {
+            return 1;
+        }
+    }
+
+    # We got to the end and we didn't get a match.  Fail.
+    return 0;
+}
+
+
+=head1 AUTHOR
+
+Oliver Charles (aCiD2) <oliver.g.charles at googlemail.com>
+Matt Troutt (mst), <mst at shadowcat.co.uk>
+Ian Norton (idn), <i.d.norton at gmail.com>
+
+=head1 SEE ALSO
+
+IronMan::Schema
+Perlanet
+
+=head1 COPYRIGHT AND LICENSE
+
+This library is free software; you can redistribute it and/or modify
+it under the same terms as Perl itself, either Perl version 5.10.0 or,
+at your option, any later version of Perl 5 you may have available.
+
+=cut
+
 no Moose;
 __PACKAGE__->meta->make_immutable;
 1;




More information about the Bast-commits mailing list