[Bast-commits] r8160 - in ironman: IronMan-Web/script plagger/lib/IronMan/Schema/Result

Wed Dec 23 21:10:38 GMT 2009

Author: iain
Date: 2009-12-23 21:10:38 +0000 (Wed, 23 Dec 2009)
New Revision: 8160

Modified:
   ironman/IronMan-Web/script/import_csv.pl
   ironman/plagger/lib/IronMan/Schema/Result/Feed.pm
Log:
working nice now, just put your paths in and run. look at the problems.txt for .... problems

Modified: ironman/IronMan-Web/script/import_csv.pl
===================================================================

--- ironman/IronMan-Web/script/import_csv.pl	2009-12-23 06:16:29 UTC (rev 8159)
+++ ironman/IronMan-Web/script/import_csv.pl	2009-12-23 21:10:38 UTC (rev 8160)
@@ -8,107 +8,146 @@
 this script is a one hit import from the "old" csv file to the new feed db
 
 =cut
+use Data::Dumper;
 
 use IronMan::Schema;
 use Text::CSV_XS;
 use DateTime::Format::HTTP;
 
-my @files = </data/Projects/ironman/plagger/resource/csv/*>;
+my @files = </data/Projects/ironman/csv/*>;
+my $schema = IronMan::Schema->connect('dbi:SQLite:/data/Projects/ironman/subscriptions.db');
 
 print scalar(@files),"\n";
 
 binmode(STDOUT, ":utf8");
 
-my $schema = IronMan::Schema->connect('dbi:SQLite:/data/Projects/ironman/subscriptions.db');
+open(PROBLEMS, '>', 'problems.txt') || die "cant open problems file $!";
 
-foreach my $file (@files) {   
+FILES: foreach my $file (@files) {   
 
+    # make sure we have a valid filename and trim it to something we can feed into a DB query
     if($file =~ /my_(.*?)\.csv$/) {
         my $file_clean = $1;
+        print "Processing file $file\n";        
         
-        #print $file, "\n";
-        
-        my @file_parts = split('_', $file_clean);
-       
-        #print "looking through ".$#file_parts." parts @file_parts\n";
-        
-        my $search_str;
-        for(my $i=0;$i <= $#file_parts;$i++) {
-            next unless(length($file_parts[$i]));
-            $search_str .= $file_parts[$i].'_';
-        }
-        
-        chop $search_str;
-        
         my $poster;
         my $feeds_rs = $schema->resultset('Feed')->search({ title => { like => $file_clean } });
         
+        # some problem with count, cant remember what though
         my @rows = $feeds_rs->all;
-        #print "searching for $search_str\n";
+
         if(scalar(@rows) > 0) {
-            if(scalar(@rows) == 1) {
-                $poster = $feeds_rs->first();
-            } 
-            else {
-                $poster = $feeds_rs->first();
+            $poster = shift @rows;
+            
+            if(scalar(@rows) > 1) {
+                # I dont think this happens anymore but there is no harm leaving it in.
                 # if we have multiple posts it may be the same author on different blogs
                 # loop though and check they are the same
-                while(my $compare = $feeds_rs->next) {
+                while(my $compare = shift @rows) {
                     unless($compare->title eq $poster->title) {
-                        die "feck cant find poster for $file\n";        
+                        print PROBLEMS "Unable to determine author in file: $file\n";
+                        next FILES;
                     }
                 }
             }
         }
         else {
-            
-            #TODO - log these
-            print "$search_str nothing found for $file\n";
-            next;    
+            print PROBLEMS "No feed entry found for file: $file\n";
+            next;
         }
         
-        print "found: ".$poster->title,"\n";
+        print "\tfound author: ".$poster->title,"\n";
 
         open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
         my $csv = Text::CSV_XS->new ({ binary => 1 }) or die "Cannot use CSV: ".Text::CSV->error_diag ();
         
+        my %posts;
+        my $posts_rs = $schema->resultset('Post');
+        
         while (my $row = $csv->getline ($fh)) {
+
+            if($row->[1] eq 'We Are Iron Man') {
+                $schema->storage->debug(1);
+            }
             
-            my $new_post;
-            eval {
-                $new_post = $poster->create_related('posts', {
-                    title     => $row->[1],
-                    url       => $row->[2],
-                    posted_on => DateTime::Format::HTTP->parse_datetime($row->[3]),
-                    body      => '',
-                });
-                
-                print "new post added ".$new_post->id;                
-            };
+            unless($row->[3]) {
+                print PROBLEMS "no date found for post:$row->[1], file:$file\n";
+                next;       
+            } 
             
-            if($@ =~ /column url is not unique/) {
+            my $new_date = DateTime::Format::HTTP->parse_datetime($row->[3]);
+            
+            my $post = $posts_rs->find({url => $row->[2]});
+            
+            # initialise the most recent post date
+            if($post && !exists($posts{$row->[2]})) {
+                $posts{$row->[2]} = $post->posted_on();
+            }            
+            
+            if($post) {
+                print "\tDupe Found ".$post->id.",".$post->title."\n";
+
+                # if this is a newer post update the details
+                if($new_date > $posts{$row->[2]}) {
+                    #the post is more recent update all bar the date
+                    print "\t\tdupe more recent $new_date\n";
+                    $post->update({
+                        title     => $row->[1],
+                        url       => $row->[2],
+                    });
+                     
+                    # track the most recent post date
+                    $posts{$row->[2]} = $post->posted_on();
+                }
+                elsif($new_date < $post->posted_on()) {
+                    # post is older update the posted_on
+                    print "\t\tdupe older $new_date\n";
+                    $post->update({
+                        posted_on => $new_date,
+                    }); 
+                }
+                else {
+                    print "\t\tDate the same, dont do anything\n";    
+                }
+            }
+            else {
+                # this is a new post
+                eval {
+                    $post = $poster->create_related('posts',
+                        {
+                            title     => $row->[1],
+                            url       => $row->[2],
+                            author    => $row->[0],
+                            tags      => '',
+                            posted_on => $new_date,
+                            body      => '',
+                        }
+                    );
+                };
                 
-                my $post = $schema->resultset('Post')->find({url => $row->[2]});
-                if($post) {
-                    print "Dupe Found ".$post->id."\n";
-                       
-                    my $new_date =  DateTime::Format::HTTP->parse_datetime($row->[3]);
-                    
-                    # if the new post is more recent update all bar the date
-                    if($new_date > $post->posted_on) {
-                        $post->update({
+                if($@) {
+                    print PROBLEMS "Unable to create new post row, probably a dupe\n";
+                    print PROBLEMS Dumper({
                             title     => $row->[1],
                             url       => $row->[2],
+                            author    => $row->[0],
+                            tags      => '',
+                            posted_on => $new_date->datetime(),
+                            body      => '',
                         });
-                    }
+            if($row->[1] eq 'We Are Iron Man') {
+                die;    
+            }
+                        next;    
                 }
-                else {
-                    die "post not found $row->[2]\n";
-                }
+                
+                print "\tnew post added ".$post->id.",".$post->title."\n";
+                
+                # track the most recent post date
+                $posts{$row->[2]} = $post->posted_on();
+                    
             }
-            else {
-                die "unknown $@\n";    
-            }
+           
         }
         
         $csv->eof or $csv->error_diag ();

Modified: ironman/plagger/lib/IronMan/Schema/Result/Feed.pm
===================================================================
--- ironman/plagger/lib/IronMan/Schema/Result/Feed.pm	2009-12-23 06:16:29 UTC (rev 8159)
+++ ironman/plagger/lib/IronMan/Schema/Result/Feed.pm	2009-12-23 21:10:38 UTC (rev 8160)
@@ -31,6 +31,7 @@
     );
 __PACKAGE__->set_primary_key(qw/id/);
 __PACKAGE__->add_unique_constraint(url => ['url']);
+__PACKAGE__->has_many('posts' => 'IronMan::Schema::Result::Post', 'feed_id');
 
 __PACKAGE__->has_many('posts', 'IronMan::Schema::Result::Post', 'feed_id');