[Bast-commits] r8099 - ironman/IronMan-Web/script

Sat Dec 12 23:06:16 GMT 2009

Author: iain
Date: 2009-12-12 23:06:16 +0000 (Sat, 12 Dec 2009)
New Revision: 8099

Added:
   ironman/IronMan-Web/script/import_csv.pl
Log:
basic import working, needs more work on dupe detection. specifically what to do when you find a newer entry (sort entries by date first)

Added: ironman/IronMan-Web/script/import_csv.pl
===================================================================

--- ironman/IronMan-Web/script/import_csv.pl	                        (rev 0)
+++ ironman/IronMan-Web/script/import_csv.pl	2009-12-12 23:06:16 UTC (rev 8099)
@@ -0,0 +1,118 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+use utf8 ();
+=head1 import_csv.pl
+
+this script is a one hit import from the "old" csv file to the new feed db
+
+=cut
+
+use IronMan::Schema;
+use Text::CSV_XS;
+use DateTime::Format::HTTP;
+
+my @files = </data/Projects/ironman/plagger/resource/csv/*>;
+
+print scalar(@files),"\n";
+
+binmode(STDOUT, ":utf8");
+
+my $schema = IronMan::Schema->connect('dbi:SQLite:/data/Projects/ironman/subscriptions.db');
+
+foreach my $file (@files) {   
+
+    if($file =~ /my_(.*?)\.csv$/) {
+        my $file_clean = $1;
+        
+        #print $file, "\n";
+        
+        my @file_parts = split('_', $file_clean);
+       
+        #print "looking through ".$#file_parts." parts @file_parts\n";
+        
+        my $search_str;
+        for(my $i=0;$i <= $#file_parts;$i++) {
+            next unless(length($file_parts[$i]));
+            $search_str .= $file_parts[$i].'_';
+        }
+        
+        chop $search_str;
+        
+        my $poster;
+        my $feeds_rs = $schema->resultset('Feed')->search({ title => { like => $file_clean } });
+        
+        my @rows = $feeds_rs->all;
+        #print "searching for $search_str\n";
+        if(scalar(@rows) > 0) {
+            if(scalar(@rows) == 1) {
+                $poster = $feeds_rs->first();
+            } 
+            else {
+                $poster = $feeds_rs->first();
+                # if we have multiple posts it may be the same author on different blogs
+                # loop though and check they are the same
+                while(my $compare = $feeds_rs->next) {
+                    unless($compare->title eq $poster->title) {
+                        die "feck cant find poster for $file\n";        
+                    }
+                }
+            }
+        }
+        else {
+            
+            #TODO - log these
+            print "$search_str nothing found for $file\n";
+            next;    
+        }
+        
+        print "found: ".$poster->title,"\n";
+
+        open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
+        my $csv = Text::CSV_XS->new ({ binary => 1 }) or die "Cannot use CSV: ".Text::CSV->error_diag ();
+        
+        while (my $row = $csv->getline ($fh)) {
+            
+            my $new_post;
+            eval {
+                $new_post = $poster->create_related('posts', {
+                    title     => $row->[1],
+                    url       => $row->[2],
+                    posted_on => DateTime::Format::HTTP->parse_datetime($row->[3]),
+                    body      => '',
+                });
+                
+                print "new post added ".$new_post->id;                
+            };
+            
+            if($@ =~ /column url is not unique/) {
+                
+                my $post = $schema->resultset('Post')->find({url => $row->[2]});
+                if($post) {
+                    print "Dupe Found ".$post->id."\n";
+                       
+                    my $new_date =  DateTime::Format::HTTP->parse_datetime($row->[3]);
+                    
+                    # if the new post is more recent update all bar the date
+                    if($new_date > $post->posted_on) {
+                        $post->update({
+                            title     => $row->[1],
+                            url       => $row->[2],
+                        });
+                    }
+                }
+                else {
+                    die "post not found $row->[2]\n";
+                }
+            }
+            else {
+                die "unknown $@\n";    
+            }
+        }
+        
+        $csv->eof or $csv->error_diag ();
+        close $fh;
+    }
+}
+