[Bast-commits] r8099 - ironman/IronMan-Web/script
iain at dev.catalyst.perl.org
iain at dev.catalyst.perl.org
Sat Dec 12 23:06:16 GMT 2009
Author: iain
Date: 2009-12-12 23:06:16 +0000 (Sat, 12 Dec 2009)
New Revision: 8099
Added:
ironman/IronMan-Web/script/import_csv.pl
Log:
basic import working, needs more work on dupe detection. specifically what to do when you find a newer entry (sort entries by date first)
Added: ironman/IronMan-Web/script/import_csv.pl
===================================================================
--- ironman/IronMan-Web/script/import_csv.pl (rev 0)
+++ ironman/IronMan-Web/script/import_csv.pl 2009-12-12 23:06:16 UTC (rev 8099)
@@ -0,0 +1,118 @@
+#!/usr/bin/perl
+use strict;
+use warnings;
+
+use utf8 ();
+=head1 import_csv.pl
+
+this script is a one hit import from the "old" csv file to the new feed db
+
+=cut
+
+use IronMan::Schema;
+use Text::CSV_XS;
+use DateTime::Format::HTTP;
+
+my @files = </data/Projects/ironman/plagger/resource/csv/*>;
+
+print scalar(@files),"\n";
+
+binmode(STDOUT, ":utf8");
+
+my $schema = IronMan::Schema->connect('dbi:SQLite:/data/Projects/ironman/subscriptions.db');
+
+foreach my $file (@files) {
+
+ if($file =~ /my_(.*?)\.csv$/) {
+ my $file_clean = $1;
+
+ #print $file, "\n";
+
+ my @file_parts = split('_', $file_clean);
+
+ #print "looking through ".$#file_parts." parts @file_parts\n";
+
+ my $search_str;
+ for(my $i=0;$i <= $#file_parts;$i++) {
+ next unless(length($file_parts[$i]));
+ $search_str .= $file_parts[$i].'_';
+ }
+
+ chop $search_str;
+
+ my $poster;
+ my $feeds_rs = $schema->resultset('Feed')->search({ title => { like => $file_clean } });
+
+ my @rows = $feeds_rs->all;
+ #print "searching for $search_str\n";
+ if(scalar(@rows) > 0) {
+ if(scalar(@rows) == 1) {
+ $poster = $feeds_rs->first();
+ }
+ else {
+ $poster = $feeds_rs->first();
+ # if we have multiple posts it may be the same author on different blogs
+ # loop though and check they are the same
+ while(my $compare = $feeds_rs->next) {
+ unless($compare->title eq $poster->title) {
+ die "feck cant find poster for $file\n";
+ }
+ }
+ }
+ }
+ else {
+
+ #TODO - log these
+ print "$search_str nothing found for $file\n";
+ next;
+ }
+
+ print "found: ".$poster->title,"\n";
+
+ open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
+ my $csv = Text::CSV_XS->new ({ binary => 1 }) or die "Cannot use CSV: ".Text::CSV->error_diag ();
+
+ while (my $row = $csv->getline ($fh)) {
+
+ my $new_post;
+ eval {
+ $new_post = $poster->create_related('posts', {
+ title => $row->[1],
+ url => $row->[2],
+ posted_on => DateTime::Format::HTTP->parse_datetime($row->[3]),
+ body => '',
+ });
+
+ print "new post added ".$new_post->id;
+ };
+
+ if($@ =~ /column url is not unique/) {
+
+ my $post = $schema->resultset('Post')->find({url => $row->[2]});
+ if($post) {
+ print "Dupe Found ".$post->id."\n";
+
+ my $new_date = DateTime::Format::HTTP->parse_datetime($row->[3]);
+
+ # if the new post is more recent update all bar the date
+ if($new_date > $post->posted_on) {
+ $post->update({
+ title => $row->[1],
+ url => $row->[2],
+ });
+ }
+ }
+ else {
+ die "post not found $row->[2]\n";
+ }
+ }
+ else {
+ die "unknown $@\n";
+ }
+ }
+
+ $csv->eof or $csv->error_diag ();
+ close $fh;
+ }
+}
+
More information about the Bast-commits
mailing list