[Bast-commits] r8160 - in ironman: IronMan-Web/script
plagger/lib/IronMan/Schema/Result
iain at dev.catalyst.perl.org
iain at dev.catalyst.perl.org
Wed Dec 23 21:10:38 GMT 2009
Author: iain
Date: 2009-12-23 21:10:38 +0000 (Wed, 23 Dec 2009)
New Revision: 8160
Modified:
ironman/IronMan-Web/script/import_csv.pl
ironman/plagger/lib/IronMan/Schema/Result/Feed.pm
Log:
working nice now, just put your paths in and run. look at the problems.txt for .... problems
Modified: ironman/IronMan-Web/script/import_csv.pl
===================================================================
--- ironman/IronMan-Web/script/import_csv.pl 2009-12-23 06:16:29 UTC (rev 8159)
+++ ironman/IronMan-Web/script/import_csv.pl 2009-12-23 21:10:38 UTC (rev 8160)
@@ -8,107 +8,146 @@
this script is a one hit import from the "old" csv file to the new feed db
=cut
+use Data::Dumper;
use IronMan::Schema;
use Text::CSV_XS;
use DateTime::Format::HTTP;
-my @files = </data/Projects/ironman/plagger/resource/csv/*>;
+my @files = </data/Projects/ironman/csv/*>;
+my $schema = IronMan::Schema->connect('dbi:SQLite:/data/Projects/ironman/subscriptions.db');
print scalar(@files),"\n";
binmode(STDOUT, ":utf8");
-my $schema = IronMan::Schema->connect('dbi:SQLite:/data/Projects/ironman/subscriptions.db');
+open(PROBLEMS, '>', 'problems.txt') || die "cant open problems file $!";
-foreach my $file (@files) {
+FILES: foreach my $file (@files) {
+ # make sure we have a valid filename and trim it to something we can feed into a DB query
if($file =~ /my_(.*?)\.csv$/) {
my $file_clean = $1;
+ print "Processing file $file\n";
- #print $file, "\n";
-
- my @file_parts = split('_', $file_clean);
-
- #print "looking through ".$#file_parts." parts @file_parts\n";
-
- my $search_str;
- for(my $i=0;$i <= $#file_parts;$i++) {
- next unless(length($file_parts[$i]));
- $search_str .= $file_parts[$i].'_';
- }
-
- chop $search_str;
-
my $poster;
my $feeds_rs = $schema->resultset('Feed')->search({ title => { like => $file_clean } });
+ # some problem with count, cant remember what though
my @rows = $feeds_rs->all;
- #print "searching for $search_str\n";
+
if(scalar(@rows) > 0) {
- if(scalar(@rows) == 1) {
- $poster = $feeds_rs->first();
- }
- else {
- $poster = $feeds_rs->first();
+ $poster = shift @rows;
+
+ if(scalar(@rows) > 1) {
+ # I dont think this happens anymore but there is no harm leaving it in.
# if we have multiple posts it may be the same author on different blogs
# loop though and check they are the same
- while(my $compare = $feeds_rs->next) {
+ while(my $compare = shift @rows) {
unless($compare->title eq $poster->title) {
- die "feck cant find poster for $file\n";
+ print PROBLEMS "Unable to determine author in file: $file\n";
+ next FILES;
}
}
}
}
else {
-
- #TODO - log these
- print "$search_str nothing found for $file\n";
- next;
+ print PROBLEMS "No feed entry found for file: $file\n";
+ next;
}
- print "found: ".$poster->title,"\n";
+ print "\tfound author: ".$poster->title,"\n";
open my $fh, "<:encoding(utf8)", $file or die "$file: $!";
my $csv = Text::CSV_XS->new ({ binary => 1 }) or die "Cannot use CSV: ".Text::CSV->error_diag ();
+ my %posts;
+ my $posts_rs = $schema->resultset('Post');
+
while (my $row = $csv->getline ($fh)) {
+
+ if($row->[1] eq 'We Are Iron Man') {
+ $schema->storage->debug(1);
+ }
- my $new_post;
- eval {
- $new_post = $poster->create_related('posts', {
- title => $row->[1],
- url => $row->[2],
- posted_on => DateTime::Format::HTTP->parse_datetime($row->[3]),
- body => '',
- });
-
- print "new post added ".$new_post->id;
- };
+ unless($row->[3]) {
+ print PROBLEMS "no date found for post:$row->[1], file:$file\n";
+ next;
+ }
- if($@ =~ /column url is not unique/) {
+ my $new_date = DateTime::Format::HTTP->parse_datetime($row->[3]);
+
+ my $post = $posts_rs->find({url => $row->[2]});
+
+ # initialise the most recent post date
+ if($post && !exists($posts{$row->[2]})) {
+ $posts{$row->[2]} = $post->posted_on();
+ }
+
+ if($post) {
+ print "\tDupe Found ".$post->id.",".$post->title."\n";
+
+ # if this is a newer post update the details
+ if($new_date > $posts{$row->[2]}) {
+ #the post is more recent update all bar the date
+ print "\t\tdupe more recent $new_date\n";
+ $post->update({
+ title => $row->[1],
+ url => $row->[2],
+ });
+
+ # track the most recent post date
+ $posts{$row->[2]} = $post->posted_on();
+ }
+ elsif($new_date < $post->posted_on()) {
+ # post is older update the posted_on
+ print "\t\tdupe older $new_date\n";
+ $post->update({
+ posted_on => $new_date,
+ });
+ }
+ else {
+ print "\t\tDate the same, dont do anything\n";
+ }
+ }
+ else {
+ # this is a new post
+ eval {
+ $post = $poster->create_related('posts',
+ {
+ title => $row->[1],
+ url => $row->[2],
+ author => $row->[0],
+ tags => '',
+ posted_on => $new_date,
+ body => '',
+ }
+ );
+ };
- my $post = $schema->resultset('Post')->find({url => $row->[2]});
- if($post) {
- print "Dupe Found ".$post->id."\n";
-
- my $new_date = DateTime::Format::HTTP->parse_datetime($row->[3]);
-
- # if the new post is more recent update all bar the date
- if($new_date > $post->posted_on) {
- $post->update({
+ if($@) {
+ print PROBLEMS "Unable to create new post row, probably a dupe\n";
+ print PROBLEMS Dumper({
title => $row->[1],
url => $row->[2],
+ author => $row->[0],
+ tags => '',
+ posted_on => $new_date->datetime(),
+ body => '',
});
- }
+ if($row->[1] eq 'We Are Iron Man') {
+ die;
+ }
+ next;
}
- else {
- die "post not found $row->[2]\n";
- }
+
+ print "\tnew post added ".$post->id.",".$post->title."\n";
+
+ # track the most recent post date
+ $posts{$row->[2]} = $post->posted_on();
+
}
- else {
- die "unknown $@\n";
- }
+
}
$csv->eof or $csv->error_diag ();
Modified: ironman/plagger/lib/IronMan/Schema/Result/Feed.pm
===================================================================
--- ironman/plagger/lib/IronMan/Schema/Result/Feed.pm 2009-12-23 06:16:29 UTC (rev 8159)
+++ ironman/plagger/lib/IronMan/Schema/Result/Feed.pm 2009-12-23 21:10:38 UTC (rev 8160)
@@ -31,6 +31,7 @@
);
__PACKAGE__->set_primary_key(qw/id/);
__PACKAGE__->add_unique_constraint(url => ['url']);
+__PACKAGE__->has_many('posts' => 'IronMan::Schema::Result::Post', 'feed_id');
__PACKAGE__->has_many('posts', 'IronMan::Schema::Result::Post', 'feed_id');
More information about the Bast-commits
mailing list