Stories
Slash Boxes
Comments
NOTE: use Perl; is on undef hiatus. You can read content, but you can't post it. More info will be forthcoming forthcomingly.

All the Perl that's Practical to Extract and Report

use Perl Log In

Log In

[ Create a new account ]

jdavidb (1361)

jdavidb
  (email not shown publicly)
http://voiceofjohn.blogspot.com/

J. David Blackstone has a Bachelor of Science in Computer Science and Engineering and nine years of experience at a wireless telecommunications company, where he learned Perl and never looked back. J. David has an advantage in that he works really hard, he has a passion for writing good software, and he knows many of the world's best Perl programmers.

Journal of jdavidb (1361)

Friday September 27, 2002
10:33 AM

use Perl; journal scrapers

[ #8055 ]

I posted an entry earlier with a little statistics gathering program I had that used WWW::UsePerl::Journal. That module's been updated, and I've cleaned up my changes slightly, so here's the patches again, plus two programs I have that use them. (I think I forgot one of the patches in my original post, anyway. You have to patch WWW::UsePerl::Journal::Entry as well.)

  • journalstat.perl

#!/usr/local/bin/perl5.8.0 -- # -*- perl -*-

use warnings;
use strict;
use lib $ENV{HOME};
use WWW::UsePerl::Journal;

my($user) = @ARGV;

foreach my $user (@ARGV)
{
    my $journal = WWW::UsePerl::Journal->new($user);

    my @entries = $journal->entryids();

    # Originally I took the date of the first and last entries, but
    # actually I want the current date as an endpoint.  (If you stop
    # posting, that means your average rate should gradually decrease
    # as time progresses.)
    my $firstdate = $journal->entry($entries[0])->date;
    my $numentries = scalar @entries;

    use Time::Piece;
    my $lastdate = localtime;
    my $interval = $lastdate - $firstdate;

    my $per_day = $numentries / $interval->days;

    print "$user has written $per_day entries per day\n";
}

  • journalmonths.perl

#!/usr/local/bin/perl5.8.0 -- # -*- perl -*-

use warnings;
use strict;
use lib $ENV{HOME};
use WWW::UsePerl::Journal;

my($user) = @ARGV;

foreach my $user (@ARGV)
{
    my $journal = WWW::UsePerl::Journal->new($user);

    my %entries = $journal->entryhash;

    my %count;
    foreach my $entrynum (sort keys %entries)
    {
    my $entry = $entries{$entrynum};
    my $date = $entry->date;
    my($month, $year) = ($date->mon, $date->year);
    $month = sprintf "%02d", $month;
    $count{"$year$month"}++;
    }
    foreach my $month (sort keys %count)
    {
    print "$month:\t$count{$month}\n";
    }
}

  • WWW::UsePerl::Journal patch

--- /usr/local/perl580/lib/site_perl/5.8.0/WWW/UsePerl/Journal.pm    2002-09-26 04:51:29.000000000 -0500
+++ WWW/UsePerl/Journal.pm    2002-09-27 10:24:17.000000000 -0500
@@ -1,4 +1,6 @@
-package WWW::UsePerl::Journal;
+package WWW::UsePerl::Journal;  # -*- perl -*-
+
+BEGIN {warn "Using local copy of WWW::UsePerl::Journal!"}

=head1 NAME

@@ -30,6 +32,7 @@
use HTTP::Request::Common;
use Data::Dumper;
use Carp;
+use Time::Piece;
use WWW::UsePerl::Journal::Entry;

@@ -171,19 +174,25 @@
         my $content = $self->{ua}->request(
             GET UP_URL . "/journal.pl?op=list&uid=$UID")->content;
         die "could not create entry list" unless $content;
-        my @lines = split /\n/, $content;

         my %entries;
-        foreach my $line (@lines){
-            next unless $line =~ m#~$user/journal/#ism;
-            $line =~ m#~$user/journal/(\d+)"><b>(.*?)</b></a>#ism;
-
+    my $count = 0;
+    while ( $content =~ m{~$user/journal/(\d+).><b>(.*?)</b></a></td>\s+<td valign="top"><em>([\d.\s:]+)</em>}ig)
+    {
             next unless defined $1;
-        $entries{$1} = WWW::UsePerl::Journal::Entry->new(
+        my($id, $subject, $datestr) = ($1, $2, $3);
+        $datestr =~ m/(\d+).(\d+).(\d+)\s+(\d+):(\d+)/;
+        my($year, $month, $dateofmonth, $hour, $minute) =
+        ($1, $2, $3, $4, $5);
+        my $formatteddate =
+        "$year-$month-$dateofmonth $hour:$minute:00";
+        my $date = Time::Piece->new(HTTP::Date::str2time($formatteddate));
+        $entries{$id} = WWW::UsePerl::Journal::Entry->new(
         j    => $self,
         user    => $user,
-        id    => $1,
-        subject    => $2,
+        id    => $id,
+        subject    => $subject,
+        date    => $date,
         );
         }

@@ -203,7 +212,7 @@
         my %entries = $self->entryhash;
         my @IDs;

-        foreach (sort keys %entries) {
+        foreach (sort {$a <=> $b} keys %entries) {
         $IDs[$#IDs+1] = $_;
         }
         return @IDs;

  • WWW::UsePerl::Journal::Entry

--- /usr/local/perl580/lib/site_perl/5.8.0/WWW/UsePerl/Journal/Entry.pm    2002-03-03 14:13:05.000000000 -0600
+++ WWW/UsePerl/Journal/Entry.pm    2002-09-27 10:00:49.000000000 -0500
@@ -1,4 +1,4 @@
-package WWW::UsePerl::Journal::Entry;
+package WWW::UsePerl::Journal::Entry;  # -*- perl -*-

=head1 NAME

@@ -15,6 +15,7 @@
use Data::Dumper;
use Carp;
use WWW::UsePerl::Journal;
+use Time::Piece;

our $VERSION = '0.03';
use constant UP_URL => 'http://use.perl.org';
@@ -58,6 +59,10 @@
sub date
{
     my $self = shift;
+    unless (defined $self->{date})
+    {
+    $self->get_content;
+    }
     return $self->{date};
}

@@ -132,6 +137,27 @@
     if $content =~
     m#Sorry, there are no journal entries
     found for this user.</TD></TR></TABLE><P>#ismx;
+    $content =~ m{
+    <H2>((Sunday|Monday|Tuesday|Wednesday|Thursday|Friday|Saturday)
+         [^<]*)</H2>
+    }x;
+    my $datestring = $1;
+    $datestring =~ m/(.*day)\s+(.*)\s+(\d+),\s+(\d+)/;
+    my($day, $month, $dateofmonth, $year) = ($1, $2, $3, $4);
+    $content =~ m{
+    <I>((\d+):(\d+)\s+[AP]M)</I>
+    }x;
+    my $timestring = $1;
+    $timestring =~ m/(\d+):(\d+)\s+([AP]M)/;
+    my($hour, $minute, $ampm) = ($1, $2, $3);
+    $hour += 12 if $ampm eq "PM";
+    $hour = 0 if $hour == 24;
+    $month = substr($month, 0, 3);
+    $day = substr($day, 0, 3);
+    my $formatteddate = "$day $month $dateofmonth $hour:$minute:00 $year";
+    my $dateseconds = HTTP::Date::str2time($formatteddate);
+    my $date = Time::Piece->new($dateseconds);
+    $self->{date} = $date;
     $content =~
     m#.*?$ID</a>\n]\n\s*</font>\n\s*<p>\n\s*(.*?)
     \n\s*<br><br></div>.*#ismx;

The Fine Print: The following comments are owned by whoever posted them. We are not responsible for them in any way.
 Full
 Abbreviated
 Hidden
More | Login | Reply
Loading... please wait.