Stories
Slash Boxes
Comments
NOTE: use Perl; is on undef hiatus. You can read content, but you can't post it. More info will be forthcoming forthcomingly.

All the Perl that's Practical to Extract and Report

The Fine Print: The following comments are owned by whoever posted them. We are not responsible for them in any way.
 Full
 Abbreviated
 Hidden
More | Login | Reply
Loading... please wait.
  • Any chance of seeing this perl code?
    I could learn a great amount from it, I'm sure...
    • Re:any chance... (Score:3, Insightful)

      by gnat (29) on 2003.04.25 20:09 (#19490) Journal
      Sure! I suppose I should have done more hackery to automatically determine the credentials() arguments from the URL, but I couldn't be buggered :-)

      #!/usr/bin/perl -w

      use LWP;
      use HTML::TableContentParser;
      use Getopt::Std;
      use strict;

      # username and password for ORA intranet
      my ($USERNAME, $PASSWORD) = ('CHANGE', 'ME');

      # where to store files.  change this!
      my $DIR = ($^O eq "darwin") ? '/Users/gnat/Ora/Paperwork/edcal'
                                  : '/home/gnat/ora';

      # mapping from HTML table heading to XML tag.  update this when the
      # report format changes
      my $Heading_to_Tag = {
                    "Title" => "title",
                    "Product" => "product",
                    "Program" => "program",
                    "Editor" => "editor",
                    "To Tech Review" => "totechreview",
                    "To production" => "toproduction",
                    "Contract Final" => "contractfinal",
                    "Est. Page Count" => "estpagecount",
                    "[Estimated Release Date]" => "estreleasedate",
                    "Estimated List Price" => "estlistprice",
                    "Projected Monthly Units" => "projmonthlyunits",
                    "Sell In" => "sellin",
                    "Projected" => "projected",
                    "Editorial Status" => "edstatus",
                    "Comments" => "comments"
      };

      # option handling:
      #   -f means "fetch HTML again"
      #   -t is the temporary filename
      #   -o is the output filename
      #   -c cleans up the temporary file
      my %opts;
      getopts("ftco:", \%opts);

      # defaults for filenames
      my $temp_filename = $opts{t} || "$DIR/calendar.html";
      my $output_filename = $opts{o} || "$DIR/calendar.xml";

      # fetch the calendar html again if we have to
      if ((! -e $temp_filename) or $opts{f}) {
        fetch_calendar_html();
      }

      convert_html_to_xml();

      unlink $temp_filename if $opts{c};

      exit;

      ###

      # this code fetches the calendar into the file named in $opts{t}
      sub fetch_calendar_html {
        local $^W = 0;  # turn off warnings about invalid cookies from intranet
        my ($URL,       # ORA editorial database URL
            $ua,        # user agent object we'll use to fetch the pages
            $page,      # HTTP::Request object
            $sid_html); # HTML for the ?sid=.... URL parameter the edcal uses

        $URL = "https://www.example.com/path/to/thingy";

        # retain cookies and
        $ua = LWP::UserAgent->new(cookie_jar => {});
        $ua->credentials("example.com:443", "REALM", $USERNAME, $PASSWORD);

        # fetch the front page--this gets us a session ID
        # needed before we can fetch the calendar page
        $page = $ua->get($URL)
          or die "Can't fetch front page";
        ($sid_html) = $page->content =~ m{(\?sid=\d+)} or die "Can't find sid";

        # now get the calendar
        # (URL for calendar is hardcoded rather than
        # figured out dynamically from the front page)
        $page = $ua->post("$URL/calendar$sid_html",
                     { "s_editor" => "Nathan Torkington" });

        # and save its content
        open my $f, ">", $temp_filename or die "Can't open $temp_filename for writing: $!\n";
        print $f $page->content;
        close $f;
      }

      # this code parses the HTML and emits the relevant table as XML
      sub convert_html_to_xml {
        my ($html,     # calendar report in HTML
            $tables,   # tables extracted via HTML::TableContentParser
            @data,     # data extracted from the table we're interested in
            @headers,  # headers of the table we're interested in
           );

        $html = slurp($temp_filename);
        $tables = HTML::TableContentParser->new->parse($html);

        foreach my $t (@$tables) {
          # the report table has width of 2000.  this distinguishes it from
          # the other tables in the page (used to make the page look pretty)
          next unless exists($t->{width}) &&
            $t->{width} == 2000;

          # extract the headers into @headers
          foreach my $header (@{$t->{headers}}) {
            my $text = clean($header->{data});
            push @headers, $text;
          }

          # extract each row's data
          foreach my $row (@{$t->{rows}}) {
            # skip non-data rows
            next unless exists $row->{cells};
            if (@{$row->{cells}} != 1) {
          my @row_data;
          foreach my $cell (@{$row->{cells}}) {
            push @row_data, clean($cell->{data});
          }
          push @data, \@row_data;
            }
          }

          # now break out of the foreach-table loop and print the report
          last;
        }

        # emit the table as XML

        open my $fh, "> $output_filename"
          or die "Can't open $output_filename for writing: $!\n";

        # XML header and surrounding tag
        print $fh <<EOHEADER;
      <xml version="1.0">
      <calendar>
      EOHEADER

        # convert rows to tagged data
        foreach my $row (@data) {
          print $fh "<book>\n";
          for (my $i=0; $i < @headers; $i++) {
            my $h = $headers[$i];
            my $tag = $Heading_to_Tag->{$h};
            print $fh "  <$tag>$row->[$i]</$tag>\n";
          }
          print $fh "</book>\n";
        }

        # xml trailer
        print $fh "</calendar>\n</xml>\n";
        close $fh;
      }

      sub clean {
          my $text = shift;
          $text =~ s{<.*?>}{}g;
          $text =~ s{&nbsp;}{ }g;
          $text =~ s{^\s+}{}g; $text =~ s{\s+$}{}g;
          $text =~ s{\b&\b}{&amp;}g;
          return $text;
      }

      sub slurp {
        my $filename = shift;
        local $/;
        open my $fh, "< $filename"
          or die "Can't open $filename for reading: $!\n";
        return <$fh>;
      }
      --Nat