#!/usr/bin/perl -w use strict; use WWW::Mechanize; use YAML qw(DumpFile LoadFile); use HTML::Template; use Getopt::Long qw(GetOptions); my $verbose; my $html_file; my $first; GetOptions("verbose" => \$verbose, "file=s" => \$html_file, "first=s" => \$first); my $w = WWW::Mechanize->new(); #my $file = "users.txt"; # should come from the command line ? #my $html_file = "users"; #my $html_file = shift usage("Need file name on command line") if not $html_file; usage("Need first value") if not $first; my $file = "$html_file.txt"; #my $user_config = { # main_url => "http://subversion.tigris.org/servlets/SummarizeList?listName=users", # msg_url => "http://subversion.tigris.org/servlets/ReadMsg?list=users&msgNo", # thread_url => "http://subversion.tigris.org/servlets/BrowseList?list=users&by=thread&from", #}; my ($config, $threads); if (-e $file) { ($config, $threads) = LoadFile($file); } else { usage("Could not find '$file'"); # $config = $user_config; } my %seen; foreach my $t (keys %$threads) { $seen{$_} = $t foreach keys %{$threads->{$t}{posts}}; } #build_html(); #exit; my $mul = length $config->{msg_url}; my $latest = get_latest(); print "Latest: $latest\n" if $verbose; foreach my $id ($first..$latest) { if ($seen{$id} and not $threads->{$seen{$id}}{date}) { get_date($seen{$id}, $id); } next if $seen{$id}; sleep 2; # be nice to the tigris server print "msg: $id (latest: $latest)\n" if $verbose; $w->get("$config->{msg_url}=$id"); if (not $w->success) { print "Could not fetch message $id\n"; next; } my $thread_link = $w->find_link(text => "Browse this thread"); my $thread_id; if (not $thread_link) { print "Could not find 'Browse this thread' for message $id\n"; next; } if ($thread_link->url =~ /(\d+)$/) { $thread_id = $1; } else { print "Could not fetch thread_id for message $id\n"; print "URL: " . $thread_link->url . "\n"; next; } print "thread: $thread_id\n" if $verbose; $w->follow_link(text => "Browse this thread"); if (not $w->success) { print "Could not fetch 'Browse this thread' for message $id\n"; next; } my $full_thread = $w->find_link(text => "Show all"); if (not $full_thread) { print "Could not find 'Show all' link\n"; next; } $w->follow_link(text => "Show all"); if (not $w->success) { print "Could not follow 'Show all' link\n"; next; } $threads->{$thread_id}{url} = $full_thread->url; my $title_id; foreach my $link ($w->links) { next if substr($link->url, 0, $mul) ne $config->{msg_url}; my $msg_id = substr($link->url, $mul+1); print " $msg_id\n" if $verbose; $threads->{$thread_id}{posts}{$msg_id}++; $seen{$msg_id} = $thread_id; if (not defined $title_id or $title_id > $msg_id) { $title_id = $msg_id; $threads->{$thread_id}{title} = $link->text; } } get_date($thread_id, $title_id); } DumpFile($file, $config, $threads); build_html(); exit; sub get_date { my ($thread_id, $msg_id) = @_; print "get_date for $thread_id $msg_id\n" if $verbose; $w->get("$config->{msg_url}=$msg_id"); return if not $w->success; if ($w->content =~ m{
(.*?)}s) { my $header = $1; if ($header =~ /Date: (.*)/) { $threads->{$thread_id}{date} = $1; } else { warn "Could not fetch Date: for $msg_id\n"; } } else { warn "Could not fetch
for $msg_id\n";
}
}
sub build_html {
my @threads;
my @pages = ({url => "$html_file.html", page => "most recent"});
foreach my $t (sort {$a <=> $b} keys %$threads) {
push @threads, {
url => $threads->{$t}{url},
title => $threads->{$t}{title},
date => $threads->{$t}{date},
count => scalar (keys %{$threads->{$t}{posts}}),
};
}
#create_html("$html_file.all.html", \@threads, []);
my $PAGE = 30;
my $last = 0;
foreach my $p (1..int((@threads-7) / $PAGE)) {
#print "from:", ($p-1)*$PAGE, " to: ", $p*$PAGE-1, "\n";
my @th = reverse @threads[($p-1)*$PAGE .. $p*$PAGE-1];
$last = $p*$PAGE-1;
my $html = sprintf ("%s_%03d.html" , $html_file, $p);
create_html($html, \@th, \@pages);
push @pages, {url => $html, page => $p};
}
my @th = reverse @threads[$last+1..@threads-1];
create_html("$html_file.html", \@th, \@pages);
}
sub create_html {
my ($filename, $threads, $pages) = @_;
#use Data::Dumper;
#print Dumper $pages;
my $template = HTML::Template->new(filename => "threads.tmpl");
$template->param(threads => $threads);
$template->param(pages => $pages);
$template->param(title => $config->{title});
mkdir "html" if not -d "html";
open my $fh, ">", "html/$filename" or die $!;
print $fh $template->output;
close $fh;
}
sub get_latest {
$w->get($config->{main_url});
if (not $w->success) {
die "Could not fetch main url\n";
}
my $all_link = $w->find_link(text => "All");
if (defined $all_link and $all_link->url =~ /(\d+)$/) {
return $1;
} else {
die "Could not find 'All' link\n";
}
}
sub usage {
my ($msg) = @_;
die "$msg\n\nUsage: $0 --file user --first NNN [--verbose]\n";
}
# TODO: add rss feed to the main page
# Deal with cases when the document cannot be retrieved, (temporarily ? at all ?) so we won't tru to
# get it again if it does not exist.