#!/usr/bin/perl -w use strict; use WWW::Mechanize; use YAML qw(DumpFile LoadFile); use HTML::Template; use Getopt::Long qw(GetOptions); my $verbose; my $html_file; my $first; GetOptions("verbose" => \$verbose, "file=s" => \$html_file, "first=s" => \$first); my $w = WWW::Mechanize->new(); #my $file = "users.txt"; # should come from the command line ? #my $html_file = "users"; #my $html_file = shift usage("Need file name on command line") if not $html_file; usage("Need first value") if not $first; my $file = "$html_file.txt"; #my $user_config = { # main_url => "http://subversion.tigris.org/servlets/SummarizeList?listName=users", # msg_url => "http://subversion.tigris.org/servlets/ReadMsg?list=users&msgNo", # thread_url => "http://subversion.tigris.org/servlets/BrowseList?list=users&by=thread&from", #}; my ($config, $threads); if (-e $file) { ($config, $threads) = LoadFile($file); } else { usage("Could not find '$file'"); # $config = $user_config; } my %seen; foreach my $t (keys %$threads) { $seen{$_} = $t foreach keys %{$threads->{$t}{posts}}; } #build_html(); #exit; my $mul = length $config->{msg_url}; my $latest = get_latest(); print "Latest: $latest\n" if $verbose; foreach my $id ($first..$latest) { if ($seen{$id} and not $threads->{$seen{$id}}{date}) { get_date($seen{$id}, $id); } next if $seen{$id}; sleep 2; # be nice to the tigris server print "msg: $id (latest: $latest)\n" if $verbose; $w->get("$config->{msg_url}=$id"); if (not $w->success) { print "Could not fetch message $id\n"; next; } my $thread_link = $w->find_link(text => "Browse this thread"); my $thread_id; if (not $thread_link) { print "Could not find 'Browse this thread' for message $id\n"; next; } if ($thread_link->url =~ /(\d+)$/) { $thread_id = $1; } else { print "Could not fetch thread_id for message $id\n"; print "URL: " . $thread_link->url . "\n"; next; } print "thread: $thread_id\n" if $verbose; $w->follow_link(text => "Browse this thread"); if (not $w->success) { print "Could not fetch 'Browse this thread' for message $id\n"; next; } my $full_thread = $w->find_link(text => "Show all"); if (not $full_thread) { print "Could not find 'Show all' link\n"; next; } $w->follow_link(text => "Show all"); if (not $w->success) { print "Could not follow 'Show all' link\n"; next; } $threads->{$thread_id}{url} = $full_thread->url; my $title_id; foreach my $link ($w->links) { next if substr($link->url, 0, $mul) ne $config->{msg_url}; my $msg_id = substr($link->url, $mul+1); print " $msg_id\n" if $verbose; $threads->{$thread_id}{posts}{$msg_id}++; $seen{$msg_id} = $thread_id; if (not defined $title_id or $title_id > $msg_id) { $title_id = $msg_id; $threads->{$thread_id}{title} = $link->text; } } get_date($thread_id, $title_id); } DumpFile($file, $config, $threads); build_html(); exit; sub get_date { my ($thread_id, $msg_id) = @_; print "get_date for $thread_id $msg_id\n" if $verbose; $w->get("$config->{msg_url}=$msg_id"); return if not $w->success; if ($w->content =~ m{
(.*?)
}s) { my $header = $1; if ($header =~ /Date: (.*)/) { $threads->{$thread_id}{date} = $1; } else { warn "Could not fetch Date: for $msg_id\n"; } } else { warn "Could not fetch
 for $msg_id\n";
	}
}

	

sub build_html {
	my @threads;
	my @pages = ({url => "$html_file.html", page => "most recent"});
	
	foreach my $t (sort {$a <=> $b} keys %$threads) {
		
		push @threads, {
			url   => $threads->{$t}{url},
			title => $threads->{$t}{title},
			date  => $threads->{$t}{date},
			count => scalar (keys %{$threads->{$t}{posts}}),
		};
	}

	#create_html("$html_file.all.html", \@threads, []);
	my $PAGE = 30;
	my $last = 0;
	foreach my $p (1..int((@threads-7) / $PAGE)) {
		#print "from:", ($p-1)*$PAGE, "   to: ", $p*$PAGE-1, "\n";
		my @th = reverse @threads[($p-1)*$PAGE .. $p*$PAGE-1];
		$last = $p*$PAGE-1;
		my $html = sprintf ("%s_%03d.html" , $html_file, $p);
		create_html($html,  \@th, \@pages);
		push @pages, {url => $html, page => $p};
	}
	my @th = reverse @threads[$last+1..@threads-1];
	create_html("$html_file.html", \@th, \@pages);
}

sub create_html {
	my ($filename, $threads, $pages) = @_;
	#use Data::Dumper;
	#print Dumper $pages;
	my $template = HTML::Template->new(filename => "threads.tmpl");
	$template->param(threads => $threads);
	$template->param(pages   => $pages);
	$template->param(title   => $config->{title});
	mkdir "html" if not -d "html";
	open my $fh, ">", "html/$filename" or die $!;
	print $fh $template->output;
	close $fh;
}



sub get_latest {
	$w->get($config->{main_url});
	if (not $w->success) {
		die "Could not fetch main url\n";
	}
	my $all_link = $w->find_link(text => "All");
	if (defined $all_link and $all_link->url =~ /(\d+)$/) {
		return $1;
	} else {
		die "Could not find 'All' link\n";
	}
}

sub usage {
	my ($msg) = @_;
	die "$msg\n\nUsage: $0 --file user --first NNN [--verbose]\n";
}

# TODO: add rss feed to the main page
# Deal with cases when the document cannot be retrieved, (temporarily ? at all ?) so we won't tru to
# get it again if it does not exist.