#!/usr/bin/perl -w

=head1 NAME

linkchecker - check links on HTML pages for validity

=head1 SYNOPSIS

linkchecker starturl filter

=head1 DESCRIPTION

The C<linkchecker> recursively retrieves documents starting at
I<starturl> and generates a (lengthy) report about the links it finds.

The regular expression C<filter> is used to restrict the region which is
searched. Only THML documents with URLs matching the expression are
scanned for links. Note however, that all links which were already found
are followed, regardless of whether they match the filter expression or
not. Thus, calling
C<linkchecker http;//www.example.org/ '^http://[-.\w]+.example.org/'>
will check all links on all pages in the example.org domain, even if
they point outside this domain.

The output is very verbose and intended to be postprocessed by other
tools before being presented to the user. For example,

C<grep '^http:' linkchecker.out | grep -v ' 200 '>

produces a list of all broken http links with a list of pages where they
occur. 

=head1 AUTHOR

Peter J. Holzer <hjp@hjp.at>


=cut

use strict;
use LWP::UserAgent;
use HTML::LinkExtor;
use Time::HiRes qw ( time );

$| = 1;

if (@ARGV != 2) {
    print STDERR "Usage: $0 starturl filter\n";
    exit(1);
}
my $starturl = $ARGV[0];
my $filter = $ARGV[1];

my $startpage = { 
		    Url => $starturl,
		    Level => 1,
		};

my $pages = { $starturl => $startpage };
my @to_check = ( $startpage );
my @new_links;

sub lex {
    my ($tag, %links) = @_;
    for my $i (keys %links) {
	# print "lex:$i=$links{$i}\n";
	push @new_links, $links{$i}
    }
}

my $ua = new LWP::UserAgent;
$ua->env_proxy();

while (@to_check) {
    my $currentpage = shift (@to_check);
    my $url = $currentpage->{Url};

    print "checking $url (", scalar(@to_check), " left)\n";

    my $request = new HTTP::Request('GET', $url);
    if ($pages->{$url}{Referrer}) {
	my $referrer = (keys %{$pages->{$url}{Referrer}})[0];
	print "\tReferer: $referrer\n";
	$request->referer($referrer)
    }

    my $t0 = time();
    my $response = $ua->request($request);
    my $t1 = time();
    
    print "\tTime: ", $t1 - $t0, "\n";
    print "\tCode: ", $response->code, "\n";
    $currentpage->{Code} = $response->code;

    print "\tContent_type: ", join('; ', $response->content_type), "\n";
    $currentpage->{Content_type} = $response->content_type;

    if ($currentpage->{Content_type} eq "text/html" &&
	$currentpage->{Level} > 0
    ) {
	my $base = $response->base;
	my $parser = HTML::LinkExtor->new(\&lex, $base);
	@new_links = ();
	$parser->parse($response->content);
	for my $i (@new_links) {
	    $i =~ s/\#.*//;	# cut off local anchors.
	    print "\tLink to: ", $i, "\n";
	    if ($pages->{$i}) {
		print "\t\talready known\n";
		$pages->{$i}->{Referrer}->{$url} = 1;
	    } else {
		print "\t\tnew\n";
		$pages->{$i} = {
		    Url => $i, 
		    Referrer => {$url => 1}
		};
		if ($i =~ m/$filter/) {
		    print "\t\t\tlevel 1\n";
		    $pages->{$i}->{Level} = 1;
		} else {
		    print "\t\t\tlevel 0\n";
		    $pages->{$i}->{Level} = 0;
		}
		push @to_check, $pages->{$i};
	    }
	}
    }
}
for my $i (sort keys (%$pages)) {
    print $i, " ";
    print $pages->{$i}->{Code}, " ";
    if ($pages->{$i}->{Referrer}) {
	print join(" ", sort keys %{$pages->{$i}->{Referrer}});
    }
    print "\n";
}
