#!/usr/bin/perl -w

=head1 NAME

linkchecker - check links on HTML pages for validity

=head1 SYNOPSIS

linkchecker [--follow | --check | --ignore filter-regex ] starturl

=head1 DESCRIPTION

The C<linkchecker> recursively retrieves documents starting at
I<starturl> and generates a (lengthy) report about the links it finds.

Any number of filters can be specified with the options --follow,
--check, and --ignore. The filters are regular expressions, and each URL
is matched against each of them in order. The first match determines
what should be done with the URL:

=over 4

=item folloe

The resource is retrieved. If it is of type text/html, it is parsed and
all URLs found are added to the list or URLs to try.

=item check

The resource is retrieved.

=item ignore

The resource is not retrieved.

=back

If no filter matches, the URL is checked. If no filter is specified on
the command line, a default filter /^\Q$starturl\E.*/ is used.

Examples:

=over

=item C<linkchecker http://www.example.org/>

will parse all HTML pages on www.example.org and check all links it
finds, even those pointing to other domains. It will not follow any
links found on other sites.

=item C<linkchecker -c ^http://www.example.org/archive/.*/msg.*html -f
^http://www.example.org/ -c ^http:// -i '.*'
http://www.example.org/sitemap.html>

will start at http://www.example.org/sitemap.html. It will follow links
on all pages of www.example.org/, except those on /archive/.*/msg.*html
(presumably because broken links are to be expected in an archive of old
messages). Of the remaining URLs, it will check all http URLs and ignore
the rest.

=back

The output is very verbose and intended to be postprocessed by other
tools before being presented to the user. For example,

C<grep '^http:' linkchecker.out | grep -v ' 200 '>

produces a list of all broken http links with a list of pages where they
occur. 

=head1 AUTHOR

Peter J. Holzer <hjp@hjp.at>


=cut

use strict;
use LWP::UserAgent;
use HTML::LinkExtor;
use Time::HiRes qw ( time );
use Getopt::Long;
use Pod::Usage;

$| = 1;

my @filters;

sub addfilter {
    my ($opt, $re) = @_;
    push (@filters, [ $re, $opt ]);
}

my $help;
GetOptions('follow=s'	=> \&addfilter,
           'check=s' 	=> \&addfilter,
	   'ignore=s'	=> \&addfilter,
	   'help'   	=> \$help
          ) or pod2usage(2);
pod2usage(1) if ($help);
pod2usage(1) unless (@ARGV == 1);

my $starturl = $ARGV[0];

unless (@filters) {
    @filters = (
	[ "^\Q$starturl\E.*",	"follow" ],
	[ "",			"check"  ],
    );
}

my $startpage = { 
		    Url => $starturl,
		};

my $pages = { $starturl => $startpage };
my @to_check = ( $startpage );
my @new_links;

sub lex {
    my ($tag, %links) = @_;
    for my $i (keys %links) {
	# print "lex:$i=$links{$i}\n";
	push @new_links, $links{$i}
    }
}

my $ua = new LWP::UserAgent;
$ua->env_proxy();

while (@to_check) {
    my $currentpage = shift (@to_check);
    my $url = $currentpage->{Url};

    my $action = 'check';
    for my $f (@filters) {
	if ($url =~ $f->[0]) {
	    $action = $f->[1];
	    last;
	}
    }
    $currentpage->{action} = $action;
    print "$action $url (", scalar(@to_check), " left)\n";
    next if ($action eq 'ignore');


    my $request = new HTTP::Request('GET', $url);
    if ($pages->{$url}{Referrer}) {
	my $referrer = (keys %{$pages->{$url}{Referrer}})[0];
	print "\tReferer: $referrer\n";
	$request->referer($referrer)
    }

    my $t0 = time();
    my $response = $ua->request($request);
    my $t1 = time();
    
    print "\tTime: ", $t1 - $t0, "\n";
    print "\tCode: ", $response->code, "\n";
    $currentpage->{Code} = $response->code;

    print "\tContent_type: ", join('; ', $response->content_type), "\n";
    $currentpage->{Content_type} = $response->content_type;

    if ($currentpage->{Content_type} eq "text/html" &&
	$action eq 'follow'
    ) {
	my $base = $response->base;
	my $parser = HTML::LinkExtor->new(\&lex, $base);
	@new_links = ();
	$parser->parse($response->content);
	for my $i (@new_links) {
	    $i =~ s/\#.*//;	# cut off local anchors.
	    print "\tLink to: ", $i, "\n";
	    if ($pages->{$i}) {
		print "\t\talready known\n";
		$pages->{$i}->{Referrer}->{$url} = 1;
	    } else {
		print "\t\tnew\n";
		$pages->{$i} = {
		    Url => $i, 
		    Referrer => {$url => 1}
		};
		push @to_check, $pages->{$i};
	    }
	}
    }
}
for my $i (sort keys (%$pages)) {
    print $i, " ";
    print $pages->{$i}->{Code}, " ";
    if ($pages->{$i}->{Referrer}) {
	print join(" ", sort keys %{$pages->{$i}->{Referrer}});
    }
    print "\n";
}
