#!/usr/bin/perl -C0

=head1 NAME

linkchecker - check links on HTML pages for validity

=head1 SYNOPSIS

linkchecker [--follow | --check | --ignore filter-regex ]
[ --credentials file ] [ --cookies ] [ --html-summary file ]
[ --no-redirect ] starturl

=head1 DESCRIPTION

The C<linkchecker> recursively retrieves documents starting at
I<starturl> and generates a (lengthy) report about the links it finds.

Any number of filters can be specified with the options --follow,
--check, and --ignore. The filters are regular expressions, and each URL
is matched against each of them in order. The first match determines
what should be done with the URL:

=over 4

=item follow

The resource is retrieved. If it is of type text/html, it is parsed and
all URLs found are added to the list or URLs to try.

=item check

The resource is retrieved.

=item ignore

The resource is not retrieved.

=back

If no filter matches, the URL is checked. If no filter is specified on
the command line, a default filter /^\Q$starturl\E.*/ is used.

Examples:

=over

=item C<linkchecker http://www.example.org/>

will parse all HTML pages on www.example.org and check all links it
finds, even those pointing to other domains. It will not follow any
links found on other sites.

=item C<linkchecker -c ^http://www.example.org/archive/.*/msg.*html -f
^http://www.example.org/ -c ^http:// -i '.*'
http://www.example.org/sitemap.html>

will start at http://www.example.org/sitemap.html. It will follow links
on all pages of www.example.org/, except those on /archive/.*/msg.*html
(presumably because broken links are to be expected in an archive of old
messages). Of the remaining URLs, it will check all http URLs and ignore
the rest.

=back

The option --credentials can be used to specify a credential file. Each
line in the file contains 4 values separated by white space: a net
location (hostname and port separated by a colon - note that the
protocol is not specified and the port has to be specified even if it is
the default!), a realm, a username and a password.

If the option --cookies is given, cookies sent by the webserver will be
honored. This is especially useful if the webserver falls back to
session-ids in URLs if the client doesn't support cookies, because
otherwise you may end up with a lot of duplicate URLs.

The output is very verbose and intended to be postprocessed by other
tools before being presented to the user. For example,

C<grep '^http:' linkchecker.out | grep -v ' 200 '>

produces a list of all broken http links with a list of pages where they
occur. 

Alternatively, the option --html-summary may be used to print the
summary in HTML format to a separate file. The detailed information
about each page is still printed to stdout. 

The option --no-redirect turns off automatic processing of redirects.
When a page returns a 301 or 302 status code, it this status code is
logged and the target of the redirect is added as a new url to be
checked (with the orginal url as a fake referrer).

=head1 AUTHOR

Peter J. Holzer <hjp@hjp.at>

=head1 BUGS

The realm in the credential file cannot contain white space.

The --no-redirect option should probably be default.

=cut

use strict;
use warnings;
use LWP::UserAgent;
use HTML::LinkExtor;
use Time::HiRes qw ( time );
use Getopt::Long;
use Pod::Usage;

#use LWP::Debug qw(+);


$| = 1;

my @filters;

sub addfilter {
    my ($opt, $re) = @_;
    push (@filters, [ $re, $opt ]);
}

my $help;
my $credentials;
my $cookies;
my $html_summary_file;
my %lwp_options = ();
GetOptions('follow=s'	=> \&addfilter,
           'check=s' 	=> \&addfilter,
	   'ignore=s'	=> \&addfilter,
	   'credentials=s'	=> \$credentials,
	   'cookies'	=> \$cookies,
	   'help'   	=> \$help,
	   'html-summary=s'	=> \$html_summary_file,
	   'no-redirect'	=> sub { $lwp_options{requests_redirectable} = [] },
          ) or pod2usage(2);
pod2usage(1) if ($help);
pod2usage(1) unless (@ARGV == 1);

my $starturl = $ARGV[0];



unless (@filters) {
    @filters = (
	[ "^\Q$starturl\E.*",	"follow" ],
	[ "",			"check"  ],
    );
}

my $startpage = { 
		    Url => $starturl,
		};

my $pages = { $starturl => $startpage };
my @to_check = ( $startpage );
my @new_links;

sub lex {
    my ($tag, %links) = @_;
    for my $i (keys %links) {
	# print "lex:$i=$links{$i}\n";
	push @new_links, $links{$i}
    }
}

sub add_link {
    my ($cururl, $newurl) = @_;
    $newurl =~ s/\#.*//;	# cut off local anchors.
    print "\tLink to: ", $newurl, "\n";
    if ($pages->{$newurl}) {
	print "\t\talready known\n";
	$pages->{$newurl}->{Referrer}->{$cururl} = 1;
    } else {
	print "\t\tnew\n";
	$pages->{$newurl} = {
	    Url => $newurl, 
	    Referrer => {$cururl => 1}
	};
	push @to_check, $pages->{$newurl};
    }
}


my $ua = new LWP::UserAgent(%lwp_options);
$ua->env_proxy();
if ($credentials) {
    open(F, $credentials) or die "cannot open $credentials: $!";
    while (<F>) {
	chomp;
	my ($netloc, $realm, $uname, $pass) = split(/\t/);
	$ua->credentials($netloc, $realm, $uname, $pass);
    }
    close(F);
}
if ($cookies) {
    $ua->cookie_jar({});
}
while (@to_check) {
    my $currentpage = shift (@to_check);
    my $url = $currentpage->{Url};

    my $action = 'check';
    for my $f (@filters) {
	if ($url =~ $f->[0]) {
	    $action = $f->[1];
	    last;
	}
    }
    $currentpage->{action} = $action;
    print "$action $url (", scalar(@to_check), " left)\n";
    next if ($action eq 'ignore');


    my $request = new HTTP::Request('GET', $url);
    if ($pages->{$url}{Referrer}) {
	my $referrer = (keys %{$pages->{$url}{Referrer}})[0];
	print "\tReferer: $referrer\n";
	$request->referer($referrer)
    }

    my $t0 = time();
    my $response = $ua->request($request);
    my $t1 = time();
    
    print "\tTime: ", $t1 - $t0, "\n";
    print "\tCode: ", $response->code, "\n";
    $currentpage->{Code} = $response->code;

    print "\tContent_type: ", join('; ', $response->content_type), "\n";
    $currentpage->{Content_type} = $response->content_type;
    print "\tLength: ", length($response->content), "\n";
    print "\tSpeed: ", length($response->content) / ($t1 - $t0), "\n";
    print "\tLast-Modified: ", $response->header('Last-Modified'), "\n";

    if ($currentpage->{Code} == 200
        && $currentpage->{Content_type} eq "text/html"
	&& $action eq 'follow'
    ) {
	my $base = $response->base;
	my $parser = HTML::LinkExtor->new(\&lex, $base);
	@new_links = ();
	$parser->parse($response->content);
	for my $i (@new_links) {
	    add_link($url, $i);
	}
    } elsif (($currentpage->{Code} == 301 || $currentpage->{Code} == 302)
	     && $action eq 'follow'
    ) {
	add_link($url, $response->header('Location'));
    }
}
if ($html_summary_file) {
    open my $fh, '>', $html_summary_file
	or die "cannot open $html_summary_file: $!";
    print $fh qq{<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n};
    print $fh qq{<html>\n};
    print $fh qq{  <head>\n};
    print $fh qq{    <title>@{[ esc($starturl) ]}</title>\n};
    print $fh qq{    <style type="text/css">\n};
    print $fh qq{      td { vertical-align: baseline; }\n};
    print $fh qq{    </style>\n};
    print $fh qq{  </head>\n};
    print $fh qq{  <body>\n};
    print $fh qq{    <dl>\n};
    for my $i (sort keys (%$pages)) {
	print $fh qq{      <dt><a href="@{[ esc($i) ]}">@{[ esc($i) ]}</a></dt>\n};
	print $fh qq{      <dd>Result: $pages->{$i}->{Code}\n};
	if ($pages->{$i}->{Referrer}) {
	    print $fh qq{        <p>Referrer: \n};
	    for (sort keys %{$pages->{$i}->{Referrer}}) {
		print $fh qq{          <a href="@{[ esc($_) ]}">@{[ esc($_) ]}</a>\n};
	    }
	    print $fh qq{        </p>\n};
	}
	print $fh qq{      </dd>\n};
    }
    print $fh qq{    </dl>\n};
    print $fh qq{  </body>\n};
    print $fh qq{</html>\n};
} else {
    for my $i (sort keys (%$pages)) {
	print $i, " ";
	print $pages->{$i}->{Code}, " ";
	if ($pages->{$i}->{Referrer}) {
	    print join(" ", sort keys %{$pages->{$i}->{Referrer}});
	}
	print "\n";
    }
}

sub esc {
    my ($s) = @_;
    $s =~ s/([^ -%(-;?-~])/"&#" . ord($1) . ";"/eg;
    return $s;
}
