@{[ esc($starturl) ]}

#!/usr/bin/perl -C0 =head1 NAME linkchecker - check links on HTML pages for validity =head1 SYNOPSIS linkchecker [--follow | --check | --ignore filter-regex ] [ --credentials file ] [ --cookies ] [ --html-summary file ] [ --no-redirect ] starturl =head1 DESCRIPTION The C recursively retrieves documents starting at I and generates a (lengthy) report about the links it finds. Any number of filters can be specified with the options --follow, --check, and --ignore. The filters are regular expressions, and each URL is matched against each of them in order. The first match determines what should be done with the URL: =over 4 =item follow The resource is retrieved. If it is of type text/html, it is parsed and all URLs found are added to the list or URLs to try. =item check The resource is retrieved. =item ignore The resource is not retrieved. =back If no filter matches, the URL is checked. If no filter is specified on the command line, a default filter /^\Q$starturl\E.*/ is used. Examples: =over =item C will parse all HTML pages on www.example.org and check all links it finds, even those pointing to other domains. It will not follow any links found on other sites. =item C will start at http://www.example.org/sitemap.html. It will follow links on all pages of www.example.org/, except those on /archive/.*/msg.*html (presumably because broken links are to be expected in an archive of old messages). Of the remaining URLs, it will check all http URLs and ignore the rest. =back The option --credentials can be used to specify a credential file. Each line in the file contains 4 values separated by white space: a net location (hostname and port separated by a colon - note that the protocol is not specified and the port has to be specified even if it is the default!), a realm, a username and a password. If the option --cookies is given, cookies sent by the webserver will be honored. This is especially useful if the webserver falls back to session-ids in URLs if the client doesn't support cookies, because otherwise you may end up with a lot of duplicate URLs. The output is very verbose and intended to be postprocessed by other tools before being presented to the user. For example, C produces a list of all broken http links with a list of pages where they occur. Alternatively, the option --html-summary may be used to print the summary in HTML format to a separate file. The detailed information about each page is still printed to stdout. The option --no-redirect turns off automatic processing of redirects. When a page returns a 301 or 302 status code, it this status code is logged and the target of the redirect is added as a new url to be checked (with the orginal url as a fake referrer). =head1 AUTHOR Peter J. Holzer =head1 BUGS The realm in the credential file cannot contain white space. The --no-redirect option should probably be default. =cut use strict; use warnings; use LWP::UserAgent; use HTML::LinkExtor; use Time::HiRes qw ( time ); use Getopt::Long; use Pod::Usage; #use LWP::Debug qw(+); $| = 1; my @filters; sub addfilter { my ($opt, $re) = @_; push (@filters, [ $re, $opt ]); } my $help; my $credentials; my $cookies; my $html_summary_file; my %lwp_options = (); GetOptions('follow=s' => \&addfilter, 'check=s' => \&addfilter, 'ignore=s' => \&addfilter, 'credentials=s' => \$credentials, 'cookies' => \$cookies, 'help' => \$help, 'html-summary=s' => \$html_summary_file, 'no-redirect' => sub { $lwp_options{requests_redirectable} = [] }, ) or pod2usage(2); pod2usage(1) if ($help); pod2usage(1) unless (@ARGV == 1); my $starturl = $ARGV[0]; unless (@filters) { @filters = ( [ "^\Q$starturl\E.*", "follow" ], [ "", "check" ], ); } my $startpage = { Url => $starturl, }; my $pages = { $starturl => $startpage }; my @to_check = ( $startpage ); my @new_links; sub lex { my ($tag, %links) = @_; for my $i (keys %links) { # print "lex:$i=$links{$i}\n"; push @new_links, $links{$i} } } sub add_link { my ($cururl, $newurl) = @_; $newurl =~ s/\#.*//; # cut off local anchors. print "\tLink to: ", $newurl, "\n"; if ($pages->{$newurl}) { print "\t\talready known\n"; $pages->{$newurl}->{Referrer}->{$cururl} = 1; } else { print "\t\tnew\n"; $pages->{$newurl} = { Url => $newurl, Referrer => {$cururl => 1} }; push @to_check, $pages->{$newurl}; } } my $ua = new LWP::UserAgent(%lwp_options); $ua->env_proxy(); if ($credentials) { open(F, $credentials) or die "cannot open $credentials: $!"; while () { chomp; my ($netloc, $realm, $uname, $pass) = split(/\t/); $ua->credentials($netloc, $realm, $uname, $pass); } close(F); } if ($cookies) { $ua->cookie_jar({}); } while (@to_check) { my $currentpage = shift (@to_check); my $url = $currentpage->{Url}; my $action = 'check'; for my $f (@filters) { if ($url =~ $f->[0]) { $action = $f->[1]; last; } } $currentpage->{action} = $action; print "$action $url (", scalar(@to_check), " left)\n"; next if ($action eq 'ignore'); my $request = new HTTP::Request('GET', $url); if ($pages->{$url}{Referrer}) { my $referrer = (keys %{$pages->{$url}{Referrer}})[0]; print "\tReferer: $referrer\n"; $request->referer($referrer) } my $t0 = time(); my $response = $ua->request($request); my $t1 = time(); print "\tTime: ", $t1 - $t0, "\n"; print "\tCode: ", $response->code, "\n"; $currentpage->{Code} = $response->code; print "\tContent_type: ", join('; ', $response->content_type), "\n"; $currentpage->{Content_type} = $response->content_type; print "\tLength: ", length($response->content), "\n"; print "\tSpeed: ", length($response->content) / ($t1 - $t0), "\n"; print "\tLast-Modified: ", $response->header('Last-Modified'), "\n"; if ($currentpage->{Code} == 200 && $currentpage->{Content_type} eq "text/html" && $action eq 'follow' ) { my $base = $response->base; my $parser = HTML::LinkExtor->new(\&lex, $base); @new_links = (); $parser->parse($response->content); for my $i (@new_links) { add_link($url, $i); } } elsif (($currentpage->{Code} == 301 || $currentpage->{Code} == 302) && $action eq 'follow' ) { add_link($url, $response->header('Location')); } } if ($html_summary_file) { open my $fh, '>', $html_summary_file or die "cannot open $html_summary_file: $!"; print $fh qq{\n}; print $fh qq{\n}; print $fh qq{ \n}; print $fh qq{ @{[ esc($starturl) ]}\n}; print $fh qq{ \n}; print $fh qq{ \n}; print $fh qq{ \n}; print $fh qq{

@{[ esc($i) ]}: Result: $pages->{$i}->{Code}\n}; if ($pages->{$i}->{Referrer}) { print $fh qq{
Referrer: \n}; for (sort keys %{$pages->{$i}->{Referrer}}) { print $fh qq{ @{[ esc($_) ]}\n}; } print $fh qq{
\n}; } print $fh qq{

\n}; print $fh qq{ \n}; print $fh qq{\n}; } else { for my $i (sort keys (%$pages)) { print $i, " "; print $pages->{$i}->{Code}, " "; if ($pages->{$i}->{Referrer}) { print join(" ", sort keys %{$pages->{$i}->{Referrer}}); } print "\n"; } } sub esc { my ($s) = @_; $s =~ s/([^ -%(-;?-~])/"&#" . ord($1) . ";"/eg; return $s; }