#! /usr/bin/perl # # hrefchecker - a simple HyperREFerence checker # Version 0.2.4 [05/11/1998] # # by Satoru Takabayashi # Copyright (C) 1998 Satoru Takabayashi All rights reserved. # This is free software with ABSOLUTELY NO WARRANTY. # # It's a typical ad hoc program but useful enough I think :-) # # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # # INSTALLATION: (of course you have known that things) # # 1. Check the first line of this file and modify the pathname of Perl. # NOTES: This program does NOT WORK with PERL4. It must be run # with Perl 5.003 or later version and I have confirmed it works with # JPerl but not tested 'jcode.pl' with JPerl. # 2. Give an execute permission, you know. # % chmod +x hrefchecker # 3. Copy this file to the directory you want to set like /usr/local/bin. # 4. Change some variable to set to your system. # These configurations are defined under the part of CHANGELOG. # 5. Enjoy! # # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # # USAGE: # # usage: hrefchecker <-lqvQVL> Target_HTML_Files... # -l: check only local links # -q: quiet mode (not to output OK messages) # -Q: more quiet (not to output warning messages too) # -v: verbose mode (more information) # -V: more verbose (it is mainly intended for debug use) # -L: show list of all links # -A: show list of all anchors # -c: content negotiaton with Apache's Multiviews style # -p: talk http protocol via proxy server # # When you specify a remote HTML file as target, hrefchecker automatically # get the file via http protocol and treat well relative links in the file. # # For example: # # % hrefchecker index.html foo.html bar.html # Local files # % hrefchecker http://foo.bar.jp/%7Equux/ # Remote file # # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # # RECOMMENDED USAGE: # # % find . -type f -name '*.html' |xargs hrefchecker # # It is very useful way to check all of HTML files which you have. # hrefchecker is tuned to check multiple HTML files at once. # Especially, searching lots of anchors is efficient because of hash method. # # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # # KNOWN PROBLEMS: # # * do not consider element. # * do not check URL correct format or not. # * can handle only "http" protocol with URL. # * do not check correctly ahchors in Multiviews' files. # # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # # TODO: # # * add timeout processing at network connection. # * add routine to check reachabilities of ftp, news, mail and etc. # # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # # BUGS: # # * I believe many bugs live happily in this program. # * my horrible English. Please teach me mistakes secretly. # # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # # CHANGELOG # # v0.2.4 [05/11/1998] # * added -A option to show all of anchors. # # v0.2.3 [05/06/1998] # * added -p option to handle proxy. # * added -c option to treat content negotiation. # * fixed bug in &Cwd. It effects only on Win32. # # v0.2.2 [05/02/1998] # * added feature to check a remote HTML file as target via http # protocol. # (Thanks to Ohzaki-san for advising) # * removed useless routine I misunderstanded. # # v0.2.1 [05/01/1998] # * stopped to use 'chat2.pl' and replaced it with 'Socket.pm'. # * added %Results hash to record &check's result, so performance # really improved. # * added Japanese character code conversion with 'jcode.pl'. # * added $BASE variable used as prefix of URL when link begins # '/' character. # * added 'Pragma: no-cache' header to http request. # (Thanks to Ohzaki-san for advising) # * fixed numbering link routine in &get_link_list. (bug fix) # (Thanks to Yano-san for telling a bug) # * removed %seen and %HttpResult hash. # # v0.2.0 [04/30/1998] # * changed the anchor search method to use %files hash. # (%files has slightly complicated data structure) # * added routine to clear %seen hash file by file. (bug fix) # * added %HttpResult hash to record &httpcheck's result. # * modified treatment of HTML element to handle unquoted value. # * modified regexp to treat scheme in URL as case insensitive. # (Thanks to Yano-san for advising) # # v0.1.1 [04/28/1998] # * fixed regexp which is to get link and anchor. # * added feature to handle proxy. # # v0.1.0 [04/27/1998] # * made first version. # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . # ################################################## # Configrations begin ################################################## # Pathname separate character # UNIX -> '/', Win32 -> '\' # but I have not tested on Win32. $PSC = '/'; ################################################## # name of the file to be used as a DirectoryIndex # e.g. $DirectoryIndex = "index.html"; ################################################## # prefix of URL when link begins '/' character. # e.g. $BASE = 'http://localhost'; ################################################## # for handling Japanese character code # $Japanese = 1; # require 'jcode.pl'; ################################################## # Proxy setting $Proxy = "proxy.foo.bar.jp"; $Port = "8080"; ################################################## # for content negotiation with Apache's MultiViews @Languages = ("en", "ja"); ################################################## # Configrations end require 5.003; use Cwd; use IO::File; use Socket; $USAGE = < HTML_Files... -l: check only local links -q: quiet mode (not to output OK messages) -Q: more quiet (not to output warning messages too) -v: verbose mode (more information) -V: more verbose (it is mainly intended for debug use) -L: show list of all links -A: show list of all anchors -c: content negotiaton with Apache's Multiviews style -p: talk http protocol via proxy server EOM STDOUT->autoflush(1); &main(); sub main () { my (%files); while ($ARGV[0] =~ '^-') { $Quiet = 1 if ($ARGV[0] =~ /q/); $LinkListing = 1 if ($ARGV[0] =~ /L/); $AnchorListing = 1 if ($ARGV[0] =~ /A/); $LocalOnly = 1 if ($ARGV[0] =~ /l/); $Verbose = 1 if ($ARGV[0] =~ /v/); $ViaProxy = 1 if ($ARGV[0] =~ /p/); $ContentNego = 1 if ($ARGV[0] =~ /c/); $MoreQuiet = 1, $Quiet = 1 if ($ARGV[0] =~ /Q/); $MoreVerbose = 1, $Verbose = 1 if ($ARGV[0] =~ /V/); shift(@ARGV); } if (@ARGV == 0) { print $USAGE; exit; } $FilenamePut = 1 if @ARGV > 1; foreach $file (@ARGV) { my ($cwd, $cwd2, $filename, $path); if ($PSC eq "\\") { # Perl for Win32 can probably handle '/' as # pathname separate character. $filename =~ s|\\|/|g; } $cwd = Cwd(); print "== $file ==\n" if $MoreVerbose; if ($file =~ m|^http://|i) { my ($rtmp); $filename = $file; if ($filename =~ m|^(http://.*)/.*|i) { $cwd2 = $1; if ("OK" ne ($rtmp = &check(\%files, $filename, $cwd2, 1))) { die "$filename is $rtmp\n"; } } else { die "$filename is undisireble URL.\n"; } } else { my ($rtmp); if ($file =~ m|^(.*)/(.*)|) { $tdir = $1; $filename = $2; } else { $tdir = "."; $filename = $file; } chdir $tdir if $tdir ne "."; $cwd2 = Cwd(); if ("OK" ne ($rtmp = &check(\%files, $filename, $cwd2, 1))) { die "$filename is $rtmp\n"; } } $path = &pathname($cwd2, $filename); if ($AnchorListing) { my ($name, @bufs, $i, $tmp); $i = 0; foreach $name (keys %{$files{$path}{anchors}}) { my ($line); foreach $line (@{$files{$path}{anchors}{$name}}) { $bufs[$i] = "$file:" if $FilenamePut; $bufs[$i] .= "$line:#$name"; if (1 < (@{$files{$path}{anchors}{$name}})) { my ($n); $n = @{$files{$path}{anchors}{$name}}; $bufs[$i] .= " is NOT UNIQUE! $n found."; } $bufs[$i] .= "\n"; $i++; } } foreach $tmp (sort bn @bufs) { print $tmp; } next; } foreach $link (@{$files{$path}{links}}) { my ($tmp, $orig); $link =~ /(\d+):(.*)/; $line = $1; $link = $2; if ($LinkListing) { print "$file:" if $FilenamePut; print "$line:$link\n"; next; } if ($cwd2 =~ m|^http://|i) { $orig = $link; $link = pathname($cwd2, $link); undef($orig) if ($orig eq $link); } elsif ($link =~ m|^/|) { $link = $BASE . $link; } if ($link =~ m|^(\w+)://| && $LocalOnly) { next; } $tmp = $link; $tmp =~ s/^#/$filename#/; $result = check(\%files, $tmp, $cwd2, 0); result_put($file, $link, $orig, $line, $result); } chdir $cwd; print "\n" if $MoreVerbose; } } # compare by number of begining of line. sub bn { my ($foo, $bar); $a =~ /(\d+)/; $foo = $1; $b =~ /(\d+)/; $bar = $1; $foo <=> $bar; } # load file sub file_load ($$\%$) { my ($filename, $files, $flag) = @_; my (@tmp, $content); unless (defined($$files{$filename})) { my ($fh) = new IO::File; $fh->open("$filename") || die "$!: $filename\n"; @tmp = <$fh>; $content = join('', @tmp); if ($Japanese && &is_japanese($content)) { $content = &jcode::euc($content, '', 'z'); print "** Japanese conversion occured\n" if $MoreVerbose; } &get_link_list($files, $filename, $content); } } # silly way to detect Japanese character sub is_japanese ($) { my ($str) = @_; return 1 if ($str =~ /[\x81-\xfe]+/); # EUC-JP or Shift_JIS return 1 if ($str =~ /\x1b$B/); # ISO-2022-JP 0; } # normalize pathname sub pathname($$) { my ($cwd, $path) = @_; if ($cwd =~ m|^http://|i) { unless ($path =~ m!^[a-zA-Z0-9\-\+\.]+://|mailto:!i) { my ($root); $cwd =~ m|^(http://[^/]+)|i; $root = $1; if ($path =~ m|^/|) { $path = $root . $path; } else { $path =~ s|/+|/|g; $path = $cwd . "/" . $path; 1 while ($path =~ s|/\./|/|g); 1 while ($path =~ s|/[^/]+/\.\./|/|g); } } } elsif ($path !~ m|^http://|i) { $path = $cwd . "/" . $path; $path =~ s|/+|/|g; 1 while ($path =~ s|/\./|/|g); 1 while ($path =~ s|/[^/]+/\.\./|/|g); # $path =~ s|/$||; } $path; } # get link and anchor list sub get_link_list(\%$) { my ($files, $filename, $content) = @_; my (@tmp, $i, $tmp, $safestr); $safechr = "[A-Za-z0-9\.\-]"; # characters not have to be quoted print "@@ Loaded: $filename\n" if $MoreVerbose; @tmp = split("\n", $content); $i = 1; $content = ""; foreach $tmp (@tmp) { $tmp .= "\n"; $tmp =~ s/<(\S+)\s/<$1:$i: /ig; # numbering, how ad hock! $content .= $tmp; $i++; } # , both of type are considered. # but what a dirty way! I have never seen quite like this before :-) $content =~ s/]*HREF\s*=\s*(([\"\'])([^>]+?)\3|$safechr+)[^>]*>/ PushL(\@{$$files{$filename}{links}}, "$1:$2", $3), $&/geis; $content =~ s/]*NAME\s*=\s*(([\"\'])(.+?)\3|$safechr+)[^>]*>/ PushA($files, $filename, $2, $1, $3), $&/iesg; $content =~ s/<\w+:(\d+):\s+[^>]*ID\s*=\s*(([\"\'])(.+?)\3|$safechr+)[^>]*>/ PushA($files, $filename, $2, $1, $3), $&/iesg; # normalize the scheme and host part of URL to small letter. # this processing does not consider user and password field. foreach $tmp (@{$$files{$filename}{links}}) { $tmp =~ s|([a-zA-Z0-9\-\+\.]+://[^/]+)|&uncapitalize($&)|e; } } sub uncapitalize($) { my ($str) = @_; $str =~ tr/A-Z/a-z/; $str; } # delete quatation marks before push. sub PushL (\@$$) { my ($list, $scalar, $quote) = @_; $scalar =~ s/$quote//g; push(@{$list}, $scalar); } # delete quatation marks before push. sub PushA (\%$$$) { my ($files, $link, $name, $line, $quote) = @_; $name =~ s/$quote//g; push(@{$$files{$link}{anchors}{$name}}, $line); } # horribly nested check routine sub check (\$$\$$) { my ($files, $link, $cwd, $get) = @_; my ($result, @tmp, $lang, $result, $exist, $path); $path = &pathname($cwd, $link); # return if link has been checked if (!$get && defined($Results{$path})) { print "// be Checked: " . $path . " Result: $Results{$path}\n" if $MoreVerbose; return $Results{$path}; } if ($link =~ /(.+)\#(.+)/) { my ($file, $anchor, $rtmp, $path2); $file = $1; $anchor = $2; $path2 = &pathname($cwd, $file); print "** file: $file\n** anchor: $anchor\n" if $MoreVerbose; unless (defined($$files{$path2})) { if (defined($Results{$path2}) && $Results{$path2} ne "OK") { $rtmp = $Results{$path2}; } else { $rtmp = check($files, $file, $cwd, 1); } } else { $rtmp = "OK"; } if ($rtmp eq "OK") { $n = @{$$files{$path2}{anchors}{$anchor}}; if ($n == 0) { $result = "file exist but anchor \"$anchor\" is NOT FOUND"; } elsif ($n == 1) { $result = "OK"; } else { $result = "file exist but anchor \"$anchor\" is NOT UNIQUE ($n found)"; } } else { $result = $rtmp; } } else { if ($link =~ m|^mailto:|i) { $result = "an email address (warning)"; } elsif ($link =~ m|^http://|i) { $result = httpcheck($files, $link, $get); } elsif ($link =~ m|^(.*?)://|) { $result = "hrefchecker's unsupported protocol '$1' (warning)"; } else { if ($link =~ /\/$/) { unless (-d $link) { $result = "NOT FOUND"; } else { print "** $link is a directory, then regard as $link$DirectoryIndex\n" if $Verbose; if ("OK" eq ($val = check($files, "$link$DirectoryIndex", $cwd, $get))) { $result = "OK"; if (defined($$files{"$path$DirectoryIndex"})) { $$files{$path} = $$files{"$path$DirectoryIndex"}; } } else { $result = "a directory (warning)"; } } } else { if (-c $link) { # such as /dev/null $result = "OK"; } elsif (-f $link) { $result = "OK"; &file_load($path, $files) if ($get); } elsif ($link =~ /\.html?$/i && $ContentNego && @Languages) { print "** $link doesn't exist, trying language pattern match...\n" if $Verbose; foreach $lang (@Languages) { if ("OK" eq ($val = check($files, "$link.$lang", $cwd, $get))) { if (defined($$files{"$path.$lang"})) { $$files{$path} = $$files{"$path.$lang"}; } print "** $link.$lang exist.\n" if $Verbose; $exist = 1; } } if ($exist) { $result = "OK"; } else { print "** language pattern match failed.\n" if $Verbose; $result = "NOT FOUND"; } } else { print "** $link doesn't exist\n" if $Verbose; $result = "NOT FOUND"; } } } } $Results{$path} = $result; $result; } # talk http protocol to get header or content specified by URL and check it sub httpcheck (\$$$) { my ($files, $link, $get) = @_; my ($server, $file, $result, $method, $buf, $header, $flag, $port, $content, $handle); if ($get) { $method = "GET"; $content = ""; } else { $method = "HEAD"; } if ($link =~ m|http://(.*?)(/.*)|i) { $server = $1; $file = $2; $port = 80; if ($ViaProxy && $Proxy && $Port && $link !~ m|^http://localhost/|) { $server = $Proxy; $file = $link; $port = $Port; } print "[server: $server][port: $port][file: $file]\n" if $MoreVerbose; if (&socket_connect($server, $port)) { $result = "NOT ABLE TO BE CONNECTED"; return $result; } print S "$method $file HTTP/1.0\n"; print S "Pragma: no-cache\n\n"; $status = 0; while (defined($buf = )) { $buf =~ s/\r\n/\n/; $content .= $buf if $get; $flag = 1 if $buf =~ /^$/; $header .= "\t$buf" unless $flag; if ($buf =~ m|^HTTP/\d.\d 200|) { $result = "OK"; } elsif ($buf =~ m|^HTTP/\d.\d (\d\d\d.*)|) { $result = "ERROR because of \"$1\""; } } close(S); if ($Japanese && &is_japanese($content)) { $content = &jcode::euc($content, '', 'z'); print "** Japanese conversion occured\n" if $MoreVerbose; } &get_link_list($files, $link, $content) if $get && $result eq "OK"; } else { # just like "http://foo.bar.jp" $result = "an undesirable URL (warning)"; return $result; } print "$header" if $MoreVerbose; return $result; } # to tell the truth I have not understood socket mechanism... and # I have no idea about timeout processing and how to use 'IO::Socket' library. sub socket_connect ($$) { my ($server, $port) = @_; my ($sockaddr, $proto, $hostaddr, $sock); $sockaddr = 'S n a4 x8'; return 1 if (!defined($proto = getprotobyname('tcp'))); return 1 if (!defined($hostaddr = gethostbyname($server))); $sock = pack($sockaddr, 2, $port, $hostaddr); socket(S, PF_INET, SOCK_STREAM, $proto) || return 1; connect(S, $sock) || return 1; S->autoflush(1); 0; } # put result message. sub result_put ($$$$$) { my ($file, $link, $orig, $line, $result) = @_; unless ($Quiet && $result =~ /OK/ || $MoreQuiet && $result =~ /warning/) { print "$file:" if $FilenamePut; if (defined($orig)) { print "$line:$link [$orig] is $result.\n"; } else { print "$line:$link is $result.\n"; } } } sub Cwd { my ($cwd); $cwd = cwd(); if ($PSC eq "\\") { # Perl for Win32 can probably handle '/' as # pathname separate character. $cwd =~ s|\\|/|g; } $cwd; }