#!/usr/bin/perl -w # # # Simple, crawl a given URL, extracting all links and check them for # redirection. # # Useful for detecting the use of mod_rewrite, mapping relationships between # sites, and determining how a given site is designed. # # Jon Hart # use diagnostics; use Getopt::Long; use HTML::LinkExtor; use HTTP::Headers; use LWP::UserAgent; use URI; GetOptions( "debug" => \(my $debug)) or &usage && die "Unknown option: $!\n"; my $base = $ARGV[0] || &usage; my @links = (); &crawl($base); &find_redirs(); sub check_redir { my $url = shift; my $ua = LWP::UserAgent->new; my $response = $ua->simple_request(HTTP::Request->new(GET => "$url")); if ($response->is_redirect) { my $location = $response->headers->header('location'); # ignore rewrites that simply tack on a '/' if (!($location eq $url . '/')) { return $location; } } return undef; } sub crawl { my $site = shift; my $ua = LWP::UserAgent->new; $ua->agent("Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.6) Gecko/20060728 Firefox/1.5.0.6"); my $parser = HTML::LinkExtor->new(\&get_links); if (defined($debug)) { print("Getting links from $site\n"); } my $response = $ua->simple_request(HTTP::Request->new(GET => $site), sub {$parser->parse($_[0])}); if ($response->is_redirect) { my $redir = $response->headers->header('location'); # ignore rewrites that simply tack on a '/' if (!($redir eq $site . '/')) { print("$site is a redirect to $redir\n"); } } } sub find_redirs { my $redir; foreach my $link (@links) { if (defined($debug)) { printf(STDERR "Checking $link... "); } if ($link =~ /^(http(|s)|ftp)/) { # link is really aboslute, do nothing } elsif ($link =~ /^[^\/]/) { # link is relative $link = $base . $link; } else { # link is mostly absolute (my $new_base = $base) =~ s/(:\/\/[^\/]+)\/.*/$1/g; $link = $new_base . $link; } if (defined($debug)) { printf(STDERR "(transformed to $link)\n"); } $redir = &check_redir($link); if (defined($redir)) { print("$link is a redirect to "); if ($redir =~ /^(http(|s)|ftp)/) { print("$redir\n"); } else { print("$base$redir\n"); } } } } sub get_links { my ($tag, %attr) = @_; foreach (values %attr) { if (!($_ =~ /^(mailto:.*|)$/)) { push(@links, $_); } } return; } sub usage { print < Simple, crawl a given URL, extracting all links and check them for redirection. Useful for detecting the use of mod_rewrite, mapping relationships between sites, and determining how a given site is designed. Usage: $0 [--debug] EOF }