#!/usr/bin/perl -Tw # # Crawl a site and try yer darndest to find directory indexes/listings # # If you know of other indexes that this doesn't find, write a regex and # send it my way # # Bugs/oddities: # * UTF8 doesn't work (this is LWP's fault...) # * # # Jon Hart use strict; use warnings; use diagnostics; use LWP::UserAgent; use HTML::LinkExtor; use Getopt::Long; use URI::URL; $| = 1; my %opts = (); # contains all of the links that need to be crawled but are properly formatted my @links_to_process = (); # contains all of the links returned from the latest run of HTML::LinkExtor my @new_links = (); # skip all links that dont start with foo:/, which helps us ignore javascript:, mailto:, etc. my %skips = map { $_ => 1 } qr(^[A-Za-z]*:[^\/]); # only parse the following mime types my %types = map { $_ => 1 } qw(text/html text/plain application/xhtml+xml text/x-server-parsed-html); # ignore these file types my %ignores = map { $_ => 1 } qw(pdf tar.gz gz); my %headers = (); my %links = (); my $it = 0; my @ticks = qw( / - \ | ); GetOptions( \%opts, 'bruteforce', 'debug', 'force', 'headers=s@', 'recurse:i', 'quiet', 'verbose') or die "Unknown option: $!\n" && &usage(); &usage() unless defined($ARGV[0]); # fill in the headers hash if (defined($opts{'headers'})) { foreach (@{$opts{'headers'}}) { my @header = split(/:/, $_); $headers{$header[0]} = $header[1]; } } # start it all off... &crawl($ARGV[0]); # take all components of the path and try to find indexes for each sub bruteforce { my $cur_url = shift; my $new_path = $cur_url->path; &debug_print("Brute-forcing $cur_url\n"); my $new_url = $cur_url->clone; # we must handle URLs with embedded queries or fragments specially foreach my $part (qw(fragment query)) { if (defined($new_url->$part)) { # try the url as is... &debug_print("Checking $new_url with $part\n"); if (defined($opts{'recurse'})) { &get_links($new_url, $links{$cur_url}{'depth'}); } &indexcheck($new_url); $links{$new_url}{'processed'} = 1; # now get rid of this part and do it again... &debug_print("Checking $new_url without $part\n"); $new_url->$part(undef); if (defined($opts{'recurse'})) { &get_links($new_url, $links{$cur_url}{'depth'}); } &indexcheck($new_url); $links{$new_url}{'processed'} = 1; } } # take the current cur_url /foo/bar/baf and try to hit /foo/bar, /foo and / while (!($new_path eq "" || $new_path eq "/")) { $new_url->path($new_path); if (defined($opts{'recurse'})) { &get_links($new_url, $links{$cur_url}{'depth'}); } &indexcheck($new_url); $links{$new_url}{'processed'} = 1; $new_path =~ s/\/[^\/]*$//g; } $links{$new_url}{'processed'} = 1; } ### # given a url, crawl it. while crawling the links are stored and # acted upon like a stack. ### sub crawl { my $start_url = shift; &get_links($start_url, 0); while (@links_to_process) { my $cur_url = URI->new(pop(@links_to_process)); # ensure that the links found first are checked first... @links_to_process = reverse @links_to_process; if (defined($links{$cur_url}{'processed'})) { next; } unless (defined($opts{'force'})) { unless ((URI->new($start_url)->authority eq $cur_url->authority)) { &debug_print("Skipping offsite URL $cur_url\n"); $links{$cur_url}{'processed'} = 1; next; } } if (defined($opts{'bruteforce'})) { &bruteforce($cur_url); } else { if (defined($opts{'recurse'})) { &get_links($cur_url, $links{$start_url}{'depth'}); } &indexcheck($cur_url); $links{$cur_url}{'processed'} = 1; } } } ### # sub for HTML::LinkExtor to minimally check # and store links in the temporary array for later # processing ### sub store_links { my ($tag, %attr) = @_; LINK: foreach my $link (values %attr) { foreach my $skip (keys %skips) { if ($link =~ /$skip/i) { &debug_print("Skipping $link\n"); next LINK; } } &tick; push(@new_links, $link); } } ### # given a url, find all of the links contained therein, # storing them in the appropriate manner ### sub get_links { my $url = shift; my $depth = shift; @new_links = (); if (defined($links{$url}{'crawled'})) { return; } $links{$url}{'crawled'} = 1; if (defined($opts{'recurse'}) && $opts{'recurse'} > 0 && $depth > $opts{'recurse'}) { &debug_print("Max depth reached.\n"); return; } &verbose_print("Crawling $url for links\n"); my $ua = &my_ua; my $parser = HTML::LinkExtor->new(\&store_links); my $response = $ua->request(HTTP::Request->new(GET => $url), sub {$parser->parse($_[0])}); foreach (@new_links) { my $new_url = URI->new_abs($_, $response->base); push(@links_to_process, $new_url); $links{$new_url}{'depth'} = $depth + 1; &tick; } push(@links_to_process, $url); $links{$url}{'depth'} = $depth + 1; unless ($response->is_success) { &debug_print("$url failed " . $response->status_line . "\n"); } } ### # given a URL, does it appear to be an "index" ### sub indexcheck { my $url = shift; my $index_type = undef; &tick; # this link has already been checked... if (defined($links{$url}{'checked'})) { return; } $links{$url}{'checked'} = 1; # two checks for URLs that are very similar ('foo' vs 'foo/') if (!($url =~ /\/$/) && defined($links{$url . "/"}{'index'})) { return; } if ($url =~ /\/$/) { (my $tmp_url = $url) =~ s/\/$//g; if (defined($links{$tmp_url}{'index'})) { return; } } foreach my $ignore (keys %ignores) { if ($url =~ /\.$ignore$/) { &debug_print("Skipping $url as .$ignore isn't worth downloading\n"); $links{$url}{'crawled'} = 1; return; } } my $ua = &my_ua; my $res = $ua->request(HTTP::Request->new(GET => $url)); if ($res->is_success) { unless (defined($types{$res->content_type})) { &debug_print("Skipping $url as " . $res->content_type . " isn't worth parsing\n"); $links{$url}{'crawled'} = 1; return; } for ($res->content) { &debug_print("Checking $url for indexes\n"); if (/Index of .*<\/title>/i) { $index_type = "Apache"; } elsif (/Directory Listing For/) { $index_type = "Tomcat"; } elsif (/\s*(\w+),\s+(\w+)\s+(\d+),\s+(\d{4})\s+(\d{1,2}):(\d\d) (AM|PM)\s+([\d\.]+)/) { $index_type = "IIS"; } elsif (/Parent Directory<\/a>/) { $index_type = "Generic"; } } if (defined($index_type)) { if ($index_type =~ /^[aeiou]/i) { print("$url is an $index_type index\n"); } else { print("$url is a $index_type index\n"); } $links{$url}{'index'} = $index_type; } } else { &debug_print("$url failed " . $res->status_line . "\n"); } } ### # silly helper function to make a spinning "ticker" ### sub tick { $it = 0 if ++$it > $#ticks; unless (defined($opts{'quiet'})) { print("\e[0E", $ticks[$it], "\e[0E"); } } sub my_ua { my $ua = LWP::UserAgent->new; $ua->agent("Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1"); my %cookies; $ua->cookie_jar(\%cookies); foreach (keys %headers) { $ua->default_header($_ => $headers{$_}); } return $ua; } sub usage { print <<EOF; $0: find webserver indexes Usage: [-b | --bruteforce] # bf URL fragments. from /foo/bar/baf, check /foo/bar and /foo [-d | --debug] # show debug output [-f | --force] # force recursion "offsite" -- use with caution. 75% of the time this spirals out of control [-h | --headers] # add headers of the form "Header: value". may be used repeatedly [-q | --quiet] # be quiet [-r | --recurse] # crawl the site recursively. Takes an optional depth limiter. [-v | --verbose] # be verbose EOF exit 1; } sub debug_print { my $msg = shift @_; if (defined($opts{'debug'})) { print "$msg"; } } sub verbose_print { my $msg = shift @_; if (defined($opts{'verbose'})) { print "$msg"; } }