#!/usr/bin/perl -Tw
#
# Crawl a site and try yer darndest to find directory indexes/listings
#
# If you know of other indexes that this doesn't find, write a regex and 
# send it my way
#
# Bugs/oddities:
#    * UTF8 doesn't work (this is LWP's fault...)
#    * 
#
# Jon Hart <jhart@spoofed.org>
use strict;
use warnings;
use diagnostics;
use LWP::UserAgent;
use HTML::LinkExtor;
use Getopt::Long;
use URI::URL;
$| = 1;
my %opts = ();
# contains all of the links that need to be crawled but are properly formatted
my @links_to_process = ();
# contains all of the links returned from the latest run of HTML::LinkExtor
my @new_links = ();
# skip all links that dont start with foo:/, which helps us ignore javascript:, mailto:, etc.
my %skips = map { $_ => 1 } qr(^[A-Za-z]*:[^\/]);
# only parse the following mime types
my %types = map { $_ => 1 } qw(text/html text/plain application/xhtml+xml text/x-server-parsed-html);
# ignore these file types
my %ignores = map { $_ => 1 } qw(pdf tar.gz gz);
my %headers = ();
my %links = ();
my $it = 0;
my @ticks = qw( / - \ | );
GetOptions( \%opts, 'bruteforce', 'debug', 'force', 'headers=s@', 'recurse:i', 'quiet', 'verbose') or
            die "Unknown option: $!\n" && &usage();
&usage() unless defined($ARGV[0]);
# fill in the headers hash
if (defined($opts{'headers'})) {
   foreach (@{$opts{'headers'}}) {
      my @header = split(/:/, $_);
      $headers{$header[0]} = $header[1];
   }
}
# start it all off...
&crawl($ARGV[0]);
# take all components of the path and try to find indexes for each 
sub bruteforce {
   my $cur_url = shift;
   my $new_path = $cur_url->path;
   &debug_print("Brute-forcing $cur_url\n");
   my $new_url = $cur_url->clone;
   # we must handle URLs with embedded queries or fragments specially
   foreach my $part (qw(fragment query)) {
      if (defined($new_url->$part)) {
         # try the url as is...
         &debug_print("Checking $new_url with $part\n");
         if (defined($opts{'recurse'})) {
            &get_links($new_url, $links{$cur_url}{'depth'});
         }
         &indexcheck($new_url);
         $links{$new_url}{'processed'} = 1;
         # now get rid of this part and do it again...
         &debug_print("Checking $new_url without $part\n");
         $new_url->$part(undef);
         if (defined($opts{'recurse'})) {
            &get_links($new_url, $links{$cur_url}{'depth'});
         }
         &indexcheck($new_url);
         $links{$new_url}{'processed'} = 1;
      }
   }
   # take the current cur_url /foo/bar/baf and try to hit /foo/bar, /foo and /
   while (!($new_path eq "" || $new_path eq "/")) {
      $new_url->path($new_path);
      if (defined($opts{'recurse'})) {
         &get_links($new_url, $links{$cur_url}{'depth'});
      }
      &indexcheck($new_url);
      $links{$new_url}{'processed'} = 1;
      $new_path =~ s/\/[^\/]*$//g;
   }
   $links{$new_url}{'processed'} = 1;
}
###
# given a url, crawl it.  while crawling the links are stored and
# acted upon like a stack.
###
sub crawl {
   my $start_url = shift;
   &get_links($start_url, 0);
   while (@links_to_process) {
      my $cur_url = URI->new(pop(@links_to_process));
      # ensure that the links found first are checked first...
      @links_to_process = reverse @links_to_process;
      if (defined($links{$cur_url}{'processed'})) { next; }
      unless (defined($opts{'force'})) {
         unless ((URI->new($start_url)->authority eq $cur_url->authority)) {
            &debug_print("Skipping offsite URL $cur_url\n");
            $links{$cur_url}{'processed'} = 1;
            next;
         }
      }
      if (defined($opts{'bruteforce'})) {
         &bruteforce($cur_url);
      } else {
            if (defined($opts{'recurse'})) {
               &get_links($cur_url, $links{$start_url}{'depth'});
            }
            &indexcheck($cur_url);
            $links{$cur_url}{'processed'} = 1;
      }
   }
}
###
# sub for HTML::LinkExtor to minimally check 
# and store links in the temporary array for later
# processing
###
sub store_links {
   my ($tag, %attr) = @_;
   LINK: foreach my $link (values %attr) { 
      foreach my $skip (keys %skips) {
         if ($link =~ /$skip/i) {
            &debug_print("Skipping $link\n"); 
            next LINK;
         }
      }
      &tick;
      push(@new_links, $link);
   }
}
###
# given a url, find all of the links contained therein,
# storing them in the appropriate manner
###
sub get_links {
   my $url = shift;
   my $depth = shift;
   @new_links = ();
   if (defined($links{$url}{'crawled'})) { return; }
   $links{$url}{'crawled'} = 1;
   if (defined($opts{'recurse'}) && $opts{'recurse'} > 0 && $depth > $opts{'recurse'}) { 
      &debug_print("Max depth reached.\n");
      return;
   }
   &verbose_print("Crawling $url for links\n");
   my $ua = &my_ua;
   my $parser = HTML::LinkExtor->new(\&store_links);
   my $response = $ua->request(HTTP::Request->new(GET => $url), sub {$parser->parse($_[0])});
   foreach (@new_links) {
      my $new_url = URI->new_abs($_, $response->base);
      push(@links_to_process, $new_url);
      $links{$new_url}{'depth'} = $depth + 1;
      &tick;
   }
   push(@links_to_process, $url);
   $links{$url}{'depth'} = $depth + 1;
   unless ($response->is_success) {
      &debug_print("$url failed " . $response->status_line . "\n");
   }
}
###
# given a URL, does it appear to be an "index"
###
sub indexcheck {
   my $url = shift;
   my $index_type = undef;
   &tick;
   # this link has already been checked...
   if (defined($links{$url}{'checked'})) { return; }
   $links{$url}{'checked'} = 1;
   # two checks for URLs that are very similar ('foo' vs 'foo/')
   if (!($url =~ /\/$/) && defined($links{$url . "/"}{'index'})) { return; }
   if ($url =~ /\/$/) {
      (my $tmp_url = $url) =~ s/\/$//g; 
      if (defined($links{$tmp_url}{'index'})) { return; }
   }
   foreach my $ignore (keys %ignores) {
      if ($url =~ /\.$ignore$/) {
         &debug_print("Skipping $url as .$ignore isn't worth downloading\n");
         $links{$url}{'crawled'} = 1;
         return;
      }
   }
   my $ua = &my_ua;
   my $res = $ua->request(HTTP::Request->new(GET => $url));
   if ($res->is_success) {
      unless (defined($types{$res->content_type})) { 
         &debug_print("Skipping $url as " . $res->content_type . " isn't worth parsing\n");
         $links{$url}{'crawled'} = 1;
         return;
      }
      for ($res->content) {
         &debug_print("Checking $url for indexes\n");
         if (/<title>Index of .*<\/title>/i) {
            $index_type = "Apache";
         } elsif (/Directory Listing For/) {
            $index_type = "Tomcat";
         } elsif (/\s*(\w+),\s+(\w+)\s+(\d+),\s+(\d{4})\s+(\d{1,2}):(\d\d) (AM|PM)\s+([\d\.]+)/) {
            $index_type = "IIS";
         } elsif (/Parent Directory<\/a>/) {
            $index_type = "Generic";
         }
      }
      if (defined($index_type)) {
         if ($index_type =~ /^[aeiou]/i) {
            print("$url is an $index_type index\n");
         } else {
            print("$url is a $index_type index\n");
         }
         $links{$url}{'index'} = $index_type;
      }
   } else {
      &debug_print("$url failed " . $res->status_line . "\n");
   }
}
###
# silly helper function to make a spinning "ticker"
###
sub tick {
   $it = 0 if ++$it > $#ticks;
   unless (defined($opts{'quiet'})) { print("\e[0E", $ticks[$it], "\e[0E"); }
}
sub my_ua {
   my $ua = LWP::UserAgent->new;
   $ua->agent("Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1");
   my %cookies;
   $ua->cookie_jar(\%cookies);
   foreach (keys %headers) {
      $ua->default_header($_ => $headers{$_});
   }
   return $ua;
}
sub usage {
   print <<EOF;
   $0: find webserver indexes
   Usage:
   [-b | --bruteforce] # bf URL fragments.  from /foo/bar/baf, check /foo/bar and /foo
   [-d | --debug]    # show debug output
   [-f | --force]    # force recursion "offsite" -- use with caution.  75% of the time this spirals out of control
   [-h | --headers]  # add headers of the form "Header: value".  may be used repeatedly
   [-q | --quiet]    # be quiet
   [-r | --recurse]  # crawl the site recursively.  Takes an optional depth limiter.
   [-v | --verbose]  # be verbose
EOF
exit 1;
}
sub debug_print { 
   my $msg = shift @_;
   if (defined($opts{'debug'})) { print "$msg"; }
}
sub verbose_print { 
   my $msg = shift @_;
   if (defined($opts{'verbose'})) { print "$msg"; }
}