#!/usr/bin/perl -Tw # # Given a URL, find and print all comments. # # Useful for finding "hidden" functionality, debugging methods, # or other things that are interesting from a security perspective. # # Jon Hart use strict; use warnings; use diagnostics; use LWP::UserAgent; use HTML::LinkExtor; use Getopt::Long; use URI::URL; $| = 1; my %opts = (); # contains all of the links that need to be crawled but are properly formatted my @links_to_process = (); # skip all links that dont start with foo:/, which helps us ignore javascript:, mailto:, etc. my %skips = map { $_ => 1 } qr(^[A-Za-z]*:[^\/]); # only parse the following mime types my %types = map { $_ => 1 } qw(text/html text/plain application/xhtml+xml text/x-server-parsed-html); # ignore these file types my %ignores = map { $_ => 1 } qw(pdf tar.gz gz); my %headers = (); my %links = (); my @new_links = (); my $it = 0; my @ticks = qw( / - \ | ); GetOptions( \%opts, 'A=i', 'B=i', 'debug', 'headers=s@', 'quiet', 'recurse:i', 'verbose') or &usage() && die("Unknown option: $!\n"); # fill in the headers hash if (defined($opts{'headers'})) { foreach (@{$opts{'headers'}}) { my @header = split(/:/, $_); $headers{$header[0]} = $header[1]; } } if (@ARGV) { foreach (@ARGV) { unless (/^(file|http|https):\/\//) { print(STDERR "No file/http/https specified. Assuming http\n"); $_ = "http://$_"; } if (defined($opts{'recurse'})) { &crawl($_); } else { &find_comments($_); } } } else { &usage(); } ### # given a url, crawl it. while crawling the links are stored and # acted upon like a stack. ### sub crawl { my $start_url = shift; &get_links($start_url, 0); while (@links_to_process) { my $cur_url = URI->new(pop(@links_to_process)); # ensure that the links found first are checked first... @links_to_process = reverse @links_to_process; if (defined($links{$cur_url}{'processed'})) { next; } unless (defined($opts{'force'})) { unless ((URI->new($start_url)->authority eq $cur_url->authority) || (URI->new($start_url)->authority eq ('www.' . $cur_url->authority)) || (('www.' . URI->new($start_url)->authority) eq $cur_url->authority) ) { &debug_print("Skipping offsite URL $cur_url\n"); $links{$cur_url}{'processed'} = 1; next; } } if (defined($opts{'recurse'})) { &get_links($cur_url, $links{$start_url}{'depth'}); } &find_comments($cur_url); $links{$cur_url}{'processed'} = 1; } } sub find_comments { my $url = shift; my $ua = &my_ua(); my $comment = ""; my $num = -1; my $in_comment = 0; my $in_single_comment = 0; my $in_js = 0; my $comment_start = 0; my $res = $ua->get($url); if ($res->is_success) { my @content = split(/\n/, $res->content); foreach my $line (@content) { $line =~ s/^\s+//g; $line =~ s/\s+$//g; $num++; &tick; for ($line) { if (1==0) { next; } elsif ( (//i) || # single line js. ignore (/\/\/\s*-->/) ) # end of js crap. ignore. { if ($in_single_comment) { &aft_print(\@content, $num-1); $in_single_comment = 0; } } elsif ($line =~ /<(style|script)[^>]*>/i) { $in_js = 1; if ($in_single_comment) { &aft_print(\@content, $num-1); $in_single_comment = 0; } } elsif ($line =~ /<\/(style|script)>/i) { $in_js = 0; if ($in_single_comment) { &aft_print(\@content, $num-1); $in_single_comment = 0; } } elsif ( ($line =~ // || # single line html comment (($line =~ /^\/\// || $line =~ /\s+\/\// || $line =~ /\/\*.*\*\//) && $in_js) # single line js comment )) { if (!$in_single_comment) { $comment_start = $num; &quiet_print("#"x80 . "\n"); &quiet_print("$url line " . ($num+1) . ":\n"); &prev_print(\@content, $comment_start); $in_single_comment = 1; } print("$line\n"); } elsif ( ($line =~ // && !$in_js) || # end of multi-line html comment ($line =~ /\*\// && $in_js) # end of multi-line js commment ) { $in_single_comment = 0; if ($in_comment) { $in_comment = 0; &quiet_print("#"x80 . "\n"); &quiet_print("$url line " . ($comment_start+1) . ":\n"); &prev_print(\@content, $comment_start); print("$comment$line\n"); &aft_print(\@content, $num); } else { &debug_print("Rogue end comment at $num: $line\n"); } } else { if ($in_single_comment) { &aft_print(\@content, $num); $in_single_comment = 0; } if ($in_comment) { $comment .= $line . "\n"; } } } } } else { printf(STDERR "$url bfailed: %s\n", $res->status_line); } } sub get_links { my $url = shift; my $depth = shift; @new_links = (); if (defined($links{$url}{'crawled'})) { return; } $links{$url}{'crawled'} = 1; if (defined($opts{'recurse'}) && $opts{'recurse'} > 0 && $depth > $opts{'recurse'}) { &debug_print("Max depth reached.\n"); return; } &verbose_print("Crawling $url for links\n"); my $ua = &my_ua; my $parser = HTML::LinkExtor->new(\&store_links); my $response = $ua->request(HTTP::Request->new(GET => $url), sub {$parser->parse($_[0])}); foreach (@new_links) { my $new_url = URI->new_abs($_, $response->base); push(@links_to_process, $new_url); $links{$new_url}{'depth'} = $depth + 1; &tick; } push(@links_to_process, $url); $links{$url}{'depth'} = $depth + 1; unless ($response->is_success) { &debug_print("$url failed " . $response->status_line . "\n"); } } sub my_ua { my $ua = LWP::UserAgent->new; $ua->agent("Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1"); my %cookies; $ua->cookie_jar(\%cookies); foreach (keys %headers) { $ua->default_header($_ => $headers{$_}); } return $ua; } sub aft_print { my @content = @{$_[0]}; my $num = $_[1]; if (defined($opts{'A'})) { for (my $aft = 1; $aft <= $opts{'A'} && $num+$aft <= $#content; $aft++) { print("$content[$num+$aft]\n"); } } } sub prev_print { my @content = @{$_[0]}; my $num = $_[1]; if (defined($opts{'B'})) { my $prev = $opts{'B'}; if ($opts{'B'} > $num) { $prev = $num; } for (; $prev >= 1; $prev--) { print("$content[$num-$prev]\n"); } } } ### # sub for HTML::LinkExtor to minimally check # and store links in the temporary array for later # processing ### sub store_links { my ($tag, %attr) = @_; LINK: foreach my $link (values %attr) { foreach my $skip (keys %skips) { if ($link =~ /$skip/i) { &debug_print("Skipping $link\n"); next LINK; } } &tick; push(@new_links, $link); } } sub usage { print < $#ticks; unless (defined($opts{'quiet'})) { print("\e[0E", $ticks[$it], "\e[0E"); } }