#!/usr/bin/perl -Tw # # Give a host/URL, find all cookies set by way of Set-Cookie headers. # Optionally, crawl. # # Jon Hart use strict; use warnings; use diagnostics; use LWP::UserAgent; use HTML::LinkExtor; use Getopt::Long; use Data::Dumper; use URI::URL; $| = 1; my %opts = (); # contains all of the links that need to be crawled but are properly formatted my @links_to_process = (); # skip all links that dont start with foo:/, which helps us ignore javascript:, mailto:, etc. my %skips = map { $_ => 1 } qr(^[A-Za-z]*:[^\/]); # only parse the following mime types my %types = map { $_ => 1 } qw(text/html text/plain application/xhtml+xml text/x-server-parsed-html); # ignore these file types my %ignores = map { $_ => 1 } qw(pdf tar.gz gz); my %headers = (); my %links = (); my @new_links = (); my $it = 0; my @ticks = qw( / - \ | ); GetOptions( \%opts, 'A=i', 'B=i', 'debug', 'headers=s@', 'quiet', 'recurse:i', 'verbose') or die("Unknown option: $!\n") && &usage(); # fill in the headers hash if (defined($opts{'headers'})) { foreach (@{$opts{'headers'}}) { my @header = split(/:/, $_); $headers{$header[0]} = $header[1]; } } foreach (@ARGV) { unless (/^(file|http|https):\/\//) { print(STDERR "No file/http/https specified. Assuming http\n"); $_ = "http://$_"; } if (defined($opts{'recurse'})) { &crawl($_); } else { &find_cookies($_); } } ### # given a url, crawl it. while crawling the links are stored and # acted upon like a stack. ### sub crawl { my $start_url = shift; &get_links($start_url, 0); while (@links_to_process) { my $cur_url = URI->new(pop(@links_to_process)); # ensure that the links found first are checked first... @links_to_process = reverse @links_to_process; if (defined($links{$cur_url}{'processed'})) { next; } unless (defined($opts{'force'})) { unless ((URI->new($start_url)->authority eq $cur_url->authority) || (URI->new($start_url)->authority eq ('www.' . $cur_url->authority)) || (('www.' . URI->new($start_url)->authority) eq $cur_url->authority) ) { &debug_print("Skipping offsite URL $cur_url\n"); $links{$cur_url}{'processed'} = 1; next; } } if (defined($opts{'recurse'})) { &get_links($cur_url, $links{$start_url}{'depth'}); } &find_cookies($cur_url); $links{$cur_url}{'processed'} = 1; } } sub find_cookies { my $url = shift; my $ua = &my_ua(); my $res = $ua->get($url); if ($res->is_success) { foreach my $header (split(/\n/, $res->headers_as_string)) { if ($header =~ /^Set-Cookie:\s*(\S+)=(\S+);.*$/) { my $key = $1; my $value = $2; print("$key:$value\n"); } } } } sub get_links { my $url = shift; my $depth = shift; @new_links = (); if (defined($links{$url}{'crawled'})) { return; } $links{$url}{'crawled'} = 1; if (defined($opts{'recurse'}) && $opts{'recurse'} > 0 && $depth > $opts{'recurse'}) { &debug_print("Max depth reached.\n"); return; } &verbose_print("Crawling $url for links\n"); my $ua = &my_ua; my $parser = HTML::LinkExtor->new(\&store_links); my $response = $ua->request(HTTP::Request->new(GET => $url), sub {$parser->parse($_[0])}); foreach (@new_links) { my $new_url = URI->new_abs($_, $response->base); push(@links_to_process, $new_url); $links{$new_url}{'depth'} = $depth + 1; &tick; } push(@links_to_process, $url); $links{$url}{'depth'} = $depth + 1; unless ($response->is_success) { &debug_print("$url failed " . $response->status_line . "\n"); } } sub my_ua { my $ua = LWP::UserAgent->new; $ua->agent("Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1"); foreach (keys %headers) { $ua->default_header($_ => $headers{$_}); } return $ua; } ### # sub for HTML::LinkExtor to minimally check # and store links in the temporary array for later # processing ### sub store_links { my ($tag, %attr) = @_; LINK: foreach my $link (values %attr) { foreach my $skip (keys %skips) { if ($link =~ /$skip/i) { &debug_print("Skipping $link\n"); next LINK; } } &tick; push(@new_links, $link); } } sub usage { print < $#ticks; unless (defined($opts{'quiet'})) { print("\e[0E", $ticks[$it], "\e[0E"); } }