#!/usr/bin/perl -Tw
# 
# Give a host/URL, find all cookies set by way of Set-Cookie headers.
# Optionally, crawl.
#
# Jon Hart <jhart@spoofed.org>
use strict;
use warnings;
use diagnostics;
use LWP::UserAgent;
use HTML::LinkExtor;
use Getopt::Long;
use Data::Dumper;
use URI::URL;
$| = 1;
my %opts = ();
# contains all of the links that need to be crawled but are properly formatted
my @links_to_process = ();
# skip all links that dont start with foo:/, which helps us ignore javascript:, mailto:, etc.
my %skips = map { $_ => 1 } qr(^[A-Za-z]*:[^\/]);
# only parse the following mime types
my %types = map { $_ => 1 } qw(text/html text/plain application/xhtml+xml text/x-server-parsed-html);
# ignore these file types
my %ignores = map { $_ => 1 } qw(pdf tar.gz gz);
my %headers = ();
my %links = ();
my @new_links = ();
my $it = 0;
my @ticks = qw( / - \ | );
GetOptions( \%opts, 'A=i', 'B=i', 'debug', 'headers=s@', 'quiet', 'recurse:i', 'verbose') or
   die("Unknown option: $!\n") && &usage();
# fill in the headers hash
if (defined($opts{'headers'})) {
   foreach (@{$opts{'headers'}}) {
      my @header = split(/:/, $_);
      $headers{$header[0]} = $header[1];
   }
}
foreach (@ARGV) {
   unless (/^(file|http|https):\/\//) {
      print(STDERR "No file/http/https specified.  Assuming http\n");
      $_ = "http://$_";
   }
   if (defined($opts{'recurse'})) {
      &crawl($_);
   } else {
      &find_cookies($_);
   }
}
###
# given a url, crawl it.  while crawling the links are stored and
# acted upon like a stack.
###
sub crawl {
   my $start_url = shift;
   &get_links($start_url, 0);
   while (@links_to_process) {
      my $cur_url = URI->new(pop(@links_to_process));
      # ensure that the links found first are checked first...
      @links_to_process = reverse @links_to_process;
      if (defined($links{$cur_url}{'processed'})) { next; }
      unless (defined($opts{'force'})) {
         unless ((URI->new($start_url)->authority eq $cur_url->authority) ||
                  (URI->new($start_url)->authority eq ('www.' . $cur_url->authority)) ||
                  (('www.' . URI->new($start_url)->authority) eq  $cur_url->authority)
                  ) {
            &debug_print("Skipping offsite URL $cur_url\n");
            $links{$cur_url}{'processed'} = 1;
            next;
         }
      }
      if (defined($opts{'recurse'})) {
         &get_links($cur_url, $links{$start_url}{'depth'});
      }
      &find_cookies($cur_url);
      $links{$cur_url}{'processed'} = 1;
   }
}
sub find_cookies {
   my $url = shift;
   my $ua = &my_ua();
   my $res = $ua->get($url);
   if ($res->is_success) {
      foreach my $header (split(/\n/, $res->headers_as_string)) {
         if ($header =~ /^Set-Cookie:\s*(\S+)=(\S+);.*$/) {
            my $key = $1;
            my $value = $2;
            print("$key:$value\n");
         }
      }
   }
}
sub get_links {
   my $url = shift;
   my $depth = shift;
   @new_links = ();
   if (defined($links{$url}{'crawled'})) { return; }
   $links{$url}{'crawled'} = 1;
   if (defined($opts{'recurse'}) && $opts{'recurse'} > 0 && $depth > $opts{'recurse'}) { 
      &debug_print("Max depth reached.\n");
      return;
   }
   &verbose_print("Crawling $url for links\n");
   my $ua = &my_ua;
   my $parser = HTML::LinkExtor->new(\&store_links);
   my $response = $ua->request(HTTP::Request->new(GET => $url), sub {$parser->parse($_[0])});
   foreach (@new_links) {
      my $new_url = URI->new_abs($_, $response->base);
      push(@links_to_process, $new_url);
      $links{$new_url}{'depth'} = $depth + 1;
      &tick;
   }
   push(@links_to_process, $url);
   $links{$url}{'depth'} = $depth + 1;
   unless ($response->is_success) {
      &debug_print("$url failed " . $response->status_line . "\n");
   }
}
sub my_ua {
   my $ua = LWP::UserAgent->new;
   $ua->agent("Mozilla/5.0 (compatible; MSIE 6.0; Windows NT 5.1");
   foreach (keys %headers) {
      $ua->default_header($_ => $headers{$_});
   }
   return $ua;
}
###
# sub for HTML::LinkExtor to minimally check 
# and store links in the temporary array for later
# processing
###
sub store_links {
   my ($tag, %attr) = @_;
   LINK: foreach my $link (values %attr) { 
      foreach my $skip (keys %skips) {
         if ($link =~ /$skip/i) {
            &debug_print("Skipping $link\n"); 
            next LINK;
         }
      }
      &tick;   
      push(@new_links, $link);
   }
}
sub usage {
   print <<EOF;
   $0: Find Cookies.  Yum.
   Usage:
   [-d | --debug]    # show debug output
   [-h | --headers]  # add headers of the form "Header: value".  may be used repeatedly
   [-q | --quiet]    # be quiet
   [-r | --recurse]  # crawl the site recursively.  Takes an optional depth limiter.
   [-v | --verbose]  # be verbose
EOF
exit 1;
}
sub debug_print { 
   my $msg = shift @_;
   if (defined($opts{'debug'})) { print "$msg"; }
}
sub verbose_print { 
   my $msg = shift @_;
   if (defined($opts{'verbose'})) { print "$msg"; }
}
sub quiet_print { 
   my $msg = shift @_;
   unless (defined($opts{'quiet'})) { print "$msg"; }
}
###
# silly helper function to make a spinning "ticker"
###
sub tick {
   $it = 0 if ++$it > $#ticks;
   unless (defined($opts{'quiet'})) { print("\e[0E", $ticks[$it], "\e[0E"); }
}