#!/usr/bin/ruby # Creates a map of a given URI. Crawls the URI given and finds every link # in a depth-first search. For each onsite link found, a node is created. # For every link found by visiting that node, an edge is created. And so # on. # # VERY RESOURCE INTENSIVE. I recommend 2G+ of RAM or a limitation on the # depth of the crawl (-r). # # Requires the installation of ruby-graphviz: # # gem install -r ruby-graphviz # # Hawler: # # wget http://spoofed.org/files/hawler/Hawler.gem && gem install Hawler.gem # # grapviz/dot: # # apt-get install graphviz (debian, ubuntu) # # Jon Hart # # Copyright (c) 2008, Jon Hart # All rights reserved. # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions are met: # * Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # * Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # * Neither the name of the nor the # names of its contributors may be used to endorse or promote products # derived from this software without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY Jon Hart ``AS IS'' AND ANY # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED # WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE # DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY # DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES # (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. require 'rubygems' require 'hawler' require 'hawleroptions' require 'hawlerhelper' require 'graphviz' require 'uri' require 'digest/md5' def add_node(node) node = URI.parse("#{node}") name = Digest::MD5.hexdigest("#{node.path}") unless (@nodes[name]) @m.add_node("f#{name}", 'label' => "#{node.path}") @nodes[name] = 1 end end def add_edge(to, fro) to = URI.parse("#{to}") fro = URI.parse("#{fro}") to_name = Digest::MD5.hexdigest("#{to.path}") fro_name = Digest::MD5.hexdigest("#{fro.path}") unless(@edges["#{fro_name} -> #{to_name}"]) @m.add_edge("f#{fro_name}", "f#{to_name}") @edges["#{fro_name} -> #{to_name}"] = 1 end end def map (u, r, response) # return unless (u and r) u = HawlerHelper.valid_uri(u) or return r = HawlerHelper.valid_uri(r) or return if (u.path == r.path) add_node(u) else ([u,r]).each do |l| add_node(l) end add_edge(u, r) end unless (response.nil?) HawlerHelper.harvest(u,response.body).each do |l| next if (l.path == u.path) add_node(l) add_edge(l,u) end end end @options = HawlerOptions.parse(ARGV) @m = GraphViz.new("akljfdlkajdf") #@m['concentrate'] = 'true' @nodes = {} @edges = {} if (ARGV.empty?) puts @options.help else @crawler = Hawler.new(ARGV[0], method(:map)) @options.each_pair do |o,v| @crawler.send("#{o}=",v) end @crawler.start @m.add_graph("adf") name = "#{ARGV[0]}" name.gsub!(/^https?:\/\//, '') name.gsub!(/\//, '_') # you'll definitely want to tweak the output here, because # I've seen very spotty behavior with png. jpeg can't support # large enough graphs. @m.output("output" => "png", "file" => "#{name}-map.png") @m.output("output" => "dot", "file" => "#{name}-map.dot") puts "Done, map is #{name}-map.png, #{name}-map.dot" end