#!/usr/local/bin/perl ############################################################# # Program: spider.pl # Copyright 2000 # # Description: This program accepts a single URL from the # command line and then crawls or spiders the # URL to find all imbedded local links in all # child pages. This program returns a list # of local links as standard output. # # Author: Dana French (dfrench@email.com) # # (405) 329-6578 # # Date: 12/14/2000 # # Limitations: This script assumes that the URL's within a # web page all begin with "href=" and extend # across a single line. This script will NOT # spider a URL which is coded as multiple # lines within a web page. This implies that # the "href=" must also be on the same line. # ############################################################# # Modifications: # # 12/14/2000 Version: 1.0 # Original Code ################################################################ # get the URL from the first command line argument ${url} = "${ARGV[0]}"; ${category} = "${ARGV[1]}"; ${hrefs[0]} = ${url}; # set a variable to reference the base URL. Only those web pages # which are a child of the URL reference on the command line will # be spidered. ${url_base} = ${url}; $_ = $url; ${url_base} = $1 if ( m|^(http://.*)/.*| ); ${url_base} = $1 if ( m|^(https://.*)/.*| ); ${url_base} =~ s|$|/|; ${url_base} =~ s|/*$|/|; print "url_base = ${url_base}\n"; # set a variable to reference the host name URL. ${url_top} = ${url_base}; ${url_top} =~ s|(.*://.+?/).*/|\1|; print "url_top = ${url_top}\n"; # Initialize Perl modules which are required to perform # the http requests. use LWP::UserAgent; use HTTP::Request; use HTTP::Response; ################################################################ sub Crawler { # print "url = ${url}\n"; # set a variable equal to the URL of the directory # immediately above the page to be spidered. ${url_dir} = ${url}; ${url_dir} =~ s|^(.*/).*|\1|g; # print "url_dir = ${url_dir}\n"; # download the HTML code at the specified URL $ua = LWP::UserAgent->new(); $ua->agent("$spider"); $request = HTTP::Request->new(GET => $url); $response = $ua->request($request); # obtain all the lines which contain an "href" foreach ( grep ( /[Hh][Rr][Ee][Ff]/, split ( /\n/, $response->content() ) ) ) { # extract only the URL from the line of text in the HTML code ${url_child} = ""; ${url_child} = $1 if m|.*[Hh][Rr][Ee][Ff] *= *(.+?)\076.*|; ${url_child} = $1 if m|.*[Hh][Rr][Ee][Ff] *= *(.+?) *.*|; ${url_child} = $1 if m|.*[Hh][Rr][Ee][Ff] *= *(.+?) *.*|; ${url_child} = $1 if m|.*[Hh][Rr][Ee][Ff] *= *\'(.+?)\'.*|; ${url_child} = $1 if m|.*[Hh][Rr][Ee][Ff] *= *\"(.+?)\".*|; next if ( "_${url_child}" eq "_" ); # delete extraneous information from the end of the URL ${url_child} =~ s|\'||g; ${url_child} =~ s|\"||g; ${url_child} =~ s|#.*||g; ${url_child} =~ s|\?.*||g; ${url_child} =~ s|\&.*||g; ${url_child} =~ s| .*||g; ${url_child} =~ s| .*||g; # ignore HREFs which are not http next if ( ${url_child} =~ m|^mailto:.*| ); next if ( ${url_child} =~ m|^javascript:.*| ); next if ( ${url_child} =~ m|^.*:.*:.*| ); next if ( ${url_child} =~ m|^ftp:.*| ); next if ( ${url_child} =~ m|^telnet:.*| ); next if ( ${url_child} =~ m|^#.*| ); # URLs which begin with a "/", append the top level # URL to that site if ( ${url_child} =~ m|^/.*| ) { ${url_child} =~ s|^/|${url_top}|g; } # URLs which don't begin with a "/","http:", or "https:", # append the URL dir if ( ( ${url_child} !~ m|^/.*| ) && ( ${url_child} !~ m|^http:.*| ) && ( ${url_child} !~ m|^https:.*| ) ) { ${url_child} =~ s|^|${url_dir}|g; } # Check each URL to see if it matches a URL already in # the list of URL's to spider $matches = 0; foreach $href (@hrefs) { ++${matches} if ( $href =~ m|^${url_child}$|i ); } # print "matches = ", $matches, "\n"; # If the URL is not in the list, add it to the end of the # list of URL's to spider. if ( ${matches}==0 ) { $hlen = @hrefs; if ( ${url_child} =~ m|^${url_base}.*|i ) { ${hrefs[${hlen}]} = ${url_child}; print "url_child = ${url_child}\n"; ${url_spider}="http://opensystems/cgi-bin/dfrench/search.cgi?action=spider_noemail&category=${category}&url=${url_child}"; $request = HTTP::Request->new(GET => ${url_spider}); $response = $ua->request($request); $stitle=$response->title(); print "Spidering: ${stitle}\n"; } } } } ################################################################ # Main body of program begins ################################################################ ${hindex} = 0; ${hend} = @hrefs; # print "hindex = ", $hindex, " hend = ", $hend, "\n"; while ( ${hindex} <= ${hend} ) { # print "hindex = ", $hindex, " hend = ", $hend, "\n"; ${url} = ${hrefs[${hindex}]}; # print "inside while url = ", ${url}, "\n"; chomp(${url}); &Crawler; ${hend} = @hrefs; ++${hindex}; }