#!/usr/local/bin/perl
#############################################################
# Program:	spider.pl
#               Copyright 2000
#
# Description:	This program accepts a single URL from the 
#		command line and then crawls or spiders the
#		URL to find all imbedded local links in all
#		child pages.  This program returns a list
#		of local links as standard output.
#
# Author:	Dana French (dfrench@email.com)
#		 
#		(405) 329-6578
#
# Date:		12/14/2000
#
# Limitations:	This script assumes that the URL's within a 
#		web page all begin with "href=" and extend
#		across a single line.  This script will NOT
#		spider a URL which is coded as multiple 
#		lines within a web page.  This implies that
#		the "href=" must also be on the same line.
#
#############################################################
# Modifications:
#
# 12/14/2000	Version: 1.0
#		Original Code
################################################################

#    get the URL from the first command line argument 

${url} = "${ARGV[0]}";
${category} = "${ARGV[1]}";
${hrefs[0]} = ${url};

# set a variable to reference the base URL.  Only those web pages
# which are a child of the URL reference on the command line will
# be spidered.

${url_base} = ${url};
$_ = $url;
${url_base} = $1 if ( m|^(http://.*)/.*| );
${url_base} = $1 if ( m|^(https://.*)/.*| );
${url_base} =~ s|$|/|;
${url_base} =~ s|/*$|/|;

print "url_base = ${url_base}\n";

# set a variable to reference the host name URL.

${url_top} = ${url_base};
${url_top} =~ s|(.*://.+?/).*/|\1|;

print "url_top = ${url_top}\n";

# Initialize Perl modules which are required to perform
# the http requests.

use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response; 	

################################################################
sub Crawler {

#     print "url = ${url}\n";

# set a variable equal to the URL of the directory 
# immediately above the page to be spidered.
    ${url_dir} = ${url};
    ${url_dir} =~ s|^(.*/).*|\1|g;

#     print "url_dir = ${url_dir}\n";

# download the HTML code at the specified URL
    $ua = LWP::UserAgent->new();
    $ua->agent("$spider");
    $request = HTTP::Request->new(GET => $url);
    $response = $ua->request($request);
    
# obtain all the lines which contain an "href"
    foreach ( grep ( /[Hh][Rr][Ee][Ff]/, split ( /\n/, $response->content() ) ) ) {
    
# extract only the URL from the line of text in the HTML code
        ${url_child} = "";
        ${url_child} = $1 if m|.*[Hh][Rr][Ee][Ff] *= *(.+?)\076.*|;
        ${url_child} = $1 if m|.*[Hh][Rr][Ee][Ff] *= *(.+?) *.*|;
        ${url_child} = $1 if m|.*[Hh][Rr][Ee][Ff] *= *(.+?)	*.*|;
        ${url_child} = $1 if m|.*[Hh][Rr][Ee][Ff] *= *\'(.+?)\'.*|;
        ${url_child} = $1 if m|.*[Hh][Rr][Ee][Ff] *= *\"(.+?)\".*|;
        next if ( "_${url_child}" eq "_" );

# delete extraneous information from the end of the URL
        ${url_child} =~ s|\'||g;
        ${url_child} =~ s|\"||g;
        ${url_child} =~ s|#.*||g;
        ${url_child} =~ s|\?.*||g;
        ${url_child} =~ s|\&.*||g;
        ${url_child} =~ s| .*||g;
        ${url_child} =~ s|	.*||g;

# ignore HREFs which are not http
        next if ( ${url_child} =~ m|^mailto:.*| );
        next if ( ${url_child} =~ m|^javascript:.*| );
        next if ( ${url_child} =~ m|^.*:.*:.*| );
        next if ( ${url_child} =~ m|^ftp:.*| );
        next if ( ${url_child} =~ m|^telnet:.*| );
        next if ( ${url_child} =~ m|^#.*| );
    
# URLs which begin with a "/", append the top level 
# URL to that site
        if ( ${url_child} =~  m|^/.*| ) {
            ${url_child} =~ s|^/|${url_top}|g;
        }

# URLs which don't begin with a "/","http:", or "https:", 
# append the URL dir
        if ( ( ${url_child} !~  m|^/.*| ) 
          && ( ${url_child} !~ m|^http:.*| )
          && ( ${url_child} !~ m|^https:.*| ) ) {
            ${url_child} =~ s|^|${url_dir}|g;
        }
    
# Check each URL to see if it matches a URL already in 
# the list of URL's to spider
        $matches = 0;
        foreach $href (@hrefs) {
            ++${matches} if ( $href =~ m|^${url_child}$|i );
        }
    
#         print "matches = ", $matches, "\n";

# If the URL is not in the list, add it to the end of the 
# list of URL's to spider.
        if ( ${matches}==0 ) {
            $hlen = @hrefs;
            if ( ${url_child} =~ m|^${url_base}.*|i ) {
                ${hrefs[${hlen}]} = ${url_child};
                print "url_child = ${url_child}\n";

${url_spider}="http://opensystems/cgi-bin/dfrench/search.cgi?action=spider_noemail&category=${category}&url=${url_child}";
$request = HTTP::Request->new(GET => ${url_spider});
$response = $ua->request($request);
$stitle=$response->title();
print "Spidering: ${stitle}\n";

            }
        }
    }
}    
################################################################
# Main body of program begins
################################################################

${hindex} = 0;
${hend} = @hrefs;
# print "hindex = ", $hindex, "    hend = ", $hend, "\n";
while ( ${hindex} <= ${hend} ) {
#     print "hindex = ", $hindex, "    hend = ", $hend, "\n";
    ${url} = ${hrefs[${hindex}]};
#     print "inside while url = ", ${url}, "\n";
    chomp(${url});
    &Crawler;
    ${hend} = @hrefs;
    ++${hindex};
}