#!/usr/local/bin/perl -w ########################################################################## # Created March/April 1996, Michael D. Smith # Part of WebGlimpse (GlimpseHTTP) research with Udi Manber # Glimpse mailing list: glimpse@cs.arizona.edu # WebGlimpse home page: http://glimpse.cs.arizona.edu/webglimpse # # Modified by Dachuan Zhang, May 22, 1996 # Some bugs fixed. # # 3/98 Added search-by-subdirectory section (wgall.html) --GB # 6/98 Fixed search-by-subdirectory when urllist != archivedir --GB # Added -w switch & cleaned up. Added .wgfilter_index check. ########################################################################## ########################################################################## ## GLOBALS ########################################################################## $archivedir = shift(@ARGV); ## Cleaner handling of options as per M. Ernst. --GB 9/24/97 @filelist=(); @THEBOX=(); @THEPAGE=(); @urllist=(); %IndexAD = (); # Added for checking .wgfilter_index --GB 6/30/98 %IndexPAT = (); ########################################################################## ## SETTINGS ########################################################################## # to be changed $WEBGLIMPSE_HOME = "/home/bugs/bugstopper-www"; $CGIBIN = "cgi-bin"; # No longer used #$FULLSEARCH="webglimpse-fullsearch"; # static $HTMLFILE_RE = "((.s?html)|(.sht)|(.htm))\$"; $WEBGLIMPSE_LIB = "$WEBGLIMPSE_HOME/lib"; #$BACKUPEXT=".wgbak"; # Now unused $MADENH = ".wg_madenh"; $SEARCHBOX = ".wgbox.html"; $SEARCHPAGE_TEMPLATE = ".wgindex.html"; $SEARCHPAGE = "wgindex.html"; $SEARCHALL_TEMPLATE = ".wgall.html"; $SEARCHALL = "wgall.html"; $nh_pre=".nh."; ########################################################################## ## ENTRY POINT ########################################################################## #--------------------------------- # make my libraries more important unshift(@INC, "$WEBGLIMPSE_LIB"); require "config.pl"; #--------------------------------- ## Read .wgfilter_index to check files allowed to index $WGINDEX = '.wgfilter-index'; $WGINDEX = "$archivedir/$WGINDEX"; &open_indexallowdeny("$WGINDEX"); ## Cleaner handling of options as per M. Ernst. --GB 9/24/97 $removing = 0; $keep_nh = 0; $quiet = 0; while (defined($arg = shift(@ARGV))) { if ($arg eq "-r") { $removing = 1; } elsif ($arg eq "-k") { $keep_nh = 1; } elsif ($arg eq "-q") { $quiet = 1; } else { die "Bad argument $arg (remaining args @ARGV)"; } } ## End option handling change --GB 9/24/97 if($archivedir eq "") { $archivedir = "."; # make it current dir } # try to change the directory to indexdir $startpwd = `pwd`; $retval = chdir ($archivedir); if($retval==0){ print "Cannot change directory to $archivedir. Quitting.\n"; exit -3; } # get the 'real' path $archivepwd = `pwd`; chomp $archivepwd; # make sure it has a configuration file if($removing==0 && &TestConfig($archivepwd)==0){ print "Cannot find configuration file for archive. Quitting.\n"; exit -4; } #---------------------- $MADENH = "$archivepwd/$MADENH"; $SEARCHBOX = "$archivepwd/$SEARCHBOX"; # get the settings from the configuration file # there should be no problem opening this file -- we know it exists # read the settings # ($title, $archiveurl, $traverse_type, $numhops,$nhhops,$addboxes) = # &ReadConfig($archivepwd); # Initialize variables to avoid warnings ($title, $archiveurl, $traverse_type, $explicit_only, $numhops, $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem) = ('','','','','','','','','','',''); ($title, $archiveurl, $traverse_type, $explicit_only, $numhops, $nhhops, $local_limit, $remote_limit, $addboxes, $vhost, $usemaxmem, @urllist) = ReadConfig($archivepwd); ### HUH? What does this code do? Can we remove it? ### here just in case. # if we're told to do nothing, do nothing. # Removed next 4 lines by bgopal oct/11/96 #if($addboxes==1 && $removing==0){ # print "Possible problem with addsearch. Please investigate (debug msg).\n"; # exit(0); #} $archivetitle = $title; if($removing==0){ # generate the wgindex.html file from the .wgindex.html &read_search_page($SEARCHPAGE_TEMPLATE); &make_search_page($SEARCHPAGE); # For subdirectory search, generate wgall.html that searches any subdirectory if ($traverse_type == 2) { &read_search_page($SEARCHALL_TEMPLATE); &make_search_all($SEARCHALL); } } # if we don't have to addboxes, just abort! if($addboxes==0) { print "No search boxes used\n"; exit(0); } # open the .wg_madenh open(FILELIST, "$MADENH") || die "Cannot open $MADENH for reading."; # get the filelist from the file @filelist = ; close FILELIST; # read the search box if($removing==0){ &read_search_box($SEARCHBOX); } # make the additions to the corefiles &add_search_box(@filelist); #---------------------- #change the dir back chdir($startpwd); ########################################################################## ### PROCEDURES ########################################################################## ########################################################################## sub read_search_page{ local($file)=@_; open(FILE, $file) || die "Cannot open $file for reading.\n"; @THEPAGE = ; close(FILE); } ########################################################################## sub make_search_page{ local($file)=@_; local($realline, $line, $newfile); open(OUTPUT,">$file") || die "Cannot open $file for writing.\n"; # output the contents of THEPAGE foreach $realline(@THEPAGE){ $line = $realline; # do the substitutions and output $line =~ s/\$ARCHIVETITLE/$archivetitle/g; # Not using ARCHIVEURL starting with v1.6 Was only used for full image paths in dist files and local copy pointers. --GB 1/18/98 # $line =~ s/\$ARCHIVEURL/$archiveurl/g; $line =~ s/\$ARCHIVEPWD/$archivepwd/g; $line =~ s/\$CGIBIN/$CGIBIN/g; $line =~ s/\$FILE/$file/g; print OUTPUT $line; } close OUTPUT; } ########################################################################## sub make_search_all{ local($file)=@_; my($realline, $line, $newfile, $dirlist, $eachdir, $entry); my($noindex,$pattern,$i); $dirlist = ''; open(OUTPUT,">$file") || die "Cannot open $file for writing.\n"; # output the contents of THEPAGE foreach $realline(@THEPAGE){ $line = $realline; # do the substitutions and output if ($line =~ /\$DIRECTORYOPTIONS/) { # Print some comments explaining what this is print OUTPUT "\n"; # We need to search each directory in the URLLIST and its subdirs # NOT necessarily the subdirs of archivepwd. --GB 6/30/98 foreach $entry (@urllist) { # Skip entries that don't look like nice canonical directories # If indexing by directory, we expect canonicalized directories in urllist if ($entry !~ /^\//) { next; } $dirlist = `find $entry -type d -print`; if ($dirlist ne '') { foreach $eachdir (split(/\n/,$dirlist)) { # Unless explicitly denied, allow search of this subdir $noindex=1; # Default to allowing foreach $i (0 .. $#IndexPAT) { $pattern = $IndexPAT[$i]; if ($eachdir =~ /$pattern/) { $noindex = $IndexAD[$i]; last; } } if ($noindex) { print OUTPUT '