Search Engine

The search script used to find bulletins on Erik's Rail News is based on Matt 's Simple Search. Matt's original version generates a list of links to every file containing the word you're looking for. Mine actually prints every file, instead of linking them. If you're looking for the search page, it's right here.

My version also has a "next 10" feature, a log feature to see what people search for, and you can bookmark searches. The latest feature is a context-sensitive help feature which reacts according to your search terms and results. I'm working on an "intelligent" case sensitivity feature and a better wildcard feature so you can have better control over wildcard searches.

Below is the search script. I've left out the parts unchanged from Matt's version.
sub parse_form { 

   # Get the input

#the get method enters the search query into the url 
#so you can bookmark your search
	$buffer = ""; 
	$buffer = $ENV{'QUERY_STRING'} if $ENV{REQUEST_METHOD} eq GET; 
	read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'}) if $ENV{REQUEST_METHOD} eq POST; 

   # Split the name-value pairs
   @pairs = split(/&/, $buffer);

   foreach $pair (@pairs) {
      ($name, $value) = split(/=/, $pair);

	$value =~ tr/+/ /;
	$value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C", hex($1))/eg;
#fix european letters
	$value =~ s/å/\å/g; 
	$value =~ s/Å/\Å/g;
	$value =~ s/ä/\ä/g; 
	$value =~ s/Ä/\Ä/g; 
	$value =~ s/ö/\ö/g; 
	$value =~ s/Ö/\Ö/g; 
	$value =~ s/ü/\ü/g;
	$value =~ s/Ü/\Ü/g;
	$value =~ s/Ø/\Ø/g;
	$value =~ s/ø/\ø/g;
#delete symbols which may disturb the search
	$weird = 0;
	if ($value =~ /_|\*|\]|\[/){
		$weird++;
		$value =~ tr/_|*/ /;
		$value =~ tr/[/ /;
		$value =~ tr/]/ /;
	}
	$FORM{$name} = $value;
   }
}

sub return_html {
	$words = $FORM{'terms'};
	$startat = $FORM{'startat'};
	$endat = ($startat + 10);
	$stop = ($startat + 11); 
	$count = 0; 
	$boolean = $FORM{'boolean'};
	$case = $FORM{'case'};
	$i = 0;
   print "Content-type: text/html\n\n";
   print "<html><HEAD><LINK REL=\"StyleSheet\" HREF=\"../railnews/railnews.css\" TYPE=\"text/css\">\n";
   print "<title>Results of Rail News Search</title></head>\n"; 
   print "<body><center><h1>Search Results</h1>\n\n";
   print "<form method=GET action=\"bulletinsearch.pl\"><input type=text name=\"terms\" size=60 value=\""; 
   foreach $term (@terms) {
	print "$term";
	$i++;
	if (!($i == @terms)) {
		print " ";
	}
   }
   print "\"><table border=\"0\"><tr><td>"; 

if ($boolean eq AND) {
	print "<input type=\"radio\" name=\"boolean\" value=AND checked>AND<br><input type=\"radio\" name=\"boolean\" value=OR>OR</td><td>\n";
}
elsif ($boolean eq OR) {
	print "<input type=\"radio\" name=\"boolean\" value=AND>AND<br><input type=\"radio\" name=\"boolean\" value=OR checked>OR</td><td>\n";
}
if ($case eq Insensitive) {
	print "<input type=\"radio\" name=\"case\" value=Insensitive checked>Case Insensitive<br><input type=\"radio\" name=\"case\" value=Sensitive>Case Sensitive</td><td>\n";
}
elsif ($case eq Sensitive) { 
	print "<input type=\"radio\" name=\"case\" value=Insensitive>Case Insensitive<br><input type=\"radio\" name=\"case\" value=Sensitive checked>Case Sensitive</td><td>\n";
}

print "</td><td><input type=\"hidden\" name=\"startat\" value=\"0\"><INPUT TYPE=submit VALUE=\"Find Bulletins\"></td></tr></table></form></center><p>\n\n";

   $numberofhits = 0; 
   foreach $key (keys %include) {
	if ($include{$key} eq 'yes') {
	 $numberofhits++; 
       }
   }
   if ($numberofhits == 0) {
	print "Sorry, couldn't find any bulletins. Tips:<ul>\n";
   	if ($words !~ /[a-z]/) {
		if ($FORM{'case'} eq 'Sensitive') {
			print "<li>Unless you're looking for something that is normally written with ALL CAPITALS, try pressing the \"Case Insensitive\" button.</li>\n";
 		}
		if ($words =~ /\./) {
			print "<li>Erik's Rail News writes acronyms without dots (.).</li>\n";
		}
	}
	if ($weird != 0) {
		print "<li>Sorry, this search engine does not recognize weird symbols like \[, \], _, or *. All words are treated like wildcards, and quoted phrases are not supported.</li>\n";
	}
	if ($i > 1) {
		print "<li>Try using fewer or more general search terms.</li>\n";
	}
	print "<li>All words are treated like wildcards, so for example \"comput\" will find computer, computers and computing.</li>\n";
	print "<li>Try the Erik's Rail News <a href=\"../archive/search_and_archive.html\">Search and Archive page</a>.</li>\n";
	print "<li>If you're pretty sure what you're looking for is not here, <a href=\"http://www.altavista.com/cgi-bin/query?pg=q&text=yes&kl=XX&q=";
	$i = 0;
	foreach $term (@terms) {
		print "$term";
		$i++;
		if (!($i == @terms)) {
    			print "+";
		}
	}
	print "+rail+train&act=search\">click here to continue your search on AltaVista</a>. On AltaVista, quote phrases and put a plus sign in front of words (+word) or phrases (+\"high speed rail\")to force the appearance of a word or phrase on a page. Put a minus sign in front of a word (\"-word\") to exclude it. See the help page for powerful search options!</li>\n";

   }
   else {
   	print "Found $numberofhits bulletins. Showing $startat to $endat in no particular order.<ul>";
   	if ($words !~ /[a-z]/) {
		if ($FORM{'case'} eq 'Insensitive') {
			print "<li>If you're looking for an acronym like ICE or EWS, try pressing the \"Case Sensitive\" button.\n";
 		}
	}
	if ($weird != 0) {
		print "<li>Sorry, this search engine does not recognize weird symbols like \[, \], _, or *. All words are treated like wildcards, and quoted phrases are not supported.</li>\n";
	}
	if ($words =~ /britain|canada|china|finland|france|germany|hungary|italy|sweden|norway|mexico|poland|portugal/i) {
		print"<li>Looking for information about a country? Use a wildcard (all words are treated as wildcards) to find the noun as well as the adjective. For example, to find information about Britain, just type in \"brit\", for Britain and British. For Sweden, type \"swed\" and so on.</li>\n";
	}
	if ($words =~ /rail|train/i) {
		print "<li>Unless words like \"rail\" or \"train\" form part of the name of an organisation you are interested in, try deleting these words from your search to get more accurate results.</li>\n";
	}
	if ($numberofhits > 20){
		unless ($words =~ /\d\d\d\d|january|february|march|april|may|june|july|august|september|october|november|december/i){
			print "<li>Try adding a month and/or year if you want to narrow the search results.</li>\n";
		}
	}
	print "</ul></p>";
	foreach $key (keys %include) {
		if ($include{$key} eq 'yes') {
			$count++; 
			if ($count >= $startat) {
				if ($count < $endat) {
					print "<center><hr width=50%></center>\n\n";
					print `cat $key`; #the dos command for cat is type
				}
				elsif ($startat = $stop) {
					print "<center><hr width=50%>\n\n";   
					print "<a href=\"bulletinsearch.pl?terms=";
					$i = 0;
					foreach $term (@terms) {
						print "$term";
						$i++;
						if (!($i == @terms)) {
				    			print "+";
						}
					}
					print "&startat=$endat&boolean=$boolean&case=$case\">Next 10<\/a><\/center>";
					print "\n\n"; 
					last; 
				}
			}
               }
	}
   }

   print "<center><p>Back to <a href=\"../railnews\">Erik's Rail News</a> front page</p></center>\n";
   print "</body>\n</html>\n";
}

sub print_stats {

	open SEARCHWORDS, ">>searchwords"; 
	print SEARCHWORDS "<p>$numberofhits bulletins with the words ";
	foreach $term (@terms) {
		print SEARCHWORDS "$term";
		$i++;
		if (!($i == @terms)) {
			print SEARCHWORDS ", ";
		}
	}
	if ($boolean eq AND) {
		print SEARCHWORDS " AND ";
	}
	elsif ($boolean eq OR) {
		print SEARCHWORDS " OR ";
	}
	if ($case eq Insensitive) {
		print SEARCHWORDS " case insensitive ";
	}
	elsif ($case eq Sensitive) {
		print SEARCHWORDS " case sensitive "; 
	}
	$browser = $ENV{'HTTP_USER_AGENT'}; 
	$host = $ENV{'REMOTE_HOST'}; 
	print SEARCHWORDS "<br>$browser $host</p>\n\n"; 
	close SEARCHWORDS; 
}