Skip to main content.
home | support | download

Back to List Archive

[swish-e] Hit-highlighting of HTML pages

From: Ostermayr Richard Dr. <Richard.Ostermayr(at)not-real.dpma.de>
Date: Fri Oct 24 2008 - 08:01:01 GMT
Hi,

 

I'd like to present a simple demo CGI script for Hit-highlighting of HTML pages.

 

Any comments are welcome.

 

Best regards

 

 

Richard Ostermayr

 

 

 

#!D:\Perl\perl.exe -w

 

 

########################################################################

# demo CGI script for highlighting SWISH-E hits in HTML files.         #

#                                                                      #

# CGI params:                                                          #

# url=            URL of HTML file                                     #

# query =   SWISHE-E query                                             #

#                                                                      #

# Copyright 2008 Richard Ostermayr - All rights reserved.              #

#    This program is free software; you can redistribute it and/or     #

#    modify it under the terms of the GNU General Public License       #

#    as published by the Free Software Foundation; either version      #

#    2 of the License, or (at your option) any later version.          #

#                                                                      #

#    This program is distributed in the hope that it will be useful,   #

#    but WITHOUT ANY WARRANTY; without even the implied warranty of    #

#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the     #

#    GNU General Public License for more details.                      #

#                                                                      #

#    The above lines must remain at the top of this program            #

########################################################################

 

use strict;

use HTTP::Request::Common qw(POST);

use LWP::UserAgent;

use CGI qw(:standard);

 

my $q = new CGI;

 

# ISO 8859-1, wie im INDEX_HEADER, aber um A-Z&; erweitert

my $char = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ&;'.

               '';

 

my $ua = LWP::UserAgent->new;

 

my $url = $q->param('url');

my $query = uc($q->param('query'));

 

#Operatoren und Hochkommatas (Phrasesearch, Suche der Operatoren) entfernen

$query =~ s/ AND | OR | NOT | NEAR[0-9]* |\"|\'|\(|\)/ /g;

#Endtrunkierungen durch Perl-regex ersetzen

$query =~ s/\*(\s|$)/\[$char\]\* /g;  # & und ; wegen &auml;&ouml;&uuml;

$query =~ s/\?(\s|$)/\[$char\] /g;

#Verbliebenes internes ? durch . ersetzen fr regex

$query =~ s/\?/\[$char\]/g;

 

    my %entities = ( #GK-Schreibung nicht bercksichtigen

       #'&' => '(&amp;)',

        '>' => '(&gt;)',

        '<' => '(&lt;)',

        '"' => '(&quot;)',

        '' => '(&Auml;||&auml;|)',

        '' => '(&Auml;||&auml;|)',

        '' => '(&Ouml;||&ouml;|)',

        '' => '(&Ouml;||&ouml;|)',

            '' => '(&Uuml;||&uuml;|)',

            '' => '(&Uuml;||&uuml;|)',

        '' => '(&szlig;|)'

    );

 

$query =~ s/&/&amp;/g;  # " fix emacs

for (keys %entities){

    $query =~ s/(^|[^|])($_)/$1$entities{$2}/g;  # " fix emacs

}

 

my @keywords = split(/\s+/, $query);

 

my $response = $ua->get($url);

if (!$response->is_success) { 

   print "Content-type: text/html\n\n";

   print "<HTML><BODY>Fehler beim Laden von $url.</BODY></HTML>";

   die $response->status_line;   

}

 

my $html = $response->content;

 

my $hl_beg = '<span style="color:#000000;background-color:#FF8C00;font-weight:bold">';

my $hl_end = '</span>';

 

foreach my $kw (@keywords){

 

 if ($kw =~ /[$char]/){

 #Zuerst die $kw markieren, die direkt von einem Tag umschlossen sind : (>)($kw)(<)

 $html =~ s/(>)($kw)/$1$hl_beg$2$hl_end/ig; #Nicht innerhalb von HTML-Tags markieren!

 

 #Dann die $kw auerhalb von Tags markieren, die mind. 1 Zeichen von Tags entfernt sind

 while( $html =~ s/(>[^<>]+?)($kw)([^$char])/$1$hl_beg$2$hl_end$+/ig ) {} 

 #Nicht innerhalb von HTML-Tags markieren!

 }

}

 

#Damit die relativen Links funktionietren

$html =~ s/(<head>)/$1<base href="$url">/i;

 

print "Content-type: text/html\n\n";

print $html;

 

 



_______________________________________________
Users mailing list
Users@lists.swish-e.org
http://lists.swish-e.org/listinfo/users
Received on Fri Oct 24 04:00:54 2008