Skip to main content.
home | support | download

Back to List Archive

Re: SWISH::API and highlighting

From: Bill Moseley <moseley(at)not-real.hank.org>
Date: Sun Jul 18 2004 - 06:09:01 GMT
Thanks Jonas, I've been meaning to get to this.  Can you send me a
diff -u output?

Thanks,

On Thu, Jul 15, 2004 at 03:23:18AM -0700, Jonas Wolf wrote:
> Ok, I realise this was my mistake. SWISH::PhraseHighlight of course 
> requires SWISH::Stemmer to look up the stemmed word for every "real" word 
> in the text. Since SWISH::Stemmer is quite old and the SWISH::API access 
> to the stemming functions has changed, I altered SWISH::PhraseHighlight to 
> cope with this. Now it is passed a stemming function in the constructor 
> which is then used for stemming, if necessary. Here is the code, if anyone 
> is interested.
> 
> In swish.cgi, after getting the results into a record:
> 
> push @records, $result while $cnt-- && ($result = $results->NextResult);
> 
> We can now define our stemmer function:
>  
>     my $stemmer = (@records) ? sub
>     {
>         my $word = shift;
>         my $fw = $records[0]->FuzzyWord($word);
>         my ($newword) = $fw->WordList;
>         return $newword;
>     } : sub
>     {
>         my $word = shift;
>         return $word;
>     };
> 
> and pass this to the highlighting module
> 
> $highlighter = SWISH::PhraseHighlight->new( \%highlight_settings, 
> \%headers, $stemmer );
> 
> The following code is the new PhraseHighlight.pm
> 
> package SWISH::PhraseHighlight;
> use strict;
> 
> use constant DEBUG_HIGHLIGHT => 0;
> 
> sub new {
>     my ( $class, $settings, $headers, $stemmer ) = @_;
> 
>     my $self = bless {
>         settings => $settings,
>         headers  => $headers,
>     }, $class;
> 
> 
> 
>     if ( $self->header('stemming applied') =~ /^(?:1|yes)$/i ) {
>         $self->{stemmer_function} = \&$stemmer;
>     }
> 
> 
>     $self->{stopwords} = { map { $_, 1 } split /\s+/, 
> $self->header('stopwords') };
> 
> 
>     $self->set_match_regexp;
> 
> 
>     return $self;
> }
> 
> 
> sub header {
>     my $self = shift;
>     return '' unless ref $self->{headers} eq 'HASH';
>     return $self->{headers}{$_[0]} || '';
> }
> 
> 
> #=========================================================================
> # Highlight a single property -- returns true if any words highlighted
> # no, no returns true really means that the text was processed and most
> # importantly HTML escaped.
> 
> sub highlight {
> 
>     my ( $self, $text_ref, $phrase_list ) = @_;
> 
>     my $wc_regexp = $self->{wc_regexp};
>     my $extract_regexp = $self->{extract_regexp};
> 
> 
>     my $last = 0;
> 
>     my $found_phrase = 0;
> 
>     my $settings = $self->{settings};
> 
>     my $Show_Words = $settings->{show_words} || 10;
>     my $Occurrences = $settings->{occurrences} || 5;
>     my $Max_Words = $settings->{max_words} || 100;
> 
> 
> 
>     my $On = $settings->{highlight_on} || '<b>';
>     my $Off = $settings->{highlight_off} || '</b>';
> 
>     my $on_flag  = 'sw' . time . 'on';
>     my $off_flag = 'sw' . time . 'off';
> 
> 
>     my $stemmer_function = $self->{stemmer_function};
> 
>     # Should really call unescapeHTML(), but then would need to escape <b> 
> from escaping.
> 
> 
> 
>     # Split into "swish" words.  For speed, should work on a stream 
> method.
>     my @words = split /$wc_regexp/, $$text_ref;
>     return unless @words;
> 
>     my @flags;  # This marks where to start and stop display.
>     $flags[$#words] = 0;  # Extend array.
> 
>     my $occurrences = $Occurrences;
> 
> 
>     my $word_pos = $words[0] eq '' ? 2 : 0;  # Start depends on if first 
> word was wordcharacters or not
> 
> 
>     my $show_all_words = 0;
> 
>     # Remember, that the swish words are every other in @words.
> 
>     WORD:
>     while ( $Show_Words && $word_pos * 2 < @words ) {
> 
>         PHRASE:
>         foreach my $phrase ( @$phrase_list ) {
> 
>             print STDERR "  Search phrase '@$phrase'\n" if 
> DEBUG_HIGHLIGHT;
>             next PHRASE if ($word_pos + @$phrase -1) * 2 > @words;  # 
> phrase is longer than what's left
> 
> 
>             my $end_pos = 0;  # end offset of the current phrase
> 
>             # now compare all the words in the phrase
> 
>             my ( $begin, $word, $end );
> 
>             for my $match_word ( @$phrase ) {
> 
>                 my $cur_word = $words[ ($word_pos + $end_pos) * 2 ];
>                 unless ( $cur_word =~ /$extract_regexp/ ) {
> 
>                     my $idx = ($word_pos + $end_pos) * 2;
>                     my ( $s, $e ) = ( $idx - 10, $idx + 10 );
>                     $s = 0 if $s < 0;
>                     $e = @words-1 if $e >= @words;
> 
>                     warn  "Failed to  IgnoreFirst/Last from word '"
>                     . (defined $cur_word ? $cur_word : '*undef')
>                     . "' (index: $idx) word_pos:$word_pos end_pos:$end_pos 
> total:"
>                     . scalar @words
>                     . "\n-search pharse words-\n"
>                     . join( "\n", map { "$_ '$phrase->[$_]'" } 0..@$phrase 
> -1 )
>                     . "\n-Words-\n"
>                     . join( "\n", map { "$_ '$words[$_]'" . ($_ == $idx ? 
> ' <<< this word' : '') } $s..$e )
>                     . "\n";
> 
>                     next PHRASE;
>                 }
> 
> 
> 
> 
>                 # Strip ignorefirst and ignorelast
>                 ( $begin, $word, $end ) = ( $1, $2, $3 );  # this is a 
> waste, as it can operate on the same word over and over
> 
>                 my $check_word = lc $word;
> 
>                 if ( $end_pos && exists $self->{stopwords}{$check_word} ) 
> {
>                     $end_pos++;
>                     print STDERR " Found stopword '$check_word' in the 
> middle of phrase - * MATCH *\n" if DEBUG_HIGHLIGHT;
>                     redo if  ( $word_pos + $end_pos ) * 2 < @words;  # go 
> on to check this match word with the next word.
> 
>                     # No more words to match with, so go on to next 
> pharse.
>                     next PHRASE;
>                 }
> 
>                 if ( $stemmer_function ) {
>                     my $w = $stemmer_function->($check_word);
>                     print STDERR "Stemmer : $check_word => $w\n" if $w && 
> DEBUG_HIGHLIGHT;
>                     $check_word = $w if $w;
>                 }
> 
> 
> 
>                 print STDERR "     comparing source # (word:$word_pos 
> offset:$end_pos) '$check_word' == '$match_word'\n" if DEBUG_HIGHLIGHT;
> 
>                 if ( substr( $match_word, -1 ) eq '*' ) {
>                     next PHRASE if index( $check_word, substr($match_word, 
> 0, length( $match_word ) - 1) ) != 0;
> 
>                 } else {
>                     next PHRASE if $check_word ne $match_word;
>                 }
> 
> 
>                 print STDERR "      *** Word Matched '$check_word' *** \n" 
> if DEBUG_HIGHLIGHT;
>                 $end_pos++;
>             }
> 
>             print STDERR "      *** PHRASE MATCHED (word:$word_pos 
> offset:$end_pos) *** \n" if DEBUG_HIGHLIGHT;
> 
>                      $found_phrase++;
> 
> 
>             # We are currently at the end word, so it's easy to set that 
> highlight
> 
>             $end_pos--;
> 
>             if ( !$end_pos ) { # only one word
>                 $words[$word_pos * 2] = 
> "$begin$on_flag$word$off_flag$end";
>             } else {
>                 $words[($word_pos + $end_pos) * 2 ] = 
> "$begin$word$off_flag$end";
> 
>                 #Now, reload first word of match
>                 $words[$word_pos * 2] =~ /$extract_regexp/ or die "2 Why 
> didn't '$words[$word_pos]' =~ /$extract_regexp/?";
>                 # Strip ignorefirst and ignorelast
>                 ( $begin, $word, $end ) = ( $1, $2, $3 );  # probably 
> should cache this!
>                 $words[$word_pos * 2] = "$begin$on_flag$word$end";
>             }
> 
> 
>             # Now, flag the words around to be shown
>                      my ($start, $stop);
>                      if (!$show_all_words) {
>                                  $start = ($word_pos - $Show_Words + 1) * 
> 2;
>                                  $stop   = ($word_pos + $end_pos + 
> $Show_Words - 2) * 2;
>                                  if ( $start < 0 ) {
>                                      $stop = $stop - $start;
>                                      $start = 0;
>                                  }
> 
>                                  $stop = $#words if $stop > $#words;
> 
>                                  $show_all_words = 1 if ($start == 0 && 
> $stop == $#words);
>                                  if (!$show_all_words) { $flags[$_]++ for 
> $start .. $stop; }
>                      }
> 
> 
>             # All done, and mark where to stop looking
>             if ( --$occurrences <= 0 ) {
>                 $last = $stop;
>                 last WORD;
>             }
> 
> 
>             # Now reset $word_pos to word following
>             $word_pos += $end_pos; # continue will still be executed
>             next WORD;
>         }
>     } continue {
>         $word_pos ++;
>     }
> 
> 
> 
> 
>     my $dotdotdot = ' ... ';
> 
> 
>     my @output;
> 
>     my $printing;
>     my $first = 1;
>     my $some_printed;
> 
>     if ( $Show_Words && @words > 50 ) {  # don't limit context if a small 
> number of words
>         for my $i ( 0 ..$#words ) {
> 
> 
>             if ( $last && $i >= $last && $i < $#words ) {
>                 push @output, $dotdotdot;
>                 last;
>             }
> 
>             if ( $show_all_words || $flags[$i] ) {
> 
>                 push @output, $dotdotdot if !$printing++ && !$first;
>                 push @output, $words[$i];
>                 $some_printed++;
> 
>             } else {
>                 $printing = 0;
>             }
> 
>                      $first = 0;
> 
> 
>         }
>     }
> 
>     if ( !$some_printed ) {
>         for my $i ( 0 .. $Max_Words ) {
>             if ( $i > $#words ) {
>                 $printing++;
>                 last;
>             }
>             push @output, $words[$i];
>         }
>     }
> 
> 
> 
>     push @output, $dotdotdot if !$printing;
> 
>     $$text_ref = join '', @output;
>     my %entities = (
>         '&' => '&amp;',
>         '>' => '&gt;',
>         '<' => '&lt;',
>         '"' => '&quot;',
>     );
>     my %highlight = (
>         $on_flag => $On,
>         $off_flag => $Off,
>     );
> 
> 
>     $$text_ref =~ s/([&"<>])/$entities{$1}/ge;  # " fix emacs
> 
>     $$text_ref =~ s/($on_flag|$off_flag)/$highlight{$1}/ge;
> 
>     return 1;  # Means that prop was processed AND was html escaped.
>     return $found_phrase;
> 
>     # $$text_ref = join '', @words;  # interesting that this seems 
> reasonably faster
> 
> 
> 
> }
> 
> #============================================
> # Returns compiled regular expressions for matching
> #
> #
> 
> sub set_match_regexp {
>     my $self = shift;
> 
> 
> 
>     my $wc = $self->header('wordcharacters');
>     my $ignoref = $self->header('ignorefirstchar');
>     my $ignorel = $self->header('ignorelastchar');
> 
> 
>     $wc = quotemeta $wc;
> 
>     #Convert query into regular expressions
> 
> 
>     for ( $ignoref, $ignorel ) {
>         if ( $_ ) {
>             $_ = quotemeta;
>             $_ = "([$_]*)";
>         } else {
>             $_ = '()';
>         }
>     }
> 
> 
>     $wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';  # Warning: dependent on tolower 
> used while indexing
> 
> 
>     # Now, wait a minute.  Look at this more, as I'd hope that making a
>     # qr// go out of scope would release the compiled pattern.
> 
>     if ( $ENV{MOD_PERL} ) {
>         $self->{wc_regexp}      = qr/([^$wc]+)/;                     # 
> regexp for splitting into swish-words
>         $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i;  # 
> regexp for extracting out the words to compare
> 
>      } else {
>         $self->{wc_regexp}      = qr/([^$wc]+)/o;                    # 
> regexp for splitting into swish-words
>         $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi;  # 
> regexp for extracting out the words to compare
>      }
> }
> 
> 1;
> 

-- 
Bill Moseley
moseley@hank.org

Unsubscribe from or help with the swish-e list: 
   http://swish-e.org/Discussion/

Help with Swish-e:
   http://swish-e.org/current/docs
   swish-e@sunsite.berkeley.edu
Received on Sat Jul 17 23:09:24 2004