Skip to main content.
home | support | download

Back to List Archive

[swish-e] Hit-highlighting of large PDF files

From: Ostermayr Richard Dr. <Richard.Ostermayr(at)not-real.dpma.de>
Date: Fri Oct 24 2008 - 07:51:28 GMT
Hi,

 

Here I present a piece of CGI code for Hit-highlighting of (large or even huge) PDF files.

 

For a simpler implementation for small PDF files please refer to my previous posting "Hit-highlighting of PDF files".

 

The highlighting is achieved by simply appending to the PDF file a small piece of Javascript, 

which performs a query similar to the SWISH-E query every time the PDF file is opened.

For this purpose, The SWISH-E query is transformed to fit the requirements of a PDF query.

Even internal wildcards (normally not treated in Acrobat Reader) are handled for the first 5 pages.

 

The advantages of this approach are: 

1. The exploitation of the proximity search feature described in the Acrobat JavaScript Scripting Guide

(as far as I know, a proximity search is not possible in the Adobe Reader search menu).

2. The shift of the CPU load caused by the PDF search to the client.

3. The persistence of the generated query (it is a PDF OpenAction).

 

Since IO::String it not yet well suited for huge files, the script builds a temporary file instead. 

On our server this technique works for PDFs > 100 MB.

 

Note: Since there is aproblem with  $filesize > 32 MB

In this case we use  Text::PDF::File_neu which is a modification of Text::PDF::File

(see extra posting "problem with Text::PDF::File").

 

PDF files form a major part of every intranet. Therfore, it may be advantageous to incorporate 

this technique into the SWISH-E package.

 

Any comments are welcome.

 

Best regards

 

 

Richard Ostermayr

 

 

 

 

#!D:\Perl\perl.exe -w

 

 

########################################################################

# CGI script for transferring SWISH-E queries into PDF files.          #

#                                                                      #

# CGI params:                                                          #

# url=            URL of PDF file                                      #

# query =   SWISHE-E query                                             #

# type =    search type (word, near/proximity or phrase)               #

#                                                                      #

# Copyright 2008 Richard Ostermayr - All rights reserved.              #

#    This program is free software; you can redistribute it and/or     #

#    modify it under the terms of the GNU General Public License       #

#    as published by the Free Software Foundation; either version      #

#    2 of the License, or (at your option) any later version.          #

#                                                                      #

#    This program is distributed in the hope that it will be useful,   #

#    but WITHOUT ANY WARRANTY; without even the implied warranty of    #

#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the     #

#    GNU General Public License for more details.                      #

#                                                                      #

#    The above lines must remain at the top of this program            #

########################################################################

 

use strict;

use LWP::UserAgent;

use Time::localtime;

#use Text::PDF::File;

use Text::PDF::Utils;

use Text::PDF::Page;

use Text::PDF::Pages;

use Text::PDF::SFont;

use CAM::PDF;

use CGI qw(:standard);

use POSIX;

use IO::File;

use Time::HiRes qw ( sleep );

my $p; 

 

my $q = new CGI;

 

my $query_dist;

my $query_phrase;

my $match_type;

my $dist;

my $pdfstr;

 

# ISO 8859-1 

my $char = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ&;'.

               '';

 

my $url = $q->param('url');

my $query = uc($q->param('query'));

my $type = $q->param('type');

 

#PDF von URL laden und in Temp-Datei ablegen

my $tmp_datei = &Tmp_Datei_Verz.POSIX::tmpnam()."$$";

 

open(AUS, ">>$tmp_datei") or abbruch("Kann Tmp-Datei nicht ffnen: $!");

binmode(AUS);

 

my $ua = LWP::UserAgent->new;

my $req = new HTTP::Request ('GET', $url);

 

my $zaehler = 10; 

my $response;

do{ $response = $ua->request($req, \&callback, 10000)} 

while(!$response->is_success && $zaehler--);

if (!$response->is_success) { abbruch($response->status_line) }

 

sub callback{   

      my($data, $response, $protocol) = @_; 

      print AUS $data;

}

close(AUS);

 

if($type eq 'word'){   

 

my $pdf_hl = $query;

  #Vorbereitung fr Wortsuche, endstndige Trunkierungen entfernen

  $pdf_hl =~ s/[\*\?]+(\s|$|"|\))/ /g;

  #Operatoren und Hochkommatas (Phrasesearch, Suche der Operatoren) entfernen

  $pdf_hl =~ s/ AND | OR | NOT | NEAR[0-9]* |\"|\'|\(|\)/ /g;

  #Verbliebenes internes ? durch . ersetzen fr PERL-regex

  $pdf_hl =~ s/\?/\[$char\]/g;

 

  my @keywords = split(/\s+/, $pdf_hl);

  $pdf_hl = '';

  foreach (@keywords){$pdf_hl .= " $_";}

  $pdf_hl =~ s/\[$char\]/!/g;        #Urspr. Trunkierungszeichen als ! darstellen (vorher internes ?)

  

  if ($pdf_hl =~ /!/){

  #Auswertung der ertsen 5 Seiten, um aus ? innnerhalb von Suchtermen reale Suchbegriffe zu erhalten

  my %pdf_hl_trunk;

  my %pdf_hl_stamm;

  

  $zaehler = 10; 

  my $pdf;

  do{ sleep (0.1); $pdf = CAM::PDF->new($tmp_datei) } 

  while(!$pdf && $zaehler--);

  if(!$pdf){ abbruch("CAM::PDF: Oeffnen fehlgeschlagen: $!") }

 

  my $seiten = $pdf->numPages() or abbruch("CAM::PDF: Seitenzaehlen fehlgeschlagen: $!");

  if ($seiten > 5){$seiten = 5}  

  while ($seiten){

  my $text = $pdf->getPageContent($seiten--) or abbruch("CAM::PDF: Seitenextraktion fehlgeschlagen: $!");

  while($text =~ s/\][^\[]+\[//g){}            #Schnelle Textextraktion

  while($text =~ s/\)[0-9\.\-]*\(//g){}

  while($text =~ s/\\\(|\\\)//g){}

  while($text =~ s/\.\.//g){}

  $text =~ s/[^$char ]//g;

  foreach my $kw (@keywords){

   if($kw =~ /\[$char\]/){

          while ($text =~ s/($kw)//i) {

               #print "$1<br>";

               $pdf_hl_trunk{$1}++;

               $pdf_hl_stamm{$kw}++;

         }

   }

  }}

  

 

  for (keys %pdf_hl_stamm){

      my $stamm = $_;

      $stamm =~ s/\[$char\]/!/g;

      $pdf_hl =~ s/ $stamm//ig;

  } #Wenn echte Suchworte gefunden wurden, werden die ursp. Terme entfernt

      

  for (keys %pdf_hl_trunk){if($pdf_hl !~ /$_/i){ $pdf_hl .= " $_"; }}

  $pdf_hl =~ s/!/\?/g;

  } #Ende Auswertung der ersten 5 Seiten

  

  $pdf_hl =~ s/^\s+//;

 

 if ($query =~ /[$char][\*\?]/){ 

     $pdfstr = "search.matchWholeWord = false;";  

 }else{ 

     $pdfstr = "search.matchWholeWord = true;";

 }

 $pdfstr .= "search.wordMatching = 'MatchAnyWord';search.query('$pdf_hl', 'ActiveDoc');";

}

 

if($type eq 'near'){

if(!($query =~ /([$char]+)[\*\?]*\s+NEAR([0-9]*)\s+([$char]+)[\*\?]*/)){

      aussteigen("Abstandsoperator NEAR konnte nicht gefunden werden.");

 }

 $query_dist = "$1 $3";

 $dist = $2;

 if ($& =~ /[$char][\*\?]/){ 

     $pdfstr = "search.matchWholeWord = false;";  

 }else{ 

     $pdfstr = "search.matchWholeWord = true;";

 }

 $pdfstr .= "search.wordMatching = 'MatchAllWords';search.proximity = true;".

                  "search.proximityRange = '$dist';search.query('$query_dist', 'ActiveDoc');";

}

 

if($type eq 'phrase'){  #Hier Trunkierung nur am uersten rechten Ende erlaubt

if(!($query =~ /"\s*(([$char]+\s+)*[$char]+)[\*\?]*\s*"/)){

      aussteigen("Phrase (in Anf&uuml;hrungszeichen) konnte nicht gefunden werden (Trunkierung nur am rechten Ende erlaubt)."); 

 }

 $query_phrase = $1;

 if ($& =~ /[$char][\*\?]/){ 

     $pdfstr = "search.matchWholeWord = false;";  

 }else{ 

     $pdfstr = "search.matchWholeWord = true;";

 }

 $pdfstr .= "search.wordMatching = 'MatchPhrase';search.query('$query_phrase', 'ActiveDoc');";

}

 

# Search-Objekt im PDF anlegen

my $filesize = -s $tmp_datei;

if($filesize > 32000000){

      require Text::PDF::File_neu;

      $p = Text::PDF::File_neu->open($tmp_datei, 1); # 1 = write

}else{

      require Text::PDF::File;

      $p = Text::PDF::File->open($tmp_datei, 1); # 1 = write     

}

 

my $r = $p->read_obj($p->{'Root'});

my $t = $p->read_obj($r->{'Pages'});                                                                                             

my $x=PDFDict();

$x->{'Type'}=PDFName('Action');

$x->{'S'}=PDFName('JavaScript');

$x->{'JS'}=PDFStr($pdfstr);

$p->new_obj($x);

$r->{'OpenAction'} = $x;

$p->out_obj($r);

$p->append_file();                                                                                  

$p->release();

 

open(IN, "<$tmp_datei") or abbruch("Kann Tmp-Datei nicht ffnen: $!");

binmode(IN);

 

print "Content-type:application/pdf\n\n";

while (read(IN, my $buf, 10000)){ print $buf } 

close (IN);

unlink $tmp_datei;

 

sub aussteigen{

   #Temp-File schlieen und lschen

   close(IN);

   close(AUS);

   if(defined($p)){ $p->release(); }

   unlink $tmp_datei;

   print "Content-type:text/html\n\n";

   print "<html><body>";

   print shift;

   print "</body></html>";

   exit (0);

}

 

sub abbruch{

   #Temp-File schlieen und lschen

   close(IN);

   close(AUS);

   if(defined($p)){ $p->release(); }

   unlink $tmp_datei;

   die $_;

}

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 



_______________________________________________
Users mailing list
Users@lists.swish-e.org
http://lists.swish-e.org/listinfo/users
Received on Fri Oct 24 09:37:35 2008