Skip to main content.
home | support | download

Back to List Archive

swish-diffs (fwd) and performance comments...

From: Tom Brown <tbrown(at)not-real.baremetal.com>
Date: Mon Oct 18 1999 - 06:16:30 GMT
Please find attached follow diffs to fix a couple bugs and minor
improvements to getMeta() in index.c ... the comment covers the
descriptions and the case that caused core dumps (although strangely
enough, the core dumps only showed up with the -O2 optimization flag) ... 

I'd note that it _seems_ (my tests haven't been rigorous), that for folks
indexing text (like 18000 threads of e-mail tech support :-), that
significant performance gains can be achieved by tweaking the config.h
file back towards what it was in the days of the original swish version...
(min word length of 3, skip all numeric, vowel, or consonant 'words',
etc... read config.h, it's well commented)

NEW! ... there's another set of patches combined with this patch file...
they patch merge.c and save about 45% of CPU usage on the sample sets
(1000 files per index) that I was merging (e.g. -M) ... basically the old
encodefilenum()  and decodefilenum() routines were a horrible hack... 

... if you throw out the getMeta() changes, at least keep the merge.c
changes

-Tom

----------------------------------------------------------------------
tbrown@BareMetal.com   | Ours is a world where people don't know what they
http://BareMetal.com/  | want and are willing to go through hell to get it.
                       | - Don Marquis

---------- Forwarded message ----------
Date: Sun, 17 Oct 1999 19:41:56 -0700
From: root <root@bronze.baremetal.com>
To: tbrown@baremetal.com
Subject: swish-diffs

*** index.c.old	Sun Oct 17 19:31:00 1999
--- index.c	Sun Oct 17 19:32:47 1999
***************
*** 1086,1091 ****
--- 1087,1105 ----
  
  /* Get the MetaData index when the whole tag is passed */
  
+ /* TAB, this routine is/was somewhat pathetic... but it was pathetic in
+  1.2.4 too ... someone needed a course in defensive programming... there are
+  lots of tests below for temp != NULL, but what is desired is *temp != '\0' 
+  (e.g. simply *temp) ... I'm going to remove some strncmp(temp,constant,1)
+  which are must faster as *temp != constant ... 
+  
+  Anyhow, the test case I've got that's core dumping is:
+     <META content=3D"MSHTML 5.00.2614.3401" name=3DGENERATOR>
+  no trailing quote, no trailing space... and with the missing/broken check for
+  end of string it scribbles over the stack...
+ 
+ */
+ 
  int getMeta(tag, docPropName)
  char* tag;
  int* docPropName;
***************
*** 1108,1133 ****
  	temp += strlen("NAME");
  	
  	/* Get to the '=' sign disreguarding blanks */
! 	while (temp != NULL) {
! 		if (strncmp(temp, "=",1))
  			temp++;
! 		else {
  			temp++;
  			break;
  		}
  	}
! 	
  	/* Get to the beginning of the word disreguarding blanks and quotes */
! 	while (temp != NULL) {
! 		if (!strncmp(temp," ",1) || !strncmp(temp,"\"",1) )
! 			temp++;
! 		else
  			break;
  	}
  	
  	/* Copy the word and convert to lowercase */
! 	while (temp !=NULL && strncmp(temp," ",1) 
! 		&& strncmp(temp,"\"",1) && i<= MAXWORDLEN ) {
  		word[i] = *temp++;
  		word[i] = tolower(word[i]);
  		i++;
--- 1122,1153 ----
  	temp += strlen("NAME");
  	
  	/* Get to the '=' sign disreguarding blanks */
! 	while (temp != NULL && *temp) {
! 		if (*temp && (*temp != '=' )) {
  			temp++;
! 		} else {
  			temp++;
  			break;
  		}
  	}
! 
  	/* Get to the beginning of the word disreguarding blanks and quotes */
! 	while (temp != NULL && *temp) {
! 		if (*temp == ' ' || *temp == '"' )
  			break;
+ 		else
+ 			temp++;
  	}
  	
  	/* Copy the word and convert to lowercase */
! 	/* while (temp !=NULL && strncmp(temp," ",1)   */
! 	/*	&& strncmp(temp,"\"",1) && i<= MAXWORDLEN ) {  }*/
! 
! 	/* and the above <= was wrong, should be < which caused the
! 	   null insertion below to be off by two bytes */
! 
! 	while (temp !=NULL && *temp && *temp != ' ' 
! 		&& *temp != '"' && i< MAXWORDLEN ) {
  		word[i] = *temp++;
  		word[i] = tolower(word[i]);
  		i++;
***************
*** 1191,1203 ****
  	* the check is done here istead of before because META tags do not have
  	* a fixed length that can be checked
  	*/
! 	if (temp != NULL)
      {
  		temp += strlen("CONTENT");
  		
  		/* Get to the " sign disreguarding blanks */
! 		while (temp != NULL) {
! 			if (strncmp(temp, "\"",1))
  				temp++;
  			else {
  				temp++;
--- 1211,1223 ----
  	* the check is done here istead of before because META tags do not have
  	* a fixed length that can be checked
  	*/
! 	if (temp != NULL && *temp)
      {
  		temp += strlen("CONTENT");
  		
  		/* Get to the " sign disreguarding blanks */
! 		while (temp != NULL && *temp ) {
! 			if ( *temp != '"' ) 
  				temp++;
  			else {
  				temp++;
*** merge.c.old	Mon Nov 30 15:14:40 1998
--- merge.c	Sun Oct 17 23:05:07 1999
***************
*** 1,3 ****
--- 1,4 ----
+ 
  /*
  ** Copyright (C) 1995, 1996, 1997, 1998 Hewlett-Packard Company
  ** Originally by Kevin Hughes, kev@kevcom.com, 3/11/94
***************
*** 25,30 ****
--- 26,32 ----
  ** G. Hill 3/26/97 ghill@library.berkeley.edu
  */
  
+ #include <assert.h> /* for bug hunting */
  #include "swish.h"
  #include "merge.h"
  #include "error.h"
***************
*** 339,344 ****
--- 341,347 ----
  	if (limit == ftell(fp))
  		return NULL;
  	for (i = 0; (c = fgetc(fp)) != 0; ) {
+ 		assert( i < MAXWORDLEN); /* TAB */
  		if (c == ':') {
  			fileword[i] = '\0';
  			break;
***************
*** 731,736 ****
--- 734,761 ----
  	}
  }
  
+ /* gprof suggests that this is a major CPU eater  :-(, that's 
+    because it gets called a _lot_ rather than because it is inefficient...
+ 
+    ... and it's probably an indicator that free() is a major CPU hog :-(
+ 
+    you'd think that putting the NULL assignment into the if() condition
+    would be fastest, but either gcc is really stupid, or really smart, 
+    because gprof showed that unconditionally setting it after reading it
+    saved about 10% over setting it unconditionally at the end of the loop
+    (that could be sampling error though), and setting it inside the loop
+    definitely increased it by 15-20% ... go figure?   - TAB oct/99
+ 
+    For reference:
+    Reading specs from /usr/lib/gcc-lib/i386-redhat-linux/2.7.2.3/specs
+    gcc version 2.7.2.3
+ 
+    hhmm... I wonder if we should consider making it a macro? No, there are
+    routines using 1/4 the CPU, getting called 20 times as often (compress)
+    so obviously subrouting overhead isn't the issue...
+ 
+ */
+ 
  void initmarkentrylistMerge()
  {
  	int i;
***************
*** 738,746 ****
  	
  	for (i = 0; i < BIGHASHSIZE; i++) {
  		mp = markentrylistMerge[i];
! 		if (mp != NULL)
  			free(mp);
! 		markentrylistMerge[i] = NULL;
  	}
  }      
  
--- 763,772 ----
  	
  	for (i = 0; i < BIGHASHSIZE; i++) {
  		mp = markentrylistMerge[i];
! 		markentrylistMerge[i] = NULL; /* minor optimization */
! 		if (mp != NULL) {
  			free(mp);
! 		}
  	}
  }      
  
***************
*** 801,806 ****
--- 827,833 ----
  /* Translates a file number into something that can be compressed.
  */
  
+ #if 0
  int encodefilenum(num)
  int num;
  {
***************
*** 813,822 ****
--- 840,865 ----
  	}
  	return j;
  }
+ #else
+ /* I tested this to 30,000 and it's return values are identical to ^^ - TAB */
+ int encodefilenum(num)
+ int num;
+ {
+ 	int i = (num -1 ) / 127;
+ 	return num + i;
+ 
+ }
+ #endif
  
  /* Translates a compressed file number into a correct file number.
  */
  
+ /* as with the encoding algorithm, I ran the old and the new algorithms
+    with 0 to 30,000 as input and their identical in everything but _speed_ :-)
+    - TAB - oct/99
+ */
+ 
+ #if 0
  int decodefilenum(num)
  int num;
  {
***************
*** 831,836 ****
--- 874,887 ----
  	
  	return num;
  }
+ #else
+ int decodefilenum(num)
+ int num;
+ {
+ 	int i = (num -1 ) / 128;
+ 	return num - i;
+ }
+ #endif
  
  /* Similar to addtoresultlist, but also adding the meta name
  */
Received on Sun Oct 17 23:15:17 1999