Skip to main content.
home | support | download

Back to List Archive

Patch

From: Shawn P. Garbett <listman(at)not-real.garbett.org>
Date: Wed Jun 18 2003 - 14:17:01 GMT
Okay, since the mail manager deleted the attached patch, you can piece it 
together after you mail viewer chops up the lines from this:

diff --unified --recursive --new-file swish-e-2.4.0-pr1/src/dump.c 
swish-e-2.4.0-pr1-spg/src/dump.c
--- swish-e-2.4.0-pr1/src/dump.c	2003-03-28 10:31:30.000000000 -0600
+++ swish-e-2.4.0-pr1-spg/src/dump.c	2003-06-13 15:28:36.000000000 -0500
@@ -136,7 +136,75 @@
     fflush(stdout);
 }
 
+int isValidAscii(char* word)
+{
+  int i;
+
+  for(i=0; i<strlen(word); ++i)
+    if(!isascii(word[i])) return 0;
+
+  return 1;
+}
 
+char *convert_to_html(char* word)
+{
+  static char buffer[1024];
+  int i;
+  int out;
+
+  for(i=0, out=0; i<strlen(word) && out<1023; ++out, ++i)
+  {
+    if(word[i] == '&')
+    {
+      buffer[out] = '&';
+      ++out;
+      buffer[out] = 'a'; 
+      ++out;
+      buffer[out] = 'm'; 
+      ++out;
+      buffer[out] = 'p'; 
+      ++out;
+      buffer[out] = ';'; 
+    }
+    else if (word[i] == '"')
+    {
+      buffer[out] = '&';
+      ++out;
+      buffer[out] = 'q'; 
+      ++out;
+      buffer[out] = 'u'; 
+      ++out;
+      buffer[out] = 'o'; 
+      ++out;
+      buffer[out] = 't'; 
+      ++out;
+      buffer[out] = ';'; 
+    }
+    else if (!isascii(word[i]))
+    {
+      char num[10];
+      buffer[out] = '&';
+      ++out;
+      buffer[out] = '#'; 
+      ++out;
+      sprintf(num,"%03ud",word[i]);
+      buffer[out] = num[0]; 
+      ++out;
+      buffer[out] = num[1]; 
+      ++out;
+      buffer[out] = num[2]; 
+      ++out;
+      buffer[out] = ';'; 
+    }
+    else
+    {
+      buffer[out] = word[i];
+    }
+  }
+  buffer[out] = 0;
+  
+  return buffer;
+}
 
 
 /* Prints out the data in an index DB */
@@ -209,8 +277,128 @@
         }
         DB_EndReadWords(sw, indexf->DB);
     }
+    else if (DEBUG_MASK & DEBUG_INDEX_XML)
+    {
+        int     *meta_used;
+        int     end_meta = 0;
+
+        printf("<index>\n");
+
+        for(i = 0; i < indexf->header.metaCounter; i++)
+            if ( indexf->header.metaEntryArray[i]->metaID > end_meta )
+                end_meta = indexf->header.metaEntryArray[i]->metaID;
+
+        meta_used = emalloc( sizeof(int) * ( end_meta + 1) );  
+
+        /* _META only reports which tags the words are found in */
+        for(i = 0; i <= end_meta; i++)
+            meta_used[i] = 0;
diff --unified --recursive --new-file swish-e-2.4.0-pr1/src/dump.c 
swish-e-2.4.0-pr1-spg/src/dump.c
--- swish-e-2.4.0-pr1/src/dump.c	2003-03-28 10:31:30.000000000 -0600
+++ swish-e-2.4.0-pr1-spg/src/dump.c	2003-06-13 15:28:36.000000000 -0500
@@ -136,7 +136,75 @@
     fflush(stdout);
 }
 
+int isValidAscii(char* word)
+{
+  int i;
+
+  for(i=0; i<strlen(word); ++i)
+    if(!isascii(word[i])) return 0;
+
+  return 1;
+}
 
+char *convert_to_html(char* word)
+{
+  static char buffer[1024];
+  int i;
+  int out;
+
+  for(i=0, out=0; i<strlen(word) && out<1023; ++out, ++i)
+  {
+    if(word[i] == '&')
+    {
+      buffer[out] = '&';
+      ++out;
+      buffer[out] = 'a'; 
+      ++out;
+      buffer[out] = 'm'; 
+      ++out;
+      buffer[out] = 'p'; 
+      ++out;
+      buffer[out] = ';'; 
+    }
+    else if (word[i] == '"')
+    {
+      buffer[out] = '&';
+      ++out;
+      buffer[out] = 'q'; 
+      ++out;
+      buffer[out] = 'u'; 
+      ++out;
+      buffer[out] = 'o'; 
+      ++out;
+      buffer[out] = 't'; 
+      ++out;
+      buffer[out] = ';'; 
+    }
+    else if (!isascii(word[i]))
+    {
+      char num[10];
+      buffer[out] = '&';
+      ++out;
+      buffer[out] = '#'; 
+      ++out;
+      sprintf(num,"%03ud",word[i]);
+      buffer[out] = num[0]; 
+      ++out;
+      buffer[out] = num[1]; 
+      ++out;
+      buffer[out] = num[2]; 
+      ++out;
+      buffer[out] = ';'; 
+    }
+    else
+    {
+      buffer[out] = word[i];
+    }
+  }
+  buffer[out] = 0;
+  
+  return buffer;
+}
 
 
 /* Prints out the data in an index DB */
@@ -209,8 +277,128 @@
         }
         DB_EndReadWords(sw, indexf->DB);
     }
+    else if (DEBUG_MASK & DEBUG_INDEX_XML)
+    {
+        int     *meta_used;
+        int     end_meta = 0;
+
+        printf("<index>\n");
+
+        for(i = 0; i < indexf->header.metaCounter; i++)
+            if ( indexf->header.metaEntryArray[i]->metaID > end_meta )
+                end_meta = indexf->header.metaEntryArray[i]->metaID;
+
+        meta_used = emalloc( sizeof(int) * ( end_meta + 1) );  
+
+        /* _META only reports which tags the words are found in */
+        for(i = 0; i <= end_meta; i++)
+            meta_used[i] = 0;
+
+        for(j=1;j<256;j++)
+        {
+            word[0] = (unsigned char) j; word[1] = '\0';
+            DB_ReadFirstWordInvertedIndex(sw, 
word,&resultword,&wordID,indexf->DB);
+
+            while(wordID && (((int)((unsigned char)resultword[0]))== j))
+            {
+                if(isValidAscii(resultword))
+                {
+                printf("  <word>\n");
+                printf("    <name>%s</name>\n",resultword);
+                }
+
+                /* Read Word's data */
+                DB_ReadWordData(sw, wordID, &worddata, &sz_worddata, 
&saved_bytes, indexf->DB);
+                uncompress_worddata(&worddata, &sz_worddata, saved_bytes);
+
+                /* parse and print word's data */
+                s = worddata;
+
+                tmpval = uncompress2(&s);     /* tfrequency */
+                metaID = uncompress2(&s);     /* metaID */
+                metadata_length = uncompress2(&s);
+
+                filenum = 0;
+                start = s;
+                while(1)
+                {                   /* Read on all items */
+                    uncompress_location_values(&s,&flag,&tmpval,&frequency);
+                    filenum += tmpval;
+                    posdata = (int *) emalloc(frequency * sizeof(int));
+                    uncompress_location_positions(&s,flag,frequency,posdata);
+
+
+                    struct metaEntry    *m;
+                        
+                    /* Get path from property list */
+                    if ( (m = getPropNameByName( &sw->indexlist->header, 
AUTOPROPERTY_DOCPATH )) )
+                    {
+                        RESULT r;
+                        DB_RESULTS db_results;
+                        char  *s;
+                        PropValue *p;
+
+                        memset( &r, 0, sizeof( RESULT ) );
+                        memset( &db_results, 0, sizeof( DB_RESULTS ) );
+                        db_results.indexf = indexf;
+
+                        r.db_results = &db_results;
+                        r.filenum = filenum;
+                        r.fi.filenum = filenum;
 
+                        s = getResultPropAsString( &r, m->metaID);
 
+                        p = getResultPropValue( &r, AUTOPROPERTY_TITLE, 0);
+
+                        if(isValidAscii(resultword))
+                        {
+                        printf("    <path freq=\"%d\" 
title=\"%s\">%s</path>\n",
+                               frequency,
+                               convert_to_html(p->value.v_str),
+                               s);
+                        }
+                        freeResultPropValue(p);
+
+                        efree( s );
+                            
+                    }
+                    else
+                    {
+                        printf("    <ERROR>Failed to lookup meta 
entry</ERROR>\n");
+                    }
+                            
+                    efree(posdata);
+
+                    /* Check for enf of worddata */
+                    if ((s - worddata) == sz_worddata)
+                        break;   /* End of worddata */
+
+                    /* Check for end of current metaID data */
+                    if ( metadata_length == (s - start))
+                    {
+                        filenum = 0;
+                        metaID = uncompress2(&s);
+                        metadata_length = uncompress2(&s);
+                        start = s;
+                    }
+                }
+
+                if(isValidAscii(resultword))
+                {
+                printf("  </word>\n");
+                }
+
+                efree(worddata);
+                efree(resultword);
+                DB_ReadNextWordInvertedIndex(sw, 
word,&resultword,&wordID,indexf->DB);
+             }
+        }
+        DB_EndReadWords(sw, indexf->DB);
+
+        efree( meta_used );
+
+        printf("</index>\n");
+    }
     else if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_WORDS | 
DEBUG_INDEX_WORDS_FULL | DEBUG_INDEX_WORDS_META)  )
     {
         int     *meta_used;
diff --unified --recursive --new-file swish-e-2.4.0-pr1/src/result_output.c 
swish-e-2.4.0-pr1-spg/src/result_output.c
--- swish-e-2.4.0-pr1/src/result_output.c	2003-05-12 16:29:19.000000000 -0500
+++ swish-e-2.4.0-pr1-spg/src/result_output.c	2003-06-13 10:28:33.000000000 
-0500
@@ -928,6 +928,7 @@
 
     for ( i = 0; i < md->numPropertiesToDisplay; i++ )
     {
+        printf("\nmetaIDs[%d] = %d\n", i, metaIDs[i]);
         propValue = s = getResultPropAsString( r, metaIDs[ i ] );
 
         if (sw->ResultOutput->stdResultFieldDelimiter)
diff --unified --recursive --new-file swish-e-2.4.0-pr1/src/swish.c 
swish-e-2.4.0-pr1-spg/src/swish.c
--- swish-e-2.4.0-pr1/src/swish.c	2003-05-15 11:42:11.000000000 -0500
+++ swish-e-2.4.0-pr1-spg/src/swish.c	2003-06-10 22:16:44.000000000 -0500
@@ -81,7 +81,8 @@
     {"INDEX_STOPWORDS", DEBUG_INDEX_STOPWORDS, "List stopwords stored in 
index"},
     {"INDEX_FILES", DEBUG_INDEX_FILES, "List file data stored in index"},
     {"INDEX_METANAMES", DEBUG_INDEX_METANAMES, "List metaname table stored in 
index"},
-    {"INDEX_ALL", DEBUG_INDEX_ALL, "Dump data ALL above data from index 
file\n\n-- indexing --\n"},
+    {"INDEX_ALL", DEBUG_INDEX_ALL, "Dump data ALL above data from index 
file"},
+    {"INDEX_XML", DEBUG_INDEX_XML, "Dump words in index to xml\n\n-- indexing 
--\n"},
 
     /* These trace indexing */
     {"INDEXED_WORDS", DEBUG_WORDS, "Display words as they are indexed"},
diff --unified --recursive --new-file swish-e-2.4.0-pr1/src/swish.h 
swish-e-2.4.0-pr1-spg/src/swish.h
--- swish-e-2.4.0-pr1/src/swish.h	2003-05-12 16:29:19.000000000 -0500
+++ swish-e-2.4.0-pr1-spg/src/swish.h	2003-06-10 22:15:18.000000000 -0500
@@ -1007,6 +1007,7 @@
 #define DEBUG_INDEX_ALL			(1<<6)
 #define DEBUG_INDEX_WORDS_ONLY	(1<<7)
 #define DEBUG_INDEX_WORDS_META	(1<<8)
+#define DEBUG_INDEX_XML (1<<9)
 
 /* These are only checked while indexing */
 #define DEBUG_WORDS				(1<<0)
Received on Wed Jun 18 14:17:43 2003