Okay, since the mail manager deleted the attached patch, you can piece it
together after you mail viewer chops up the lines from this:
diff --unified --recursive --new-file swish-e-2.4.0-pr1/src/dump.c
swish-e-2.4.0-pr1-spg/src/dump.c
--- swish-e-2.4.0-pr1/src/dump.c 2003-03-28 10:31:30.000000000 -0600
+++ swish-e-2.4.0-pr1-spg/src/dump.c 2003-06-13 15:28:36.000000000 -0500
@@ -136,7 +136,75 @@
fflush(stdout);
}
+int isValidAscii(char* word)
+{
+ int i;
+
+ for(i=0; i<strlen(word); ++i)
+ if(!isascii(word[i])) return 0;
+
+ return 1;
+}
+char *convert_to_html(char* word)
+{
+ static char buffer[1024];
+ int i;
+ int out;
+
+ for(i=0, out=0; i<strlen(word) && out<1023; ++out, ++i)
+ {
+ if(word[i] == '&')
+ {
+ buffer[out] = '&';
+ ++out;
+ buffer[out] = 'a';
+ ++out;
+ buffer[out] = 'm';
+ ++out;
+ buffer[out] = 'p';
+ ++out;
+ buffer[out] = ';';
+ }
+ else if (word[i] == '"')
+ {
+ buffer[out] = '&';
+ ++out;
+ buffer[out] = 'q';
+ ++out;
+ buffer[out] = 'u';
+ ++out;
+ buffer[out] = 'o';
+ ++out;
+ buffer[out] = 't';
+ ++out;
+ buffer[out] = ';';
+ }
+ else if (!isascii(word[i]))
+ {
+ char num[10];
+ buffer[out] = '&';
+ ++out;
+ buffer[out] = '#';
+ ++out;
+ sprintf(num,"%03ud",word[i]);
+ buffer[out] = num[0];
+ ++out;
+ buffer[out] = num[1];
+ ++out;
+ buffer[out] = num[2];
+ ++out;
+ buffer[out] = ';';
+ }
+ else
+ {
+ buffer[out] = word[i];
+ }
+ }
+ buffer[out] = 0;
+
+ return buffer;
+}
/* Prints out the data in an index DB */
@@ -209,8 +277,128 @@
}
DB_EndReadWords(sw, indexf->DB);
}
+ else if (DEBUG_MASK & DEBUG_INDEX_XML)
+ {
+ int *meta_used;
+ int end_meta = 0;
+
+ printf("<index>\n");
+
+ for(i = 0; i < indexf->header.metaCounter; i++)
+ if ( indexf->header.metaEntryArray[i]->metaID > end_meta )
+ end_meta = indexf->header.metaEntryArray[i]->metaID;
+
+ meta_used = emalloc( sizeof(int) * ( end_meta + 1) );
+
+ /* _META only reports which tags the words are found in */
+ for(i = 0; i <= end_meta; i++)
+ meta_used[i] = 0;
diff --unified --recursive --new-file swish-e-2.4.0-pr1/src/dump.c
swish-e-2.4.0-pr1-spg/src/dump.c
--- swish-e-2.4.0-pr1/src/dump.c 2003-03-28 10:31:30.000000000 -0600
+++ swish-e-2.4.0-pr1-spg/src/dump.c 2003-06-13 15:28:36.000000000 -0500
@@ -136,7 +136,75 @@
fflush(stdout);
}
+int isValidAscii(char* word)
+{
+ int i;
+
+ for(i=0; i<strlen(word); ++i)
+ if(!isascii(word[i])) return 0;
+
+ return 1;
+}
+char *convert_to_html(char* word)
+{
+ static char buffer[1024];
+ int i;
+ int out;
+
+ for(i=0, out=0; i<strlen(word) && out<1023; ++out, ++i)
+ {
+ if(word[i] == '&')
+ {
+ buffer[out] = '&';
+ ++out;
+ buffer[out] = 'a';
+ ++out;
+ buffer[out] = 'm';
+ ++out;
+ buffer[out] = 'p';
+ ++out;
+ buffer[out] = ';';
+ }
+ else if (word[i] == '"')
+ {
+ buffer[out] = '&';
+ ++out;
+ buffer[out] = 'q';
+ ++out;
+ buffer[out] = 'u';
+ ++out;
+ buffer[out] = 'o';
+ ++out;
+ buffer[out] = 't';
+ ++out;
+ buffer[out] = ';';
+ }
+ else if (!isascii(word[i]))
+ {
+ char num[10];
+ buffer[out] = '&';
+ ++out;
+ buffer[out] = '#';
+ ++out;
+ sprintf(num,"%03ud",word[i]);
+ buffer[out] = num[0];
+ ++out;
+ buffer[out] = num[1];
+ ++out;
+ buffer[out] = num[2];
+ ++out;
+ buffer[out] = ';';
+ }
+ else
+ {
+ buffer[out] = word[i];
+ }
+ }
+ buffer[out] = 0;
+
+ return buffer;
+}
/* Prints out the data in an index DB */
@@ -209,8 +277,128 @@
}
DB_EndReadWords(sw, indexf->DB);
}
+ else if (DEBUG_MASK & DEBUG_INDEX_XML)
+ {
+ int *meta_used;
+ int end_meta = 0;
+
+ printf("<index>\n");
+
+ for(i = 0; i < indexf->header.metaCounter; i++)
+ if ( indexf->header.metaEntryArray[i]->metaID > end_meta )
+ end_meta = indexf->header.metaEntryArray[i]->metaID;
+
+ meta_used = emalloc( sizeof(int) * ( end_meta + 1) );
+
+ /* _META only reports which tags the words are found in */
+ for(i = 0; i <= end_meta; i++)
+ meta_used[i] = 0;
+
+ for(j=1;j<256;j++)
+ {
+ word[0] = (unsigned char) j; word[1] = '\0';
+ DB_ReadFirstWordInvertedIndex(sw,
word,&resultword,&wordID,indexf->DB);
+
+ while(wordID && (((int)((unsigned char)resultword[0]))== j))
+ {
+ if(isValidAscii(resultword))
+ {
+ printf(" <word>\n");
+ printf(" <name>%s</name>\n",resultword);
+ }
+
+ /* Read Word's data */
+ DB_ReadWordData(sw, wordID, &worddata, &sz_worddata,
&saved_bytes, indexf->DB);
+ uncompress_worddata(&worddata, &sz_worddata, saved_bytes);
+
+ /* parse and print word's data */
+ s = worddata;
+
+ tmpval = uncompress2(&s); /* tfrequency */
+ metaID = uncompress2(&s); /* metaID */
+ metadata_length = uncompress2(&s);
+
+ filenum = 0;
+ start = s;
+ while(1)
+ { /* Read on all items */
+ uncompress_location_values(&s,&flag,&tmpval,&frequency);
+ filenum += tmpval;
+ posdata = (int *) emalloc(frequency * sizeof(int));
+ uncompress_location_positions(&s,flag,frequency,posdata);
+
+
+ struct metaEntry *m;
+
+ /* Get path from property list */
+ if ( (m = getPropNameByName( &sw->indexlist->header,
AUTOPROPERTY_DOCPATH )) )
+ {
+ RESULT r;
+ DB_RESULTS db_results;
+ char *s;
+ PropValue *p;
+
+ memset( &r, 0, sizeof( RESULT ) );
+ memset( &db_results, 0, sizeof( DB_RESULTS ) );
+ db_results.indexf = indexf;
+
+ r.db_results = &db_results;
+ r.filenum = filenum;
+ r.fi.filenum = filenum;
+ s = getResultPropAsString( &r, m->metaID);
+ p = getResultPropValue( &r, AUTOPROPERTY_TITLE, 0);
+
+ if(isValidAscii(resultword))
+ {
+ printf(" <path freq=\"%d\"
title=\"%s\">%s</path>\n",
+ frequency,
+ convert_to_html(p->value.v_str),
+ s);
+ }
+ freeResultPropValue(p);
+
+ efree( s );
+
+ }
+ else
+ {
+ printf(" <ERROR>Failed to lookup meta
entry</ERROR>\n");
+ }
+
+ efree(posdata);
+
+ /* Check for enf of worddata */
+ if ((s - worddata) == sz_worddata)
+ break; /* End of worddata */
+
+ /* Check for end of current metaID data */
+ if ( metadata_length == (s - start))
+ {
+ filenum = 0;
+ metaID = uncompress2(&s);
+ metadata_length = uncompress2(&s);
+ start = s;
+ }
+ }
+
+ if(isValidAscii(resultword))
+ {
+ printf(" </word>\n");
+ }
+
+ efree(worddata);
+ efree(resultword);
+ DB_ReadNextWordInvertedIndex(sw,
word,&resultword,&wordID,indexf->DB);
+ }
+ }
+ DB_EndReadWords(sw, indexf->DB);
+
+ efree( meta_used );
+
+ printf("</index>\n");
+ }
else if (DEBUG_MASK & (DEBUG_INDEX_ALL | DEBUG_INDEX_WORDS |
DEBUG_INDEX_WORDS_FULL | DEBUG_INDEX_WORDS_META) )
{
int *meta_used;
diff --unified --recursive --new-file swish-e-2.4.0-pr1/src/result_output.c
swish-e-2.4.0-pr1-spg/src/result_output.c
--- swish-e-2.4.0-pr1/src/result_output.c 2003-05-12 16:29:19.000000000 -0500
+++ swish-e-2.4.0-pr1-spg/src/result_output.c 2003-06-13 10:28:33.000000000
-0500
@@ -928,6 +928,7 @@
for ( i = 0; i < md->numPropertiesToDisplay; i++ )
{
+ printf("\nmetaIDs[%d] = %d\n", i, metaIDs[i]);
propValue = s = getResultPropAsString( r, metaIDs[ i ] );
if (sw->ResultOutput->stdResultFieldDelimiter)
diff --unified --recursive --new-file swish-e-2.4.0-pr1/src/swish.c
swish-e-2.4.0-pr1-spg/src/swish.c
--- swish-e-2.4.0-pr1/src/swish.c 2003-05-15 11:42:11.000000000 -0500
+++ swish-e-2.4.0-pr1-spg/src/swish.c 2003-06-10 22:16:44.000000000 -0500
@@ -81,7 +81,8 @@
{"INDEX_STOPWORDS", DEBUG_INDEX_STOPWORDS, "List stopwords stored in
index"},
{"INDEX_FILES", DEBUG_INDEX_FILES, "List file data stored in index"},
{"INDEX_METANAMES", DEBUG_INDEX_METANAMES, "List metaname table stored in
index"},
- {"INDEX_ALL", DEBUG_INDEX_ALL, "Dump data ALL above data from index
file\n\n-- indexing --\n"},
+ {"INDEX_ALL", DEBUG_INDEX_ALL, "Dump data ALL above data from index
file"},
+ {"INDEX_XML", DEBUG_INDEX_XML, "Dump words in index to xml\n\n-- indexing
--\n"},
/* These trace indexing */
{"INDEXED_WORDS", DEBUG_WORDS, "Display words as they are indexed"},
diff --unified --recursive --new-file swish-e-2.4.0-pr1/src/swish.h
swish-e-2.4.0-pr1-spg/src/swish.h
--- swish-e-2.4.0-pr1/src/swish.h 2003-05-12 16:29:19.000000000 -0500
+++ swish-e-2.4.0-pr1-spg/src/swish.h 2003-06-10 22:15:18.000000000 -0500
@@ -1007,6 +1007,7 @@
#define DEBUG_INDEX_ALL (1<<6)
#define DEBUG_INDEX_WORDS_ONLY (1<<7)
#define DEBUG_INDEX_WORDS_META (1<<8)
+#define DEBUG_INDEX_XML (1<<9)
/* These are only checked while indexing */
#define DEBUG_WORDS (1<<0)
Received on Wed Jun 18 14:17:43 2003