fix wiktionary-based generation code so we can map

a word with accents stripped to the word with the accents in place.
2024-10-04 12:17:35 +03:00 · 2014-06-01 06:33:16 -07:00 · 2014-06-01 06:33:16 -07:00 · 6f704d3d6a
commit 6f704d3d6a
parent f16414b774
2 changed files with 35 additions and 3 deletions
--- a/Wiktionary.cpp
+++ b/Wiktionary.cpp
@ -1932,6 +1932,18 @@ bool Wiktionary::compile ( ) {
 			//lastWid = *(long long *)data;
 			// it matches!
 			formCount++;
+
+			// if it has accent marks then we count the stripped
+			// version as a form, but we do not have to
+			// store the stripped version in wiktionary-buf.txt
+			// because it is just a waste of space.
+			char a[1024];
+			long stripLen = stripAccentMarks(a,
+							 1023,
+							 (unsigned char *)word,
+							 gbstrlen(word));
+			if ( stripLen > 0 ) 
+				formCount++;
 		}
 		// need 2+ forms!
 		if ( formCount <= 1 ) continue;
@ -2015,6 +2027,26 @@ bool Wiktionary::compile ( ) {
 			// so maybe allow dup keys in syntable?
 			//

+			// . also strip accent marks and add that key as well
+			// . so we can map a stripped word to the original
+			//   word with accent marks, although it might
+			//   actually map to multiple words! so who knows
+			//   what to pick, maybe all of them!
+			char a[1024];
+			long stripLen = stripAccentMarks(a,
+							 1023,
+							 (unsigned char *)word,
+							 gbstrlen(word));
+			if ( stripLen > 0 ) {
+				long long swid = hash64Lower_utf8(word);
+				// xor in the langid
+				swid ^= g_hashtab[0][langId];
+				// only add this word form once per langId
+				if ( dd.isInTable ( &swid ) ) continue;
+				dd.addKey ( &swid );
+			}
+
+
 			// count em up
 			count++;
 			// limit to 100 synonyms per synset
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -3656,7 +3656,7 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, long long docId, long long uh48 ){
 	// alloc the buffer
 	char *ubuf = (char *) mmalloc ( need1 , "xdtrb" );
 	// return NULL with g_errno set on error
-	if ( ! ubuf ) return NULL;
+	if ( ! ubuf ) return false;
 	// serialize into it
 	char *p = ubuf;
 	// copy our crap into there
@ -3744,7 +3744,7 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, long long docId, long long uh48 ){
 		log("db: Failed to compress document of %li bytes. "
 		    "Provided buffer of %li bytes.",
 		    size, (need2 - hdrSize ) );
-		return NULL;
+		return false;
 	}
 	// check for error
 	if ( err != Z_OK ) {
@ -3752,7 +3752,7 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, long long docId, long long uh48 ){
 		tbuf->purge();
 		g_errno = ECOMPRESSFAILED; 
 		log("db: Failed to compress document.");
-		return NULL;
+		return false;
 	}
 	// calc cbufSize, the uncompressed header + compressed stuff
 	//cbufSize = hdrSize + size ;