fix wiktionary-based generation code so we can map

a word with accents stripped to the
word with the accents in place.
This commit is contained in:
Matt Wells 2014-06-01 06:33:16 -07:00
parent f16414b774
commit 6f704d3d6a
2 changed files with 35 additions and 3 deletions

View File

@ -1932,6 +1932,18 @@ bool Wiktionary::compile ( ) {
//lastWid = *(long long *)data;
// it matches!
formCount++;
// if it has accent marks then we count the stripped
// version as a form, but we do not have to
// store the stripped version in wiktionary-buf.txt
// because it is just a waste of space.
char a[1024];
long stripLen = stripAccentMarks(a,
1023,
(unsigned char *)word,
gbstrlen(word));
if ( stripLen > 0 )
formCount++;
}
// need 2+ forms!
if ( formCount <= 1 ) continue;
@ -2015,6 +2027,26 @@ bool Wiktionary::compile ( ) {
// so maybe allow dup keys in syntable?
//
// . also strip accent marks and add that key as well
// . so we can map a stripped word to the original
// word with accent marks, although it might
// actually map to multiple words! so who knows
// what to pick, maybe all of them!
char a[1024];
long stripLen = stripAccentMarks(a,
1023,
(unsigned char *)word,
gbstrlen(word));
if ( stripLen > 0 ) {
long long swid = hash64Lower_utf8(word);
// xor in the langid
swid ^= g_hashtab[0][langId];
// only add this word form once per langId
if ( dd.isInTable ( &swid ) ) continue;
dd.addKey ( &swid );
}
// count em up
count++;
// limit to 100 synonyms per synset

View File

@ -3656,7 +3656,7 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, long long docId, long long uh48 ){
// alloc the buffer
char *ubuf = (char *) mmalloc ( need1 , "xdtrb" );
// return NULL with g_errno set on error
if ( ! ubuf ) return NULL;
if ( ! ubuf ) return false;
// serialize into it
char *p = ubuf;
// copy our crap into there
@ -3744,7 +3744,7 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, long long docId, long long uh48 ){
log("db: Failed to compress document of %li bytes. "
"Provided buffer of %li bytes.",
size, (need2 - hdrSize ) );
return NULL;
return false;
}
// check for error
if ( err != Z_OK ) {
@ -3752,7 +3752,7 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, long long docId, long long uh48 ){
tbuf->purge();
g_errno = ECOMPRESSFAILED;
log("db: Failed to compress document.");
return NULL;
return false;
}
// calc cbufSize, the uncompressed header + compressed stuff
//cbufSize = hdrSize + size ;