mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
fix wiktionary-based generation code so we can map
a word with accents stripped to the word with the accents in place.
This commit is contained in:
parent
f16414b774
commit
6f704d3d6a
@ -1932,6 +1932,18 @@ bool Wiktionary::compile ( ) {
|
||||
//lastWid = *(long long *)data;
|
||||
// it matches!
|
||||
formCount++;
|
||||
|
||||
// if it has accent marks then we count the stripped
|
||||
// version as a form, but we do not have to
|
||||
// store the stripped version in wiktionary-buf.txt
|
||||
// because it is just a waste of space.
|
||||
char a[1024];
|
||||
long stripLen = stripAccentMarks(a,
|
||||
1023,
|
||||
(unsigned char *)word,
|
||||
gbstrlen(word));
|
||||
if ( stripLen > 0 )
|
||||
formCount++;
|
||||
}
|
||||
// need 2+ forms!
|
||||
if ( formCount <= 1 ) continue;
|
||||
@ -2015,6 +2027,26 @@ bool Wiktionary::compile ( ) {
|
||||
// so maybe allow dup keys in syntable?
|
||||
//
|
||||
|
||||
// . also strip accent marks and add that key as well
|
||||
// . so we can map a stripped word to the original
|
||||
// word with accent marks, although it might
|
||||
// actually map to multiple words! so who knows
|
||||
// what to pick, maybe all of them!
|
||||
char a[1024];
|
||||
long stripLen = stripAccentMarks(a,
|
||||
1023,
|
||||
(unsigned char *)word,
|
||||
gbstrlen(word));
|
||||
if ( stripLen > 0 ) {
|
||||
long long swid = hash64Lower_utf8(word);
|
||||
// xor in the langid
|
||||
swid ^= g_hashtab[0][langId];
|
||||
// only add this word form once per langId
|
||||
if ( dd.isInTable ( &swid ) ) continue;
|
||||
dd.addKey ( &swid );
|
||||
}
|
||||
|
||||
|
||||
// count em up
|
||||
count++;
|
||||
// limit to 100 synonyms per synset
|
||||
|
@ -3656,7 +3656,7 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, long long docId, long long uh48 ){
|
||||
// alloc the buffer
|
||||
char *ubuf = (char *) mmalloc ( need1 , "xdtrb" );
|
||||
// return NULL with g_errno set on error
|
||||
if ( ! ubuf ) return NULL;
|
||||
if ( ! ubuf ) return false;
|
||||
// serialize into it
|
||||
char *p = ubuf;
|
||||
// copy our crap into there
|
||||
@ -3744,7 +3744,7 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, long long docId, long long uh48 ){
|
||||
log("db: Failed to compress document of %li bytes. "
|
||||
"Provided buffer of %li bytes.",
|
||||
size, (need2 - hdrSize ) );
|
||||
return NULL;
|
||||
return false;
|
||||
}
|
||||
// check for error
|
||||
if ( err != Z_OK ) {
|
||||
@ -3752,7 +3752,7 @@ bool XmlDoc::setTitleRecBuf ( SafeBuf *tbuf, long long docId, long long uh48 ){
|
||||
tbuf->purge();
|
||||
g_errno = ECOMPRESSFAILED;
|
||||
log("db: Failed to compress document.");
|
||||
return NULL;
|
||||
return false;
|
||||
}
|
||||
// calc cbufSize, the uncompressed header + compressed stuff
|
||||
//cbufSize = hdrSize + size ;
|
||||
|
Loading…
Reference in New Issue
Block a user