open-source-search-engine/Thesaurus.cpp
2021-05-06 01:52:55 +10:00

2616 lines
72 KiB
C++

#include "gb-include.h"
#include "Thesaurus.h"
#include "HashTable.h"
#include "HttpServer.h"
#include "Dns.h"
#include "StopWords.h"
//#include "TitleRec.h" // for gb(un)compress
#include "Speller.h"
#include "Words.h"
#include "Bits.h"
#include "Phrases.h"
#include "sort.h"
/*
// returns false if fails and sets g_errno, true if succeeded
bool Thesaurus::load () {
// open the thesaurus.txt file
char filename[1024];
sprintf(filename,"%sdict/thesaurus.txt",g_hostdb.m_dir);
File f;
f.open ( filename );
// read it all in
int32_t fsize = f.getFileSize();
// alloc space
char *buf = mmalloc(fsize,"thesaurusinit");
if ( ! buf ) return false;
// read it in
int32_t n = f.read ( buf , fsize );
// g_errno should be set in this case
if ( n != fsize ) return false;
char *p = buf;
char *pend = buf + fsize;
loop:
// skip til we hit '|'
while ( p<pend && *p!='|' && *p!='\n')
*/
// TODO: Add support for multiple languages ("dict/en/", "dict/de/", etc)
// stores synonym lists and affinity data
// when computing the affinity, order is important
// affinity is used to compute how often one word occurs with another
// affinity is computed as a ratio of list sizes
// (a,b) means size of list A to size of list (A intersect B)
// (b,a) means size of list B to size of list (A intersect B)
#define OFFSET(x) (x & 0x07FFFFFF)
#define AFFINITY(x) (x >> 32)
#define ISSYN(x) !(x & 0x80000000)
#define TYPE(x) ((x & 0x78000000) >> 27)
Thesaurus g_thesaurus;
// TODO: Replace this with a member variable at some point to support multiple
// languages
static char *s_dictDir = "dict/";
static char *s_affFile = "thesaurus-affinity.txt";
#define MAX_STIDS 8
// quick and dirty
static int32_t findTermIds(char *s, int64_t *tids,
bool hasSpace, int32_t slen = 0) {
static int64_t pid = 0;//getPrefixHash(NULL, 0, "", 0);
char buf[256];
if (!slen) slen = gbstrlen(s);
if (slen > 255) return 0;
gbmemcpy(buf, s, slen);
buf[slen] = '\0';
Words words;
Bits bits;
Phrases phrases;
if (hasSpace) {
// 0 for niceness
words.set(buf, TITLEREC_CURRENT_VERSION, true, 0);
bits.set(&words, TITLEREC_CURRENT_VERSION, 0);
//spam.reset(words.getNumWords());
phrases.set(&words, &bits, true, false,
TITLEREC_CURRENT_VERSION, 0);
int32_t i = 0, j = 0;
int64_t tid;
while (i < words.getNumWords() && j < MAX_STIDS) {
tid = phrases.getPhraseId2(i++);
if (!tid) continue;
tids[j++] = g_indexdb.getTermId(pid, tid);
};
return j;
} else {
words.set(buf, TITLEREC_CURRENT_VERSION, true, 0);
tids[0] = g_indexdb.getTermId(pid, words.getWordId(0));
return 1;
}
}
bool isvowel(char c) {
c = tolower(c);
return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' ||
c == 'h';
}
SynonymInfo::SynonymInfo() {
m_ballocSize = 0;
m_tallocSize = 0;
m_tidSize = 0;
reset();
}
// m_syn (char *), m_affinity, m_offset, m_len, m_firstId, m_numIds (int32_t),
// m_type, m_sort (char), m_hasSpace (bool),
// m_leftSynHash, m_rightSynHash, m_synHash ((u)int64_t)
// m_termId is stored in a separate array due to it not necessarily being 1:1
static int32_t s_synSize = sizeof(char *) + sizeof(int32_t) * 5 +
sizeof(char) * 2 + sizeof(int64_t) * 3 +
sizeof(bool);
void SynonymInfo::reset() {
m_word = NULL;
m_h = 0;
m_wordLen = 0;
m_numSyns = 0;
m_numIds = 0;
m_slots = sizeof(m_buf) / s_synSize;
m_syn = NULL;
m_affinity = NULL;
m_offset = NULL;
m_len = NULL;
m_type = NULL;
m_sort = NULL;
m_hasSpace = NULL;
m_leftSynHash = NULL;
m_rightSynHash = NULL;
m_synHash = NULL;
if (m_tidSize) mfree(m_termId, m_tidSize, "SynonymTID");
m_termId = m_tidBuf;
m_tidSize = 0;
if (m_ballocSize) mfree(m_balloc, m_ballocSize, "SynonymB");
m_balloc = m_buf;
m_ballocSize = 0;
if (m_tallocSize) mfree(m_talloc, m_tallocSize, "SynonymT");
m_talloc = m_tbuf;
m_tallocSize = 0;
m_tbufLen = 0;
}
SynonymInfo::~SynonymInfo() {
reset();
}
bool SynonymInfo::growSyns() {
int32_t newSize = 0;
char *newBuf;
char **newSyn;
int32_t *newAffinity, *newOffset, *newLen, *newFirstId, *newLastId;
char *newType, *newSort;
bool *newHasSpace;
int64_t *newLeftId, *newRightId;
uint64_t *newSynHash;
char *p;
int32_t newSlots;
if (!m_ballocSize) newSize = sizeof(m_buf) * 2;
else newSize = m_ballocSize + sizeof(m_buf);
newBuf = (char *)mmalloc(newSize, "SynonymB");
if (!newBuf) return false;
newSlots = newSize / s_synSize;
p = newBuf;
newSyn = (char **) p; p += newSlots * sizeof(char *);
newAffinity = (int32_t *) p; p += newSlots * sizeof(int32_t);
newOffset = (int32_t *) p; p += newSlots * sizeof(int32_t);
newLen = (int32_t *) p; p += newSlots * sizeof(int32_t);
newFirstId = (int32_t *) p; p += newSlots * sizeof(int32_t);
newLastId = (int32_t *) p; p += newSlots * sizeof(int32_t);
newType = p; p += newSlots * sizeof(char);
newSort = p; p += newSlots * sizeof(char);
newHasSpace = (bool *) p; p += newSlots * sizeof(bool);
newLeftId = (int64_t *)p; p += newSlots * sizeof(int64_t);
newRightId = (int64_t *)p; p += newSlots * sizeof(int64_t);
newSynHash = (uint64_t*)p; p += newSlots * sizeof(uint64_t);
gbmemcpy(newSyn , m_syn , m_numSyns * sizeof(char *));
gbmemcpy(newAffinity, m_affinity, m_numSyns * sizeof(int32_t));
gbmemcpy(newOffset , m_offset , m_numSyns * sizeof(int32_t));
gbmemcpy(newLen , m_len , m_numSyns * sizeof(int32_t));
gbmemcpy(newFirstId , m_firstId , m_numSyns * sizeof(int32_t));
gbmemcpy(newLastId , m_lastId , m_numSyns * sizeof(int32_t));
gbmemcpy(newType , m_type , m_numSyns * sizeof(char));
gbmemcpy(newSort , m_sort , m_numSyns * sizeof(char));
gbmemcpy(newHasSpace, m_hasSpace, m_numSyns * sizeof(bool));
gbmemcpy(newLeftId , m_leftSynHash , m_numSyns * sizeof(int64_t));
gbmemcpy(newRightId , m_rightSynHash , m_numSyns * sizeof(int64_t));
gbmemcpy(newSynHash , m_synHash , m_numSyns * sizeof(uint64_t));
m_syn = newSyn;
m_affinity = newAffinity;
m_offset = newOffset;
m_len = newLen;
m_firstId = newFirstId;
m_lastId = newLastId;
m_type = newType;
m_sort = newSort;
m_hasSpace = newHasSpace;
m_leftSynHash = newLeftId;
m_rightSynHash = newRightId;
m_synHash = newSynHash;
if (m_ballocSize) mfree(m_balloc, m_ballocSize, "SynonymB");
m_balloc = newBuf;
m_ballocSize = newSize;
return true;
}
bool SynonymInfo::growText() {
int32_t newSize = m_tallocSize + sizeof(m_tbuf) * 2;
char *newBuf;
newBuf = (char *)mmalloc(newSize, "SynonymT");
if (!newBuf) return false;
for (int32_t i = 0; i < m_numSyns; i++) {
m_syn[i] = newBuf + (m_syn[i] - m_talloc);
}
gbmemcpy(newBuf, m_talloc, m_tbufLen);
if (m_tallocSize) mfree(m_talloc, m_tallocSize, "SynonymT");
m_talloc = newBuf;
m_tallocSize = newSize;
return true;
}
bool SynonymInfo::growTids() {
int32_t newSize;
if (!m_tidSize) {
newSize = sizeof(m_tidBuf) + sizeof(int64_t) * MAX_STIDS;
} else {
newSize = m_tidSize + sizeof(int64_t) * MAX_STIDS;
}
int64_t *newBuf;
newBuf = (int64_t *)mcalloc(newSize, "SynonymTID");
if (!newBuf) return false;
gbmemcpy(newBuf, m_termId, m_tidSize);//newSize);
if (m_tidSize > (int32_t)sizeof(m_tidBuf)) {
mfree(m_termId, m_tidSize, "SynonymTID");
}
m_termId = newBuf;
m_tidSize = newSize;
return true;
}
bool SynonymInfo::setWord(char *s, int32_t len, uint64_t h) {
// theoretically we shouldn't need this, but it's safer
int32_t tbufSize = m_tallocSize;
if (!tbufSize) tbufSize = sizeof(m_tbuf);
if ((len + m_tbufLen > tbufSize) && !growText()) {
return log("query: ran out of memory producing synonyms");
}
gbmemcpy(m_talloc, s, len);
m_h = h;
return true;
}
bool SynonymInfo::addSynonym(char *syn, int32_t affinity,
int32_t offset, int32_t len,
char type, char sort, bool hasSpace,
int64_t leftSynHash, int64_t rightSynHash) {
int32_t bufSize = m_ballocSize;
int32_t tbufSize = m_tallocSize;
int32_t tidSize = m_tidSize;
int32_t bufNeed = (m_numSyns + 1) * s_synSize;
// check for duplicates
uint64_t h = hash64Lower_utf8(syn, len);
if (h == m_h) {
return log(LOG_DEBUG, "query: Synonym dup hash %016"XINT64"", m_h);
}
int64_t tids[MAX_STIDS];
int32_t addIds = findTermIds(syn, tids, hasSpace, len);
int32_t tidNeed = (m_numIds + addIds) * sizeof(int64_t);
int32_t i, j;
// check for duplicates
for (i = 0; i < m_numSyns; i++) {
// if the number of ids is different, definitely not a match
// prevents false positive "sled dog" and "sled dog_iron"
if (m_lastId[i] - m_firstId[i] + 1 != addIds) continue;
for (j = m_firstId[i]; j <= m_lastId[i]; j++) {
int k = j - m_firstId[i];
if (m_termId[j] != tids[k]) break; // mismatch
}
if (j <= m_lastId[i]) continue; // mismatch, check next one
return log(LOG_DEBUG, "query: Synonym dup by tids %"INT32"", i);
}
// grow the buffers if need be
if (!bufSize) bufSize = sizeof(m_buf);
if (!tbufSize) tbufSize = sizeof(m_tbuf);
if (!tidSize) tidSize = sizeof(m_tidBuf);
if (((bufNeed > bufSize) && !growSyns()) ||
((m_tbufLen + len > tbufSize) && !growText()) ||
((tidNeed > tidSize) && !growTids())) {
return log("query: ran out of memory producing synonyms");
}
// assign pointers if necessary
if (!m_syn) {
int32_t slots = bufSize / s_synSize;
char *p = m_buf;
m_syn = (char **) p; p += slots * sizeof(char *);
m_affinity = (int32_t *) p; p += slots * sizeof(int32_t);
m_offset = (int32_t *) p; p += slots * sizeof(int32_t);
m_len = (int32_t *) p; p += slots * sizeof(int32_t);
m_firstId = (int32_t *) p; p += slots * sizeof(int32_t);
m_lastId = (int32_t *) p; p += slots * sizeof(int32_t);
m_type = p; p += slots * sizeof(char);
m_sort = p; p += slots * sizeof(char);
m_hasSpace = (bool *) p; p += slots * sizeof(bool);
m_leftSynHash = (int64_t *)p; p += slots * sizeof(int64_t);
m_rightSynHash = (int64_t *)p; p += slots * sizeof(int64_t);
m_synHash = (uint64_t*)p; p += slots * sizeof(uint64_t);
}
// check the sort
if (m_highSort < sort) {
m_highSort = sort;
}
// and finally, load all the info into the structure
gbmemcpy(m_talloc + m_tbufLen, syn, len);
m_syn[m_numSyns] = m_talloc + m_tbufLen;
m_tbufLen += len;
m_affinity[m_numSyns] = affinity;
m_offset[m_numSyns] = offset;
m_firstId[m_numSyns] = m_numIds;
m_lastId[m_numSyns] = m_numIds + addIds - 1;
m_len[m_numSyns] = len;
m_type[m_numSyns] = type;
m_sort[m_numSyns] = sort;
m_hasSpace[m_numSyns] = hasSpace;
m_leftSynHash[m_numSyns] = leftSynHash;
m_rightSynHash[m_numSyns] = rightSynHash;
m_synHash[m_numSyns] = h;
m_numSyns++;
for (i = 0; i < addIds; i++) {
m_termId[m_numIds++] = tids[i];
}
return true;
}
Thesaurus::Thesaurus() {
m_rebuilding = false;
m_affinityState = NULL;
m_synonymTable.reset();
m_synonymTable.set(0, m_buf, 2 * HT_BUF_SIZE, true);
m_synonymText = NULL;
m_synonymLen = 0;
m_synonymSize = 0;
m_numSynonyms = 0;
m_totalPairs = 0;
}
Thesaurus::~Thesaurus() {
reset();
m_synonymTable.reset();
}
void Thesaurus::reset() {
if (m_reps) {
mfree(m_reps, sizeof(char *) * m_numReps, "stemmer");
}
if (m_repLens) {
mfree(m_repLens, sizeof(int32_t) * m_numReps, "stemmer");
}
m_reps = NULL;
m_repLens = NULL;
m_numReps = 0;
if (m_suffixes) {
mfree(m_suffixes, sizeof(Suffix) * m_numSuffixes, "stemmer");
}
m_suffixes = NULL;
m_numSuffixes = 0;
m_suffixBuffer.reset();
m_stemTable.reset();
m_stemTable.set(0, 0, 0, true);
m_stemBuffer.reset();
m_rebuilding = false;
// note that we DON'T reset affinityState here, this needs to be
// cleaned up by the affinity code so we can detect that there's
// outstanding requests to deal with still
//m_synonymTable.reset();
//m_synonymTable.set(0, m_buf, 2 * HT_BUF_SIZE, true);
if (m_synonymText) mfree(m_synonymText, m_synonymSize, "thesaurus");
m_synonymText = NULL;
m_synonymLen = 0;
m_synonymSize = 0;
m_numSynonyms = 0;
m_totalPairs = 0;
}
bool Thesaurus::rebuild(char *server, bool fullRebuild) {
log(LOG_INFO, "build: rebuilding Thesaurus synonyms");
if (!rebuildSynonyms()) {
log("build: Couldn't rebuild Thesaurus synonyms");
return true;
}
if (!load()) {
log("build: Couldn't load rebuilt thesaurus data, disk "
"problem?");
return true;
}
// this function starts a callback loop, returns true on error
return rebuildAffinity(server, fullRebuild);
}
void Thesaurus::cancelRebuild() {
m_rebuilding = false;
}
char *Thesaurus::getSynonymFromOffset(int32_t offset) {
// corner cases first
if (offset == 0) return m_synonymText;
if (offset >= m_synonymLen || offset < 0) return NULL;
// if the character just before the offset is a null byte, we're at
// the beginning of a word, so as int32_t as the rest of the code is
// sane this is valid
if (m_synonymText[offset-1] == '\0') return m_synonymText + offset;
// otherwise, no, we're in the middle of a word and this isn't valid
return NULL;
}
bool Thesaurus::getAllInfo(char *s, SynonymInfo *info, int32_t slen,
int32_t bits) {
bool r = false;
if (!slen) slen = gbstrlen(s);
if (slen > 256) return false;
if (!bits) return false;
log(LOG_DEBUG, "query: getAllInfo(%32s, %"INT32", %p, %"XINT32")",
s, slen, info, bits);
// do stems first so SYN_STEM overrides
if (bits & SYNBIT_STEM) r |= getStems(s, slen, info);
r |= getSynonymInfo(s, info, slen, bits);
/*
MDW: take this out until it works!
if (bits & SYNBIT_SPELLING) {
bool found;
int32_t score, popularity;
char buf[256];
if (g_speller.m_language[langEnglish].getRecommendation(
s, slen, buf, 256,
&found, &score, &popularity, false) &&
buf[0] ) {
r |= info->addSynonym(buf, DEF_AFFINITY, -1,
gbstrlen(buf), SYN_SPELLING,
strchr(buf, ' ') != NULL,
0, 0);
}
}
*/
if (bits & SYNBIT_NUMBER) r |= parseNumbers(s, slen, info);
if (bits & SYNBIT_PHRASE) r |= generatePhrases(s, slen, info, bits);
return r;
}
bool Thesaurus::getSynonymInfo(char *s, SynonymInfo *info,
int32_t slen, int32_t bits) {
if (!slen) slen = gbstrlen(s);
uint64_t h = hash64Lower_utf8(s, slen);
// debug
log(LOG_DEBUG,"query: get syn info for %s",s);
// do not get synonyms of stop words...
//if ( ! isStopWord ( s , gbstrlen(s) , h ) )
// this returns true if we found some synonyms
bool r = getSynonymInfo(h, info, bits);
if (!r) info->setWord(s, slen, h);
return r;
}
bool Thesaurus::getSynonymInfo(uint64_t h, SynonymInfo *info, int32_t bits) {
int32_t slot = m_synonymTable.getSlot(h);
if (slot < 0) return false;
log(LOG_DEBUG, "query: getSynonymInfo(%"XINT64", %p, %"XINT32")", h, info, bits);
// this is NOW the first synonym
//char *p = m_synonymText + OFFSET(m_synonymTable.
// getValueFromSlot(slot));
//int32_t len = gbstrlen(p);
//info->setWord(p, len, h);
info->setWord(NULL,0,h);
do {
if (m_synonymTable.getKey(slot) == h) {
int64_t v = m_synonymTable.getValueFromSlot(slot);
int32_t o = OFFSET(v);
char *p = m_synonymText + o;
int32_t a = AFFINITY(v);
int32_t t = TYPE(v);
bool sp = strchr(p, ' ') != NULL;
char sr;
//if (!a) continue;
if (t == SYN_STEM) sr = 1;
else sr = 2;
if (bits & (1 << t)) {
info->addSynonym(p, a, o, gbstrlen(p), t, sr,
sp, 0, 0);
}
}
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
} while (m_synonymTable.getKey(slot));
return true;
}
int32_t Thesaurus::getAffinity(char *s1, char *s2, int32_t l1, int32_t l2) {
if (!l1) l1 = gbstrlen(s1);
if (!l2) l2 = gbstrlen(s2);
return getAffinity(hash64Lower_utf8(s1, l1), hash64Lower_utf8(s2, l2));
}
int32_t Thesaurus::getAffinity(uint64_t h1, uint64_t h2) {
if (h1 == h2) return MAX_AFFINITY;
int32_t slot = m_synonymTable.getSlot(h1);
if (slot < 0) return -1;
while (m_synonymTable.getKey(slot)) {
if (m_synonymTable.getKey(slot) == h1) {
int64_t v = m_synonymTable.getValueFromSlot(slot);
char *p = m_synonymText + OFFSET(v);
if (h2 == hash64Lower_utf8(p, gbstrlen(p)))
return AFFINITY(v);
}
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
}
return -1;
}
int32_t Thesaurus::getAffinityN(char *s, int32_t n, int32_t l) {
if (!l) l = gbstrlen(s);
return getAffinityN(hash64Lower_utf8(s, l), n);
}
int32_t Thesaurus::getAffinityN(uint64_t h, int32_t n) {
int32_t slot = m_synonymTable.getSlot(h);
if (slot < 0) return -1;
while (m_synonymTable.getKey(slot)) {
if (m_synonymTable.getKey(slot) == h && !n--)
return AFFINITY(m_synonymTable.getValueFromSlot(slot));
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
}
return -1;
}
char *Thesaurus::getSynonymN(char *s, int32_t n, int32_t l) {
if (!l) l = gbstrlen(s);
return getSynonymN(hash64Lower_utf8(s, l), n);
}
char *Thesaurus::getSynonymN(uint64_t h, int32_t n) {
int32_t slot = m_synonymTable.getSlot(h);
if (slot < 0) return NULL;
while(m_synonymTable.getKey(slot)) {
if (m_synonymTable.getKey(slot) == h && !n--)
return m_synonymText +
OFFSET(m_synonymTable.getValueFromSlot(slot));
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
}
return NULL;
}
int32_t Thesaurus::getNumSyns(char *s, int32_t l) {
if (!l) l = gbstrlen(s);
return getNumSyns(hash64Lower_utf8(s, gbstrlen(s)));
}
int32_t Thesaurus::getNumSyns(uint64_t h) {
int32_t slot = m_synonymTable.getSlot(h);
if (slot < 0) return 0;
++slot; // skip the first slot
int32_t r = 0;
while(m_synonymTable.getKey(slot)) {
if (m_synonymTable.getKey(slot) == h) r++;
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
}
return r;
}
int32_t Thesaurus::getSlot(char *s1, char *s2, int32_t l1, int32_t l2) {
if (!l1) l1 = gbstrlen(s1);
if (!l2) l2 = gbstrlen(s2);
return getSlot(hash64Lower_utf8(s1, l1), hash64Lower_utf8(s2, l2));
}
int32_t Thesaurus::getSlot(uint64_t h1, uint64_t h2) {
int32_t slot = m_synonymTable.getSlot(h1);
if (slot < 0) return -1;
if (h1 == h2) return slot;
while(m_synonymTable.getKey(slot)) {
if (m_synonymTable.getKey(slot) == h1) {
char *p = m_synonymText +
OFFSET(m_synonymTable.getValueFromSlot(slot));
if (hash64Lower_utf8(p, gbstrlen(p)) == h2) return slot;
}
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
}
return -1;
}
int32_t Thesaurus::getSlotN(char *s, int32_t n, int32_t l) {
if (!l) l = gbstrlen(s);
return getSlotN(hash64Lower_utf8(s, l), n);
}
int32_t Thesaurus::getSlotN(uint64_t h, int32_t n) {
int32_t slot = m_synonymTable.getSlot(h);
if (slot < 0) return -1;
while(m_synonymTable.getKey(slot)) {
if (m_synonymTable.getKey(slot) == h && !n--) return slot;
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
}
return -1;
}
int32_t Thesaurus::getOffset(char *s, int32_t l) {
if (!l) l = gbstrlen(s);
return getOffset(hash64Lower_utf8(s));
}
int32_t Thesaurus::getOffset(uint64_t h) {
int32_t slot = m_synonymTable.getSlot(h);
if (slot < 0) return -1;
return OFFSET(m_synonymTable.getValueFromSlot(slot));
}
char Thesaurus::getFlag(char *s1, char *s2, int32_t l1, int32_t l2) {
if (!l1) l1 = gbstrlen(s1);
if (!l2) l2 = gbstrlen(s2);
return getFlag(hash64Lower_utf8(s1, l1), hash64Lower_utf8(s2, l2));
}
char Thesaurus::getFlag(uint64_t h1, uint64_t h2) {
int32_t slot = m_synonymTable.getSlot(h1);
if (slot < 0) return SYN_INVALID;
while(m_synonymTable.getKey(slot)) {
if (m_synonymTable.getKey(slot) == h1) {
int64_t v = m_synonymTable.getValueFromSlot(slot);
char *p = m_synonymText + OFFSET(v);
if (hash64Lower_utf8(p) == h2) return TYPE(v);
}
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
}
return SYN_INVALID;
}
char Thesaurus::getFlagN(char *s, int32_t n, int32_t l) {
if (!l) l = gbstrlen(s);
return getFlagN(hash64Lower_utf8(s, l), n);
}
char Thesaurus::getFlagN(uint64_t h, int32_t n) {
int32_t slot = m_synonymTable.getSlot(h);
if (slot < 0) return SYN_INVALID;
while(m_synonymTable.getKey(slot)) {
if (m_synonymTable.getKey(slot) == h && !n--)
return TYPE(m_synonymTable.getValueFromSlot(slot));
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
}
return SYN_INVALID;
}
int64_t Thesaurus::getValue(char *s1, char *s2, int32_t l1, int32_t l2) {
if (!l1) l1 = gbstrlen(s1);
if (!l2) l2 = gbstrlen(s2);
return getValue(hash64Lower_utf8(s1, l1), hash64Lower_utf8(s2, l2));
}
int64_t Thesaurus::getValue(uint64_t h1, uint64_t h2) {
int32_t slot = m_synonymTable.getSlot(h1);
if (slot < 0) return -1LL;
while(m_synonymTable.getKey(slot)) {
if (m_synonymTable.getKey(slot) == h1) {
int64_t v = m_synonymTable.getValueFromSlot(slot);
char *p = m_synonymText + OFFSET(v);
if (hash64Lower_utf8(p) == h2) return v;
}
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
}
return -1LL;
}
int64_t Thesaurus::getValueN(char *s, int32_t n, int32_t l) {
if (!l) l = gbstrlen(s);
return getValueN(hash64Lower_utf8(s, l), n);
}
int64_t Thesaurus::getValueN(uint64_t h, int32_t n) {
int32_t slot = m_synonymTable.getSlot(h);
if (slot < 0) return -1LL;
while(m_synonymTable.getKey(slot)) {
if (m_synonymTable.getKey(slot) == h && !n--)
return m_synonymTable.getValueFromSlot(slot);
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
}
return -1LL;
}
static int removePunctuation(char *src, int srcLen) {
int i = 0, j = 0;
while (i < srcLen) {
if (src[i] != '-' && src[i] != '.' && src[i] != ',')
src[j++] = src[i];
i++;
}
return j;
}
bool Thesaurus::getStems(char *s, int32_t slen, SynonymInfo *info) {
int32_t lang = 1; // FIXME: add support for other languages
if (slen > 255) return false;
if (!m_suffixes) return false;
bool r = false;
char s2[256];
bool dbl = false; // double consonant?
// sanity check - for to_lower_utf8 to work
if ( s[slen] ) { char *xx=NULL;*xx=0; }
// store it as a lower case string into "s2"
to_lower_utf8(s2 , s2+250, s);
uint32_t h = hash32Lower_utf8(s, slen);
// do not do this on stop words!
if ( isStopWord ( s , slen , h ) )
return false;
int32_t slot = m_stemTable.getSlot(h);
// check for exceptions
if (slot >= 0) {
char *p = m_stemTable.getValueFromSlot(slot);
int32_t plen = gbstrlen(p);
if (p[0] != '.') {
r |= info->addSynonym(p, -1, -1, plen,
SYN_STEM, 1, false,
0, 0);
}
do {
if (++slot >= m_synonymTable.getNumSlots()) slot = 0;
if (m_synonymTable.getKey(slot) == h) {
p = m_stemTable.getValueFromSlot(slot);
plen = gbstrlen(p);
if (p[0] == '.') continue;
r |= info->addSynonym(p, -1, -1, plen,
SYN_STEM, 1, false,
0, 0);
}
} while (m_stemTable.getKey(slot));
}
char buf[256], buf2[256];
int32_t bufLen = 0, buf2Len = 0;
// see if we can remove punctuation first
gbmemcpy(buf, s2, slen);
bufLen = removePunctuation(buf, slen);
if (bufLen != slen) {
r |= info->addSynonym(buf, -1, -1, bufLen,
SYN_STEM, 1, false, 0, 0);
}
Suffix *suf = m_suffixes;
Suffix *sufend = m_suffixes + m_numSuffixes;
int32_t sufLen;
for (; suf < sufend; suf++) {
sufLen = suf->m_len;
// if replacing the suffix would int16_ten the word below 3,
// skip it
if (sufLen >= slen - 1) continue;
if (!memcmp(s2 + slen - sufLen, suf->m_suffix, sufLen)) break;
}
char **rep;
char **repend;
int32_t *repLenp, repLen;
int32_t best = -1;
// found a usable suffix, so try to stem it
if (suf != sufend) {
rep = suf->m_reps;
repend = suf->m_reps + suf->m_numReps;
repLenp = suf->m_repLens;
// find the most likely word
while (rep < repend) {
gbmemcpy(buf, s2, slen);
repLen = *repLenp;
int32_t stemLen = slen - sufLen;
bool mdbl = false;
bufLen = stemLen + repLen;
if (bufLen <= 1) continue;
// attach the replacement
gbmemcpy(buf + stemLen, *rep, repLen + 1);
rep++;
repLenp++;
// needs to be hash64d because that's what the speller
// is expecting
uint64_t h2 = hash64d(buf, bufLen);
int32_t pop = g_speller.getPhrasePopularity(buf, h2,
false, lang);
if (g_conf.m_logDebugQuery) {
char buf3[256];
gbmemcpy(buf3, buf, bufLen);
buf3[bufLen] = '\0';
log(LOG_DEBUG, "query: maybe stem %s (%"INT32")",
buf3, pop);
}
// if the replacement is empty, see if removing a
// double-consanant is a better match
// note that some other code assumes this only
// happens if the replacement is empty so if you
// change this please change the other code as well
// - bcc
if (!repLen && stemLen > 1 &&
buf[stemLen-1] == buf[stemLen-2]) {
char buf3[256];
gbmemcpy(buf3, buf, bufLen - 1);
buf3[bufLen - 1] = '\0';
h2 = hash64d(buf3, bufLen - 1);
int32_t pop2 = g_speller.getPhrasePopularity(
buf3, h2, false, lang);
if (pop2 > pop) {
log(LOG_DEBUG, "query: Double "
"consonant removed \"%s\""
" (%"INT32")",
buf3, pop2);
gbmemcpy(buf, buf3, bufLen);
pop = pop2;
bufLen--;
mdbl = true;
}
}
if (!pop) continue;
if (best < pop) {
best = pop;
gbmemcpy(buf2, buf, bufLen + 1);
buf2Len = bufLen;
dbl = mdbl;
}
}
}
// if we found something, add it in
if (best >= 0) {
log(LOG_DEBUG, "query: Stemming %s to %s (%"INT32")",
s, buf2, best);
r |= info->addSynonym(buf2, -1, -1, buf2Len,
SYN_STEM, 1, false, 0, 0);
} else {
// else just copy this in to make the next section simpler
gbmemcpy(buf2, s2, slen);
buf2Len = slen;
}
// loop through all the other suffixes and see if we can
// attach them and get a usable word
suf = m_suffixes;
while (suf < sufend) {
rep = suf->m_reps;
repend = suf->m_reps + suf->m_numReps;
repLenp = suf->m_repLens;
while (rep < repend) {
repLen = *repLenp;
char *rep2 = *rep;
char buf3[256];
int32_t buf3Len;
int32_t pop2 = 0;
rep++;
repLenp++;
if (memcmp(buf2 + buf2Len - repLen, rep2, repLen))
continue;
// found a possible replacement, so add the
// suffix to it and see what we get
gbmemcpy(buf, buf2, buf2Len);
bufLen = buf2Len - repLen;
gbmemcpy(buf + bufLen, suf->m_suffix, suf->m_len + 1);
bufLen += suf->m_len;
// needs to be hash64d because that's what the speller
// is expecting
uint64_t h2 = hash64d(buf, bufLen);
int32_t pop = g_speller.getPhrasePopularity(buf, h2,
false, lang);
// if we removed a double consonant, add it back and
// evaluate it with the new suffix
if (dbl) {
// if we reached here, repLen is always 0
gbmemcpy(buf3, buf2, bufLen);
buf3[buf2Len] = buf3[buf2Len - 1];
gbmemcpy(buf3 + buf2Len + 1, suf->m_suffix,
suf->m_len + 1);
buf3Len = buf2Len + 1 + suf->m_len;
h2 = hash64d(buf3, buf3Len);
pop2 = g_speller.getPhrasePopularity(buf3, h2,
false, lang);
}
if (pop) { // got a potential suffix
log(LOG_DEBUG, "query: adding unstem \"%s\" "
"%"INT32"", buf, pop);
r |= info->addSynonym(buf, -1, -1, bufLen,
SYN_STEM, 1, false, 0, 0);
}
if (pop2) {
log(LOG_DEBUG, "query: adding unstem \"%s\" "
"%"INT32"", buf3, pop);
r |= info->addSynonym(buf3, -1, -1, bufLen + 1,
SYN_STEM, 1, false, 0, 0);
}
}
suf++;
}
return r;
}
struct Number {
char *m_word;
int64_t m_number;
int32_t m_len;
uint32_t m_h;
};
static Number s_smallNumbers[] = {
{ "zero", 0 },
{ "one", 1 },
{ "two", 2 },
{ "three", 3 },
{ "four", 4 },
{ "five", 5 },
{ "six", 6 },
{ "seven", 7 },
{ "eight", 8 },
{ "nine" , 9 },
{ "ten", 10 },
{ "eleven", 11 },
{ "twelve", 12 },
{ "thirteen", 13 },
{ "fourteen", 14 },
{ "fifteen", 15 },
{ "sixteen", 16 },
{ "seventeen", 17 },
{ "eighteen", 18 },
{ "nineteen", 19 },
{ 0 }
};
static Number s_tens[] = {
{ "twenty", 20 },
{ "thirty", 30 },
{ "forty", 40 },
{ "fifty", 50 },
{ "sixty", 60 },
{ "seventy", 70 },
{ "eighty", 80 },
{ "ninety", 90 },
{ "hundred", 100 },
{ 0 }
};
static Number s_bigNumbers[] = {
{ "quintillion", 1000000000000000000LL },
{ "quadrillion", 1000000000000000LL },
{ "trillion", 1000000000000LL },
{ "billion", 1000000000 },
{ "million", 1000000 },
{ "thousand", 1000 },
{ 0 }
};
// TODO: Make this work again when other number systems are added in
#if 0
void testNumber() {
SynonymInfo info;
for(int i = 0; i < 10000; i++) {
int64_t n1 = (int64_t)rand() << 32 + rand();
bool r1 = g_thesaurus.parseNumbers(n1, &info);
if (!r1) log(LOG_INFO, "query: %lld failure", n1);
bool r2 = g_thesaurus.parseNumbers(info.m_syn[0],
info.m_len[0],
&info);
int64_t n2 = strtoll(info.m_, 0, 0);
if (n1 != n2 && n1 >= 0)
log(LOG_INFO, "query: %lld %lld %s", n1, n2, buf1);
}
}
#endif
bool Thesaurus::parseNumbers(int64_t n, SynonymInfo *info) {
SafeBuf buf(256);
bool sp = false, r = true;
// break it down until there's nothing left
while (n > 0) {
Number *mult = NULL;
for (Number *number = s_bigNumbers; number->m_word;
number++) {
if (n >= number->m_number) {
mult = number;
break;
}
}
int64_t base = n;
if (mult) base /= mult->m_number;
int32_t hundred = base / 100;
int32_t tens;
int32_t small;
if (base % 100 < s_tens[0].m_number) {
tens = 0;
small = base % 100;
} else {
tens = (base % 100) / 10;
small = base % 10;
}
if (hundred) {
sp = true;
r &= buf.safePrintf("%s hundred",
s_smallNumbers[hundred].m_word);
if (tens || small)
r &= buf.safePrintf(" ");
}
if (tens) {
r &= buf.safePrintf("%s", s_tens[tens-2].m_word);
if (small) {
sp = true;
r &= buf.safePrintf(" ");
}
}
if (small) r &= buf.safePrintf("%s",
s_smallNumbers[small].m_word);
if (mult) {
sp = true;
r &= buf.safePrintf(" %s", mult->m_word);
n %= mult->m_number;
if (n >= 100) { r &= buf.safePrintf(", "); }
else if (n) { r &= buf.safePrintf(" and "); }
} else {
n = 0;
}
}
if (!buf.length() && n == 0) r &= buf.safePrintf("zero");
if (buf.length() && r) {
r &= info->addSynonym(buf.getBufStart(), -1, -1,
buf.length(), SYN_NUMBER, 2, sp, 0, 0);
}
return r;
}
bool Thesaurus::parseNumbers(char *s, int32_t slen, SynonymInfo *info) {
// TODO: Make this language specific
// init the hashes if they don't exist yet
if (!s_smallNumbers[0].m_len)
for (Number *number = s_smallNumbers; number->m_word;
number++) {
number->m_len = gbstrlen(number->m_word);
number->m_h = hash32n(number->m_word);
}
if (!s_tens[0].m_len)
for (Number *number = s_tens; number->m_word;
number++) {
number->m_len = gbstrlen(number->m_word);
number->m_h = hash32n(number->m_word);
}
if (!s_bigNumbers[0].m_len)
for (Number *number = s_bigNumbers; number->m_word;
number++) {
number->m_len = gbstrlen(number->m_word);
number->m_h = hash32n(number->m_word);
}
if (!slen) slen = gbstrlen(s);
// first check to see if we have digits
char *p = s, *pend = s + slen;
char *send;
int64_t n = strtoll(s, &send, 10);
if (s != send && send == pend) {
return parseNumbers(n, info);
}
SafeBuf buf(256);
n = 0;
int64_t m = 0;
Number *sm = NULL, *tn = NULL, *hn = NULL, *md = NULL;
while (p < pend) {
while ((isspace(*p) || *p == ',') && p < pend) p++;
char *sp = p;
while (!isspace(*sp) && *sp != ',' && sp < pend) sp++;
if ( sp - p > 3 && !strncmp(p, "and", sp - p)) {
p = sp;
continue;
}
uint32_t h = hash32(p, sp - p);
bool match = false;
if (!md) for (Number *number = s_bigNumbers;
number->m_word; number++) {
if (h == number->m_h && sp - p == number->m_len) {
match = true;
md = number;
if (!sm && !tn && !hn)
{ sm = s_smallNumbers + 1; }
break;
}
}
if ((!hn || !tn) && !match) for (Number *number = s_tens;
number->m_word; number++) {
if (h == number->m_h && sp - p == number->m_len) {
if (number->m_number == 100) {
if (hn) break;
if (!sm) { sm = s_smallNumbers + 1; }
hn = sm;
sm = NULL;
} else {
if (tn) break;
tn = number;
}
match = true;
break;
}
}
if (!sm && !match) for (Number *number = s_smallNumbers;
number->m_word; number++) {
if (h == number->m_h && sp - p == number->m_len) {
match = true;
sm = number;
break;
}
}
if (match) {
if (md) {
if (hn) m += 100 * hn->m_number;
if (tn) m += tn->m_number;
if (sm) m += sm->m_number;
m *= md->m_number;
n += m;
m = 0;
sm = tn = hn = md = NULL;
}
} else if (p < pend) {
//log(LOG_DEBUG, "query: mismatch %s %lld", p, n);
return false;
}
p = sp;
}
if (hn) n += 100 * hn->m_number;
if (tn) n += tn->m_number;
if (sm) n += sm->m_number;
if (buf.safePrintf("%lld", n)) {
return info->addSynonym(buf.getBufStart(), -1, -1,
buf.length(), SYN_NUMBER, 2, false, 0, 0);
}
return false;
}
static char *s_articlesEng[] = { "the",
"an",
"a"};
// MDW: "some is not a stop word and should be omitted
// "some"};
static int32_t s_numArticlesEng = 3;
bool Thesaurus::generatePhrases(char *s, int32_t slen,
SynonymInfo *info, int32_t bits) {
char *w1, *w2, *p1, *p2, *end, *mid = NULL;
int32_t w1Len, w2Len, midLen;
int64_t leftSynHash, rightSynHash;
// disable this lest we get into an infinite recursive loop
bits &= ~SYNBIT_PHRASE;
p1 = s;
// find first non-stopword
bool isStop;
end = s + slen;
char **articles = s_articlesEng;
int32_t numArticles = s_numArticlesEng;
do {
while (*p1 && isspace(*p1) && p1 < end) p1++;
w1 = p1;
while (*p1 && !isspace(*p1) && p1 < end) p1++;
w1Len = p1 - w1;
isStop = isStopWord(w1, w1Len, hash64Lower_utf8(w1, w1Len));
} while (p1 < end && *p1 && isStop);
// we reached the end without finding a non-stopword
// probably shouldn't have gotten fed this
if (!w1 || p1 == w1 || !*p1 || isStop) {
// not sure if this should be a logic error?
log(LOG_DEBUG, "query: non-phrase fed to generatePhrases");
return false;
}
// find second non-stopword
do {
while (*p1 && isspace(*p1) && p1 < end) p1++;
w2 = p1;
if (!mid) mid = p1;
while (*p1 && !isspace(*p1) && p1 < end) p1++;
w2Len = p1 - w2;
isStop = isStopWord(w2, w2Len, hash64Lower_utf8(w2, w2Len));
} while (p1 < end && *p1 && isStop);
// carve out the middle for later
midLen = w2 - mid;
while (isspace(mid[midLen - 1])) midLen--;
// we reached the end without finding a second non-stopword
// this happens with certain phrase segments, e.g. "cheese and"
if (!w2 || p1 == w2 || isStop) return false;
SynonymInfo syn1, syn2;
bool r = getAllInfo(w1, &syn1, w1Len, bits);
r |= getAllInfo(w2, &syn2, w2Len, bits);
if (!r) return false;
// check to see if there is an article for the first stop word
p1 = mid;
char *stop;
int32_t stopLen;
while(*p1 && isspace(*p1) && p1 < w2) p1++;
stop = p1;
while(*p1 && !isspace(*p1) && p1 < w2) p1++;
stopLen = p1 - stop;
int32_t artIndex = -1;
if (stopLen > 0) {
artIndex = numArticles - 1;
while (artIndex >= 0) {
if (!strncmp(stop, articles[artIndex], stopLen)) {
break;
}
artIndex--;
}
}
r = false;
// -1 is for the original source
// i is for the first part, j is for the second part, and k is for
// article substitution
for (int i = -1; i < syn1.m_numSyns; i++) {
for (int j = -1; j < syn2.m_numSyns; j++) {
for (int k = -1; k < numArticles; k++) {
if (artIndex < 0) k = numArticles;
if ((i < 0 && j < 0) &&
(artIndex < 0 || artIndex == k))
continue;
// check for 'an' and only use if w2 starts
// with a vowel, and don't use 'a' if w2
// starts with a vowel
char vw;
if (j < 0) vw = w2[0];
else vw = syn2.m_syn[j][0];
if ((k == 1 && !isvowel(vw)) ||
(k == 2 && isvowel(vw)))
continue;
char buf[2048];
char sort = 0;
p1 = s;
p2 = buf;
int32_t n1, n2;
// copy the fragment before w1
n1 = w1 - p1;
n2 = n1;
gbmemcpy(p2, p1, n2);
p1 += n1;
p2 += n2;
// copy the w1 synonym
n1 = w1Len;
if (i < 0) {
n2 = n1;
gbmemcpy(p2, w1, n1);
leftSynHash = 0;
} else {
n2 = syn1.m_len[i];
gbmemcpy(p2, syn1.m_syn[i], n2);
//lid = syn1.m_termId[i];
leftSynHash = syn1.m_synHash[i];
sort += syn1.m_sort[i];
}
p1 += n1;
p2 += n2;
if (k < numArticles) {
n1 = stopLen;
if (k == -1) {
n2 = 0;
} else {
*p2++ = ' ';
n2 = gbstrlen(articles[k]);
gbmemcpy(p2, articles[k], n2);
}
p1 += n1 + 2;
p2 += n2;
*p2++ = ' ';
if (midLen > stopLen) {
n1 = midLen - stopLen;
n2 = n1;
gbmemcpy(p2, p1, n2);
p1 += n1 + 1;
*p2++ = ' ';
}
} else {
// copy the fragment between w1 and 2
n1 = w2 - (w1 + w1Len);
n2 = n1;
gbmemcpy(p2, p1, n2);
p1 += n1;
p2 += n2;
}
// copy the w2 synonym
n1 = w2Len;
if (j < 0) {
n2 = n1;
gbmemcpy(p2, w2, n1);
rightSynHash = 0;
} else {
n2 = syn2.m_len[j];
gbmemcpy(p2, syn2.m_syn[j], n2);
//rid = syn2.m_termId[j];
rightSynHash = syn2.m_synHash[j];
sort += syn2.m_sort[j];
}
p1 += n1;
p2 += n2;
// copy the fragment after w2
n1 = (s + slen) - (w2 + w2Len);
n2 = n1;
gbmemcpy(p2, p1, n2);
p1 += n1;
p2 += n2;
*p2 = '\0';
r |= info->addSynonym(buf, -1, -1,
p2 - buf, SYN_PHRASE,
sort, true,
leftSynHash,
rightSynHash);
}
}
}
return r;
}
struct synType {
char *m_word;
char m_type;
};
static synType s_types[] = {
{ "synonym", SYN_SYNONYM },
{ "stem", SYN_STEM },
{ "spelling", SYN_SPELLING },
{ "acronym", SYN_ACRONYM },
{ "number", SYN_NUMBER },
{ "phrase", SYN_PHRASE },
{ "translation", SYN_TRANS },
{ "unknown", SYN_UNKNOWN },
{ "invalid", SYN_INVALID }
};
bool Thesaurus::rebuildSynonyms() {
uint64_t startTime = gettimeofdayInMilliseconds();
char ff[PATH_MAX];
char *p1, *pend;
HashTableT<uint64_t, SynonymLinkGroup> linkTable;
// read in all files that fit the pattern
Dir dir;
dir.set(g_hostdb.m_dir, s_dictDir);
if (!dir.open())
return log("build: Couldn't open directory %s%s",
g_hostdb.m_dir, s_dictDir);
char *synFile;
SafeBuf addBuffer;
int32_t unknown = 0; // number of missing synonym types
while((synFile = dir.getNextFilename("thesaurus-*"))) {
// don't read this, the format is different
if (!strcmp(synFile, s_affFile)) continue;
snprintf(ff, PATH_MAX, "%s%s%s",
g_hostdb.m_dir, s_dictDir, synFile);
SafeBuf addFile;
if (!addFile.fillFromFile(ff)) {
log("build: Couldn't load %s", ff);
continue;
}
log(LOG_INFO, "build: Reading synonym pairs from %s", ff);
p1 = addFile.getBufStart();
pend = addFile.getBuf();
// one word/phrase pair per line, read first as master, second
// as synonym, delimited by a pipe '|' character to support
// phrase synonyms
while(p1 < pend) {
char *a = p1, *b = NULL, *c = NULL,
*d = NULL, *e = NULL;
while (*p1 != '\n' && p1 < pend) p1++;
if (*p1 == '\n') *p1++ = '\0';
b = strchr(a, '|');
if (b) c = strchr(b + 1, '|');
if (c) d = strchr(c + 1, '|');
if (d) e = strchr(d + 1, '|');
if (!b || e) {
log(LOG_DEBUG, "build: Line in %s doesn't "
"contain the right number of pipes: "
"\"%s\", skipping line", ff, a);
continue;
} else {
*b++ = '\0';
if (c) *c++ = '\0';
if (d) *d++ = '\0';
}
SynonymLinkGroup w, *wp1, *wp2;
uint64_t h1, h2;
int32_t alen = gbstrlen(a), blen = gbstrlen(b);
char type = SYN_UNKNOWN;
int32_t aff = -1;
// if we have both but the third field is a number,
// assign it to d for affinity
if (c && d && isdigit(*c)) {
char *s = c;
c = d;
d = s;
}
if (c) {
for (synType *typep = s_types;
typep->m_type != SYN_UNKNOWN;
typep++) {
if (!strcmp(c, typep->m_word)) {
type = typep->m_type;
break;
}
}
}
if (d) aff = strtol(d, &e, 0);
if (type >= SYN_UNKNOWN) {
if (c) {
log(LOG_DEBUG, "build: Unknown synonym "
"type: %s", c);
} else {
log(LOG_DEBUG, "build: Missing synonym "
"type: %s, %s", a, b);
}
unknown++;
}
if (e && *e) log("build: Extra characters in "
"affinity: %s", e);
h1 = hash64Lower_utf8(a, alen);
h2 = hash64Lower_utf8(b, blen);
int32_t slot1 = linkTable.getSlot(h1);
bool x = true;
if (slot1 < 0) {
w.m_n = 0;
w.m_h[0] = h1;
w.m_syn[0] = addBuffer.length();
addBuffer.safeMemcpy(a, alen+1);
x &= linkTable.addKey(h1, w, &slot1);
}
int32_t slot2 = linkTable.getSlot(h2);
if (slot2 < 0) {
w.m_n = 0;
w.m_h[0] = h2;
w.m_syn[0] = addBuffer.length();
addBuffer.safeMemcpy(b, blen+1);
x &= linkTable.addKey(h2, w, &slot2);
// this slot may have moved so grab it again
slot1 = linkTable.getSlot(h1);
}
if (!x) { // ran out of memory
return log("build: Out of memory rebuilding "
"synonym list, aborting.");
}
wp1 = linkTable.getValuePointerFromSlot(slot1);
wp2 = linkTable.getValuePointerFromSlot(slot2);
int i, j;
// make sure they aren't already in the lists
for (i = 0; i < wp1->m_n; i++)
if (h2 == wp1->m_h[i+1]) break;
for (j = 0; j < wp2->m_n; j++)
if (h1 == wp2->m_h[j+1]) break;
if (i == wp1->m_n) { // couldn't find it, so add it
if (i >= MAX_SYNS - 1) {
log("build: Too many links in "
"thesaurus for %s, not adding "
"%s", a, b);
} else {
i++;
wp1->m_n++;
wp1->m_h[i] = h2;
wp1->m_syn[i] = wp2->m_syn[0];
wp1->m_type[i] = type;
wp1->m_aff[i] = aff;
}
} else if (aff >= 0) { // found it and we override
wp1->m_aff[i] = aff;
}
if (j == wp2->m_n) { // couldn't find it, so add it
if (j >= MAX_SYNS - 1) {
log("build: Too many links in "
"thesaurus for %s, not adding "
"%s", b, a);
} else {
j++;
wp2->m_n++;
wp2->m_h[j] = h1;
wp2->m_syn[j] = wp1->m_syn[0];
wp2->m_type[j] = type;
wp2->m_aff[j] = -1;
}
}
}
}
// make sure it's null-terminated in case of bad formatting in the
// add files
if (*(addBuffer.getBuf()-1) != '\0')
addBuffer.pushChar('\0');
int32_t numSynonyms = 0;
int32_t totalPairs = 0;
// count up groups that have at least 2 members
for (int32_t slot = 0; slot < linkTable.getNumSlots(); slot++) {
SynonymLinkGroup w;
w = linkTable.getValueFromSlot(slot);
if (w.m_n) numSynonyms++;
totalPairs += w.m_n;
}
log(LOG_INFO, "build: Built %"INT32" synonym groups and %"INT32" pairs",
numSynonyms, totalPairs);
if (unknown) log(LOG_WARN, "build: %"INT32" synonyms pairs were missing "
"valid types, check your input files", unknown);
SafeBuf thesFile;
thesFile.safePrintf("|lastRebuild|%"INT32"\n", m_lastRebuild);
thesFile.safePrintf("|numSynonyms|%"INT32"\n", numSynonyms);
thesFile.safePrintf("|totalPairs|%"INT32"\n", totalPairs);
thesFile.safePrintf("|totalSlots|0\n");
for (int32_t slot = 0; slot < linkTable.getNumSlots(); slot++) {
if (!linkTable.getKey(slot)) continue;
SynonymLinkGroup w;
w = linkTable.getValueFromSlot(slot);
// this won't run if w.m_n = 0
for (int j = 1; j <= w.m_n; j++) {
char *s1, *s2;
if (w.m_h[0] == w.m_h[j]) continue;
s1 = addBuffer.getBufStart() + w.m_syn[0];
s2 = addBuffer.getBufStart() + w.m_syn[j];
int32_t aff;
if (w.m_aff[j] >= 0) aff = w.m_aff[j];
else aff = getAffinity(s1, s2);
if (aff >= 0) {
thesFile.safePrintf("%s|%s|0x%08"XINT32"|%"INT32"\n",
s1, s2, aff, (int32_t)w.m_type[j]);
} else {
thesFile.safePrintf("%s|%s|%"INT32"|%"INT32"\n",
s1, s2, aff, (int32_t)w.m_type[j]);
}
}
}
snprintf(ff, PATH_MAX, "%s%sthesaurus.txt", g_hostdb.m_dir, s_dictDir);
if (!thesFile.dumpToFile(ff)) return log("build: Couldn't save %s", ff);
log(LOG_TIMING, "build: took %"INT64"ms to rebuild synonyms",
gettimeofdayInMilliseconds() - startTime);
return true;
}
class StateAffinityGroup {
public:
StateAffinityGroup();
StateAffinity *m_aff;
SynonymInfo m_info;
char *m_syn;
int32_t m_i;
int32_t m_sent;
int32_t m_cache;
int32_t m_recv;
bool m_next;
};
StateAffinityGroup::StateAffinityGroup() {
m_aff = NULL;
m_syn = NULL;
m_i = 0;
m_sent = 0;
m_cache = 0;
m_recv = 0;
m_next = false;
}
static void buildAffinity(StateAffinity *aff);
static StateAffinityGroup *getNextAffinityGroup(StateAffinity *aff);
static StateAffinityGroup *buildAffinityGroup(StateAffinityGroup *group);
static void gotAffinityDoc(void *state, TcpSocket *socket);
static void affinityRetry(StateAffinityGroup *group, TcpSocket *socket);
static void affinityAbort(StateAffinityGroup *group);
static void gotAffinityIP(void *state, int32_t ip);
static void gotAllAffinityPairs(void *state);
static void gotGroupAffinityPairs(StateAffinityGroup *group);
// start the callback loop, this basically stops running after we send out
// enough requests
static void buildAffinity(StateAffinity *aff) {
StateAffinityGroup *group = getNextAffinityGroup(aff);
do {
if (aff->m_n >= aff->m_next) {
log(LOG_INFO, "build: %"INT32" out of %"INT32" pairs built",
aff->m_n, aff->m_oldTable->getNumSlotsUsed());
aff->m_next = aff->m_n + 1000;
QUICKPOLL(1); // just in case we're hogging the cpu
// with lots of already built pairs
}
} while((group = buildAffinityGroup(group)));
}
static StateAffinityGroup *getNextAffinityGroup(StateAffinity *aff) {
StateAffinityGroup *group;
if (!aff->m_thes->m_rebuilding) return NULL;
if (aff->m_syn == aff->m_synend) return NULL;
try { group = new(StateAffinityGroup); }
catch( ... ) {
log("build: Couldn't allocate %i bytes for thesaurus, "
"aborting rebuild", sizeof(StateAffinityGroup));
aff->m_thes->m_rebuilding = false;
return NULL;
}
mnew(group, sizeof(StateAffinityGroup), "thesaurus");
group->m_aff = aff;
group->m_syn = aff->m_syn;
aff->m_thes->getSynonymInfo(group->m_syn, &group->m_info);
group->m_i = group->m_info.m_numSyns - 1;
aff->m_n += group->m_i + 2;
aff->m_syn += gbstrlen(aff->m_syn) + 1;
return group;
}
// returns a pointer to the group to process (sometimes ourselves, sometimes
// the next group), or NULL if we need to stop
static StateAffinityGroup *buildAffinityGroup(StateAffinityGroup *group) {
StateAffinity *aff = group->m_aff;
SynonymInfo *info = &group->m_info;
// too many requests going at once, return without doing anything
// the callback will call us again when a request is finished
if (aff->m_sent >= aff->m_recv + g_conf.m_maxAffinityRequests)
return NULL;
// we're done sending out requests for this chain or we aborted
if (group->m_i < -1 || !aff->m_thes->m_rebuilding) {
StateAffinityGroup *group2 = NULL;
// grab the next group if need be
if (!group->m_next) {
// this will be NULL if we aborted or we're done
group2 = getNextAffinityGroup(aff);
group->m_next = true;
}
// we sent all our requests AND we have all our responses
if (group->m_recv == group->m_sent)
gotGroupAffinityPairs(group);
// call the final callback if EVERYTHING is back and we have
// nothing else to send
if (aff->m_recv == aff->m_sent && !group2)
aff->m_callback(aff);
return group2;
}
int32_t i = group->m_i;
uint64_t hh;
char *s1 = group->m_syn;
char *s2 = "";
int32_t h1 = hash32n(group->m_syn);
int32_t h2;
SafeBuf b;
b.safePrintf("http://%s/search?q=", aff->m_server);
if (strchr(s1, ' ')) {
b.urlEncode("\"", 1);
b.urlEncode(s1, gbstrlen(s1));
b.urlEncode("\"", 1);
} else {
b.urlEncode(s1, gbstrlen(s1));
}
if (i == -1) {
hh = h1;
} else {
s2 = info->m_syn[i];
h2 = hash32n(s2);
b.urlEncode(" .. ", 4);
if (strchr(s2, ' ')) {
b.urlEncode("\"", 1);
b.urlEncode(s2, gbstrlen(s2));
b.urlEncode("\"", 1);
} else {
b.urlEncode(s2, gbstrlen(s2));
}
// hits for (i,j) is the same as (j,i), but the hash will be
// different so we need to order it properly
if (strcmp(s1, s2) < 0) {
hh = h1 + ((uint64_t)h2 << 32);
} else {
hh = h2 + ((uint64_t)h1 << 32);
}
}
b.safeMemcpy(g_conf.m_affinityParms, gbstrlen(g_conf.m_affinityParms)+1);
uint64_t *llp;
if (!aff->m_fullRebuild && i >= 0 && info->m_affinity[i] >= 0) {
log(LOG_DEBUG, "build: old value: (%s, %s, %08"XINT32")",
s1, s2, info->m_affinity[i]);
aff->m_old++;
} else if (i == -1 && !group->m_sent && !group->m_cache) {
// we used nothing but old values for this group so we don't
// need to send this either
log(LOG_DEBUG, "build: old value: (%s)", s1);
aff->m_old++;
} else if (!(llp = aff->m_hitsTable.getValuePointer(hh))) {
//Url url;
//url.set(b.getBufStart(), b.length(), aff->m_ip, 0, 0, 0, 0);
if (!g_httpServer.getDoc( b.getBufStart(),//&url,
0,//ip
0, -1, 0,
group, gotAffinityDoc,
g_conf.m_affinityTimeout, 0, 0,
32768, 32768,
0)) {
group->m_sent++;
aff->m_sent++;
} else {
log("build: getDoc error: %s (%s)", mstrerror(g_errno),
b.getBufStart());
affinityAbort(group);
// let it fall through so it cleans itself up
}
} else {
if (i >= 0) {
log(LOG_DEBUG, "build: cache hit (%s, %s, %"INT64")",
s1, s2, *llp);
} else {
log(LOG_DEBUG, "build: cache hit (%s, %"INT64")",
s1, *llp);
}
aff->m_cache++;
group->m_cache++;
}
group->m_i--;
// return ourselves so we run again
return group;
}
static void gotAffinityDoc(void *state, TcpSocket *socket) {
StateAffinityGroup *group = (StateAffinityGroup *)state;
StateAffinity *aff = group->m_aff;
SynonymInfo *info = &group->m_info;
int32_t i = -1;
char *q = strstr(socket->m_sendBuf, "q=") + 2;
Xml xml;
group->m_recv++;
aff->m_recv++;
// the stuff below might no longer be valid (synonyms specifically)
if (!aff->m_thes->m_rebuilding) {
// do cleanup
buildAffinityGroup(group);
return;
}
char *qend = strchr(q, '&');
char buf[1024];
int32_t qlen = urlDecode(buf, q, qend - q);
buf[qlen] = '\0';
char *sep = buf + gbstrlen(group->m_syn);
if (buf[0] == '\"') sep += 2;
if (!strncmp(sep, " .. ", 4)) { // are we a pair?
char *syn = sep + 4; // step over the " .. " in the middle
for (int32_t j = 0; j < info->m_numSyns; j++) {
uint32_t h1 = hash32n(info->m_syn[j]);
uint32_t h2;
int32_t slen = buf - syn + qlen;
if (syn[0] == '\"') {
h2 = hash32(syn + 1, slen - 2);
} else {
h2 = hash32(syn, slen);
}
if (h1 == h2) {
i = j;
break;
}
}
if (i == -1) {
log("build: i == -1 but we have a pair");
char *xx = NULL; *xx = 0;
}
}
if (g_errno) {
log("build: affinity socket error: %s", mstrerror(g_errno));
affinityRetry(group, socket);
return;
}
if (!socket->m_totalToRead) {
log("build: affinity socket error, no data to read");
affinityRetry(group, socket);
return;
}
if (socket->m_totalRead != socket->m_totalToRead) {
log("build: affinity socket error, read did not complete");
affinityRetry(group, socket);
return;
}
char *s;
s = strstr(socket->m_readBuf, "\r\n\r\n");
if (!s || (s - socket->m_readBuf) > socket->m_readOffset) {
log("build: invalid HTTP response during affinity rebuild");
affinityRetry(group, socket);
return;
}
s += 4;
int32_t len;
len = socket->m_readOffset - (s - socket->m_readBuf);
if (strncmp(s, "<?xml", 5)) {
log("build: Non-XML response during affinity rebuild");
log("build: s = %32s", s);
affinityRetry(group, socket);
return;
}
if (!xml.set(s, len, false, 0, false, 0)) {
log("build: len = %"INT32"", len);
log("build: s = %32s", s);
affinityRetry(group, socket);
return;
}
int64_t hits;
hits = xml.getLongLong("Report.hits", -1LL);
if (hits == -1LL) {
log("build: hits tag not found in XML response");
log("build: s = %256s", s);
affinityRetry(group, socket);
return;
}
uint64_t hh;
uint32_t h1 = hash32n(group->m_syn);
if (i == -1) {
hh = (uint64_t)h1;
} else {
char *s1 = group->m_syn;
char *s2 = info->m_syn[i];
uint32_t h2 = hash32n(s2);
if (strcmp(s1, s2) < 0) {
hh = h1 + ((uint64_t)h2 << 32);
} else {
hh = h2 + ((uint64_t)h1 << 32);
}
}
if (!aff->m_hitsTable.addKey(hh, hits)) {
log("build: Ran out of memory while rebuilding affinity, "
"aborting.");
affinityAbort(group);
return;
}
// send the next request and/or do cleanup
do {
if (aff->m_n >= aff->m_next) {
log(LOG_INFO, "build: %"INT32" out of %"INT32" pairs built",
aff->m_n, aff->m_oldTable->getNumSlotsUsed());
aff->m_next = aff->m_n + 1000;
QUICKPOLL(1); // just in case we're hogging the cpu
// with lots of already built pairs
}
} while((group = buildAffinityGroup(group)));
}
static void affinityRetry(StateAffinityGroup *group, TcpSocket *socket) {
StateAffinity *aff = group->m_aff;
if (!aff->m_thes->m_rebuilding) return;
if (aff->m_errors >= g_conf.m_maxAffinityErrors) {
if (aff->m_thes->m_rebuilding)
log("build: exceeded affinity retry limit, aborting");
affinityAbort(group);
return;
}
aff->m_errors++;
log(LOG_DEBUG, "build: affinity error #%"INT32"", aff->m_errors);
// rebuild the url from the sendBuf
char buf[1024], *p;
p = buf;
strcpy(p, "http://"); p += 7;
// first, the host
char *p2, *p2end;
p2 = strstr(socket->m_sendBuf, "Host: ") + 6;
p2end = strstr(p2, "\r\n");
strncpy(p, p2, p2end - p2); p += p2end - p2;
// then the port
p += sprintf(p, ":%i", socket->m_port);
// then the rest of the url
p2 = socket->m_sendBuf + 4;
p2end = strstr(socket->m_sendBuf, " HTTP");
strncpy(p, p2, p2end - p2); p += p2end - p2;
*p = '\0';
//Url url;
//url.set(buf, gbstrlen(buf), aff->m_ip, 0, 0, 0, 0);
// resend the request
if (!g_httpServer.getDoc(buf,//&url,
0,//ip
0, -1, 0,
group, gotAffinityDoc,
30000, 0, 0,
32768, 32768,
0)) {
group->m_sent++;
aff->m_sent++;
return;
}
log("build: getDoc error: %s (%s)", mstrerror(g_errno), buf);
affinityAbort(group);
}
static void affinityAbort(StateAffinityGroup *group) {
group->m_aff->m_thes->m_rebuilding = false;
// do cleanup
buildAffinityGroup(group);
}
static void gotAffinityIP(void *state, int32_t ip) {
StateAffinity *aff = (StateAffinity *)state;
if (!ip) {
log("build: Couldn't resolve %s for affinity rebuild",
aff->m_server);
aff->m_thes->m_rebuilding = false;
aff->m_thes->m_affinityState = NULL;
delete(aff);
mdelete(aff, sizeof(StateAffinity), "thesaurus");
return;
}
aff->m_ip = ip;
// start the loop
buildAffinity(aff);
}
static void gotGroupAffinityPairs(StateAffinityGroup *group) {
StateAffinity *aff = group->m_aff;
SynonymInfo *info = &group->m_info;
char *s1 = group->m_syn;
//int32_t s1len = gbstrlen(s1);
uint64_t key = hash64Lower_utf8(s1);
int64_t v = aff->m_thes->getValueN(key, 0);
int32_t numSyns = info->m_numSyns;
log(LOG_DEBUG, "build: gotGroupAffinityPairs(%p)", group);
aff->m_newTable->addKey(key, v);
for(int32_t i = 0; i < numSyns; i++) {
if (!aff->m_thes->m_rebuilding) continue;
char *s2 = info->m_syn[i];
//int32_t s2len = gbstrlen(s2);
uint64_t hh;
uint32_t hh1 = hash32n(s1), hh2 = hash32n(s2);
if (info->m_affinity[i] >= 0 && !aff->m_fullRebuild) {
// if we're not doing a full rebuild, use the old value
// if it exists and is valid
// these values never have bit 31 set
v = ((int64_t)info->m_affinity[i] << 32) +
((int64_t)info->m_type[i] << 27) +
info->m_offset[i];
aff->m_newTable->addKey(key, v);
continue;
}
if (strcmp(s1, s2) > 0) {
hh = hh2 + ((uint64_t)hh1 << 32);
} else {
hh = hh1 + ((uint64_t)hh2 << 32);
}
uint64_t k = 0, l = 0, *pk, *pl;
pk = aff->m_hitsTable.getValuePointer((uint64_t)hh1);
pl = aff->m_hitsTable.getValuePointer(hh);
if (pk) k = *pk;
if (pl) l = *pl;
int32_t a = -1;
if (k && l) {
double f = (double)l / (double)k;
// we never want synonym affinity to be 100%
if (f > 0.99) f = 0.99;
a = (int32_t)(f * MAX_AFFINITY);
} else if (pk && pl) {
a = 0;
} else {
log(LOG_WARN, "build: hits=%s,%08"XINT32",%p,%"INT64","
"%s,%016"XINT64",%p,%"INT64"",
s1, (uint32_t) hh1, pk, k,
s2, hh, pl, l);
continue;
}
log(LOG_DEBUG, "build: affinity(%s,%s)=%"INT32"(%"INT64",%"INT64")",
s1, s2, a, k, l);
// if (a < MAX_AFFINITY * 0.01) {
// aff->m_skip++;
// continue;
// }
v = ((int64_t)a << 32) + ((int64_t)info->m_type[i] << 27) +
info->m_offset[i];
aff->m_newTable->addKey(key, v);
aff->m_built++;
}
delete group;
mdelete(group, sizeof(StateAffinityGroup), "thesaurus");
}
// for logic warnings
/*static uint64_t s_cmpKey;
static int slotCmp(const void *p1, const void *p2) {
int64_t v1 = *(const int64_t *)p1,
v2 = *(const int64_t *)p2;
// check for invalid affinity (negative) and push those back
// else sort by the affinity (stored in the high 32 bits) and then
// by offset into the text buffer if there's a tie for some reason
if (v2 < 0) return -1; // v2 is invalid, push it back
else if (v1 < 0) return 1; // v1 is invalid, push it back
else if (v1 > v2) return -1; // v1 has higher affinity, push it up
else if (v1 < v2) return 1; // v2 has higher affinity, push it up
else {
// if this happens the code elsewhere is borked
log(LOG_LOGIC, "build: duplicate entry (%016"XINT64",%016"XINT64")",
s_cmpKey, v1);
return 0;
}
} */
/*
static void sortTable(HashTableT<uint64_t, int64_t> *table) {
int32_t n1 = table->getNumSlots();
// this is a bit ugly but it's the best way to get the synonyms sorted
// as far as I can figure out
for (int32_t i = 0; i < n1; i++) {
uint64_t key = table->getKey(i);
// check for an empty slot or if we're at the first slot for
// this key, if we're not we already sorted this key set
if (!key || (table->getSlot(key) != i)) continue;
// if not, count up all the slots that use this key and store
// them in a temporary array
int32_t n2 = 0, j = i;
int32_t slots[MAX_SYNS];
int64_t vals[MAX_SYNS];
while(table->getKey(j)) {
if (table->getKey(j) == key) {
slots[n2] = j;
vals[n2] = table->getValueFromSlot(j);
n2++;
}
if (++j >= table->getNumSlots()) j = 0;
}
// and then sort them
s_cmpKey = key;
gbmergesort(vals, n2, sizeof(int64_t), slotCmp);
// and then throw them back in the table
for (int32_t j = 0; j < n2; j++) {
table->setValue(slots[j], vals[j]);
}
}
}
*/
static void gotAllAffinityPairs(void *state) {
StateAffinity *aff = (StateAffinity *)state;
log(LOG_DEBUG, "build: gotAllAffinityPairs(%p)", state);
if (aff->m_thes->m_rebuilding) {
log(LOG_INFO, "build: Rebuilt %"INT32" affinity pairs, sent "
"%"INT32" total requests, hit cache %"INT32" times, used %"INT32" "
// "old values, had %"INT32" errors, dropped %"INT32" pairs for "
// "values below the threshold, and took %"INT64" seconds"
"old values, had %"INT32" errors, and took %"INT64" seconds"
"(%s rebuild)",
aff->m_built, aff->m_sent, aff->m_cache,
aff->m_old, aff->m_errors, //aff->m_skip,
(gettimeofdayInMilliseconds() - aff->m_time) / 1000,
aff->m_fullRebuild ? "full" : "partial");
// do the overrides now, before we copy the table over
char ff[PATH_MAX];
SafeBuf addFile;
snprintf(ff, PATH_MAX, "%s%s%s",
g_hostdb.m_dir, s_dictDir, s_affFile);
addFile.fillFromFile(ff);
char *pstart = addFile.getBufStart();
char *p = pstart;
char *pend = addFile.getBuf();
// format is "%s|%s|%f/d", word pair (a, b), and either a float
// or 32 bit hex integer designating the affinity, floating
// point is probably more 'portable' in case MAX_AFFINITY ever
// changes, also supports the word 'max' (case sensitive) to
// designate maximum affinity
// pipe-delimited triplet per line
while (p < pend) {
char *a = p;
while (*p != '\n' && p < pend) p++;
if (*p == '\n') *p++ = '\0';
// verify that there are two pipes per line
char *b = NULL, *c = NULL, *e = NULL;
b = strchr(a, '|');
if (b) c = strchr(b + 1, '|');
if (c) e = strchr(c + 1, '|');
if (!b || !c || e) {
log(LOG_DEBUG, "build: Bad format in %s, line "
"does not contain exactly two pipes: "
"\"%s\", skipping line", ff, p);
break;
} else {
*b++ = '\0';
*c++ = '\0';
}
int32_t val;
char *d = NULL;
if (strcmp(c, "max") == 0) {
val = MAX_AFFINITY;
} else if (strchr(c, '.')) { // floating point
float f = strtod(c, &d);
if (f > 0.99) f = 0.99;
if (f < 0.0 ) f = 0.0;
val = (int32_t)(f * MAX_AFFINITY);
} else {
val = strtol(c, &d, 0);
}
if (d && *d) log(LOG_DEBUG, "build: Extra characters "
"in affinity value: %s", d);
uint64_t h = hash64Lower_utf8(a);
int32_t slot = aff->m_newTable->getSlot(h);
int32_t offset = aff->m_thes->getOffset(b);
if (slot < 0) {
log("build: Couldn't find synonym slot for "
"(%s)", a);
continue;
}
if (offset < 0) {
log("build: Couldn't find synonym slot for "
"(%s)", b);
continue;
}
uint64_t k;
int64_t v;
do {
if (++slot >= aff->m_newTable->getNumSlots())
slot = 0;
k = aff->m_newTable->getKey(slot);
v = aff->m_newTable->getValueFromSlot(slot);
} while (k && (k != h || OFFSET(v) != offset));
if (!k) {
log("build: Couldn't find synonym slot for "
"(%s,%s)", a, b);
} else {
int64_t nv = ((int64_t)val << 32) +
(v & 0xFFFFFFFF);
aff->m_newTable->setValue(slot, nv);
}
}
if (aff->m_fullRebuild || !aff->m_old) {
aff->m_thes->m_lastRebuild =
gettimeofdayInMilliseconds() / 1000;
log(LOG_INFO, "build: Affinity timestamp updated");
}
//sortTable(aff->m_newTable);
aff->m_oldTable->copy(aff->m_newTable);
if (aff->m_thes->save()) {
log(LOG_INFO, "build: propagating thesaurus data to "
"all hosts");
char cmd[512];
for ( int32_t i = 0; i < g_hostdb.getNumHosts() ; i++ ) {
Host *h = g_hostdb.getHost(i);
snprintf(cmd, 512,
"rcp -r "
"%s%sthesaurus* "
"%s:%s%s &",
g_hostdb.m_dir,
s_dictDir,
iptoa(h->m_ip),
h->m_dir,
s_dictDir);
log(LOG_INFO, "admin: %s", cmd);
system( cmd );
}
} else {
log("build: Couldn't save thesaurus data: (%s), "
"will try again later", mstrerror(g_errno));
}
} else {
log(LOG_INFO, "build: Affinity rebuild aborted, table "
"unchanged");
}
delete(aff);
mdelete(aff, sizeof(StateAffinity), "thesaurus");
aff->m_thes->m_rebuilding = false;
g_thesaurus.m_affinityState = NULL;
}
bool Thesaurus::rebuildAffinity(char *server, bool fullRebuild) {
char *syn = m_synonymText, *synend = syn + m_synonymLen;
if (g_conf.m_maxAffinityAge >= 0 &&
((gettimeofdayInMilliseconds() / 1000 - m_lastRebuild)
/ 86400 > g_conf.m_maxAffinityAge)
) {
fullRebuild = true;
}
if (m_rebuilding) {
log("build: Ignoring rebuild request while already rebuilding");
return true;
}
if (m_affinityState) {
log("build: Still cleaning up affinity from abort, not "
"restarting");
return true;
}
if (!syn) {
log("build: Synonyms need to be built before affinity");
return true;
}
// Use default if server is blank
if (!server || !gbstrlen(server)) server = g_conf.m_affinityServer;
log(LOG_INFO, "build: rebuilding Thesaurus word affinity from "
"server %s", server);
m_rebuilding = true;
StateAffinity *aff;
try { aff = new (StateAffinity); }
catch ( ... ) {
log("build: Couldn't allocate %i bytes for thesaurus, "
"aborting rebuild", sizeof(StateAffinity));
m_rebuilding = false;
return true;
}
g_thesaurus.m_affinityState = aff;
mnew(aff, sizeof(StateAffinity), "thesaurus");
memset(aff, 0, sizeof(StateAffinity));
aff->m_time = gettimeofdayInMilliseconds();
aff->m_synstart = syn;
aff->m_syn = syn;
aff->m_synend = synend;
strncpy(aff->m_server, server, MAX_URL_LEN);
aff->m_newTable = &aff->m_synTable;
aff->m_newTable->set(0, NULL, 0, true);
aff->m_oldTable = &m_synonymTable;
aff->m_next = 1000;
aff->m_thes = this;
aff->m_fullRebuild = fullRebuild;
aff->m_callback = gotAllAffinityPairs;
char *c = strchr(server, ':');
int32_t len;
int32_t ip;
if (c)
len = c - server;
else
len = gbstrlen(server);
if (g_dns.getIp(server, len, &ip,
aff, gotAffinityIP,
0, 30000))
gotAffinityIP(aff, ip);
return false;
}
bool Thesaurus::save() {
char *p1 = m_synonymText, *p1end = m_synonymText + m_synonymLen;
SafeBuf b;
bool x = true;
//x &= b.safePrintf("|lastRebuild|%"INT32"\n", m_lastRebuild);
//x &= b.safePrintf("|numSynonyms|%"INT32"\n", m_numSynonyms);
//x &= b.safePrintf("|totalPairs|%"INT32"\n", m_totalPairs);
//x &= b.safePrintf("|totalSlots|%"INT32"\n", m_synonymTable.getNumSlots());
while (p1 < p1end && x) {
SynonymInfo syn;
getSynonymInfo(p1, &syn);
for (int32_t i = 0; i < syn.m_numSyns; i++) {
char *p2 = syn.m_syn[i];
int32_t a = syn.m_affinity[i];
float af = a / (float)MAX_AFFINITY;
int32_t f = syn.m_type[i];
if (a >= 0) {
x &= b.safePrintf("%s|%s|%f|%"INT32"\n",
p1, p2, af, f);
} else {
x &= b.safePrintf("%s|%s|%"INT32"|%"INT32"\n",
p1, p2, a, f);
}
}
p1 += gbstrlen(p1) + 1;
}
char ff[PATH_MAX];
snprintf(ff, PATH_MAX, "%s%sthesaurus.txt", g_hostdb.m_dir, s_dictDir);
if (x) x &= (b.dumpToFile(ff) != 0);
return x;
}
static char s_suffixEng[] = {
"ational|ate\n"
"ization|ize\n"
"iveness|ive\n"
"fulness|ful\n"
"ousness|ous\n"
"tional|tion\n"
"ation|ate\n"
"alism|al\n"
"iment|y\n"
"ator|ate\n"
"ying|ie|ye|y\n"
"ment|.\n"
"sses|ss\n"
"ings|e|.\n"
"enci|ence\n"
"anci|ance\n"
"izer|ize\n"
"ing|e|.\n"
"ied|y\n"
"men|man\n"
"ies|y|i\n"
"eed|ee\n"
"bli|ble\n"
"'re| are\n"
" are|'re\n"
"'ve| have\n"
" have|'ve\n"
"'ll| will\n"
" will|'ll\n"
"n't| not\n"
" not|n't\n"
"es|e|.\n"
"ed|e|.\n"
"'m| am\n"
" am|'m\n"
"'s| is|.\n"
" is|'s\n"
"'d| would| had\n"
" would|'d\n"
" had|'d\n"
"s|.\n"
"y|.\n"
};
bool Thesaurus::initStems() {
// mdw: disable for now
return true;
char ff[256];
// prevents certain words from stemming into the wrong thing
// and provides stems for irregular words ("children" -> "child")
sprintf(ff, "%s%sstemmer.exceptions", g_hostdb.m_dir, s_dictDir);
if (!m_stemBuffer.fillFromFile(ff)) return false;
char *p = m_stemBuffer.getBufStart();
char *pend = m_stemBuffer.getBuf();
while (p < pend) {
char *a = NULL, *b = NULL, *e = NULL;
a = strchr(p, '|');
b = strchr(p, '\n');
if (a) e = strchr(a+1, '|');
if (b) *b = '\0';
if (!a || (e && e < b)) {
log("query: Stem exception file is corrupt, line does "
"not contain exactly one pipe: %s", p);
m_stemBuffer.reset();
m_stemTable.reset();
break;
}
*a = '\0';
uint32_t h1 = hash32n(p), h2 = hash32n(a+1);
// add it in both ways now
m_stemTable.addKey(h1, a+1);
m_stemTable.addKey(h2, p);
if (b) p = b+1;
else p = pend;
}
int32_t used = m_stemTable.getNumSlotsUsed();
if (used) log(LOG_INIT, "query: Loaded %"INT32" stem exceptions", used);
else log(LOG_INIT, "query: Couldn't load stem exceptions");
m_suffixBuffer.reset();
m_suffixBuffer.safeMemcpy(s_suffixEng, sizeof(s_suffixEng) - 1);
p = m_suffixBuffer.getBufStart();
pend = m_suffixBuffer.getBuf();
while (p < pend) {
// Count number of newlines, change all pipes to null bytes
if (*p == '\n') m_numSuffixes++;
if (*p == '|') { *p = '\0'; m_numReps++; }
p++;
}
if (!m_numSuffixes || !m_numReps) {
log("query: No suffixes or no replacements in %s", ff);
return used != 0;
} else {
m_suffixes = (Suffix *)mmalloc(sizeof(Suffix) * m_numSuffixes,
"stemmer");
m_reps = (char **)mmalloc(sizeof(char *) * m_numReps,
"stemmer");
m_repLens = (int32_t *)mmalloc(sizeof (int32_t) * m_numReps,
"stemmer");
if (!m_suffixes || !m_reps || !m_repLens) {
if (m_suffixes) mfree(m_suffixes, sizeof(Suffix) *
m_numSuffixes, "stemmer");
if (m_reps) mfree(m_reps, sizeof(char *) *
m_numReps, "stemmer");
if (m_repLens) mfree(m_repLens, sizeof(int32_t) *
m_numReps, "stemmer");
m_suffixes = NULL;
m_reps = NULL;
m_repLens = NULL;
log("query: Couldn't allocate memory for stemmer");
return used != 0;
}
}
p = m_suffixBuffer.getBufStart();
Suffix *suf = m_suffixes;
char **rep = m_reps;
int32_t *repLenp = m_repLens;
while (p < pend) {
// first entry in a line is the suffix
suf->m_suffix = p;
int32_t len = gbstrlen(p);
p += len;
char *p2 = p + 1, *p2end;
suf->m_numReps = 0;
suf->m_len = len;
suf->m_reps = rep;
suf->m_repLens = repLenp;
// count null bytes (from spaces) to get number of replacements
while (*p != '\n' && p < pend) {
if (*p == '\0') {
suf->m_numReps++; rep++; repLenp++;
}
p++;
}
p2end = p;
if (p2end < pend) *p2end = '\0';
p++;
// input validation
if (!suf->m_numReps) {
log("query: no replacement suffixes for %s",
suf->m_suffix);
continue;
}
char **rep2 = suf->m_reps;
int32_t *repLenp2 = suf->m_repLens;
while (p2 < p2end) {
int32_t len = gbstrlen(p2);
if (*p2 == '.') {
*rep2 = "";
*repLenp2 = 0;
} else if (len) {
*rep2 = p2;
*repLenp2 = len;
} else {
log("query: zero-length replacement for %s",
suf->m_suffix);
continue;
}
rep2++;
repLenp2++;
p2 += len + 1;
}
suf++;
}
log(LOG_INIT, "query: Loaded suffixes");
return true;
}
bool Thesaurus::load() {
char ff[PATH_MAX], ff2[PATH_MAX];
snprintf(ff, PATH_MAX, "%s%sthesaurus.txt", g_hostdb.m_dir, s_dictDir);
snprintf(ff2, PATH_MAX, "%s%sthesaurus.dat", g_hostdb.m_dir, s_dictDir);
struct stat stats;
bool x = true;
// do the reset up here so we don't have to do a table copy, and we can
// just use m_synonymTable directly
m_synonymTable.reset();
SafeBuf b, b2;
if (!b.fillFromFile(ff))
return log("build: Couldn't load thesaurus from %s", ff);
if (stat(ff, &stats)) return log("build: Could load, but couldn't "
"stat %s", ff);
// load in the additional buffer, "m_synonymText" as well as hash table,
// because the offsets in the hash table reference that additional buffer.
// it is fairly common for a hash table to do this, so it is built-in to
// HashTableT::load()/save() now
if ( m_synonymTable.load ( ff2 , &m_synonymText , &m_synonymLen ) &&
m_synonymTable.m_numSlots > 0 ) {
// let gb know how many bytes to free...
m_synonymSize = m_synonymLen;
log(LOG_INFO,"admin: Loaded thesaurus from thesaurus.dat.");
initStems();
return true;
}
log(LOG_INIT, "build: Loading thesaurus from %s", ff);
char *pstart = b.getBufStart(), *p = pstart, *pend = b.getBuf();
SafeBuf synonymTextB;
int32_t warn = 0;
int32_t unknown = 0;
// allow dups in this table
m_synonymTable.setAllowDupKeys(true);
// verify that there are exactly two pipes per line
while (p < pend) {
char *w1 = p, *w2 = NULL, *w3 = NULL, *w4 = NULL, *e = NULL;
while (*p != '\n' && p < pend) p++;
if (*p == '\n') *p++ = '\0';
w2 = strchr(w1, '|');
if (w2) w3 = strchr(w2 + 1, '|');
if (w3) w4 = strchr(w3 + 1, '|');
if (w4) e = strchr(w4 + 1, '|');
if (!w2 || !w3 || e) {
log("build: Bad format in %s, line does not "
"contain the right number of pipes: %s",
ff, w1);
continue;
} else {
*w2++ = '\0';
*w3++ = '\0';
if (w4) *w4++ = '\0';
}
//int32_t w2len;
int32_t a, b;
//w1len = gbstrlen(w1);
//w2len = gbstrlen(w2);
a = strtol(w3, &e, 0);
if (w4) b = strtol(w4, &e, 0);
if (!w4 || b >= SYN_UNKNOWN ) {
b = SYN_UNKNOWN;
unknown++;
}
if (e && *e)
log("build: Extra characters in affinity data: %s", e);
uint64_t h1 = hash64Lower_utf8(w1), h2 = hash64Lower_utf8(w2);
if (h1 == h2) {
log(LOG_WARN, "build: Thesaurus pair has same hash "
"(%s,%s)", w1, w2);
continue;
}
int64_t v;
// warp h2 since we don't want it matching h1 ever
// because we are only adding it to the table to "save" the word ptr,
// we are not adding it as a "synonym entry" per se. so this table
// is really storing two different types of things.
h2 ^= 0x987fce44;
int32_t slot2 = m_synonymTable.getSlot(h2);
int32_t offset2;
if (slot2 < 0) {
// point into our word buffer
offset2 = synonymTextB.length();
// copy our word into our word buffer
x &= synonymTextB.safeMemcpy(w2, gbstrlen(w2) + 1);
// . set the offset of the "word" with hash "h2"
// . use a fake affinity of 0x7fffffff and a synd type of 8?
v = 0x7FFFFFFF80000000LL + offset2;
// only add this to the table because we want to "save" the
// offset of its text for re-use
x &= m_synonymTable.addKey(h2, v);
}
// otherwise, we already stored it into the word buffer, recycle!
else {
v = m_synonymTable.getValueFromSlot(slot2);
offset2 = OFFSET(v);
// sanity check, affinity better be 0x7fffffff, otherwise
// there might have been a collision?
int32_t a2 = AFFINITY(v);
if ( a2 != 0x7fffffff ) { char *xx = NULL; *xx = 0; }
}
// add the actual synonym info for the hash of word1, "h1"
if (a < 0) warn++;
// "a" is the synonym affinity
v = ((int64_t)a << 32) + offset2;
// b is the synonym type, see Thesaurus.h for these, #define'd
v += (b << 27);
x &= m_synonymTable.addKey(h1, v);
}
if (!x) return log("build: Thesaurus loading failure, memory low?");
if (warn)
log(LOG_INIT, "build: %"INT32" invalid/missing affinity "
"values, recommend rebuild", warn);
if (unknown)
log(LOG_INIT, "build: %"INT32" synonyms with missing/"
"invalid type", unknown);
// this no longer resets m_synonymTable, why did we
// want to do that anyway??? MDW
reset();
// preserve the word buffer
m_synonymText = synonymTextB.getBufStart();
m_synonymLen = synonymTextB.length();
m_synonymSize = synonymTextB.getCapacity();
synonymTextB.detachBuf(); // we own this now
relabel(m_synonymText, m_synonymSize, "thesaurus");
log(LOG_INIT,"build: Loaded %"INT32" synonym pairs.",
m_synonymTable.m_numSlotsUsed);
// save it as "thesaurus.dat", and include the text buffer,
// m_synonymText, that it references
if ( ! g_conf.m_readOnlyMode )
m_synonymTable.save(ff2,m_synonymText,m_synonymLen);
initStems();
return true;
}