open-source-search-engine/Speller.cpp

#include "gb-include.h"

#include "Mem.h"
#include "Conf.h"
#include "Dns.h"
#include "HttpServer.h"
#include "Loop.h"
#include <sys/resource.h>  // setrlimit

#include "Speller.h"
#include <stdio.h>
#include <ctype.h>

/*
static void handleRequestSpeller ( UdpSlot *slot , int32_t netnice );

static void gotSpellerReplyWrapper (void *state, void *state2);

bool Speller::registerHandler ( ) {
	// . register ourselves with the udp server
	// . it calls our callback when it receives a msg of type 0x39
	if ( ! g_udpServer.registerHandler ( 0x3d, handleRequestSpeller ))
		return false;
	return true;
}

// . handle a request to get a linkInfo for a given docId/url/collection
// . returns false if slot should be nuked and no reply sent
// . sometimes sets g_errno on error
void handleRequestSpeller ( UdpSlot *slot , int32_t netnice ) {
	// The request is the string to be spellchecked, null ended
	char *request = slot->m_readBuf;

	// first tells us if we should narrow the search stuff
	bool narrowP = *(bool *) request;
	request += sizeof(bool);

	// is it found in dict or pop words
	bool found;
	int32_t score;
	char reco[MAX_PHRASE_LEN];
	int32_t pop;
	int64_t start = gettimeofdayInMilliseconds();
	bool recommendation = g_speller.m_language[langEnglish].
		getRecommendation( request, gbstrlen(request),
				   reco, MAX_PHRASE_LEN,
				   &found, &score,
				   &pop );

	log ( LOG_DEBUG,"speller: %s --> %s", request, reco );

	int32_t numNarrow = 0;
	char narrow[MAX_NARROW_SEARCHES * MAX_PHRASE_LEN];
	int32_t narrowPops[MAX_NARROW_SEARCHES];
	//if ( narrowP )
	//	numNarrow = g_speller.m_language[langEnglish].
	//		narrowPhrase ( request, narrow, narrowPops,
	//			       MAX_NARROW_SEARCHES );

	// calculate total reply size
	// int32_t replySize = found + recommendation + score + pop + reco
	int32_t replySize = sizeof(bool) + sizeof(bool) + 4 + 4 +
		gbstrlen(reco) + 1;

	if ( narrowP ){
		replySize += 4; // numPhrases
		for ( int32_t i = 0; i < numNarrow; i++ )
			replySize += 4 + gbstrlen(&narrow[i*MAX_FRAG_SIZE]) + 1;
	}

	char *reply = (char*) mmalloc(replySize, "SpellerReplyBuf");
	if ( !reply ) {
		g_errno = ENOMEM;
		//g_udpServer.sendReply_ass( NULL, 0, NULL, 0, slot );
		g_udpServer.sendErrorReply( slot , g_errno );
		return;
	}
	char *p = reply;

	*(bool *)p = found;
	p += sizeof(bool);

	*(bool *)p = recommendation;
	p += sizeof(bool);

	// store the score and pop
	*(int32_t *) p = score; p += 4;
	*(int32_t *) p = pop; p += 4;

	// store the recommendation
	strcpy( p, reco );
	p += gbstrlen(reco) + 1;
	if ( narrowP ){
		// store the number of narrow phrases found
		*(int32_t *) p = numNarrow;
		p += 4;
		for ( int32_t i = 0; i < numNarrow; i++ ){
			*(int32_t *)p = narrowPops[i];
			p += 4;
			strcpy(p, &narrow[i * MAX_FRAG_SIZE]);
			p += gbstrlen(&narrow[i * MAX_FRAG_SIZE]) + 1;
		}
	}

	//sanity check
	if ( p - reply != replySize ){
		char *xx = NULL; *xx = 0;
	}

	int64_t end = gettimeofdayInMilliseconds();
	if ( end - start > 1 )
		log (LOG_INFO,"speller: took %"INT64" ms to spellcheck "
		     "fragment %s", end-  start, request);
	g_udpServer.sendReply_ass ( reply   ,
				    replySize,
				    reply   ,
				    replySize,
				    slot    );
}
*/

Speller g_speller;

Speller::Speller(){
	//m_unifiedBuf = NULL;
	//mm_unifiedBufSize = 0;
}

Speller::~Speller(){
	reset();
}
char *g_str=NULL;
bool Speller::init(){

	static bool s_init = false;
	if ( s_init ) return true;
	s_init = true;

	/*
	m_hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits;
	m_hostsPerSplit /= g_hostdb.m_numHostsPerShard;
	if ( m_hostsPerSplit <= 0 )
		return log("db: the <indexSplit> in gb.conf is probably not "
			   "too big. Are you using the wrong hosts.conf?");
	// check if we've got enough multicasts avaiable
	if ( m_hostsPerSplit > MAX_UNIQUE_HOSTS_PER_SPLIT ){
		log( LOG_WARN,"speller: not enough multicasts available for "
		     "this host configuration. Increase multicasts" );
		return false;
	}
	*/

	if ( !loadUnifiedDict() )
		return log("spell: Could not load unified dict from "
			   "unifiedDict-buf.txt and unifiedDict-map.dat");

	// this seems to slow our startup way down!!!
	log("speller: turning off spell checking for now");
	return true;

	/*
	int32_t myHash = g_hostdb.m_hostId %
		( m_hostsPerSplit * g_hostdb.m_indexSplits );
	myHash /= g_hostdb.m_indexSplits;

	//for ( int32_t i = 0; i < MAX_LANGUAGES; i++ )
	m_language[langEnglish].init ( m_unifiedBuf.getBufStart(),
				       m_unifiedBuf.length(),
				       langEnglish,
				       m_hostsPerSplit,
				       myHash );

	return true;
	*/
}

void Speller::reset(){
	//if ( m_unifiedBuf && m_unifiedBufSize > 0 )
	//	mfree ( m_unifiedBuf, m_unifiedBufSize, "SpellerBuf" );
	m_unifiedBuf.purge();

	m_unifiedDict.reset();
	/*
	for(int32_t i = 0; i < MAX_LANGUAGES; i++)
		m_language[i].reset();
	*/

	//m_unifiedBuf = NULL;
	//m_unifiedBufSize = 0;
}

// test it.
void Speller::test ( char *ff ) {
	//char *ff = "/tmp/sctest";
	FILE *fd = fopen ( ff, "r" );
	if ( ! fd ) {
		log("speller: test: Could not open %s for "
		    "reading: %s.", ff,strerror(errno));
		return;
	}

	char buf[1026];
	//char dst[1026];
	// go through the words in dict/words
	while ( fgets ( buf , MAX_FRAG_SIZE , fd ) ) {
		// length of word(s), including the terminating \n
		int32_t wlen = gbstrlen(buf) ;
		// skip if empty
		if ( wlen <= 0 ) continue;
		buf[wlen-1]='\0';
		Query q;
		q.set2 ( buf , langUnknown , false );

		//if ( getRecommendation ( &q, dst , 1024 ) )
		//  log(LOG_INIT,"speller: %s-->%s",buf,dst);
		//  else
		// log(LOG_INIT,"speller: %s",buf);
	}
	fclose(fd);
}

/*
///////////////////////////////////////////////////////
// RECOMMENDATION ROUTINES BELOW HERE
//
// These will spellcheck and give recommendations
///////////////////////////////////////////////////////

bool Speller::canStart( QueryWord *qw ) {
	// can only start with a alpha character, no numeric
	if ( ! is_alnum_utf8 ( qw->m_word+0 ) ) return false;

	if ( qw->m_ignoreWord &&
	     qw->m_ignoreWord != IGNORE_CONNECTED &&
	     qw->m_ignoreWord != IGNORE_QUOTED ) return false;

	// don't check 'rom' in phrase "cd-rom", or 't' in "ain't"
	if ( qw->m_leftConnected )
		return false;

	// don't start with a stop word
	if ( qw->m_isStopWord )
		return false;

	// a lot of field terms should not be spell checked
	if ( qw->m_fieldCode ) {
		if ( qw->m_fieldCode != FIELD_TITLE   &&
		     qw->m_fieldCode != FIELD_CITY    &&
		     qw->m_fieldCode != FIELD_AUTHOR  &&
		     qw->m_fieldCode != FIELD_COUNTRY   )
			return false;
	}
	return true;
}


// . returns false if blocked
//   recommended something different than original query, "q"
//   and false otherwise
// . also returns false and sets g_errno on error
// . stores recommended query in "dst" and NULL terminates it
// . if dst is too small it will bitch and return true with g_errno set
bool Speller::getRecommendation ( Query *q,
				  bool   spellcheck,
				  char  *dst, // recommendation destination
				  int32_t   dstLen, // recommendation max len
				  bool   narrowSearch,
				  char  *narrow, // narrow search
				  int32_t   narrowLen,  // narrow search len
				  int32_t  *numNarrows, // num narrows found
				  void  *state,
				  void (*callback)(void *state) ){
	*dst = '\0';
	*narrow = '\0';
	// no narrowing search if spellchecking is off
	if ( !spellcheck )
		return true;

	// don't spellcheck queries that are more than MAX_FRAG_SIZE int32_t.
	if ( q->getQueryLen() >= MAX_FRAG_SIZE )
		return true;

	StateSpeller *st ;
	try { st = new (StateSpeller); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("Speller: new(%i): %s", sizeof(StateSpeller),
		    mstrerror(g_errno));
		return true;
	}
	mnew ( st , sizeof(StateSpeller) , "State00" );

	st->m_state = state;
	st->m_callback = callback;
	st->m_q = q;
	st->m_spellcheck = spellcheck;
	st->m_dst = dst;
	st->m_dend = dst + dstLen;
	st->m_narrowSearch = narrowSearch;
	st->m_nrw = narrow;
	st->m_nend = narrow + narrowLen;
	st->m_numNarrow = numNarrows;
	*st->m_numNarrow = 0;
	st->m_start = gettimeofdayInMilliseconds();
	st->m_numFrags = 0;
	st->m_numFragsReceived = 0;

	// . break query down into fragments
	// . each fragment is a string of words
	// . quotes and field names will separate fragments
	// . TODO: make field data in its own fragment
	int32_t nqw = q->m_numWords;

	for ( int32_t i = 0 ; i < nqw ; i++ ) {
		// get a word in the Query to start a fragment with
		QueryWord *qw = &q->m_qwords[i];
		// can he start the phrase?
		if ( ! canStart( qw ) )
			continue;

		bool inQuotes  = qw->m_inQuotes;
		char fieldCode = qw->m_fieldCode;
		// . get longest continual fragment that starts with word #i
		// . get the following words that can be in a fragment
		//   that starts with word #i
		// . start of the frag
		int32_t  endQword = i;
		int32_t  startQword = i;
		for ( ; i < nqw ; i++ ) {
			// . skip if we should
			// . keep punct, however
			QueryWord *qw1 = &q->m_qwords[i];
			if ( qw1->m_opcode                 ) break;
			if ( qw1->m_inQuotes  != inQuotes  ) break;
			if ( qw1->m_fieldCode != fieldCode ) break;
			if ( qw1->m_ignoreWord == IGNORE_FIELDNAME ) break;
			if ( qw1->m_phraseSign &&
			     !qw1->m_rightConnected ) break;
			// are we punct?
			if ( ! is_alnum_utf8(qw1->m_word) )
				endQword = i - 1;
			else
				endQword = i;
		}
		// revisit this i in big loop since we did not include it
		i = endQword;

		//create a new stateFrag
		StateFrag *stFrag;
		try { stFrag = new (StateFrag); }
		catch ( ... ) {
			mdelete ( st, sizeof(StateSpeller),  "StateSpeller" );
			delete (st);
			g_errno = ENOMEM;
			log("Speller: new(%i): %s", sizeof(StateFrag),
			    mstrerror(g_errno));
			//continue;
			return true;
		}
		mnew ( stFrag, sizeof(StateFrag),
		       "StateFrag" );

		stFrag->m_state = (void*) st;
		stFrag->m_narrowPhrase = st->m_narrowSearch;
		stFrag->m_q = q;
		stFrag->m_startQword = startQword;
		stFrag->m_endQword = endQword;
		stFrag->m_errno = 0;
		st->m_stFrag[st->m_numFrags] = stFrag;
		st->m_numFrags++;
		// blocked
		if ( !getRecommendation( stFrag ) ){
			continue;
		}
		st->m_numFragsReceived++;
	}
	// if outstanding frags
	if ( st->m_numFragsReceived < st->m_numFrags )
		return false;
	gotFrags(st);
	// delete state
	mdelete ( st, sizeof(StateSpeller),  "StateSpeller" );
	delete (st);
	return true;
}

bool Speller::getRecommendation ( StateFrag *st ){
	st->m_recommended = false;
	st->m_numFound = 0;
	st->m_numNarrowPhrases = 0;
	char *dst = st->m_dst;

	// normalize this fragment and store in "dst"
	bool wasAlnum = true;
	for ( int32_t i = st->m_startQword; i <= st->m_endQword; i++ ){
		// start of each word
		st->m_wp[i] = dst;
		char *p = st->m_q->m_qwords[i].m_word;
		int32_t  plen = st->m_q->m_qwords[i].m_wordLen;
		for ( int32_t j = 0; dst-st->m_dst <MAX_FRAG_SIZE&&j<plen;j++ ) {
			if ( !getClean_utf8(p+j) )
				continue;
			// skip back to back punct/spaces
			if (j>0 && !is_alnum_utf8(p+j) &&!wasAlnum)
				continue;
			*dst = p[j];
			dst++;
			wasAlnum = is_alnum_utf8 ( p+j );
		}
		st->m_wplen[i] = dst - st->m_wp[i];
		st->m_isfound[i] = false;
	}
	*dst = '\0';

	// debug msg
	log(LOG_DEBUG,"speller: Getting recommendation for frag=%s",
	    st->m_dst);

	// give each word in the phrase a chance to start the subphrase
	int32_t maxPhrase = st->m_endQword - st->m_startQword;
	if ( maxPhrase > MAX_WORDS_PER_PHRASE )
		maxPhrase = MAX_WORDS_PER_PHRASE;

	// store the phraseLen and posn
	st->m_pLen = maxPhrase;
	st->m_pPosn = st->m_startQword;

	return launchReco(st);
}

bool Speller::launchReco(StateFrag *st){
	// if we checked all the phrases or found all the words
	if ( st->m_numFound == st->m_endQword - st->m_startQword + 1 ||
	     st->m_pLen < 0 ){
		return true;
	}

	bool launchPhrase = false;
 	for ( ; st->m_pLen >= 0; st->m_pLen-- ){
		for ( ; st->m_pPosn + st->m_pLen <= st->m_endQword;
		      st->m_pPosn++ ) {
			// find a word that can start the phrase
			QueryWord *qw = &st->m_q->m_qwords[st->m_pPosn];
			if ( !canStart (qw) )
				continue;
			// don't do this phrase if we have found even one
			// word in the phrase
			bool found = false;
			for ( int32_t k = st->m_pPosn;
			      k <= st->m_pPosn + st->m_pLen; k++ ) {
				if ( st->m_isfound[k] ){
					found = true;
					break;
				}
			}
			if ( found )
				continue;

			// cannot end on a stop word, punct, right-connected
			// word
			QueryWord *qwEnd =
				&st->m_q->m_qwords[st->m_pPosn + st->m_pLen];
			if ( qwEnd->m_isStopWord || qwEnd->m_isPunct ||
			     qwEnd->m_rightConnected )
				continue;

			// found someone to start the phrase with
			// what is the new phrase parms?
			st->m_a = st->m_wp[st->m_pPosn];
			st->m_b = st->m_wp[st->m_pLen + st->m_pPosn]+
				st->m_wplen[st->m_pLen + st->m_pPosn];

			// also store the tmp char that we are changing
			st->m_c = *(st->m_b);
			*(st->m_b) = '\0';

			// if it is just a number, don't get recommendation
			// lest we emabarrass ourselves
			if ( st->m_pPosn == 0 && is_digit(st->m_a[0]) ) {
				char *k = st->m_a+1;
				while ( is_digit(*k) ) k++;
				if ( ! *k ) {
					*st->m_b = st->m_c ;
					continue;
				}
			}

			// if it is an adult phrase, don't get a recommendation
			// check if isAdult really finds a word.
			char *adultLoc = NULL;
			if ( isAdult(st->m_a, gbstrlen(st->m_a), &adultLoc) &&
			     ( adultLoc == st->m_a || *(adultLoc-1) == ' ' ) ){
				// mark as found
				for ( int32_t k = st->m_pPosn;
				      k <= st->m_pPosn + st->m_pLen; k++ )
					st->m_isfound[k] = true;
				*(st->m_b) = st->m_c;
				continue;
			}
			// if the phrase is in dict or in the top pop words,
			// phrase is found. Don't check if we are narrowing
			// the phrase because we need to multicast anyways
			uint64_t h ;
			h = hash64d(st->m_a, gbstrlen(st->m_a) );
			if ( !st->m_narrowPhrase &&
			     getPhrasePopularity( st->m_a, h, false ) > 0 ){
				// mark as found
				for ( int32_t k = st->m_pPosn;
				      k <= st->m_pPosn + st->m_pLen; k++ )
					st->m_isfound[k] = true;
				*(st->m_b) = st->m_c;
				continue;
			}
			launchPhrase = true;
			break;
		}
		if ( launchPhrase )
			break;
		st->m_pPosn = st->m_startQword;
	}

	if ( st->m_pLen < 0 ){
		return true;
	}

	// debug msg
	log(LOG_DEBUG,"speller: ----------");
	log(LOG_DEBUG,"speller: Checking phrase=%s", st->m_a);


	// launch for all the splits
	st->m_numRequests = 0;
	st->m_numReplies = 0;


	int32_t hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits;
	// don't send to twins...
	hostsPerSplit /= g_hostdb.m_numHostsPerShard;
	int32_t mySplit = g_hostdb.m_hostId % g_hostdb.m_indexSplits;

	int32_t key = st->m_q->getQueryHash();//0;
	int32_t timeout = 30;
	int32_t niceness = 0;
	char request[MAX_FRAG_SIZE + 4];
	char *p = request;
	*(bool *)p = st->m_narrowPhrase;
	p += sizeof(bool);
	strcpy ( p, st->m_a );
	// send the null end too
	p += gbstrlen(st->m_a)+1;
	int32_t plen = p - request;
	for ( int32_t i = 0; i < hostsPerSplit; i++ ){
		// get the hostId of the host we're sending to
		uint32_t hostId =
			mySplit + ( i * g_hostdb.m_indexSplits );
		Host *h = g_hostdb.getHost(hostId);
		st->m_mcast[i].reset();

		bool status = st->m_mcast[i].
			send(request   ,
			     plen      , // request size
			     0x3d      , // msgType 0x3d
			     false     , // multicast owns m_request?
			     h->m_groupId, // group to send to (groupKey)
			     false     , // send to whole group?
			     key       ,
			     st        , // state data
			     NULL      , // state data
			     gotSpellerReplyWrapper ,
			     timeout      , // in seconds
			     niceness  ,
			     false     , // realtime?
			     -1        , // m_q->m_bestHandlingHostId ,
			     NULL      , // m_replyBuf   ,
			     0         , // MSG39REPLYSIZE,
			     // this is true if multicast should free
			     // the
			     // reply, otherwise caller is responsible
			     // for freeing it after calling
			     // getBestReply).
			     // actually, this should always be false,
			     // there
			     // is a bug in Multicast.cpp.
			     false        );

		if (!status){
			st->m_numReplies++;
			log("speller: Multicast had error: %s",
			    mstrerror(g_errno));
			st->m_errno = g_errno;
			continue;
		}
		// blocked
		else
			st->m_numRequests++;
	}

	if ( st->m_numReplies == st->m_numRequests )
		return true;
	return false;
}

void gotSpellerReplyWrapper( void *state, void *state2 ){
	StateFrag *stFrag = (StateFrag *) state;
	stFrag->m_numReplies++;
	if ( stFrag->m_numReplies < stFrag->m_numRequests )
		return;
	// blocked
	if ( !g_speller.gotSpellerReply(stFrag) )
		return;

	StateSpeller *st = (StateSpeller *)stFrag->m_state;
	// One more frag received
	st->m_numFragsReceived++;
	if ( st->m_numFragsReceived < st->m_numFrags )
		return;

	g_speller.gotFrags(st);
	// callback
	st->m_callback( st->m_state );
	// delete state
	mdelete ( st, sizeof(StateSpeller),  "StateSpeller" );
	delete (st);
}

bool Speller::gotSpellerReply( StateFrag *st ){
	int32_t minScore = LARGE_SCORE;
	int32_t maxPop = -1;
	char *bestReco = NULL;

	char *reply[MAX_UNIQUE_HOSTS_PER_SPLIT];
	int32_t  replySize[MAX_UNIQUE_HOSTS_PER_SPLIT];
	int32_t  replyMaxSize[MAX_UNIQUE_HOSTS_PER_SPLIT];
	bool  freeit;
	bool  found = false; //phrase was found in dict or pop words
	int32_t hostsPerSplit = g_hostdb.m_numHosts / g_hostdb.m_indexSplits;
	// don't send to twins...
	hostsPerSplit /= g_hostdb.m_numHostsPerShard;

	int32_t  numNarrowPhrases[MAX_UNIQUE_HOSTS_PER_SPLIT];
	char *narrowPtrs[MAX_UNIQUE_HOSTS_PER_SPLIT];

	// init narrowSearch arrays
	for ( int32_t i = 0; i < MAX_UNIQUE_HOSTS_PER_SPLIT; i++ ){
		numNarrowPhrases[i] = 0;
		narrowPtrs[i] = NULL;
	}

	for ( int32_t i = 0; i < hostsPerSplit; i++ ){
		reply[i] = st->m_mcast[i].getBestReply( &replySize[i] ,
							&replyMaxSize[i] ,
							&freeit );
		// multicast may have an empty reply buffer if there was an
		// OOM error or something. m_errno should have been set, but
		// we have to loop through all the multicasts to free the
		// reply buffers.
		char *p = reply[i];

		if ( g_errno || st->m_errno || !p){
			continue;
		}
		// was is found in dict
		bool foundInDict = *(bool *)p;
		p += sizeof(bool);
		if ( foundInDict )
			found = true;

		// first is if there is a recommendation or not
		bool recommendation = *(bool *) p;
		p += sizeof (bool);

		if ( !recommendation && !st->m_narrowPhrase )
			continue;

		int32_t score = *(int32_t *)p;
		p += 4;
		int32_t pop = *(int32_t *)p;
		p += 4;

		if ( recommendation ){
			log ( LOG_DEBUG,"speller: Received reco %s, "
			      "score=%"INT32", pop=%"INT32"", p, score, pop );

			// we have a recommendation with score and pop
			// choose the one with the lowest score, and if the
			// score is same then the max pop
			// HACK: we are getting bad recommendations for smaller
			// popularities. So don't consider them
			if ( pop > 8 && ( score < minScore ||
				   ( score == minScore && pop > maxPop ) ) ){
				bestReco = p;
				minScore = score;
				maxPop = pop;
			}
		}

		p += gbstrlen(p) + 1;
		if ( st->m_narrowPhrase ){
			numNarrowPhrases[i] = *(int32_t *)p;
			p += 4;
			narrowPtrs[i] = p;
		}
	}

	// merge all the narrow results
	if ( st->m_narrowPhrase ){
		int32_t currPhrase[MAX_UNIQUE_HOSTS_PER_SPLIT];
		for ( int32_t i = 0; i < MAX_UNIQUE_HOSTS_PER_SPLIT; i++ )
			currPhrase[i] = 0;
		for ( int32_t i = 0; i < MAX_NARROW_SEARCHES; i++ ){
			int32_t maxHost = -1;
			int32_t maxPop = 0;
			for ( int32_t j = 0; j < hostsPerSplit; j++ ){
				if ( numNarrowPhrases[j] <= currPhrase[j] )
					continue;
				int32_t pop = *(int32_t *)narrowPtrs[j];
				if ( pop <= maxPop )
					continue;
				maxPop = pop;
				maxHost = j;
			}
			if ( maxHost < 0 )
				break;
			//
			narrowPtrs[maxHost] += 4;
			strcpy( st->m_narrowPhrases[i], narrowPtrs[maxHost] );
			narrowPtrs[maxHost] +=gbstrlen(narrowPtrs[maxHost]) + 1;
			currPhrase[maxHost]++;
			st->m_numNarrowPhrases++;
		}
	}

	// make narrowPhrase false here, so that its not launched a second time
	// for the same frag;
	st->m_narrowPhrase = false;

	// revert
	*(st->m_b) = st->m_c;

	// if we found a recommendation,or if the phrase was found in the
	// dictionary or pop words then mark all the
	// words that fall under the phrase as found
	if ( found || bestReco ){
		for ( int32_t k = st->m_pPosn;
		      k <= st->m_pLen + st->m_pPosn; k++ )
			st->m_isfound[k] = true;
		st->m_numFound += st->m_pLen + 1;
	}

	// if not found in the dictionary or a recommendation, copy the phrase
	if ( !found && bestReco){
		// this fragment is going to be recommended
		st->m_recommended = true;
		// insert our recommendation into the phrase to get a new one
		char *s1    = st->m_wp[st->m_startQword];
		int32_t  slen1 = st->m_a - st->m_wp[st->m_startQword];
		char *s2    = bestReco;
		int32_t  slen2 = gbstrlen(bestReco);
		char *s3    = st->m_b ;
		// store the difference in length between the reco and the
		// original string
		int32_t  diff = slen2 - ( st->m_b - st->m_a );
		int32_t  slen3 = st->m_wp[st->m_endQword] +
			st->m_wplen[st->m_endQword] - st->m_b;

		if ( slen3 < 0 )
			slen3 = 0;

		int32_t  tlen = slen1 + slen2 + slen3 ;
		if ( tlen > MAX_FRAG_SIZE ){
			log(LOG_LOGIC,"speller: buf too small. Fix me 3.");
			// blocked
			if ( !launchReco(st) )
				return false;
			return true;
		}
		// make substitution and store in "dst"
		char buf2 [ MAX_FRAG_SIZE];
		char *nf = buf2;
		memcpy ( nf , s1 , slen1 ) ; nf += slen1;
		memcpy ( nf , s2 , slen2 ) ; nf += slen2;
		memcpy ( nf , s3 , slen3 ) ;
		nf += slen3;

		// don't forget to NULL terminate
		*nf = '\0';
		// debug msg
		log( LOG_DEBUG,"speller: Trying substitution \"%s\"",
		     buf2 );

		strcpy ( st->m_dst , buf2 );

		// the pointers might have to be changed if the
		// recommendation was not of the same length as the words
		if ( diff != 0 ){
			for ( int32_t k = st->m_pLen+st->m_pPosn+1;
			      k <= st->m_endQword; k++ )
				st->m_wp[k] += diff;
		}
	}

	// don't forget to free the replies
	for ( int32_t i = 0; i < hostsPerSplit; i++ )
		if ( reply[i] && replyMaxSize[i] > 0 )
			mfree( reply[i], replyMaxSize[i], "SpellerReplyBuf" );

	// go to the next position in the phrase. if we have reached the end
	// of the phrase position, decrement the phrase length and start again
	if ( st->m_pPosn + st->m_pLen >= st->m_endQword - 1 ){
		st->m_pLen--;
		st->m_pPosn = st->m_startQword;
	}
	else
		st->m_pPosn++;

	if ( !launchReco(st) )
		return false;
	return true;
}
*/
// . break a NULL-terminated string down into a list of ptrs to the words
// . return the number of words stored into "wp"
/*
int32_t Speller::getWords ( const char *s ,
			 char *wp     [MAX_FRAG_SIZE] ,
			 int32_t  wplen  [MAX_FRAG_SIZE] ,
			 bool *isstop                   ) {
	int32_t nwp = 0;
 loop:
	// skip initial punct
	while ( *s && ! is_alnum ( *s ) ) s++;
	// bail if done
	if ( ! *s ) return nwp;
	// point to word
	wp [ nwp ] = (char *)s;
	// convenience ptr
	char *ww = (char *)s;
	// count over it
	while ( is_alnum ( *s ) ) s++;
	// how long is the word?
	int32_t slen = s - wp [ nwp ];
	// set length
	wplen [ nwp ] = slen ;
	// is it a stop word?
	if ( isstop ) {
		// TODO: make the stop words utf8!!!
		int64_t h = hash64Lower_utf8 ( ww , slen ) ;
		bool stop = ::isStopWord       ( ww , slen , h ) ;
		// BUT ok if Capitalized or number
		if ( stop ) {
			if ( is_digit (ww[0])    ) stop = false;
			if ( is_cap   (ww,slen ) ) stop = false;
			// e-mail, c file, c. s. lewis
			if ( slen  == 1 && ww[0] != 'a' ) stop = false;
		}
		isstop[nwp] = stop;
	}
	nwp++;
	goto loop;
}
*/
/*
void Speller::gotFrags( void *state ){
	StateSpeller *st = (StateSpeller *) state;

	char *dptr = st->m_dst;
	char *nptr = st->m_nrw;
	bool recommendation = false;
	Query *q = st->m_q;

	// . break query down into fragments
	// . each fragment is a string of words
	// . quotes and field names will separate fragments
	// . TODO: make field data in its own fragment
	int32_t nqw = q->m_numWords;
	int32_t currFrag = 0;
	for ( int32_t i = 0 ; i < nqw ; i++ ) {
		// get a word in the Query to start a fragment with
		QueryWord *qw = &q->m_qwords[i];
		// if he has a phraseSign, put it right away
		//if ( qw->m_phraseSign ) {
		// *dptr = qw->m_phraseSign;
		// dptr++;
		// }
		// can he start the phrase?
		// if he can't start our fragment, just copy over to "dst"
		if ( !canStart( qw )) {
			// copy to rp and get next word
			char *w    = qw->m_word;
			int32_t  wlen = qw->m_wordLen;
			if ( dptr + wlen >= st->m_dend ) {
				g_errno = EBUFTOOSMALL; continue; }
			// watch out for LeFtP and RiGhP
			if      ( qw->m_opcode == OP_LEFTPAREN ) *dptr++ = '(';
			else if ( qw->m_opcode == OP_RIGHTPAREN) *dptr++ = ')';
			else if ( qw->m_opcode == OP_PIPE      ) *dptr++ = '|';
			else {
				memcpy ( dptr , w , wlen );
				dptr += wlen;
			}
			*dptr = '\0';
			continue;
		}
		bool inQuotes  = qw->m_inQuotes;
		char fieldCode = qw->m_fieldCode;
		// . get longest continual fragment that starts with word #i
		// . get the following words that can be in a fragment
		//   that starts with word #i
		// . start of the frag
		int32_t  endQword = i;
		for ( ; i < nqw ; i++ ) {
			// . skip if we should
			// . keep punct, however
			QueryWord *qw1 = &q->m_qwords[i];
			if ( qw1->m_opcode                 ) break;
			if ( qw1->m_inQuotes  != inQuotes  ) break;
			if ( qw1->m_fieldCode != fieldCode ) break;
			if ( qw1->m_ignoreWord== IGNORE_FIELDNAME ) break;
			if ( qw1->m_phraseSign && !qw1->m_rightConnected )
				break;
			// are we punct?
			if ( ! is_alnum_utf8 (qw1->m_word) )
				endQword = i - 1;
			else
				endQword = i;
		}
		// revisit this i in big loop since we did not include it
		i = endQword;

		// OOM errors might cause us not to launch frags
		if ( currFrag >= st->m_numFrags )
			continue;
		StateFrag *stFrag = st->m_stFrag[currFrag];
		// don't breech
		if ( dptr + gbstrlen(stFrag->m_dst) >= st->m_dend ) {
			g_errno = EBUFTOOSMALL;
		}
		else {
			// store it
			strcpy ( dptr, stFrag->m_dst );
			dptr += gbstrlen ( dptr );
			// add a space between fragments
			//			*dptr = ' ';
			//dptr++;
			*dptr = '\0';
			// set the flag
			if ( stFrag->m_recommended )
				recommendation = true;
		}
		// copy over all the narrow searches that can fit
		for ( int32_t j = 0; j < stFrag->m_numNarrowPhrases; j++ ){
			// don't breech
			if ( nptr +gbstrlen(stFrag->m_narrowPhrases[j]) >
			     st->m_nend )
				break;
			strcpy(nptr, stFrag->m_narrowPhrases[j]);
			nptr += gbstrlen(stFrag->m_narrowPhrases[j]) + 1;
			(*st->m_numNarrow)++;
		}

		mdelete(stFrag, sizeof(StateFrag), "StateFrag");
		delete (stFrag);
		// now we get the next frag
		currFrag++;
	}
	if ( !recommendation )
		*st->m_dst = '\0';

	int64_t now = gettimeofdayInMilliseconds();
	if ( now - st->m_start > 50 )
		log(LOG_INFO,"speller: Took %"INT64" ms to spell check %s",
		    now - st->m_start, st->m_q->getQuery() );
	return;
}
*/


bool Speller::generateDicts ( int32_t numWordsToDump , char *coll ){
	m_language[2].setLang(2);
	//m_language[2].generateDicts ( numWordsToDump, coll );
	return false;
}

char *Speller::getRandomWord() {
	int32_t offset = rand() % m_unifiedBuf.length();//Size;
	// find nearest \0
	char *p = m_unifiedBuf.getBufStart() + offset;
	// backup until we hit \0
	for ( ; p > m_unifiedBuf.getBufStart() && *p ; p-- );
	// now advance!
	if ( p > m_unifiedBuf.getBufStart() ) p++;
	// that is the word
	return p;
}

// The unified dict is the combination of the word list, title rec and the top
// query dict of all languages. It has to be created by loading each languages
// dict into memory using Language.loadWordList(), loadTitleRecDict(), etc
bool Speller::loadUnifiedDict() {

	bool building = false;

 reload:

	bool needRebuild = false;

	m_unifiedBuf.purge();
	m_unifiedBuf.setLabel("unibuf");

	// this MUST be there
	if ( m_unifiedBuf.fillFromFile(g_hostdb.m_dir,
				       "unifiedDict-buf.txt" ) == 0 )
		needRebuild = true;

	// . give it a million slots
	// . unified dict currently has 1340223 entries
	m_unifiedDict.set ( 8,4, 2*1024*1024,NULL,0,false,0,"udictht");

	// try to load in the hashtable and the buffer directly
	if ( ! m_unifiedDict.load(g_hostdb.m_dir,"unifiedDict-map.dat"))
		needRebuild = true;

	if ( ! needRebuild ) {
		// convert unifiedBuf \n's to \0's
		char *start = m_unifiedBuf.getBufStart();
		char *end   = start + m_unifiedBuf.length();
		for ( char *p = start ; p < end ; p++ )
			if ( *p == '\n' ) *p = '\0';
		log(LOG_DEBUG,"speller: done loading successfully");

		// a quick little checksum
		if ( ! g_conf.m_isLive ) return true;

		// the size
		int64_t h1 = m_unifiedDict.getNumSlotsUsed();
		int64_t h2 = m_unifiedBuf .length();
		int64_t h = hash64 ( h1 , h2 );
		char *tail1 = (char *)m_unifiedDict.m_keys;
		char *tail2 = m_unifiedBuf.getBufStart()+h2-1000;
		h = hash64 ( tail1 , 1000 , h );
		h = hash64 ( tail2 , 1000 , h );
		//int64_t n = 8346765853685546681LL;
		int64_t n = -14450509118443930LL;
		if ( h != n ) {
			log("gb: unifiedDict-buf.txt or "
			    "unifiedDict-map.dat "
			    "checksum is not approved for "
			    "live service (%"INT64" != %"INT64")" ,h,n);
			//return false;
		}

		return true;
	}

	if ( building ) {
		log("gb: rebuild failed. exiting.");
		exit(0);
	}

	building = true;

	log("gb: REBUILDING unifiedDict-buf.txt and unifiedDict-map.dat");

	// just in case that was there and the buf wasn't
	m_unifiedDict.clear();
	// or vice versa
	m_unifiedBuf.purge();

	// load the .txt file. this is REQUIRED for rebuild
	SafeBuf ub;
	if ( ub.fillFromFile (g_hostdb.m_dir,"unifiedDict.txt") <= 0 )
		return false;

	//
	// change \n to \0
	// TODO: filter out the first word from each line?
	//
	char *start = ub.getBufStart();
	char *end   = start + ub.length();
	for ( char *p = start ; p < end ; p++ )
		if ( *p == '\n' ) *p = '\0';


	// now scan wikitionary file wiktionary-lang.txt to get even
	// more words! this file is generated from Wiktionary.cpp when
	// it scans the wiktionary xml dump to generate the other
	// wiktionary-syns.dat and wiktionary-buf.txt files. it also
	// cranks this file out because we can use it since we do not
	// have czech in the unifiedDict.txt file.
	SafeBuf wkfBuf;
	if ( wkfBuf.fillFromFile ( g_hostdb.m_dir,"wiktionary-lang.txt") <= 0 )
		return false;

	// scan each line
	char *p = wkfBuf.getBufStart();
	char *pend = p + wkfBuf.length();
	HashTableX wkfMap;
	// true = allow dups. because same word can appear in multiple langs
	if ( ! wkfMap.set ( 8,1,1000000,NULL,0,true,0,"wkfmap") )
		return false;

	// "fr|livre" is how it's formatted
	for ( ; p && p < pend ; p = wkfBuf.getNextLine(p) ) {
		char *start = p;
		// skip til |
		for ( ; *p && *p != '|' ; p++ );
		// sanity check
		if ( *p != '|' ) { char *xx=NULL;*xx=0; }
		// tmp NULL that
		*p = '\0';
		char langId = getLangIdFromAbbr(start);
		// revert
		*p = '|';
		if ( langId == langUnknown )
			continue;
		if ( langId == langTranslingual )
			continue;
		// skip |
		p++;
		// that's the word
		char *word = p;
		// find end
		char *end = p;
		for ( ; *end && *end != '\n' ; end++ ) ;
		// so hash it up
		int64_t wid = hash64d ( word , end - word );
		// debug point
		//if ( wid == 5000864073612302341LL )
		//	log("download");
		// add it to map
		if ( ! wkfMap.addKey ( &wid , &langId ) ) return false;
	}


	//
	// scan unifiedDict.txt file
	//
	int32_t totalCollisions = 0;
	uint64_t atline = 0;
	p = start;
	while ( p < end ) {
		atline++;
		char *phrase = p;
		// if line is a comment skip it
		if ( *p == '#' ){
			p += gbstrlen(p) + 1;
			continue;
		}
		// skip phrase
		while ( *p != '\t' )
			p++;
		// Null end the phrase
		*p = '\0';

		// skip empty phrases
		if(gbstrlen(phrase) < 1) {
			log(LOG_WARN,
				"spell: Got zero length entry in unifiedDict "
			    "at line %"UINT64", skipping\n",
				atline);
			p += gbstrlen(p) + 1;
			continue;
		}

		// skip single byte words that are not alphabetic
		// Anything over 'Z' is likely unicode, so don't bother
		if(gbstrlen(phrase) == 1 && (phrase[0] < 'a')) {
			log(LOG_WARN,
				"spell: Got questionable entry in "
			    "unifiedDict at line %"UINT64", skipping: %s\n",
				atline,p);
			p += gbstrlen(p) + 1;
			continue;
		}
		// . i need to move everything over to utf8!!!
		// . this is the same hash function used by Words.cpp so that
		p++;
		// phonet
		char *phonet = p;
		// next is the phonet
		while ( *p != '\t' )
			p++;
		// Null end the phonet
		*p = '\0';
		p++;

		uint64_t key = hash64d(phrase,gbstrlen(phrase));

		// make sure we haven't added this word/phrase yet
		if ( m_unifiedDict.isInTable ( &key ) ) {
			totalCollisions++;
			p += gbstrlen(p) + 1;
			continue;
		}

		// reset lang vector
		int64_t pops[MAX_LANGUAGES];
		memset ( pops , 0 , MAX_LANGUAGES * 8 );

		// see how many langs this key is in in unifiedDict.txt file
		char *phraseRec = p;
		getPhraseLanguages2 ( phraseRec , pops );

		// make all pops positive if it has > 1 lang already
		//int32_t count = 0;
		//for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ )
		//	if ( pops[i] ) count++;

		int32_t imax = MAX_LANGUAGES;
		//if ( count <= 1 ) imax = 0;
		// assume none are in official dict
		// seems like nanny messed things up, so undo that
		// and set it negative if in wiktionary in loop below
		for ( int32_t i = 0 ; i < imax ; i++ )
			// HOWEVER, if it is -1 leave it be, i think it
			// was probably correct in that case for some reason.
			// Wiktionary fails to get a TON of forms for
			// many foreign languages in the english dict.
			// so nanny got these from some dict, so try to
			// keep them.
			// like 'abelhudo'
			// http://pt.wiktionary.org/wiki/abelhudo
			// and is not in en.wiktionary.org
			// . NO! because it has "ein" as english with
			//   a -1 popularity as well as "ist"! reconsider
			if ( pops[i] < -1 ) pops[i] *= -1;
			//if ( pops[i] < 0 ) pops[i] *= -1;

		// debug
		//if ( strcmp(phrase,"download") == 0 )
		//	log("hey");

		// now add in from wiktionary
		int32_t slot = wkfMap.getSlot ( &key );
		for ( ; slot >= 0 ; slot = wkfMap.getNextSlot(slot,&key) ) {
			uint8_t langId = *(char *)wkfMap.getDataFromSlot(slot);
			if ( langId == langUnknown ) continue;
			if ( langId == langTranslingual ) continue;
			// if it marked as already in that dictionary, cont
			if ( pops[langId] < 0 ) continue;
			// if it is positive, make it negative to mark
			// it as being in the official dictionary
			// -1 means pop unknown but in dictionary
			if ( pops[langId] == 0 ) pops[langId]  = -1;
			else                     pops[langId] *= -1;
		}

		// save the offset
		int32_t offset = m_unifiedBuf.length();

		// print the word/phrase and its phonet, if any
		m_unifiedBuf.safePrintf("%s\t%s\t",phrase,phonet);

		int32_t count = 0;
		// print the languages and their popularity scores
		for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ ) {
			if ( pops[i] == 0 ) continue;
			// skip "unknown" what does that really mean?
			if ( i == 0 ) continue;
			m_unifiedBuf.safePrintf("%"INT32"\t%"INT32"\t",
						i,(int32_t)pops[i]);
			count++;
		}
		// if none, revert
		if ( count == 0 ) {
			m_unifiedBuf.setLength(offset);
			// skip "p" to next line in unifiedBuf.txt
			p += gbstrlen(p) + 1;
			continue;
		}

		// trim final tab i guess
		m_unifiedBuf.incrementLength(-1);
		// end line
		m_unifiedBuf.pushChar('\n');

		// directly point to the (lang, score) tuples
		m_unifiedDict.addKey(&key, &offset);

		// skip "p" to next line in unifiedBuf.txt
		p += gbstrlen(p) + 1;
	}

	log (LOG_WARN,"spell: got %"INT32" TOTAL collisions in unified dict",
	     totalCollisions);


	HashTableX dedup;
	dedup.set(8,0,1000000,NULL,0,false,0,"dmdm");

	// . now add entries from wkfBuf that were not also in "ub"
	// . format is "<langAbbr>|<word>\n"
	p = wkfBuf.getBufStart();
	end = p + wkfBuf.length();
	for ( ; p ; p = wkfBuf.getNextLine(p) ) {
		//char *langAbbr = p;
		for ( ; *p && *p !='\n' && *p !='|' ; p++ );
		if ( *p != '|' ) {
			log("speller: bad format in wiktionary-lang.txt");
			char *xx=NULL;*xx=0;
		}
		//*p = '\0';
		//uint8_t langId = getLangIdFromAbbr ( langAbbr );
		//*p = '|';
		// get word
		char *word = p + 1;
		// get end of it
		for ( ; *p && *p !='\n' ; p++ );
		if ( *p != '\n' ) {
			log("speller: bad format in wiktionary-lang.txt");
			char *xx=NULL;*xx=0;
		}
		int32_t wordLen = p - word;
		// wiktinary has like prefixes ending in minus. skip!
		if ( word[wordLen-1] == '-' ) continue;
		// suffix in wiktionary? skip
		if ( word[0] == '-' ) continue;
		// .zr .dd
		if ( word[0] == '.' ) continue;

		// hash the word
		int64_t key = hash64d ( word , wordLen );

		// skip if we did it in the above loop
		if ( m_unifiedDict.isInTable ( &key ) ) continue;

		// skip if already did it in this loop
		if ( dedup.isInTable ( &key ) ) continue;
		if ( ! dedup.addKey ( &key ) ) return false;

		// reset lang vector
		int64_t pops[MAX_LANGUAGES];
		memset ( pops , 0 , MAX_LANGUAGES * 8 );

		// now add in from wiktionary map
		int32_t slot = wkfMap.getSlot ( &key );
		for ( ; slot >= 0 ; slot = wkfMap.getNextSlot(slot,&key) ) {
			uint8_t langId = *(char *)wkfMap.getDataFromSlot(slot);
			if ( langId == langUnknown ) continue;
			if ( langId == langTranslingual ) continue;
			if ( pops[langId] ) continue;
			// -1 means pop unknown but in dictionary
			pops[langId] = -1;
		}


		// save the offset
		int32_t offset = m_unifiedBuf.length();

		// . print the word/phrase and its phonet, if any
		// . phonet is unknown here...
		//char *phonet = "";
		m_unifiedBuf.safeMemcpy ( word, wordLen );
		m_unifiedBuf.safePrintf("\t\t");//word,phonet);

		int32_t count = 0;
		// print the languages and their popularity scores
		for ( int32_t i = 0 ; i < MAX_LANGUAGES ; i++ ) {
			if ( pops[i] == 0 ) continue;
			// skip "unknown" what does that really mean?
			if ( i == 0 ) continue;
			m_unifiedBuf.safePrintf("%"INT32"\t%"INT32"\t",
						i,(int32_t)pops[i]);
			count++;
		}
		// if none, revert
		if ( count == 0 ) {
			m_unifiedBuf.setLength(offset);
			continue;
		}

		// trim final tab i guess
		m_unifiedBuf.incrementLength(-1);
		// end line
		m_unifiedBuf.pushChar('\n');

		// directly point to the (lang, score) tuples
		m_unifiedDict.addKey(&key, &offset);

	}


	// save the text too! a merge of unifiedDict.txt and
	// wiktionary-lang.txt!!!
	if ( m_unifiedBuf.saveToFile(g_hostdb.m_dir,"unifiedDict-buf.txt") <=0)
		return false;

	// save it
	if ( m_unifiedDict.save(g_hostdb.m_dir,"unifiedDict-map.dat")<=0 )
		return false;

	// start over and load what we created
	goto reload;

	// hmmm... seems like we need to re-run for some reason
	log("spell: PLEASE RERUN gb");
	log("spell: PLEASE RERUN gb");
	log("spell: PLEASE RERUN gb");
	exit(0);

	return true;
}

// in case the language is unknown, just give the pop of the
// first found language
int32_t Speller::getPhrasePopularity ( char *str, uint64_t h,
				    bool checkTitleRecDict,
				    unsigned char langId ){
	//char *xx=NULL;*xx=0;

	// hack fixes.
	// common word like "and"?
	if ( isCommonWord(h) ) return MAX_PHRASE_POP;
	// another common word check
	if ( isQueryStopWord(NULL,0,h) ) return MAX_PHRASE_POP;
	// single letter?
	if ( str && str[0] && str[1] == '\0' ) return MAX_PHRASE_POP;
	// 0-99 only
	if ( str && is_digit(*str) ) {
		if ( !str[1]) return MAX_PHRASE_POP;
		if ( is_digit(str[1])&& !str[2]) return MAX_PHRASE_POP;
	}

	// what up with this?
	//if ( !s ) return 0;
	int32_t slot = m_unifiedDict.getSlot(&h);
	// if not in dictionary assume 0 popularity
	if ( slot == -1 ) return 0;
	//char *p = *(char **)m_unifiedDict.getValueFromSlot(slot);
	int32_t offset =  *(int32_t *)m_unifiedDict.getValueFromSlot(slot);
	char *p = m_unifiedBuf.getBufStart() + offset;
	char *pend = p + gbstrlen(p);

	// skip word itself
	while ( *p != '\t' ) p++;
	p++;
	// skip phonet, if any
	while ( *p != '\t' ) p++;
	p++;

	int32_t max = 0;

	// the tuples are in ascending order of the langid
	// get to the right language
	while ( p < pend ){

		int32_t currLang = atoi(p);

		// the the pops are sorted by langId, return 0 if the lang
		// was not found
		if ( langId != langUnknown && currLang > langId )
			return 0;

		// skip language
		while ( *p != '\t' ) p++;
		p++;

		int32_t score = atoi(p);

		// i think negative scores mean it is only from titlerec and
		// not in any of the dictionaries.
		if ( score < 0 )
			score *= -1;

		if ( currLang == langId && langId != langUnknown )
			return score;

		// if lang is unknown get max
		if ( score > max ) max = score;

		// skip that score and go to the next <lang> <pop> tuple
		while ( *p != '\t' && *p != '\0' ) p++;
		p++;

	}
	return max;
}

// splits words and checks if they form a porn word or not. montanalinux.org
// is showing up as porn because it has 'anal' in the hostname. So try to
// find a combination of words such that they are NOT porn.
// try this only after isAdult() succeeds.
// Always tries to find longer words first. so 'montanalinux' is split as
// 'montana' and 'linux' and not as 'mont', 'analinux'
// if it finds a seq of words leading upto a porn word, then it returns true
// eg. shall split montanalinux into 'mont', 'anal', and return true without
// checking if 'inux' is a word. Need to do this because isAdult() cannot
// define where an adult word has ended.
// TODO: chatswingers.com NOT identified as porn because it is split as
// 'chats' and 'wingers'.

bool Speller::canSplitWords( char *s, int32_t slen, bool *isPorn,
			     char *splitWords,
			     unsigned char langId, int32_t encodeType ){
	//char *xx=NULL;*xx=0;

	*isPorn = false;
	char *index[1024];
	if ( slen == 0 )
		return true;
	*splitWords = '\0';

	// this is the current word we're on
	int32_t curr = 0;
	index[curr++] = s;
	index[curr] = s + slen;
	while ( curr > 0 ){
		char *nextWord = NULL;
		while ( findNext( index[curr-1], index[curr],
				  &nextWord, isPorn, langId, encodeType ) ){
			// next word in chain
			index[curr++] = nextWord;
			index[curr] = s + slen;
			// found a porn word OR
			// finished making a sequence of words
			if ( *isPorn || nextWord == s + slen ){
				char *p = splitWords;
				for ( int32_t k = 1; k < curr; k++ ){
					memcpy (p, index[k - 1],
						index[k] - index[k - 1]);
					p += index[k] - index[k - 1];
					*p = ' ';
					p++;
				}
				*p = '\0';
				return true;
			}
		}

		// did not find any word. reduce the current position
		while ( --curr > 0 ){
			if ( curr > 0 && index[curr] > index[curr-1] ){
				index[curr]--;
				break;
			}
		}
	}
	return false;
}

bool Speller::findNext( char *s, char *send, char **nextWord, bool *isPorn,
			unsigned char langId, int32_t encodeType ){
	//char *xx=NULL;*xx=0;

	char *loc = NULL;
	int32_t slen = send - s;
	// check if there is an adult word in there
	// NOTE: The word 'adult' gives a lot of false positives, so even
	// though it is in the isAdult() list, skip it.
	// s/slen constitues an individual word.
	if ( isAdult ( s, slen, &loc ) && strncmp ( s, "adult", 5 ) != 0 ){
		// if this string starts with the adult word, don't check
		// further
		if ( loc == s ){
			*isPorn = true;
			*nextWord = send;
			return true;
		}
	}
	for ( char *a = send; a > s; a-- ){
		// a hack, if the word is only one letter long, check if it
		// is 'a' or 'i'. If not then continue
		if ( a - s == 1 && *s != 'a' && *s != 'i')
			continue;
		// another hack, the end word of the string cannot be 2 letters
		// or less. freesex was being split as 'frees ex'
		if ( a == send && a - s <= 2 )
			continue;

		// do not allow "ult" to be a word because it always will
		// split "adult" into "ad+ult"
		if ( a - s == 3 && s[0]=='u' && s[1]=='l' && s[2]=='t' )
			continue;
		// adultsiteratings = "ad ul ts it era tings"
		if ( a - s == 2 && s[0]=='u' && s[1]=='l' )
			continue;
		// lashaxxxnothing = "lash ax xx nothing"
		if ( a - s == 2 && s[0]=='u' && s[1]=='l' )
			continue;
		// livesexasian = "lives ex asian"
		if ( a - s == 2 && s[0]=='e' && s[1]=='x' )
			continue;
		// fuckedtits = "fu ck edt its"
		if ( a - s == 2 && s[0]=='c' && s[1]=='k' )
			continue;
		// blogsexe = "blogs exe" ... many others
		// any 3 letter fucking word starting with "ex"
		if ( a - s == 3 && s[0]=='e' && s[1]=='x' )
			continue;
		// shemales = "*s hem ales"
		if ( a - s == 4 && s[0]=='a' &&s[1]=='l'&&s[2]=='e'&&s[3]=='s')
			continue;
		// grooverotica = "groove rot ica"
		if ( a - s == 3 && s[0]=='i' && s[1]=='c' && s[2]=='a' )
			continue;
		// dinerotik = dinero tik
		if ( a - s == 3 && s[0]=='t' && s[1]=='i' && s[2]=='k' )
			continue;
		// nudeslutpics = "nud esl ut pics"
		if ( a - s == 3 && s[0]=='n' && s[1]=='u' && s[2]=='d' )
			continue;
		// seepornos = "seep or nos"
		if ( a - s == 3 && s[0]=='n' && s[1]=='o' && s[2]=='s' )
			continue;
		// bookslut = "books lut"
		if ( a - s == 3 && s[0]=='l' && s[1]=='u' && s[2]=='t' )
			continue;
		// lesexegratuit = "lese xe gratuit"
		if ( a - s == 2 && s[0]=='x' && s[1]=='e' )
			continue;
		// mooiemensensexdating = "mens ense xd a ting"
		if ( a - s == 2 && s[0]=='x' && s[1]=='d' )
			continue;
		// mpornlinks = mpo rn links
		if ( a - s == 2 && s[0]=='r' && s[1]=='n' )
			continue;
		// ukpornbases = ukp or nba bes
		if ( a - s == 2 && s[0]=='o' && s[1]=='r' )
			continue;
		// slut
		if ( a - s == 2 && s[0]=='l' && s[1]=='u' )
			continue;
		// independentstockholmescorts = "tock holme sco rts"
		if ( a - s == 3 && s[0]=='s' && s[1]=='c' && s[2]=='o' )
			continue;
		// relatosexcitantes = relat ose xci tan tes
		if ( a - s == 3 && s[0]=='x' && s[1]=='c' && s[2]=='i' )
			continue;
		// babe = * bes
		if ( a - s == 3 && s[0]=='b' && s[1]=='e' && s[2]=='s' )
			continue;
		// xpornreviews "xp orn reviews "
		if ( a - s == 3 && s[0]=='o' && s[1]=='r' && s[2]=='n' )
			continue;
		// shemal fix
		if ( a - s == 3 && s[0]=='h' && s[1]=='e' && s[2]=='m' )
			continue;
		// adultswim = adults wim
		if ( a - s == 3 && s[0]=='w' && s[1]=='i' && s[2]=='m' )
			continue;
		// bdsm
		if ( a - s == 3 && s[0]=='d' && s[1]=='s' && s[2]=='m' )
			continue;
		// anal
		if ( a - s == 3 && s[0]=='n' && s[1]=='a' && s[2]=='l' )
			continue;
		// vibrator = bra
		if ( a - s == 3 && s[0]=='b' && s[1]=='r' && s[2]=='a' )
			continue;
		// sitiospornox = sitio spor nox
		if ( a - s == 4 && s[0]=='s' && s[1]=='p' && s[2]=='o' &&
		     s[3] == 'r' )
			continue;
		// orn*
		if ( a - s == 4 && s[0]=='o' && s[1]=='r' && s[2]=='n' )
			continue;
		// hotescorts = hote scor
		if ( a - s == 4 && s[0]=='s' && s[1]=='c' && s[2]=='o' &&
		     s[3] == 'r' )
			continue;
		// uniformsluts = uniformts lutz
		if ( a - s == 4 && s[0]=='l' && s[1]=='u' && s[2]=='t' &&
		     s[3] == 'z' )
			continue;
		// free porn login = freep ornl
		if ( a - s == 5 && s[0]=='f' && s[1]=='r' && s[2]=='e' &&
		     s[3] == 'e' && s[4] == 'p' )
			continue;
		// shemal fix
		if ( a - s == 5 && s[0]=='h' && s[1]=='e' && s[2]=='m' &&
		     s[3] == 'a' && s[4] == 'l' )
			continue;
		// inbondage = inbond age
		if ( a - s == 6 &&
		     s[0]=='i' && s[1]=='n' && s[2]=='b' &&
		     s[3]=='o' && s[4]=='n' && s[5]=='d' )
			continue;
		// swingers = wingers
		if ( a - s == 7 &&
		     s[0]=='w' && s[1]=='i' && s[2]=='n' &&
		     s[3]=='g' && s[4]=='e' && s[5]=='r' &&
		     s[6]=='s' )
			continue;
		// free sex contents = freese xc ont ents
		if ( a - s == 2 && s[0]=='x' && s[1]=='c' )
			continue;
		// mosexstore = mose xs tore
		if ( a - s == 2 && s[0]=='x' && s[1]=='s' )
			continue;
		// phonesexfootsies
		if ( a - s == 8 &&
		     s[0]=='p' && s[1]=='h' && s[2]=='o' &&
		     s[3]=='n' && s[4]=='e' && s[5]=='s' &&
		     s[6]=='e' && s[7]=='x' )
			continue;
		// cybersex
		if ( a - s == 8 &&
		     s[0]=='c' && s[1]=='y' && s[2]=='b' &&
		     s[3]=='e' && s[4]=='r' && s[5]=='s' &&
		     s[6]=='e' && s[7]=='x' )
			continue;
		// hotescorts


		// check if the word has popularity. if it is in the
		// unifiedDict, then it is considered to be a word
		uint64_t h = hash64d(s, a-s);//a - s, encodeType);
		int32_t pop = getPhrasePopularity(s, h, false, langId);

		// continue if did not find it
		if ( pop <= 0 )
			continue;
		// this is our next word
		*nextWord = a;
		return true;
	}
	return false;
}

//similar to one above but using recursion
/*bool Speller::canSplitWords( char *s, int32_t slen, bool *isPorn,
  char *splitWords,
  unsigned char langId, int32_t encodeType ){

  if ( slen == 0 )
  return true;
  char *loc = NULL;
  // check if there is an adult word in there
  if ( isAdult ( s, slen, &loc ) ){
  // if this string starts with the adult word
  if ( loc == s ){
  memcpy ( splitWords, s, slen );
  splitWords[slen] = ' ';
  splitWords[slen + 1] = '\0';
  *isPorn = true;
  return true;
  }
  }

  char *b = s + slen;
  // split the phrase into two or more phrases.
  for ( char *a = b; a > s; a-- ){
  //	while ( a > s ){
  // a hack, if the word is only one letter long, check if it
  // is 'a' or 'i'. If not then continue
  if ( a - s == 1 && *s != 'a' && *s != 'i')
  continue;

  // check if the word has popularity. if it is in the
  // unifiedDict, then it is considered to be a word
  uint64_t h = hash64d(s, a - s, encodeType);
  int32_t pop = getPhrasePopularity(s, h, false, langId);

  // continue if did not find it
  if ( pop <= 0 )
  continue;
  memcpy ( splitWords, s, a - s );
  splitWords[a - s] = ' ';
  splitWords[a - s + 1] = '\0';
  // see if we can split the rest
  if ( canSplitWords ( a, b - a, isPorn,
  splitWords + (a - s + 1),
  langId, encodeType ) )
  return true;
  }
  // did not find any sequence of words that can make this string
  return false;
  }*/

bool Speller::createUnifiedDict (){
	// first get all the tuples from wordlist and query file
	//HashTableT <uint64_t, char*> ht[MAX_LANGUAGES];
	HashTableX ht[MAX_LANGUAGES];
	char ff[1024];
	for ( int32_t i = 0; i < MAX_LANGUAGES; i++ ){
		ht[i].set ( 8,4,0,NULL,0,false,0,"cud");
		sprintf ( ff , "%sdict/%s/%s.wl.phonet", g_hostdb.m_dir,
			  getLanguageAbbr(i), getLanguageAbbr(i) );
		populateHashTable(ff, &ht[i], i);

		sprintf ( ff , "%sdict/%s/%s.query.phonet.top", g_hostdb.m_dir,
			  getLanguageAbbr(i), getLanguageAbbr(i) );
		populateHashTable(ff, &ht[i], i);

		for ( int32_t j = 0; j < NUM_CHARS; j++ ){
			sprintf ( ff , "%sdict/%s/%s.dict.%"INT32"", g_hostdb.m_dir,
				  getLanguageAbbr(i), getLanguageAbbr(i), j );
			populateHashTable(ff, &ht[i], i);
		}
	}

	//sprintf ( ff, "%sdict/unifiedDict",g_hostdb.m_dir );
	sprintf ( ff, "%sunifiedDict.txt",g_hostdb.m_dir );
	// delete it first
	unlink ( ff );
	// then open a new one for appending
	int fdw = open ( ff ,
			 O_CREAT | O_RDWR | O_APPEND ,
			 S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH);
	if ( fdw < 0 ){
		return log("lang: Could not open for %s "
			   "writing: %s.",ff, strerror(errno));
	}

	log(LOG_INIT,"spell: Making %s.", ff );

	//HashTableT <uint64_t, int32_t> phrases;
	HashTableX phrases;
	phrases.set(8,4,0,NULL,0,false,0,"phud");
	char buf[1024];
	for ( int32_t  i = 0; i < MAX_LANGUAGES; i++ ){
		// get each slot
		for ( int32_t j = 0; j < ht[i].getNumSlots(); j++ ){
			uint64_t key = *(uint64_t *)ht[i].getKey(j);
			if ( key == 0 )
				continue;
			// if key is already found
			int32_t slot = phrases.getSlot(&key);
			if ( slot != -1 )
				continue;

			char *tuple = *(char **)ht[i].getValueFromSlot(j);

			// here we print the phrase and the phonet if present
			// skip the score
			while ( *tuple != '\t' )
				tuple++;
			tuple++;

			sprintf( buf, "%s", tuple );

			char *p = buf;
			p += gbstrlen(buf);

			// if there wasn't a phonet, its from the titleRec.
			// add another tab
			bool fromTitleRec = false;
			if ( strstr (tuple,"\t") == NULL ){
				*p = '\t';
				p++;
				fromTitleRec = true;
			}

			for ( int32_t k = 0; k < MAX_LANGUAGES; k++ ){
				slot = ht[k].getSlot(&key);
				if ( slot == -1 )
					continue;
				char *val = *(char **)ht[k].getValueFromSlot(slot);
				int32_t pop = atoi(val);
				if ( fromTitleRec ) pop *= -1;
				sprintf(p,"\t%"INT32"\t%"INT32"",k,pop);
				p += gbstrlen(p);
			}
			// write out the trailing \n as well
			*p = '\n';
			p++;
			*p = '\0';
			p++;
			int32_t bufLen = gbstrlen(buf);
			int32_t wn = write ( fdw , buf , bufLen ) ;
			if ( wn != bufLen )
				return log("lang:  write: %s",strerror(errno));
			int32_t val = 1;
			phrases.addKey(&key, &val);
		}
	}
	return true;
}


bool Speller::populateHashTable( char *ff, HashTableX *htable,
				 unsigned char langId ){
	File f;
	f.set(ff);
	// open file
	if ( ! f.open ( O_RDONLY ) ) {
		log("spell: open: %s",mstrerror(g_errno));
		return false;
	}

	// get file size
	int32_t fileSize = f.getFileSize() ;

	int32_t bufSize = fileSize + 1;
	char *buf = (char *) mmalloc(bufSize, "SpellerTmpBuf");
	if (!buf)
		return false;
	if ( !f.read(buf, fileSize,0) ){
		log("spell: read: %s", mstrerror(g_errno));
		return false;
	}
	for ( int32_t i = 0; i < bufSize; i++ ){
		if ( buf[i] == '\n' )
			buf[i] = '\0';
	}

	char *p = buf;
	while ( p < buf + fileSize ){
		char *tuple = p;
		int32_t score = atoi(p);
		// many scores in dict have a pop of 0. ignore them
		if ( score <= 0 ){
			p += gbstrlen(p) + 1;
			continue;
		}
		while ( *p != '\t' )
			p++;
		p++;
		// at the phrase
		char *phrase = p;
		while ( *p != '\t' && *p != '\0' )
			p++;
		uint64_t key = hash64d(phrase, p-phrase );
		int32_t slot = htable->getSlot(&key);
		if ( slot == -1 )
			htable->addKey(&key,&tuple);
		p += gbstrlen(p) + 1;
	}
	return true;
}

// This isn't really much use except for the spider
// language detection to keep from making 32 sequential
// calls for the same phrase to isolate the language.
char *Speller::getPhraseRecord(char *phrase, int len ) {
	//char *xx=NULL;*xx=0;
	if ( !phrase ) return NULL;
	//char *rv = NULL;
	int64_t h = hash64d(phrase, len);
	int32_t slot = m_unifiedDict.getSlot(&h);
	//log("speller: h=%"UINT64" len=%i slot=%"INT32"",h,len,slot);
	if ( slot < 0 ) return NULL;
	//rv = *(char **)m_unifiedDict.getValueFromSlot(slot);
	int32_t offset =  *(int32_t *)m_unifiedDict.getValueFromSlot(slot);
	char *p = m_unifiedBuf.getBufStart() + offset;
	return p;
}

/*
uint8_t Speller::getUniqueLang ( int64_t *wid ) {
	int32_t slot = m_unifiedDict.getSlot(wid);
	if (slot < 0) return langUnknown;
	//char *p = *(char **)m_unifiedDict.getValueFromSlot(slot);
	int32_t offset =  *(int32_t *)m_unifiedDict.getValueFromSlot(slot);
	char *p = m_unifiedBuf.getBufStart() + offset;
	int32_t langId = langUnknown;
	char langCount = 0;
	// skip over word
	for ( ; *p && *p != '\t' ; ) p++;
	// nothing after?
	if ( !*p ) return langUnknown;
	// skip tab
	p++;
	// skip over phonet
	for ( ; *p && *p != '\t' ; ) p++;
	// nothing after?
	if ( !*p ) return langUnknown;
	// skip tab
	p++;
	// loop over langid/pop pairs
	while ( *p ) {
		// get langid
		langId = atoi(p);
		// skip to next delimiter
		for ( ; *p && *p != '\t' ; p++ );
		// error?
		if ( ! *p ) break;
		// skip tab
		p++;
		// error?
		if ( ! *p ) break;
		// . if pop is zero ignore it
		// . we now set pops to zero when generating
		//   unifiedDict-buf.txt if they are not in the wiktionary
		//   map for that language. seems like to many bad entries
		//   were put in there by john nanny.
		//char pop = 1;
		//if ( *p == '0' ) pop = 0;
		// require it be in the official dictionary here
		bool official;
		if ( *p == '-' ) official = true;
		else             official = false;
		// skip pop
		for ( ; *p && *p != '\t' ; p++ );
		// multi lang count
		if ( langId != langUnknown && official ) langCount++;
		// no unique lang
		//if ( langCount >= 2 ) return langTranslingual;
		if ( langCount >= 2 ) return langUnknown;
		// done?
		if ( ! *p ) break;
		// skip tab
		p++;
	}
	// unique lang!
	return langId;
}
*/

int64_t Speller::getLangBits64 ( int64_t *wid ) {
	int32_t slot = m_unifiedDict.getSlot(wid);
	if (slot < 0) return 0LL;
	int32_t offset =  *(int32_t *)m_unifiedDict.getValueFromSlot(slot);
	char *p = m_unifiedBuf.getBufStart() + offset;
	// skip over word
	for ( ; *p && *p != '\t' ; ) p++;
	// nothing after?
	if ( !*p ) return 0LL;
	// skip tab
	p++;
	// skip over phonet
	for ( ; *p && *p != '\t' ; ) p++;
	// nothing after?
	if ( !*p ) return 0LL;
	// skip tab
	p++;
	// init
	int64_t bits = 0LL;
	// loop over langid/pop pairs
	while ( *p ) {
		// get langid
		uint8_t langId = atoi(p);
		// skip to next delimiter
		for ( ; *p && *p != '\t' ; p++ );
		// error?
		if ( ! *p ) break;
		// skip tab
		p++;
		// error?
		if ( ! *p ) break;
		// . if pop is zero ignore it
		// . we now set pops to zero when generating
		//   unifiedDict-buf.txt if they are not in the wiktionary
		//   map for that language. seems like to many bad entries
		//   were put in there by john nanny.
		//char pop = 1;
		// if not official, cancel it?
		if ( *p != '-' ) langId = langUnknown;
		// skip pop
		for ( ; *p && *p != '\t' ; p++ );
		// multi lang count
		//if ( langId != langUnknown ) langCount++;
		// no unique lang
		//if ( langCount >= 2 ) return langTranslingual;
		if ( langId != langTranslingual &&
		     langId != langUnknown )
			// make english "1"
			bits |= 1LL << (langId-1);
		// done?
		if ( ! *p ) break;
		// skip tab
		p++;
	}
	return bits;
}

/*
int64_t *Speller::getPhraseLanguages(char *phrase, int len ) {
	//char *xx=NULL;*xx=0;

	char *phraseRec = getPhraseRecord(phrase, len );
	if(!phraseRec) return(NULL);
	int64_t *rv = (int64_t *)mmalloc(sizeof(int64_t) * MAX_LANGUAGES,
					     "PhraseRec");
	if(!rv) return(NULL);
	if(!getPhraseLanguages(phrase, len, rv)) {
		mfree(rv, sizeof(int64_t) * MAX_LANGUAGES,
		      "PhraseRec");
		return(NULL);
	}
	return(rv);
}
*/

bool Speller::getPhraseLanguages(char *phrase, int len,
				 int64_t *array) {
	//char *xx=NULL;*xx=0;

	char *phraseRec = getPhraseRecord(phrase, len);
	if(!phraseRec || !array) return false;
	return getPhraseLanguages2 ( phraseRec,array );
}

bool Speller::getPhraseLanguages2 (char *phraseRec , int64_t *array) {

	int64_t l = 0;
	memset(array, 0, sizeof(int64_t)*MAX_LANGUAGES);

	while(*phraseRec) {
		l = 0;
		// skip leading whitespace
		while(*phraseRec && (*phraseRec == ' ' ||
				     *phraseRec == '\t'))
			phraseRec++;

		if(!*phraseRec) break;

		int64_t l = atoi(phraseRec);
		// l = abs(l); // not using score method anymore, so this is moot.

		// skip to next delimiter
		// while(*phraseRec && *phraseRec != '\t') phraseRec++;
		if(!(phraseRec = strchr(phraseRec, '\t'))) break;

		// skip tab
		phraseRec++;

		if(!*phraseRec) break;

		// wtf?
		if ( *phraseRec == '\t' ) return true;

		// Save score
		array[l] = atoi(phraseRec);

		// skip to next delimiter
		// while(*phraseRec && *phraseRec != '\t') phraseRec++;
		if(!(phraseRec = strchr(phraseRec, '\t'))) break;

		// skip over tab
		if(*phraseRec == '\t') phraseRec++;
	}
	return(true);
}

bool Speller::getSynsInEnglish ( char *w ,
				 int32_t wlen ,
				 char nativeLang ,
				 char wikiLang ) {
	// no digits please!
	if ( is_digit(w[0]) ) return false;

	char *p = getPhraseRecord(w,wlen);
	if ( ! p ) return false;
	bool inEnglish = false;
	// skip word
	for ( ; *p != '\t' ; p++ );
	// skip tab
	p++;
	// skip phonet
	for ( ; *p != '\t' ; p++ );
	// skip tab
	p++;

	for ( ; *p ; ) {
		// end of line?
		if ( !*p ) return inEnglish;
		// get language id
		int32_t l = atoi(p);
		// english?
		//if ( l == langEnglish ) inEnglish = true;
		//if ( l > langEnglish && ! inEnglish ) return false;
		//if ( l == nativeLang ) return false;
		// skip langid
		for ( ; *p && *p != '\t' ; p++ );
		// end of line?
		if ( !*p ) return inEnglish;
		// skip tab
		p++;
		// . get popularity. if not negative undo inEnglish.
		// . it has to be negative because that means it is in the
		//   OFFICIAL wiktionary dictionary for that language
		if ( l == langEnglish && p[0] == '-' ) inEnglish = true;
		// if this word is in the doc's primary/native language
		// then do not try to get english synonyms of it
		if ( l == nativeLang && p[0] == '-' ) return false;
		// no chance? it MUST be in english, and these are
		// sorted by langid...
		if ( l > langEnglish && ! inEnglish ) return false;
		// skip popularity
		for ( ; *p && *p != '\t' ; p++ );
		// no more?
		if ( ! *p )
			return inEnglish;
		// skip tab
		p++;
	}
	return inEnglish;
}

/*
static inline int s_findMaxVal(int64_t *vals, int numVals) {
	int64_t max, oldmax, val;
	if(!vals) return(0);
	max = oldmax = INT_MIN;
	val = 0;
	for(int x = 0; x < numVals; x++) {
		if(vals[x] >= max) {
			oldmax = max;
			max = vals[x];
			val = x;
		}
	}
	if(oldmax == max) return(0);
	return(val);
}

char Speller::getPhraseLanguage(char *phrase, int len) {
	//char *xx=NULL;*xx=0;

	char lang;
	int64_t *langs = getPhraseLanguages(phrase, len);
	if(!langs) return(0);
	lang = s_findMaxVal(langs, MAX_LANGUAGES);
	if ( lang < 0 ) { char *xx=NULL;*xx=0; }
	if(langs[(uint8_t)lang] == 0) lang = 0;
	mfree(langs, sizeof(int) * MAX_LANGUAGES, "PhraseRec");
	return(lang);
}
*/

void Speller::dictLookupTest ( char *ff ){
	//char *ff = "/tmp/sctest";
	FILE *fd = fopen ( ff, "r" );
	if ( ! fd ) {
		log("speller: test: Could not open %s for "
		    "reading: %s.", ff,strerror(errno));
		return;
	}
	int64_t start = gettimeofdayInMilliseconds();
	char buf[1026];
	int32_t count = 0;
	// go through the words
	while ( fgets ( buf , MAX_FRAG_SIZE , fd ) ) {
		// length of word(s), including the terminating \n
		int32_t wlen = gbstrlen(buf) ;
		// skip if empty
		if ( wlen <= 0 ) continue;
		buf[wlen-1]='\0';
		uint64_t h = hash64d ( buf, gbstrlen(buf));
		int32_t pop = g_speller.getPhrasePopularity(buf, h, true);
		if ( pop < 0 ){
			char *xx = NULL; *xx = 0;
		}
		count++;
	}
	log ( LOG_WARN,"speller: dictLookupTest took %"INT64" ms to do "
	      "%"INT32" words. Compare against 46-66ms taken for dict/words file.",
	      gettimeofdayInMilliseconds() - start, count );
	fclose(fd);
}