open-source-search-engine/Phrases.cpp
2021-05-06 01:52:55 +10:00

777 lines
23 KiB
C++

#include "gb-include.h"
#include "Phrases.h"
#include "Mem.h"
Phrases::Phrases ( ) {
m_buf = NULL;
//m_phraseScores = NULL;
m_phraseSpam = NULL;
//m_phraseIds = NULL;
}
Phrases::~Phrases ( ) {
reset();
}
void Phrases::reset() {
if ( m_buf && m_buf != m_localBuf )
mfree ( m_buf , m_bufSize , "Phrases" );
m_buf = NULL;
//m_phraseScores = NULL;
m_phraseSpam = NULL;
//m_phraseIds = NULL;
}
// initialize this token array with the string, "s" of length, "len".
bool Phrases::set( Words *words,
Bits *bits ,
bool useStopWords ,
bool useStems ,
int32_t titleRecVersion,
int32_t niceness) {
// reset in case being re-used
reset();
// always reset this
//m_phraseScores = NULL;
m_phraseSpam = NULL;
//m_phraseIds = NULL;
// now we never use stop words and we just index two-word phrases
// so that a search for "get a" in quotes will match a doc that has
// the phrase "get a clue". it might impact performance, but it should
// be insignificant... but we need to have this level of precision.
// ok -- but what about 'kick a ball'. we might not have that phrase
// in the results for "kick a" AND "a ball"!! so we really need to
// index "kick a ball" as well as "kick a" and "a ball". i don't think
// that will cause too much bloat.
//useStopWords = false;
// ensure we have words
if ( ! words ) return true;
// set the words' scores array, m_wordScores
//if ( scores ) m_wordScores = scores->m_scores;
//else m_wordScores = NULL;
// . we have one phrase per word
// . a phrase #n is "empty" if spam[n] == PSKIP
m_numPhrases = words->getNumWords();
// replaces scores
//m_sections = m_sections;
//m_sectionPtrs = NULL;
//if ( m_sections ) m_sectionPtrs = m_sections->m_sectionPtrs;
// how much mem do we need?
//int32_t need = (18+1+(3+8*3)) * m_numPhrases;
int32_t need = m_numPhrases * (8+8+1+1+1);
//if ( m_wordScores ) need += 4 * m_numPhrases;
// alloc if we need to
if ( need > PHRASE_BUF_SIZE )
m_buf = (char *)mmalloc ( need , "Phrases" );
else
m_buf = m_localBuf;
if ( ! m_buf )
return log("query: Phrases::set: %s",mstrerror(g_errno));
m_bufSize = need;
// set up arrays
char *p = m_buf;
//m_phraseIds = (int64_t *)p ; p += m_numPhrases * 8;
// phrase not using stop words
m_phraseIds2 = (int64_t *)p ; p += m_numPhrases * 8;
m_phraseIds3 = (int64_t *)p ; p += m_numPhrases * 8;
//m_phraseIds4 = (int64_t *)p ; p += m_numPhrases * 8;
//m_phraseIds5 = (int64_t *)p ; p += m_numPhrases * 8;
//m_stripPhraseIds = (int64_t *)p ; p += m_numPhrases * 8;
//if ( m_wordScores ) {
// m_phraseScores = (int32_t *)p ;
// p += m_numPhrases * 4;
//}
m_phraseSpam = (unsigned char *)p ; p += m_numPhrases * 1;
//m_numWordsTotal = (unsigned char *)p ; p += m_numPhrases * 1;
m_numWordsTotal2= (unsigned char *)p ; p += m_numPhrases * 1;
m_numWordsTotal3= (unsigned char *)p ; p += m_numPhrases * 1;
//m_numWordsTotal4= (unsigned char *)p ; p += m_numPhrases * 1;
//m_numWordsTotal5= (unsigned char *)p ; p += m_numPhrases * 1;
// sanity
if ( p != m_buf + need ) { char *xx=NULL;*xx=0; }
// clear this
//memset ( m_numWordsTotal , 0 , m_numPhrases );
memset ( m_numWordsTotal2 , 0 , m_numPhrases );
memset ( m_numWordsTotal3 , 0 , m_numPhrases );
//memset ( m_numWordsTotal4 , 0 , m_numPhrases );
//memset ( m_numWordsTotal5 , 0 , m_numPhrases );
// point to this info while we parse
m_words = words;
m_wptrs = words->getWords();
m_wlens = words->getWordLens();
m_wids = words->getWordIds();
m_bits = bits;
m_useStopWords = useStopWords;
m_useStems = useStems;
// we now are dependent on this
m_titleRecVersion = titleRecVersion;
// . set the phrases
// . sets m_phraseIds [i]
// . sets m_phraseSpam[i] to PSKIP if NO phrase exists
for ( int32_t i = 0 ; i < words->getNumWords() ; i++ ) {
if ( ! m_wids[i] ) continue;
setPhrase ( i , niceness);
}
// success
return true;
}
// . add the phrase that starts with the ith word
// . "read Of Mice and Men" should make 3 phrases:
// . read.ofmice
// . ofmice
// . mice.andmen
void Phrases::setPhrase ( int32_t i, int32_t niceness ) {
// . if the ith word cannot start a phrase then we have no phrase
// . we indicate NULL phrasesIds with a spam of PSKIP
// . we now index all regardless! we want to be able to search
// for "a thing" or something. so do it!
//if ( ! m_bits->canStartPhrase ( i ) ) {
// m_phraseSpam[i] = PSKIP;
// m_phraseIds [i] = 0LL;
// return;
//}
// MDW: now Weights.cpp should encompass all this logic
// or if score <= 0, set in Scores.cpp
//if ( m_wordScores && m_wordScores[i] <= 0 ) {
// m_phraseSpam[i] = PSKIP;
// m_phraseIds [i] = 0LL;
// return;
//}
// hash of the phrase
int64_t h = 0LL;
// the hash of the two-word phrase (now we do 3,4 and 5 word phrases)
int64_t h2 = 0LL;
int64_t h3 = 0LL;
//int64_t h4 = 0LL;
//int64_t h5 = 0LL;
// reset
unsigned char pos = 0;
// now look for other tokens that should follow the ith token
int32_t nw = m_words->getNumWords();
int32_t numWordsInPhrase = 1;
// use the min spam from all words in the phrase as the spam for phrase
char minSpam = -1;
// we need to hash "1 / 8" differently from "1.8" from "1,000" etc.
char isNum = is_digit(m_wptrs[i][0]);
// min score
//int32_t minScore ;
//if ( m_wordScores ) minScore = m_wordScores[i];
// if i is not a stop word, it can set the min spam initially
//if ( ! m_bits->isStopWord(i) &&m_spam ) minSpam = m_spam->getSpam(i);
// do not include punct/tag words in the m_numWordsTotal[j] count
// of the total words in the phrase. these are just usesless tails.
int32_t lastWordj = -1;
// loop over following words
int32_t j;
bool hasHyphen ;
bool hasStopWord2 ;
// . NOTE: a token can start a phrase but NOT be in it.
// . like a large number for example.
// . wordId is the lower ascii hash of the ith word
// . NO... this is allowing the query operator PiiPe to start
// a phrase but not be in it, then the phrase id ends up just
// being the following word's id. causing the synonyms code to
// give a synonym which it should not un Synonyms::set()
if ( ! m_bits->canBeInPhrase(i) )
// so indeed, skip it then
goto nophrase;
//h = hash64 ( h, m_words->getWordId(i));
h = m_wids[i];
// set position
pos = (unsigned char)m_wlens[i];
//if (m_words->getStripWordId(i))
// h2 = hash64 ( h2, m_words->getStripWordId(i));
//else h2 = h;
hasHyphen = false;
hasStopWord2 = m_bits->isStopWord(i);
// this makes it true now too
//if ( m_wlens[i] <= 2 ) hasStopWord = true;
for ( j = i + 1 ; j < nw ; j++ ) {
QUICKPOLL(niceness);
// . do not allow more than 32 alnum/punct "words" in a phrase
// . this prevents phrases with 100,000 words from slowing
// us down. would put us in a huge double-nested for loop
if ( j > i + 32 ) goto nophrase;
// deal with punct words
if ( ! m_wids[j] ) {
// if we cannot pair across word j then break
if ( ! m_bits->canPairAcross (j) ) break;
// does it have a hyphen?
if (j==i+1 && m_words->hasChar(j,'-')) hasHyphen=true;
/*
// "D & B" --> dandb
if (j==i+1 && m_words->hasChar(j,'&')) {
// set this
hasStopWord = true;
// insert "and"
int32_t conti=pos;
h = hash64Lower_utf8_cont("and",3,h,&conti);
pos=conti;
// the two-word phrase, set it if we need to
h2 = h;
m_numWordsTotal2[i] = j-i+1;
}
*/
continue;
}
// . if this word can not be in a phrase then continue our
// search for a word that can
// . no punctuation can be in a phrase currently (++?)
//if ( m_bits->canBeInPhrase (j) ) {
//}
// keep this set right
//if (m_bits->isStopWord(j)||m_wlens[j]<=2) hasStopWord = true;
//if ( m_bits->isStopWord(j) ) hasStopWord = true;
// record lastWordj to indicate that word #j was a true word
lastWordj = j;
// . stop words should have a 0 spam value so don't count those
// . added by mdw in march 2002
/*
if ( ! m_bits->isStopWord(j) && m_spam ) {
// maintain the min spam
char spam = m_spam->getSpam ( j );
if ( minSpam == -1 || spam < minSpam ) minSpam = spam;
// . min weight from score vector
// . normal score here is 256, not 128, so shift
// down 3 to normalize it relatively
//if ( m_wordScores && (m_wordScores[j]>>3)<minScore)
// minScore = m_wordScores[j]>>3;
//if ( m_wordScores && m_wordScores[j] < minScore )
// minScore = m_wordScores[j];
}
*/
// if word #j can be in phrase then incorporate it's hash
if ( m_bits->canBeInPhrase (j) ) {
// continue the hash
//unsigned char *p= (unsigned char *)m_wptrs[j];
//unsigned char *pend = p + m_wlens[j];
//for ( ; p < pend ; p++ )
// h ^= g_hashtab[pos++][*p];
int32_t conti = pos;
// . get the punctuation mark separating two numbers
// . use space if can't find one
// . 1/234 1,234 1.234 10/11 "1 234" 1-5
//if (isNum && j==i + 2 && is_digit(m_wptrs[j][0]) ) {
// // get punct mark
// char c = m_wptrs[i+1][0];
// // if space try next
// if(c==' '&&m_wlens[i+1]>1) c=m_wptrs[i+1][1];
// // treat comma as nothing
// if ( c==',' ) c='\0';
// // treat / and . and - as they are, everything
// // else should be treated as a space
// else if(c!='/'&&c !='.'&& c!='-'&&c!=':')c=' ';
// // incorporate into hash if c is there
// if (c)h=hash64Lower_utf8_cont(&c,1,h,&conti);
//}
// hash the jth word into the hash
h = hash64Lower_utf8_cont(m_wptrs[j],
m_wlens[j],
h,
&conti );
pos = conti;
//h = hash64 ( h , m_words->getWordId (j) );
//if (m_words->getStripWordId(j))
// h2 = hash64 ( h2, m_words->getStripWordId(j));
//else h2 = hash64(h2, m_words->getWordId(j));
numWordsInPhrase++;
// N-word phrases?
if ( numWordsInPhrase == 2 ) { // h != h2 ) {
h2 = h;
m_numWordsTotal2[i] = j-i+1;
if ( m_bits->isStopWord(j) )
hasStopWord2 = true;
continue;
}
if ( numWordsInPhrase == 3 ) {
h3 = h;
m_numWordsTotal3[i] = j-i+1;
//continue;
break;
}
/*
if ( numWordsInPhrase == 4 ) {
h4 = h;
m_numWordsTotal4[i] = j-i+1;
continue;
}
if ( numWordsInPhrase == 5 ) {
h5 = h;
m_numWordsTotal5[i] = j-i+1;
continue;
}
*/
}
// if we cannot pair across word j then break
if ( ! m_bits->canPairAcross (j) ) break;
// keep chugging?
if ( numWordsInPhrase >= 5 ) {
// if we're not using stop words then break
if ( ! m_useStopWords ) break;
// if it's not a stop word then break
if ( ! m_bits->isStopWord (j) ) break;
}
// otherwise, get the next word
}
// if we had no phrase then use 0 as id (need 2+ words to be a pharse)
if ( numWordsInPhrase <= 1 ) {
nophrase:
m_phraseSpam[i] = PSKIP;
//m_phraseIds [i] = 0LL;
m_phraseIds2[i] = 0LL;
m_phraseIds3[i] = 0LL;
//m_stripPhraseIds [i] = 0LL;
//m_numWordsTotal[i] = 0;
m_numWordsTotal2[i] = 0;
m_numWordsTotal3[i] = 0;
return;
}
// don't jump the edge
//if ( j >= nw ) j = nw - 1;
// sanity check
if ( lastWordj == -1 ) { char *xx = NULL; *xx = 0; }
// set the phrase length (from word #i up to & including word #j)
//m_numWordsTotal[i] = j - i + 1;
//m_numWordsTotal [i] = lastWordj - i + 1;
// sanity check
if ( lastWordj - i + 1 > 255 ) { char *xx=NULL;*xx=0; }
// set the phrase spam
if ( minSpam == -1 ) minSpam = 0;
m_phraseSpam[i] = minSpam;
// return the phraseId
//m_phraseIds [i] = h;
// hyphen between numbers does not count (so 1-2 != 12)
if ( isNum ) hasHyphen = false;
// . the two word phrase id
// . "cd rom" -> cdrom
// . "fly paper" -> flypaper
// . "i-phone" -> iphone
// . "e-mail" -> email
if ( hasHyphen || ! hasStopWord2 ) {
//m_phraseIds [i] = h;
m_phraseIds2[i] = h2;
}
// . "st. and" !-> stand
// . "the rapist" !-> therapist
else {
//m_phraseIds [i] = h ^ 0x768867;
m_phraseIds2[i] = h2 ^ 0x768867;
}
// forget hyphen logic for these
m_phraseIds3[i] = h3;
//m_phraseIds4[i] = h4;
//m_phraseIds5[i] = h5;
//if ( h != h2 ) m_stripPhraseIds[i] = h2;
//else m_stripPhraseIds[i] = 0LL;
// the score weight, if any
//if ( m_phraseScores ) m_phraseScores [i] = minScore;
// sanity check
//if(m_phraseScores && minScore == 0x7fffffff ) {char *xx =NULL;*xx=0;}
// debug msg
//char *w = m_words->getWord(i) ;
//int32_t wlen = m_words->getWordLen(i) ;
//for ( int32_t k = 0 ; k < wlen ; k++ )
// fprintf(stderr,"%c",w[k]);
//fprintf(stderr,"--> hash=%"UINT64"\n",(uint64_t)h);
}
// . store phrase that starts with word #i into "printBuf"
// . return bytes stored in "printBuf"
char *Phrases::getPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
// return 0 if no phrase
if ( m_phraseSpam[i] == PSKIP ) return NULL;
// store the phrase in here
static char buf[256];
// . how many words, including punct words, are in phrase?
// . this should never be 1 or less
//int32_t n = m_numWordsTotal[i] ;
int32_t n ;
if ( npw == 2 ) n = m_numWordsTotal2[i] ;
else if ( npw == 3 ) n = m_numWordsTotal3[i] ;
else { char *xx=NULL; *xx=0; }
//char *w1 = m_words->getWord(i);
//char *w2 = m_words->getWord(i+n-1);
//int32_t wlen2 = m_words->getWordLen(i+n-1);
//int32_t plen = ( w2 - w1 ) + wlen2;
char *s = buf;
char *send = buf + 255;
for (int32_t w = i;w<i+n;w++){
if (!m_words->isAlnum(w)){
// skip spaces for now since we has altogether now
*s++ = ' ';
/*
// . get the punctuation mark separating two numbers
// . use space if can't find one
// . 1/234 1,234 1.234 10/11 "1 234" 1-5
if ( is_digit(m_wptrs[i][0]) && w == i + 1 &&
is_digit(m_wptrs[i+2][0]) ) {
// get punct mark
char c = m_wptrs[i+1][0];
// if space try next
if(c==' '&&m_wlens[i+1]>1) c=m_wptrs[i+1][1];
// treat comma as nothing
if ( c==',' ) continue;//c='\0';
// treat / and . and - as they are, everything
// else should be treated as a space
else if(c!='/'&&c !='.'&& c!='-'&&c!=':')c=' ';
// print that
*s++ = c;
}
*/
continue;
}
char *w1 = m_words->getWord(w);
char *wend = w1 + m_words->getWordLen(w);
for ( int32_t j = 0 ; j < m_words->getWordLen(w) && s<send ; j++){
// write the lower case char from w1+j into "s"
int32_t size = to_lower_utf8 ( s , send , w1 + j , wend );
// advance
j += size;
s += size;
}
}
// null terminate
*s = '\0';
// set length we wrote into "buf"
*phrLen = s - buf;
// return ptr to buf
return buf;
}
/*
// . store phrase that starts with word #i into "printBuf"
// . return bytes stored in "printBuf"
char *Phrases::getNWordPhrase ( int32_t i , int32_t *phrLen , int32_t npw ) {
// return 0 if no phrase
if ( m_phraseSpam[i] == PSKIP ) return NULL;
// store the phrase in here
static char buf[512];
// . how many words, including punct words, are in phrase?
// . this should never be 1 or less
int32_t n = m_numWordsTotal[i] ;
char *dst = buf;
char *dend = buf + 255;
int32_t count = 0;
for (int32_t w = i; w<i+n && count<npw; w++ ) {
// do not breach the buffer
if ( dst + 4 >= dend ) break;
// all non alnum chars are spaces now
if ( ! m_words->isAlnum(w) ) {
// skip spaces for now since we has altogether now
*dst++ = ' ';
continue;
}
count++;
char *w1 = m_words->getWord(w);
int32_t wlen = m_words->getWordLen(w);
// store the word in lower case into "dst"
to_lower_utf8 ( dst , dend , w1 , w1 + wlen );
// advance destination cursor
dst += wlen;
}
// null terminate
*dst = '\0';
// set length we wrote into "buf"
*phrLen = dst - buf;
// return ptr to buf
return buf;
}
*/
/*
char *Phrases::getStripPhrase ( int32_t i , int32_t *phrLen ) {
// return 0 if no phrase
if ( m_phraseSpam[i] == PSKIP ) return NULL;
// store the phrase in here
static char buf[512];
// . how many words, including punct words, are in phrase?
// . this should never be 1 or less
int32_t n = m_numWordsTotal[i] ;
//char *w1 = m_words->getWord(i);
//char *w2 = m_words->getWord(i+n-1);
//int32_t wlen2 = m_words->getWordLen(i+n-1);
//int32_t plen = ( w2 - w1 ) + wlen2;
char *s = buf;
char *send = buf + 255;
for (int32_t w = i;w<i+n;w++){
if (!m_words->isAlnum(w)){
*s++ = ' ';
continue;
}
char *w1 = m_words->getWord(w);
for ( int32_t j = 0 ; j < m_words->getWordLen(w) && s<send ; j++){
// write the lower case char from w1+j into "s"
int32_t size = to_lower_ascii_utf8 ( s , send , w1 + j );
// advance
j += size;
s += size;
}
}
// null terminate
*s = '\0';
// set length we wrote into "buf"
*phrLen = s - buf;
// return ptr to buf
return buf;
}
*/
/*
// for getTermId()
#include "Indexdb.h"
// . hash all the words into "table"
bool Phrases::hash ( TermTable *table ,
Weights *weightsPtr ,
uint32_t baseScore ,
uint32_t maxScore ,
int64_t startHash ,
char *prefix1 ,
int32_t prefixLen1 ,
char *prefix2 ,
int32_t prefixLen2 ,
bool hashUniqueOnly ,
int32_t titleRecVersion,
int32_t niceness ) {
// don't hash if score is 0 or less.
if (baseScore <= 0) return true;
// point to the phrase weights array, m_pw[]
int32_t *weights = NULL;
if ( weightsPtr ) weights = weightsPtr->m_pw;
// is the table storing the terms as strings, too? used by
// PageParser.cpp
SafeBuf *pbuf = table->getParserBuf();
// . now add each phraseId to the index table
// . TODO: might want to add w/ uniqueOnly on if spam is 100%
uint32_t score;
bool huo;
for (int32_t i =0; i < m_numPhrases; i++) {
// should we hash this phraseId only if it's not hashed yet?
huo = hashUniqueOnly;
// a phraseSpam of PSKIP means word #i does not start a phrase
if ( m_phraseSpam[i] == PSKIP ) continue;
// don't hash it if it's heavily spammed (spam of 100%)
score = baseScore - ( baseScore * m_phraseSpam[i] ) / 100;
// . use weights instead if we have them
// . default weight should be 128!
if ( weights ) {
// skip if the weight is 0, we probably have menu
// eelimination technology turned on...
if ( weights[i] == 0 ) continue;
// . the old way: we used a signed int32_t which could
// overflow before the divide and make artificially
// high term scores
//if(titleRecVersion < 85)
// score = (int32_t)((int32_t)score * weights[i]) / DW;
//else score = (score * weights[i]) / DW;
score = (score * weights[i]) / DW;
}
// weight by score if we need to
// if score is 0 because it's heavily spammed then we
// should hash just enough to index the phrase
if ( ! score ) { score = 1; huo = true; }
// get the phrase hash (includes coll,field prefixes)
int64_t h = g_indexdb.getTermId (startHash ,m_phraseIds[i]) ;
//int64_t h2 = 0LL;
//if (m_stripPhraseIds[i])
// h2 = g_indexdb.getTermId (startHash ,
// m_stripPhraseIds[i]) ;
int64_t h2 = g_indexdb.getTermId(startHash,m_phraseIds2[i]);
// we must mask it before adding it to the table because
// this table is also used to hash IndexLists into that come
// from LinkInfo classes (incoming link text). And when
// those IndexLists are hashed they used masked termIds.
// So we should too...
//h = h & TERMID_MASK;
// add to table
//int32_t score2;
//if ( titleRecVersion >= 36 ) {
//score2 = score >> 1;
//if ( score2 <= 0 ) score2 = 1;
//}
//else
// score2 = score;
QUICKPOLL(niceness);
if ( ! pbuf ) {
if ( ! table->addTerm ( h, score, maxScore, huo,
titleRecVersion ))
return false;
// hash the two-word phrase if h is not two words
if ( h2 && h2 != h &&
! table->addTerm ( h2, score, maxScore,
huo, titleRecVersion ))
return false;
continue;
}
// add phrase as string to hash table if we need to as well
int32_t plen;
char *p = getPhrase ( i , &plen );
int32_t slen;
//#if 1
char *s = table->storeTerm ( p , plen ,
prefix1 , prefixLen1 ,
prefix2 , prefixLen2 ,true,&slen);
//#else
//char *s = table->storeTerm ( p , plen ,
// "phrase" , 6 ,
// prefix2 , prefixLen2 , true, &slen );
//#endif
if ( ! table->addTerm( h, score, maxScore, huo ,
titleRecVersion, s, slen ) )
return false;
// if no strippable chars in phrase, we're done
if ( ! h2 || h2 == h ) continue;
p = getTwoWordPhrase(i, &plen);
s = table->storeTerm ( p , plen ,
prefix1 , prefixLen1 ,
prefix2 , prefixLen2 ,true,&slen);
if ( ! table->addTerm( h2, score , maxScore, huo ,
titleRecVersion , s, slen ) )
return false;
}
// . TODO: print spam %'s for phrases!!!
// . see Words.cpp for template code to do this
return true;
}
*/
// . word #n is in a phrase if he has [word][punct] or [punct][word]
// before/after him and you can pair across the punct and include both
// in a phrase
// . used by SimpleQuery class to see if a word is in a phrase or not
// . if it is then the query may choose not to represent the word by itself
bool Phrases::isInPhrase ( int32_t n ) {
// returns true if we started a phrase (our phraseSpam is not PSKIP)
if ( m_phraseSpam[n] != PSKIP ) return true;
// . see if we were in a phrase started by a word before us
// . this only words since stop words - whose previous word cannot be
// paired across - are able to start phrases
if ( n < 2 ) return false;
if ( ! m_bits->canPairAcross(n-1) ) return false;
if ( ! m_bits->canBeInPhrase(n-2) ) return false;
return true;
}
/*
// . get the index of the word that starts this phrase
// . returns -1 if none...factored out for
// . getLeftPhraseId and getLeftStripPhraseId
int32_t Phrases::getLeftPhraseIndex( int32_t i ) {
// return 0 if we no words before us
while ( i > 0 ) {
// check punct before
i--;
// can he be paired across
if ( m_words->isPunct(i)){
if ( ! m_bits->canPairAcross(i) ) return -1;
}
else{
// if word before him not in a phrase, bail
if ( ! isInPhrase ( i ) ) return -1;
// can he start ?
if ( ! m_bits->canStartPhrase ( i ) ) continue;
// yes he can
return i;
}
}
// none
return -1;
}
// . get the id of the phrase we are in that we do not start
// . returns 0 if none, even though 0 may be a valid phraseId!! TODO: fix
int64_t Phrases::getLeftPhraseId ( int32_t i ) {
int32_t index = getLeftPhraseIndex(i);
if ( index < 0 ) return 0LL;
return getPhraseId(index);
}
int64_t Phrases::getLeftStripPhraseId ( int32_t i ) {
int32_t index = getLeftPhraseIndex(i);
if ( index < 0 ) return 0LL;
return getStripPhraseId(index);
}
*/
int32_t Phrases::getMaxWordsInPhrase ( int32_t i , int64_t *pid ) {
*pid = 0LL;
/*
if ( m_numWordsTotal5[i] ) {
*pid = m_phraseIds5[i];
return m_numWordsTotal5[i];
}
if ( m_numWordsTotal4[i] ) {
*pid = m_phraseIds4[i];
return m_numWordsTotal4[i];
}
*/
if ( m_numWordsTotal3[i] ) {
*pid = m_phraseIds3[i];
return m_numWordsTotal3[i];
}
if ( m_numWordsTotal2[i] ) {
*pid = m_phraseIds2[i];
return m_numWordsTotal2[i];
}
return 0;
}
int32_t Phrases::getMinWordsInPhrase ( int32_t i , int64_t *pid ) {
*pid = 0LL;
if ( m_numWordsTotal2[i] ) {
*pid = m_phraseIds2[i];
return m_numWordsTotal2[i];
}
return 0;
}