getWord(e),w->getWordLen(e)))
// assume no match then!!
continue;
// if we had a previous guy, reset the end of the
// previous candidate
if ( prev ) {
bs[n-2] = k;
bs[n-1] = k;
}
// . ok, we got two more candidates
// . well, only one more if this is not the 1st time
if ( ! prev ) {
cptrs [n] = cptrs [i];
htmlEnc [n] = htmlEnc [i];
scores [n] = scores [i];
types [n] = types [i];
as [n] = lasta;
bs [n] = k;
parent [n] = i;
n++;
added++;
}
// the 2nd one
cptrs [n] = cptrs [i];
htmlEnc [n] = htmlEnc [i];
scores [n] = scores [i];
types [n] = types [i];
as [n] = e + 1;
bs [n] = bs [i];
parent [n] = i;
n++;
added++;
// now add in the last pair as a whole token
cptrs [n] = cptrs [i];
htmlEnc [n] = htmlEnc [i];
scores [n] = scores [i];
types [n] = types [i];
as [n] = lasta;
bs [n] = bs [i];
parent [n] = i;
n++;
added++;
// nuke the current candidate then since it got
// split up to not contain the root title...
//cptrs[i] = NULL;
// update this
lasta = k+1;
// if we encounter another delimeter we will have
// to revise bs[n-1], so note that
prev = true;
}
// nuke the current candidate then since it got
// split up to not contain the root title...
if ( added ) {
scores[i] = 0.001;
//cptrs[i] = NULL;
}
// erase the pair if that there was only one token
if ( added == 3 ) n--;
}
//logf(LOG_DEBUG,"title: took5=%"INT64"",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// set base score
for ( int32_t i = 0 ; i < n ; i++ ) baseScore[i] = scores[i];
// set # alnum words
for ( int32_t i = 0 ; i < n ; i++ ) {
// point to the words
Words *w = cptrs[i];
// skip if got nuked above
if ( ! w ) continue;
// get the word boundaries
int32_t a = as[i];
int32_t b = bs[i];
int32_t count = 0;
// scan the words in this title candidate
for ( int32_t j = a ; j < b ; j++ )
if ( w->isAlnum(j) ) count++;
// store it
numAlnum[i] = count;
}
//
// . now punish by 0.85 for every lower case non-stop word it has
// . reward by 1.1 if has a non-stopword in the query
//
for ( int32_t i = 0 ; i < n ; i++ ) {
// point to the words
Words *w = cptrs[i];
// skip if got nuked above
if ( ! w ) continue;
// the word ptrs
char **wptrs = w->getWordPtrs();
// skip if empty
if ( w->getNumWords() <= 0 ) continue;
// get the word boundaries
int32_t a = as[i];
int32_t b = bs[i];
// record the boosts
float ncb = 1.0;
float qtb = 1.0;
// a flag
char uncapped = false;
// scan the words in this title candidate
for ( int32_t j = a ; j < b ; j++ ) {
// skip stop words
if ( w->isQueryStopWord(j,xd->m_langId) ) continue;
// punish if uncapitalized non-stopword
if ( ! w->isCapitalized(j) ) uncapped = true;
// skip if no query
if ( ! q ) continue;
// convert the word id into a term id
//int64_ttermid=g_indexdb.getTermId(0,w->getWordId(j));
int64_t wid = w->getWordId(j);
// reward if in the query
if ( q->getWordNum(wid) >= 0 ) {
qtb *= 1.5;
scores[i] *= 1.5;
}
}
// . only punish once if missing a capitalized word
// . hurts us for:
// http://content-uk.cricinfo.com/ausvrsa2008_09/engine/
// current/match/351682.html
if ( uncapped ) {
ncb *= 1.00;//0.85;
scores[i] *= 1.00;//0.85;
}
// punish if a http:// title thingy
char *s = wptrs[a];//w->getWord(a);
int32_t size = w->getStringSize(a,b);
if ( size > 9 && memcmp("http://",s,7)==0 )
ncb *= .10;
if ( size > 14 && memcmp("h\0t\0t\0p\0:\0/\0/",s,14)==0 )
ncb *= .10;
// set these guys
scores [i] *= ncb;
noCapsBoost[i] = ncb;
qtermsBoost[i] = qtb;
}
//logf(LOG_DEBUG,"title: took6=%"INT64"",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// . now compare each candidate to the other candidates
// . give a boost if matches
for ( int32_t i = 0 ; i < n ; i++ ) {
// point to the words
Words *w1 = cptrs[i];
// skip if got nuked above
if ( ! w1 ) continue;
int32_t a1 = as [i];
int32_t b1 = bs [i];
//int32_t nw1 = b1 - a1;
// reset our array
//int32_t found[512];
// sanity check
//if ( nw1 > 512 ) { char *xx=NULL;*xx=0; };
//memset ( found , 0 , 4*512);
// reset some flags
char localFlag1 = 0;
char localFlag2 = 0;
// record the boost
float iccb = 1.0;
// total boost
float total = 1.0;
//int32_t count = 0;
// to each other candidate
for ( int32_t j = 0 ; j < n ; j++ ) {
// not to ourselves
if ( j == i ) continue;
// or our derivatives
if ( parent[j] == i ) continue;
// or derivates to their parent
if ( parent[i] == j ) continue;
// only check parents now. do not check kids.
// this was only for when doing percent contained
// not getSimilarity() per se
//if ( parent[j] != -1 ) continue;
//
// TODO: do not accumulate boosts from a parent
// and its kids, subtitles...
//
// do not compare type X to type Y
if ( types[i] == TT_TITLETAG ) {
if ( types[j] == TT_TITLETAG ) continue;
}
// do not compare a div candidate to another div cand
// http://friendfeed.com/foxiewire?start=30
// likewise, a TD to another TD
// http://content-uk.cricinfo.com/ausvrsa2008_09/
// engine/match/351681.html ... etc.
if ( types[i] == TT_BOLDTAG ||
types[i] == TT_HTAG ||
types[i] == TT_DIVTAG ||
types[i] == TT_TDTAG ||
types[i] == TT_FONTTAG ) {
if ( types[j] == types[i] ) continue;
}
// . do not compare one kid to another kid
// . i.e. if we got "x | y" as a title and "x | z"
// as a link text, it will emphasize "x" too much
// http://content-uk.cricinfo.com/ausvrsa2008_09/
// engine/current/match/351682.html
if ( parent[j] != -1 && parent[i] != -1 ) continue;
// . body type tags are mostly mutually exclusive
// . for the legacy.com url mentioned below, we have
// good stuff in tags, so this hurts us...
// . but for the sake of
// http://larvatusprodeo.net/2009/01/07/partisanship
// -politics-and-participation/ i put bold tags back
//if ( types[i] == TT_BOLDTAG )
// if ( types[j] == TT_BOLDTAG ) continue;
/*
if ( types[i] == TT_BOLDTAG ||
types[i] == TT_HTAG ||
types[i] == TT_DIVTAG ||
types[i] == TT_TDTAG ||
types[i] == TT_FONTTAG ) {
if ( types[j] == TT_HTAG ) continue;
if ( types[j] == TT_BOLDTAG ) continue;
if ( types[j] == TT_DIVTAG ) continue;
if ( types[j] == TT_TDTAG ) continue;
if ( types[j] == TT_FONTTAG ) continue;
}
*/
if ( types[i] == TT_LINKTEXTLOCAL ) {
if ( types[j] == TT_LINKTEXTLOCAL ) continue;
}
if ( types[i] == TT_RSSITEMLOCAL ) {
if ( types[j] == TT_RSSITEMLOCAL ) continue;
}
// only compare to one local link text for each i
if ( types[j] == TT_LINKTEXTLOCAL && localFlag1 )
continue;
if ( types[j] == TT_RSSITEMLOCAL && localFlag2 )
continue;
if ( types[j] == TT_LINKTEXTLOCAL ) localFlag1 = 1;
if ( types[j] == TT_RSSITEMLOCAL ) localFlag2 = 1;
// not link title attr to link title attr either
// fixes http://www.spiritualwoman.net/?cat=191
if ( types[i] == TT_TITLEATT &&
types[j] == TT_TITLEATT )
continue;
// get our words
Words *w2 = cptrs[j];
// skip if got nuked above
if ( ! w2 ) continue;
int32_t a2 = as [j];
int32_t b2 = bs [j];
// use body scores if we can
//Scores *scores1 = NULL;
//Scores *scores2 = NULL;
//if ( w1 == WW ) scores1 = SS;
//if ( w2 == WW ) scores2 = SS;
/*
// make his hashtable
HashTable ht;
char hbuf[5000];
// but we cannot have more than 1024 slots then
if ( ! ht.set ( 256 , hbuf,5000) ) return false;
// and table auto grows when 90% full, so limit us here
int32_t count = 0;
// loop over all words in "w1" and hash them
for ( int32_t k = a2 ; k < b2 && count<128; k++ ) {
// the word id
int32_t wid = (int32_t) w2->m_wordIds[k] ;
// skip if not indexable
if ( wid == 0 ) continue;
// count it
count++;
// add to table
if ( ! ht.addKey ( (int32_t)wid , 1 , NULL ) )
return false;
}
// which words are found in another candidate
for ( int32_t k = 0 ; k < nw1 ; k++ ) {
// get word id
int32_t wid = (int32_t)w1->m_wordIds[a1 + k];
// skip if punct. set it to -1
if ( wid == 0LL ) { found[k] = -1; continue; }
// see if in table
int32_t slot = ht.getSlot ( wid );
// this word was found in another candidate
if ( slot >= 0 ) found[k]++;
}
*/
// how similar is title #i to title #j ?
float fp = getSimilarity ( w2 , a2 , b2 ,
w1 , a1 , b1 );
// TODO: scores1 , scores2 );
// error?
if ( fp == -1.0 ) return false;
// give a 1.1 boost per word i guess
//float boost = 1.0;
// get # of "matched words" in the two titles
//int32_t nw1 = (int32_t)(fp * (float)numAlnum[i]);
//for ( int32_t v = 0 ; v < nw1 ; v++ )
// boost *= 1.1;
// custom boosting...
float boost = 1.0;
if ( fp >= .95 ) boost = 3.0;
else if ( fp >= .90 ) boost = 2.0;
else if ( fp >= .85 ) boost = 1.5;
else if ( fp >= .80 ) boost = 1.4;
else if ( fp >= .75 ) boost = 1.3;
else if ( fp >= .70 ) boost = 1.2;
else if ( fp >= .60 ) boost = 1.1;
else if ( fp >= .50 ) boost = 1.08;
else if ( fp >= .40 ) boost = 1.04;
// limit total
total *= boost;
if ( total > 100.0 ) break;
// if you are matching the url path, that is pretty
// good so give more!
// actually, that would hurt:
// http://michellemalkin.com/2008/12/29/gag-worthy/
/*
if ( types[j] == TT_URLPATH ) {
float delta = boost - 1.0;
// double the delta boost
boost = boost + delta;
}
*/
// . boost by that!
// . if 100% similar give x3.0
// . if 0% similar x1.0
//float boost = 1.0 + (2.0 * fp);
//float boost = ((1.0 + fp)*(1.0 + fp));
// custom boosting!
if ( fp > 0.0 && g_conf.m_logDebugTitle )
logf(LOG_DEBUG,"title: i=%"INT32" j=%"INT32" fp=%.02f "
"b=%.02f", i,j,fp,boost);
// apply it
scores[i] *= boost;
iccb *= boost;
}
// . boost from words that word found in other candidates
// . TODO: dedup the found vector so we don't count the same
// word twice!!
/*
float boost = 1.0;
for ( int32_t k = 0 ; k < nw1 ; k++ ) {
// skip punct
if ( found[k] == -1 ) continue;
// boost or punish
if ( found[k] ) boost *= 1.20;
else boost *= 0.85;
}
// assigne
scores [i] = boost;
inCommonCandBoost[i] = boost;
*/
inCommonCandBoost[i] = iccb;
}
//logf(LOG_DEBUG,"title: took7=%"INT64"",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// loop over all n candidates
for ( int32_t i = 0 ; i < n ; i++ ) {
// skip if not in the document body
if ( cptrs[i] != WW ) continue;
// point to the words
int32_t a1 = as [i];
int32_t b1 = bs [i];
// . loop through this candidates words
// . TODO: use memset here?
for ( int32_t j = a1 ; j <= b1 && j < NW ; j++ )
// flag it
flags[j] |= 0x01;
}
Section **sp = NULL;
if ( sections ) sp = sections->m_sectionPtrs;
//logf(LOG_DEBUG,"title: took8=%"INT64"",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
/*
MDW: removed since SEC_ARTICLE was removed ----
// . now compare each candidate to the words in the positive scoring
// body of the document.
// . hash each word in the document with a positive score
// . go up to the first 5000 "words"
// . hash up to 1000 "words"
HashTableT ht;
inLink = false;
for ( int32_t i = 0 ; i < NW && i < 5000 ; i++ ) {
// see whose in a link tag
if ( tids[i] == TAG_A ) inLink = true;
if ( tids[i] == (TAG_A | BACKBIT) ) inLink = false;
// must be alnum word
if ( wids[i] == 0LL ) continue;
// skip if not in article section
if ( sp && ! (sp[i]->m_flags & SEC_ARTICLE ) ) continue;
// skip if 0 score
//if ( SS && SS->m_scores[i] <= 0 ) continue;
// . skip if this word is in a candidate title
// . for http://www.legacy.com/shelbystar/Obituaries.asp?Pa
// ge=LifeStory&PersonId=122245831
// the body is actually a candidate and the first
// td candidate is a good title and is unable to get boost
// from the body because it is a | candidate! so remove
// this logic for now
if ( flags[i] & 0x01 ) continue;
// or in a link as determined with the flags
if ( flags[i] & 0x02 ) continue;
// skip if in a link
if ( inLink ) continue;
// skip if stop word
if ( WW->isQueryStopWord(i) ) continue;
// . hash it. return false if error adding it.
// . store the word # so we can avoid comparing to ourselves
// in case the title candidate intersect this part of the doc
if ( ! ht.addKey ( wids[i] , i ) ) {
if ( flags!=localBuf ) mfree (flags,need,"TITLEflags");
return false;
}
}
*/
// free our stuff
if ( flags!=localBuf ) mfree (flags,need, "TITLEflags");
//logf(LOG_DEBUG,"title: took9=%"INT64"",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// ok, now compare each candidate to that hash table
for ( int32_t i = 0 ; i < n ; i++ ) {
// record the boost
float icbb = 1.0;
/*
MDW: removed since SEC_ARTICLE was removed ----
// point to the words
Words *w1 = cptrs[i];
int32_t a1 = as [i];
int32_t b1 = bs [i];
int32_t nw1 = w1->getNumWords();
int64_t *wids1 = w1->getWordIds ();
// loop through this candidates words
for ( int32_t j = a1 ; j <= b1 && j < nw1 ; j++ ) {
// skip if not alnum
if ( wids1[j] == 0LL ) continue;
// is it in the positive scoring body?
if ( ! ht.getValuePtr ( wids1[j] ) ) continue;
// boost score by 20% for every term we have that
// is also in the positive scoring body
icbb *= 1.20;
scores[i] *= 1.20;
}
*/
inCommonBodyBoost[i] = icbb;
}
//logf(LOG_DEBUG,"title: took10=%"INT64"",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// now get the highest scoring candidate title
float max = -1.0;
int32_t winner = -1;
for ( int32_t i = 0 ; i < n ; i++ ) {
// skip if got nuked
if ( ! cptrs[i] ) continue;
if ( winner != -1 && scores[i] <= max ) continue;
// url path's cannot be titles in and of themselves
if ( types[i] == TT_URLPATH ) continue;
// skip if empty basically, like if title was exact
// copy of root, then the whole thing got nuked and
// some empty string added, where a > b
if ( as[i] >= bs[i] ) continue;
// got one
max = scores[i];
// save it
winner = i;
}
//logf(LOG_DEBUG,"title: took11=%"INT64"",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// if we are a root, always pick the title tag as the title
if ( oldn == -2 && tti >= 0 ) winner = tti;
// if no winner, all done. no title
if ( winner == -1 ) return true;
// point to the words class of the winner
Words *w = cptrs[winner];
// skip if got nuked above
if ( ! w ) { char *xx=NULL;*xx=0; }
// make the Pos class of the winner, and point "pp" to it
Pos *pp = POS;
//Scores *ss = SS;
// need to make our own Pos class if title not from body
Pos tp;
if ( w != WW ) {
// use the temp Pos class, "tp"
pp = &tp;
// use no scores then
//ss = NULL;
// set "Scores" ptr to NULL. we assume all are positive scores
if ( ! tp.set ( w , NULL ) ) return false;
}
// the string ranges from word #a up to and including word #b
int32_t a = as[winner];
int32_t b = bs[winner];
// sanity check
if ( a < 0 || b > w->getNumWords() ) { char*xx=NULL;*xx=0; }
// save the title
if ( ! copyTitle ( w , pp , a , b , sections ) )
return false;
// save these
m_htmlEncoded = htmlEnc [winner];
//logf(LOG_DEBUG,"title: took12=%"INT64"",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// return now if no need to log this stuff
//SafeBuf sb;
//pbuf = &sb;
if ( ! pbuf ) return true;
//log("title: candidates for %s",xd->getFirstUrl()->getUrl() );
pbuf->safePrintf(""
"Title Generation"
" | "
" \n"
"# | "
"type | "
"parent | "
"base score | "
"format penalty | "
"query term boost | "
"candidate intersection boost | "
"body intersection boost | "
"FINAL SCORE | "
"title | "
" \n" );
// print out all candidates
for ( int32_t i = 0 ; i < n ; i++ ) {
char *ts = "unknown";
if ( types[i] == TT_LINKTEXTLOCAL ) ts = "local inlink text";
if ( types[i] == TT_LINKTEXTREMOTE ) ts = "remote inlink text";
if ( types[i] == TT_RSSITEMLOCAL ) ts = "local rss title";
if ( types[i] == TT_RSSITEMREMOTE ) ts = "remote rss title";
if ( types[i] == TT_BOLDTAG ) ts = "bold tag";
if ( types[i] == TT_HTAG ) ts = "header tag";
if ( types[i] == TT_TITLETAG ) ts = "title tag";
if ( types[i] == TT_DMOZ ) ts = "dmoz title";
if ( types[i] == TT_FIRSTLINE ) ts = "first line in text";
if ( types[i] == TT_FONTTAG ) ts = "font tag";
if ( types[i] == TT_ATAG ) ts = "anchor tag";
if ( types[i] == TT_DIVTAG ) ts = "div tag";
if ( types[i] == TT_TDTAG ) ts = "td tag";
if ( types[i] == TT_PTAG ) ts = "p tag";
if ( types[i] == TT_URLPATH ) ts = "url path";
if ( types[i] == TT_TITLEATT ) ts = "title attribute";
// get the title
pbuf->safePrintf(
""
"#%"INT32" | "
"%s | "
"%"INT32" | "
"%0.2f | " // baseScore
"%0.2f | "
"%0.2f | "
"%0.2f | "
"%0.2f | "
"%0.2f | "
"",
i,
ts ,
parent[i],
baseScore[i],
noCapsBoost[i],
qtermsBoost[i],
inCommonCandBoost[i],
inCommonBodyBoost[i],
scores[i]);
// ptrs
Words *w = cptrs[i];
int32_t a = as[i];
int32_t b = bs[i];
// skip if no words
if ( w->getNumWords() <= 0 ) continue;
// the word ptrs
char **wptrs = w->getWordPtrs();
// string ptrs
char *ptr = wptrs[a];//w->getWord(a);
int32_t size = w->getStringSize(a,b);
// it is utf8
pbuf->safeMemcpy ( ptr , size );
// end the line
pbuf->safePrintf(" | \n");
}
pbuf->safePrintf(" \n \n");
//logf(LOG_DEBUG,"title: took13=%"INT64"",gettimeofdayInMilliseconds()-x);
//x = gettimeofdayInMilliseconds();
// log these for now
//log("title: %s",sb.getBufStart());
return true;
}
// . returns 0.0 to 1.0
// . what percent of the alnum words in "w1" are in "w2" from words in [t0,t1)
// . gets 50% points if has all single words, and the other 50% if all phrases
// . Scores class applies to w1 only, use NULL if none
// . use word popularity information for scoring rarer term matches more
// . ONLY CHECKS FIRST 1000 WORDS of w2 for speed
float Title::getSimilarity ( Words *w1 , int32_t i0 , int32_t i1 ,
Words *w2 , int32_t t0 , int32_t t1 ) {
// if either empty, that's 0% contained
if ( w1->getNumWords() <= 0 ) return 0;
if ( w2->getNumWords() <= 0 ) return 0;
if ( i0 >= i1 ) return 0;
if ( t0 >= t1 ) return 0;
// invalids vals
if ( i0 < 0 ) return 0;
if ( t0 < 0 ) return 0;
// . for this to be useful we must use idf
// . get the popularity of each word in w1
// . w1 should only be a few words since it is a title candidate
// . does not add pop for word #i if scores[i] <= 0
// . take this out for now since i removed the unified dict,
// we could use this if we added popularity to g_wiktionary
// but it would have to be language dependent
Pops pops1;
Pops pops2;
if ( ! pops1.set ( w1 , i0 , i1 ) ) return -1.0;
if ( ! pops2.set ( w2 , t0 , t1 ) ) return -1.0;
// now hash the words in w1, the needle in the haystack
int32_t nw1 = w1->getNumWords();
if ( i1 > nw1 ) i1 = nw1;
HashTable table;
//int32_t *ss1 = NULL;
//int32_t *ss2 = NULL;
//if ( scores1 ) ss1 = scores1->m_scores;
//if ( scores2 ) ss2 = scores2->m_scores;
// this augments the hash table
//int64_t lastWids[1024];
int64_t lastWid = -1;
float lastScore = 0.0;
// but we cannot have more than 1024 slots then
if ( ! table.set ( 1024 ) ) return -1.0;
// and table auto grows when 90% full, so limit us here
int32_t count = 0;
int32_t maxCount = 20; // (1024 * 90) / 100 - 1;
// sum up everything we add
float sum = 0.0;
// loop over all words in "w1" and hash them
for ( int32_t i = i0 ; i < i1 ; i++ ) {
// the word id
int64_t wid = (int32_t) w1->m_wordIds[i] ;
// skip if not indexable
if ( wid == 0 ) continue;
// or score is 0
//if ( ss && ss[i] <= 0 ) continue;
// no room left in table!
if ( count++ > maxCount ) {
//logf(LOG_DEBUG, "query: Hash table for title "
// "generation too small. Truncating words from w1.");
break;
}
// . map pop to a score, "pscore"
// . the least popular something is the more points it is worth
//val = MAX_POP - pops.m_pops[i];
// . make this a float. it ranges from 0.0 to 1.0
// . 1.0 means the word occurs in 100% of documents sampled
// . 0.0 means it occurs in none of them
// . but "val" is the complement of those two statements!
float score = 1.0 - pops1.getNormalizedPop(i);
// accumulate
sum += score;
// debug
//logf(LOG_DEBUG,"adding wid=%"INT32" score=%.02f sum=%.02f",
// (int32_t)wid,score,sum);
// accumulate for scoring phrases too! (adjacent words)
//psum += val;
// update the linked list
//if ( oldi < 1024 ) next[oldi] = i;
// prepare for next link, it may never come if we're last one!
//oldi = i;
// add to table
if ( ! table.addKey ( (int32_t)wid , (int32_t)score , NULL ) )
return -1.0;
// if no last wid, continue
if ( lastWid == -1LL ) {lastWid=wid;lastScore=score;continue; }
// keep this 1-1 with the hash table slots
//lastWids [ slot ] = lastWid;
// . what was his val?
// . the "val" of the phrase:
float phrScore = score + lastScore;
// do not count as much as single words
phrScore *= 0.5;
// accumulate
sum += phrScore;
// get the phrase id
int64_t pid = hash64 ( wid , lastWid );
// debug
//logf(LOG_DEBUG,
// "adding pid=%"INT32" score=%.02f sum=%.02f",
// (int32_t)pid,phrScore,sum);
// now add that
if ( ! table.addKey ( (int32_t)pid , (int32_t)phrScore , NULL ) )
return -1.0;
// we are now the last wid
lastWid = wid;
lastScore = score;
}
// sanity check. it can't grow cuz we keep lastWids[] 1-1 with it
if ( table.getNumSlots() != 1024 ) {
log(LOG_LOGIC,"query: Title has logic bug.");
return -1.0;
}
// reset score sum to get "percent contained" functionality back
//sum = 0.0;
// accumulate scores of words that are found
float found = 0.0;
// reset
lastWid = -1LL;
// loop over all words in "w1" and hash them
for ( int32_t i = t0 ; i < t1 ; i++ ) {
// the word id
int64_t wid = (int32_t) w2->m_wordIds[i] ;
// skip if not indexable
if ( wid == 0 ) continue;
// or score is 0
//if ( ss && ss[i] <= 0 ) continue;
// . make this a float. it ranges from 0.0 to 1.0
// . 1.0 means the word occurs in 100% of documents sampled
// . 0.0 means it occurs in none of them
// . but "val" is the complement of those two statements!
float score = 1.0 - pops2.getNormalizedPop(i);
// accumulate
sum += score;
// is it in table?
int32_t slot = table.getSlot ( (int32_t)wid ) ;
// . if in table, add that up to "found"
// . we essentially find his wid AND our wid, so 2.0 times
if ( slot >= 0 ) found += 2.0 * score;
// use percent contained functionality now
//if ( slot >= 0 ) found += score;
// debug
//logf(LOG_DEBUG,"checking wid=%"INT32" score=%.02f sum=%.02f "
// "found=%.02f slot=%"INT32"", (int32_t)wid,score,sum,found,slot);
// now the phrase
if ( lastWid == -1LL ) {lastWid=wid;lastScore=score;continue;}
// . what was his val?
// . the "val" of the phrase:
float phrScore = score + lastScore;
// do not count as much as single words
phrScore *= 0.5;
// accumulate
sum += phrScore;
// get the phrase id
int64_t pid = hash64 ( wid , lastWid );
// is it in table?
slot = table.getSlot ( (int32_t)pid ) ;
// . accumulate if in there
// . we essentially find his wid AND our wid, so 2.0 times
if ( slot >= 0 ) found += 2.0 * phrScore;
// use percent contained functionality now
//if ( slot >= 0 ) found += score;
// we are now the last wid
lastWid = wid;
lastScore = score;
// debug
//logf(LOG_DEBUG,
// "checking pid=%"INT32" score=%.02f sum=%.02f found=%.02f "
// "slot=%"INT32"",
// (int32_t)pid,phrScore,sum,found,slot);
}
// do not divide by zero
if ( sum == 0.0 ) return 0.0;
// sanity check
//if ( found > sum ) { char *xx=NULL;*xx=0; }
if ( found < 0.0 || sum < 0.0 ) { char *xx=NULL;*xx=0; }
// . return the percentage matched
// . will range from 0.0 to 1.0
return found / sum;
}
// . copy just words in [t0,t1)
// . returns false on error and sets g_errno
bool Title::copyTitle ( Words *w , Pos *pos ,
int32_t t0 , int32_t t1 ,
Sections *sections ) {
// skip initial punct
//int64_t *wids = w->m_wordIds;
//nodeid_t *tids = w->m_tagIds;
char **wp = w->m_words;
int32_t *wlens = w->m_wordLens;
int32_t nw = w->m_numWords;
// sanity check
if ( t1 < t0 ) { char *xx = NULL; *xx = 0; }
// don't breech number of words
if ( t1 > nw ) t1 = nw;
// no title?
if ( nw == 0 || t0 == t1 ) { reset(); return true; }
char *end = wp[t1-1] + wlens[t1-1] ;
// allocate title
int32_t need = end - wp[t0];
// . max bytes we'll need
// . no, all "chars" could be encoded so they take up like 5 bytes each
//int32_t max = m_maxTitleChars;
// truncate the bytes to allocate if we can, based on m_maxTitleChars
//if ( need > max ) need = max;
// add 3 bytes for "..." and 1 for \0
need += 5;
// assume we can use our local buf
m_title = m_localBuf;
// if it is too small, then we must allocate
if ( need >= TITLE_LOCAL_SIZE ) {
m_title = (char *)mmalloc ( need , "Title" );
m_titleAllocSize = need;
}
// return false if could not alloc mem to hold the title
if ( ! m_title ) {
m_titleBytes = 0;
log("query: Could not alloc %"INT32" bytes for title.",need);
return false;
}
// save for freeing later
m_titleAllocSize = need;
// point to the title to transcribe
char *src = wp[t0];
char *srcEnd = end;
// include a \" or \'
if ( t0>0 &&
(src[-1] == '\'' || src[-1] == '\"' ) )
src--;
// and remove terminating | or :
for ( ;
srcEnd > src &&
(srcEnd[-1] == ':' ||
srcEnd[-1] == ' ' ||
srcEnd[-1] == '-' ||
srcEnd[-1] == '\n' ||
srcEnd[-1] == '\r' ||
srcEnd[-1] == '|' ) ;
srcEnd-- );
// store in here
char *dst = m_title;
// leave room for "...\0"
char *dstEnd = m_title + m_titleAllocSize - 4;
// size of character in bytes, usually 1
char cs ;
// point to last punct char
char *lastp = dst;//NULL;
// convert them always for now
bool convertHtmlEntities = true;
int32_t charCount = 0;
// copy the node @p into "dst"
for ( ; src < srcEnd ; src += cs , dst += cs ) {
// get src size
cs = getUtf8CharSize ( src );
// break if we are full!
if ( dst + cs >= dstEnd ) break;
// or hit our max char limit
if ( charCount++ >= m_maxTitleChars ) break;
// remember last punct for cutting purposes
if ( ! is_alnum_utf8 ( src ) ) lastp = dst;
// encode it as an html entity if asked to
if ( *src == '<' && convertHtmlEntities ) {
if ( dst + 4 >= dstEnd ) break;
gbmemcpy ( dst , "<" , 4 );
dst += 4 - cs;
continue;
}
// encode it as an html entity if asked to
if ( *src == '>' && convertHtmlEntities ) {
if ( dst + 4 >= dstEnd ) break;
gbmemcpy ( dst , ">" , 4 );
dst += 4 - cs;
continue;
}
// if more than 1 byte in char, use gbmemcpy
if ( cs == 1 ) *dst = *src;
else gbmemcpy ( dst , src , cs );
}
// null term always
*dst = '\0';
// do not split a word in the middle!
if ( src < srcEnd ) {
if ( lastp ) {
gbmemcpy ( lastp , "...\0" , 4 );
dst = lastp + 3;
}
else {
gbmemcpy ( dst , "...\0" , 4 );
dst += 3;
}
}
// set size. does not include the terminating \0
m_titleBytes = dst - m_title;
return true;
}
| |