open-source-search-engine/Scores.cpp
2013-08-02 13:12:24 -07:00

758 lines
26 KiB
C++

#include "gb-include.h"
#include "Scores.h"
#include "Words.h"
// . explicit article body indicator tags:
// <div class=blogbody,storycontent,body,article_body,story-body
// <div/td/span class=blogbody,storycontent,body,article_body,
// story,body-content,entry,story-body,mainarttxt>
// <td class=story>
// <span class="body-content">
// <div class="entry"> --- although has "entry" for ads, etc.
// reuters: <!-- Article Text Begins --> and Ends -->
// . forbes have a bunch of <span class=mainartext> strewn together. they
// are neighbor sections.
#define MAX_LEVELS 200
Scores::Scores () {
m_buf = NULL;
m_bufSize = 0;
m_scores = NULL;
//m_rerankScores = NULL;
}
Scores::~Scores() {
reset();
}
void Scores::reset() {
if ( m_buf && m_needsFree ) // m_buf != m_localBuf )
mfree ( m_buf , m_bufSize , "Scores" );
m_buf = NULL;
m_scores = NULL;
}
bool Scores::set ( Words *words ,
Sections *sections ,
long titleRecVersion ,
bool eliminateMenus ,
// provide it with a buffer to prevent a malloc
char *buf ,
long bufSize ,
long minIndexableWords ) {
//long defaultm = 40;
//if ( titleRecVersion >= 56 ) defaultm = -1;
// "scoreBySection" (default is true)
// Should gigablast break the document into sections and score the
// words in sections with mostly link text lower than words in sections
// without much link text? This helps to reduce the effects of menu
// spam.
// Used for news articles.
// This only applies to the body of the document.
// "indexContentSectionOnly" (default is false)
// Should gigablast attempt to isolate just the single most-relevant
// content section from the document and not index anything else?
// Used for news articles.
// This only applies to the body of the document.
// "minSectionScore" (default is -1000000000)
// The minimum score an entire section of the document needs to have
// its words indexed. Each word in a section counts as 128 points, but
// a word in a hyperlink counts as -256 points.
// Used for news articles.
// This only applies to the body of the document.
// "minIndexableWords" (default is -1)
// If the number of indexable words that have a positive average score
// is below this value, then no words will be indexed. Used
// to just index beefy news articles. -1 means to ignore this
// constraint.
// "minAvgWordScore" (default is 0)
// Words have an average score of the 8 neighboring words on their left
// and the 8 neighboring words on their right, in the same section.
// These word scores are 128 points for a word not in a link, and only
// 21 points for a word in a link. What is the minimum score average
// score a word needs to be indexed? (Before applying the top word
// weight, below)
// scoreBySection must be enabled for this to work.
// "numTopWords" (default is 0)
// Weight the first X words higher.
// Used for news articles.
// This only applies to the body of the document.
// "topWordsWeight" (default is 1.0)
// Weight the first X words by this much, a rational number.
// Used for news articles.
// This only applies to the body of the document.
// "topSentenceWeight" (default is 1.0)
// Weight the first sentence by this much, a rational number.
// Only applies to documents that support western punctuation.
// Used for news articles.
// This only applies to the body of the document.
// "maxWordsInSentence" (default is 0)
// Do not weight more than this words in the first sentence.
// Used for news articles.
// This only applies to the body of the document.
// if we are doing "menu elimination technology" then zero out
// scores of terms not in the single content section
if ( eliminateMenus )
return set ( words ,
titleRecVersion,
true , // scoreBySection
true , // indexContentSectionOnly (DIFF)
-1000000000 , // minSectionScore
0 , // minAvgWordScore
40 , // minIndexableWords (DIFF)
0 , // numTopWords
3.0 , // topWordsWeight
1.0 , // topSentenceWeight
30 );// maxWordsInSentence
// use all defaults if no site rec
//if ( ! sx )
return set ( words ,
titleRecVersion ,
true , // scoreBySection
false , // indexContentSectionOnly
-1000000000 , // minSectionScore
0 , // minAvgWordScore
minIndexableWords , // defaults to -1
0 , // numTopWords
3.0 , // topWordsWeight
1.0 , // topSentenceWeight
30 );// maxWordsInSentence
/*
// there should only by one <index> block in the ruleset file that has
// these special config switches
long n0 = 0;
long n1 = 0x7fffffff;
// this is used to decrease the scores of words in menu sections.
// this means that words will be scored based on their neighboring
// words in the same section of the document. the section of the
// document is determined by <table><div><tr><td> tags and the like.
// if the neighboring words are in links then the score is decreased.
// this way we expect to score words in menus less. this is now
// default scoring behaviour for newer documents.
bool scoreBySection = true;
if ( ! sx->getBool(n0,n1,"index.scoreBySection",true) )
scoreBySection = false;
// this is used to index newspaper articles.
// indexContentSectionOnly means to only index the words in the top-
// scoring section of the document. the section of the document
// is determined by <table><div><tr><td> tags and the like. the score
// of a section is based on how many words that are not in hyperlinks
// are contained in that section. words in hyperlinks actually decrease
// the score of the section.
bool indexContentSectionOnly =
sx->getBool(n0,n1,"index.indexContentSectionOnly",false);
//log("REMOVE ME");
//indexContentSectionOnly = true;
// if the total score of a section is less than this then no words
// in that section will get indexed. each word in a section is
// counted as 128 points, but if the word is in a hyper link it is
// counted as -256 points (-2*128)
long minSectionScore = sx->getLong(n0,n1,"index.minSectionScore",
-1000000000);
// count words in links as 21 points, words not in links as 128.
// the average score of each word is its score plus the scores of
// its 8 left and its 7 right neighbors divided by 16. if that
// average score is below this value, the word is not indexed.
// only valid if scoreBySection is true!
long minAvgWordScore = sx->getLong(n0,n1,"index.minAvgWordScore",0);
// if the whole document has less than this many words with positive
// scores, do not index any of the words (set their scores to 0)
long minIndexableWords =
sx->getLong (n0,n1,"index.minIndexableWords",defaultm);//40);
// . for weighting the top portion of the document more, use these.
// . only applicable if using the new parser so we can use the new
// Scores class
long numTopWords =
sx->getLong (n0,n1,"index.numTopWords",0);
float topWordsWeight =
sx->getFloat(n0,n1,"index.topWordsWeight",3.0);
float topSentenceWeight =
sx->getFloat(n0,n1,"index.topSentenceWeight",1.0);
long maxWordsInSentence =
sx->getLong (n0,n1,"index.maxWordsInSentence",30);
return set ( words ,
titleRecVersion ,
scoreBySection ,
indexContentSectionOnly ,
minSectionScore ,
minAvgWordScore ,
minIndexableWords ,
// these are for weighting top part
// of news articles
numTopWords ,
topWordsWeight ,
topSentenceWeight ,
maxWordsInSentence ) ;
*/
}
// . returns false and sets g_errno on error
// . scores the words in the Words.cpp class, which is set from an Xml pointer
// . Words.cpp must contain tags cuz that's what we look at to divide the
// words up into sections
// . most docs are divided up into sections based on div, and table/tr/td tags
// . look at each section independently and score words in each section based
// on the density of words in hyperlinks in their vicinity.
// . if a particular section has a lot of hyperlinked text it should score
// low, while a section of a lot of pure text should score high.
// . small sections with not much plain text, but no hyperlinks, will not score
// very high either, usually they are like copyright notices and stuff,
// although they could be a small message on a message board.
// . most sections really don't have many things embedded in them, with the
// exception of the root section, so we can linearly scan each section,
// skipping over the embedded sections, with decent speed and compute the
// score of each word on an individual basis.
// . sets m_wscores[i] to word #i's score weight.
// . if n1 is non-NULL we set the scores of all words that are not in the
// top-scoring section to 0 or -1. this is used for just indexing simple
// news articles which are mostly just contained in a single section.
// . if we have less than minIndexableWords positive scoring words, then do
// not index any words, set their scores to 0
bool Scores::set ( Words *words ,
Sections *sections ,
long titleRecVersion ,
bool scoreBySection ,
bool indexContentSectionOnly ,
long minSectionScore ,
long minAvgWordScore ,
long minIndexableWords ,
// these are for weighting top part of news articles
long numTopWords ,
float topWordsWeight ,
float topSentenceWeight ,
long maxWordsInSentence ,
char *buf ,
long bufSize ) {
// sanity check
//if ( m_buf ) { char *xx = NULL; *xx = 0; }
reset();
// save for printing into g_pbuf in TermTable.cpp
m_scoreBySection = scoreBySection ;
m_indexContentSectionOnly = indexContentSectionOnly ;
m_minSectionScore = minSectionScore ;
m_minAvgWordScore = minAvgWordScore ;
m_minIndexableWords = minIndexableWords ;
m_numTopWords = numTopWords ;
m_topWordsWeight = topWordsWeight ;
m_topSentenceWeight = topSentenceWeight ;
m_maxWordsInSentence = maxWordsInSentence ;
// allocate m_scores buffer, one byte score per word
m_scores = NULL;
long nw = words->getNumWords();
long need = nw * 4;
// assume no malloc
m_needsFree = false;
if ( need < SCORES_LOCALBUFSIZE ) m_buf = m_localBuf;
else if ( need < bufSize ) m_buf = buf;
else {
m_buf = (char *)mmalloc ( need , "Scores" );
m_needsFree = true;
}
m_bufSize = need;
if ( ! m_buf ) return false;
char *p = m_buf;
m_scores = (long *)p;
p += nw * 4;
//m_rerankScores = (long *) p;
// all words start with a default normal score, 128 as of right now
for ( long i = 0 ; i < nw ; i++ ) m_scores[i] = NORM_WORD_SCORE;
nodeid_t *tids = words->getTagIds ();
long long *wids = words->getWordIds ();
char **w = words->m_words;
long *wlens = words->m_wordLens;
// . zero out scores of words in javascript and style tags
// . set scores to 1 if word in select or marquee tag
// . MATCHES.CPP check if the score is -1, and ignores it if so!!!!
// so if you modify this, keep that in mind
if ( ! tids ) return true;
char inScript = 0;
char inStyle = 0;
char inSelect = 0;
char inMarquee = 0;
for ( long i = 0 ; i < nw ; i++ ) {
// skip if not tag
if ( ! tids[i] ) {
if (inScript || inStyle) { m_scores[i] = -1; continue;}
if (inSelect||inMarquee) { m_scores[i] = -1; continue;}
continue;
}
// give all tags score of 0 by default
m_scores[i] = 0;
if ( (tids[i]&BACKBITCOMP) == TAG_SCRIPT ) { // <script>
if ( tids[i] & BACKBIT ) inScript = 0;
else inScript = 1;
continue;
}
if ( (tids[i]&BACKBITCOMP) == TAG_STYLE ) { // <style>
if ( tids[i] & BACKBIT ) inStyle = 0;
else inStyle = 1;
continue;
}
if ( (tids[i]&BACKBITCOMP) == TAG_SELECT ) { // <select>
if ( tids[i] & BACKBIT ) inSelect = 0;
else inSelect = 1;
continue;
}
if ( (tids[i]&BACKBITCOMP) == TAG_MARQUEE ) { // <marquee>
if ( tids[i] & BACKBIT ) inMarquee = 0;
else inMarquee = 1;
continue;
}
if ( inScript || inStyle ) { m_scores[i] = -1; continue; }
if ( inSelect || inMarquee ) { m_scores[i] = -1; continue; }
}
// . set pre-scores of words to NORM_WORD_SCORE (128) if in not in a
// link and to NORM_WORD_SCORE/6 (21) if in a link
// . ignore punctuation and tag words
// . then set the final score of each word to the average of its
// pre-score and the pre-scores of its 7 left and 8 right neighbors
// . do not score anything with a score of "1" that is reserved for
// the <select> tags above
if ( scoreBySection ) {
if ( ! setScoresBySection ( words,
indexContentSectionOnly ,
minSectionScore ,
minAvgWordScore ) )
return false;
}
// otherwise, give all indexable words a default normal score
//else if (titleRecVersion >= 60){
for ( long i = 0 ; i < nw ; i++ )
if ( wids[i] && m_scores[i] > 0 )
m_scores[i] = NORM_WORD_SCORE; // 128;
//}
//else{ // old version...unignores script/select/style words
// for ( long i = 0 ; i < nw ; i++ )
// if ( wids[i] )
// m_scores[i] = NORM_WORD_SCORE; // 128;
//}
// . we need at least this many positive scoring, indexable words
// . this is -1 if unused
if ( minIndexableWords > 0 ) {
long count = 0;
for ( long i = 0 ; i < nw ; i++ )
if ( wids[i] && m_scores[i] > 1 ) count++;
if ( count < minIndexableWords )
for ( long i = 0 ; i < nw ; i++ ) m_scores[i] = 0 ;
}
// . now weight the words in the top of the document more
// . news articles and other docs put the most important info first
if ( numTopWords == 0 ) return true;
long k;
long count = 0;
for ( long i = 0 ; i < nw ; i++ ) {
// skip if not indexed (even though it may have a score > 0)
if ( wids[i] == 0 ) continue;
// skip over anything with a weight of 0 (ignored) or 1
// which means in a <select> tag or something else that should
// be indexed with minimum possible score.
if ( m_scores[i] <= 1 ) {
// end of sentence?
if ( wids[i] != 0 ) continue;
if ( maxWordsInSentence == 0 ) continue;
for ( k = 0 ; k < wlens[i] ; i++ )
if ( w[i][k] == '.' ||
w[i][k] == '!' )
maxWordsInSentence = 0;
continue;
}
if ( count < numTopWords )
m_scores[i] =
(long)((float)m_scores[i] * topWordsWeight);
if ( count < maxWordsInSentence )
m_scores[i] =
(long)((float)m_scores[i] * topSentenceWeight);
count++;
if ( count >= maxWordsInSentence ) maxWordsInSentence = 0;
if ( count >= numTopWords ) numTopWords = 0;
if ( maxWordsInSentence > 0 ) continue;
if ( numTopWords > 0 ) continue;
break;
}
return true;
}
#define RADIUS 16
bool Scores::setScoresBySection ( Words *words,
bool indexContentSectionOnly ,
long minSectionScore ,
long minAvgWordScore ) {
long nw = words->getNumWords();
long long *wids = words->getWordIds ();
nodeid_t *tids = words->getTagIds ();
bool inLink = false;
long score = 0;
long level = 0;
long i;
nodeid_t ids [ MAX_LEVELS ]; // tag ids on stack
long scores [ MAX_LEVELS ]; // scores on stack
long starts [ MAX_LEVELS ]; // section start positions on stack
long previs [ MAX_LEVELS ]; // linked list end
long previ = 0;
// for storing the winning section
long max = -2000000000;
long maxa = -1;
long maxb = -1;
char flag = 0;
// . get the vector, 1-1 with the words
// . wscores is 1 byte, fscores is 4 bytes, wnext is 4 bytes
long need = nw * 6;
char *tmp = NULL;
char tstack[1024*100];
if ( need > 1024*100 )
tmp = (char *)mmalloc(need,"Scoress");
else
tmp = tstack;
// bail if alloc failed
if ( ! tmp ) return log("build: Scores failed to alloc %li bytes.",
need);
char *p = (char *)tmp;
long *wnext = (long *)p ; p += 4 * nw;
short *wscores = (short *)p ; p += 2 * nw;
// init
wnext[0] = -1;
// point to our score buffer
long *fscores = m_scores;
// convenience var
//char *wscores = m_wscores;
// -1 means score is unset
//memset ( wscores , -1 , nw );
// make this fixed for now, a hyperlink word needs to be balanced out
// with 6 plain text words in order to be scored positively. this needs
// to be an unchangeable knob in the ruleset file.
float ratio = 8.0;
// how much to score a plain text word?
long plain = NORM_WORD_SCORE; // 128;
// how much to score a word in hypertext?
long hyper = (long)((float)plain / ratio);
// for scoring the section
long neg = plain * 4;
// misc vars
long mid,k,j,sj,rscore,lscore,bscore,count,cumscore;
nodeid_t tid;
for ( i = 0 ; i < nw ; i++ ) {
// get the tag id
tid = tids[i] & BACKBITCOMP;
// we have to know what words are in hyperlinks
if ( tid == 2 ) {
if ( tids[i] & BACKBIT ) inLink = false;
else inLink = true;
continue;
}
// . if score already set to 0 or 1, skip it
// . probably in a <script>, <style> or <select> tag
// . MDW: i commented this out because it causes
// menu elimination tech to falter, zak uncommented
// it because it screwed up summary generation,
// cuz we were taking summaries from scripts i guess
//if ( m_scores[i] <= 1 ) {
// // set it to 0 in case it was 1 already
// if ( indexContentSectionOnly ) m_scores[i] = 0;
// continue;
//}
// did we have a non section delimiting word?
if ( tid != TAG_DIV && // <div>
tid != TAG_TEXTAREA && // <textarea>
tid != TAG_TR && // <tr>
tid != TAG_TD && // <td>
tid != TAG_TABLE ) { // <table>
// don't score tags, only text
//if ( tid > 0 ) continue;
// skip if punct or tag
if ( wids[i] == 0 ) {
// . punish if tag, except for <br> or <p>
// . generally, taggy sections are not very
// good content.
if ( ! tid ) continue;
if ( tid == TAG_BR ) continue; // <br>
if ( tid == TAG_P ) continue; // <p>
score -= neg;
continue;
}
// in a <script>, <style> or <select> tag?
//if ( indexContentSectionOnly && m_scores[i] == -1 )
// hey we should always scores these as -1! we were
// getting summaries with no spaces in them!
if ( m_scores[i] == -1 )
continue;
// if we hit a comment section identifier, then stop. i
// don't want to index comments right now
//if ( wids[i] == WID_COMMENT ||
// wids[i] == WID_COMMENTS ||
// subtract points if in link, otherwise add points
if ( inLink ) {
score -= neg ; wscores[i] = hyper; }
else {
score += plain ; wscores[i] = plain; }
// keep linked list up to date
wnext[previ] = i;
wnext[ i] = -1;
previ = i;
continue;
}
// . we got a section delimiting tag
// . is there an embedded section in this section?
// . this should break any hyper link right?
inLink = false;
// front tag? did we start a new section?
if ( !(tids[i] & BACKBIT) ) {
// no more sections until we pop this one off
if ( level >= MAX_LEVELS ) {
// only log once
if ( flag == 0 ) {
log("build: Exceeded max levels.");
flag = 1;
}
continue;
}
if ( g_conf.m_logDebugBuild )
log(LOG_DEBUG,"build: Scored section %ld: %ld",
level, score);
// push old info onto the stack
ids [level] = tids[i];
scores [level] = score;
starts [level] = i;
previs [level] = previ;
level++;
score = 0;
// start another linked list
previ = i;
// assume no linked list of words for this section
wnext[i] = -1;
continue;
}
// . it's a back tag
// . bail if no corresponding fron tag on stack
if ( level == 0 ) continue;
// or if did not match what was on top of stack
if ( tid != ids[level-1] ) continue;
// recycle code
hookin:
// pop stack
level--;
//
// this part scores the individual words based on the scores of
// their neighbors. it is just like a moving average of the
// past and future, but we don't bother dividing by the number
// of samples.
//
// score words on left side of section
if ( level == -1 ) sj = 0;
else sj = starts[level];
// often, the first in level is not an indexable word, but
// just a start of the linked list
if ( wids[sj] == 0 ) sj = wnext[sj];
// now start with that
j = sj;
// bail if nothing in the list
if ( j == -1 ) goto empty;
// compute left Boundary score, bscore
bscore = 0;
// accumulate score of first 16 words
for (count=0;count<RADIUS&&j>=0;count++,j=wnext[j]) {
bscore += wscores[j];
// show score of each word
//char *s = words->m_words [j];
//long slen = words->m_wordLens[j];
//printstring(s,slen);
// then score of it
//fprintf(stderr,"(%li) ",(long)wscores[j]);
}
// save accumulation, not average
cumscore = bscore;
// if section has less than 16 words then
// grow bscore proportionately
//if ( count < 16 ) bscore = (bscore *16)/count;
// make it an average score, so it's in [0,128]
bscore /= count;
// 1 is reserved for <select> tag et al
if ( bscore == 1 ) bscore = 2;
// must be above this. all or nothing.
if ( bscore < minAvgWordScore ) bscore = 0;
// set score of first 16 words to the sum
// of the scores of the first 16 words
j = sj; // j = starts[level];
for (count=0;count<RADIUS&&j>=0;count++,j=wnext[j])
fscores[j] = bscore;
// bail if no more words in section
if ( j == -1 ) goto skip;
// . set up right/mid/left ptr info
// . set our rightmost ptr, k
k = j;
// and right most cumulative score
rscore = cumscore;
// left side cumulative score starts at 0
lscore = 0;
// advance j to the 8th word, almost exactly in the middle
j = sj; // j = starts[level];
for (count=0;count<(RADIUS/2-1)/*7*/;count++,j=wnext[j]);
// score words in middle now
mid = j;
// j is the left most ptr
j = sj; // j = starts[level];
more: // now set centroids' FINAL score, the sum of
// its wscore and first 5 on left and right.
// divide by 16 to make it an average score
//fscores[mid] = (rscore - lscore) / 16 ;
//fscores[mid] = (rscore - lscore) >> 4;
//fscores[mid] = cumscore >> 4;
fscores[mid] = cumscore / RADIUS; // >> 4;
// 1 is reserved for <select> tag et al
if ( fscores[mid] == 1 ) fscores[mid] = 2;
// must be above this. all or nothing.
if ( fscores[mid] < minAvgWordScore ) fscores[mid] = 0;
// debug point
//if ( fscores[mid] == 0 ) {
// char *xx = NULL; *xx = 0; }
// advance left end and its cumulative score
cumscore -= wscores[j];
j = wnext [j];
// advance middle
mid = wnext [mid];
// advance right end and its cumulative score
k = wnext [k];
cumscore += wscores[k];
// loop if more left
if ( k > 0 ) goto more;
// score words on right end
bscore = 0;
count = 0;
for (k=mid;k>=0;k=wnext[k],count++) bscore+=wscores[k];
// get the average score of them
bscore /= count;
// 1 is reserved for <select> tag et al
if ( bscore == 1 ) bscore = 2;
// must be above this. all or nothing.
if ( bscore < minAvgWordScore ) bscore = 0;
// and set
for (k=mid;k>=0;k=wnext[k]) fscores[k]=bscore;
//
// end neighbor-influenced scoring
//
skip:
if ( (score > max || maxa==-1) && score > minSectionScore ) {
// zero out the previous winning section
if ( indexContentSectionOnly && maxa >= 0 ) {
// zero out all in list
for ( j = maxa; j >= 0 ; j = wnext[j] )
fscores[j] = 0;
}
// this section is the new winning section
log(LOG_DEBUG, "build: Winning section: %ld, "
"score: %ld", level, score);
max = score;
maxa = sj; // starts[level];
maxb = i; // our section's last word # is < i
}
// if we were not the winning section, zero ourselves out
else if ( indexContentSectionOnly ) {
for ( j = sj ; j >= 0 ; j = wnext[j] )
fscores[j] = 0;
}
empty:
// pop old score et al back to be resumed
if ( level >= 0 ) {
score = scores[level];
previ = previs[level];
}
// get next node
}
// set scores of anything still on the stack at completion
while ( level >= 0 ) {
i = nw - 1;
goto hookin;
}
/*
for (long i = 0 ; i < nw ; i++ ) {
// skip if no wid
if ( words->m_wordIds[i] == 0LL ) continue;
if ( m_scores[i] == 0 ) continue;
// show score of each word
char *s = words->m_words [i];
long slen = words->m_wordLens[i];
printstring(s,slen);
// then score of it
fprintf(stderr,"(%li) ",(long)m_scores[i]);
}
*/
// and scores of the main/base section
//i = nw - 1;
//if ( ! flag ) { flag = 1; level = 1; goto hookin; }
// ok, now we have designated all the sections and assigned them a
// score, so if we are just getting the top section, return that
// give caller the article text in a nutshell if that's all they wanted
//if ( n1 ) {
// // assume no top section
// *n1 = -1; *n2 = -1;
// if ( maxi >= 0 ) { *n1 = maxa; *n2 = maxb; }
// return true;
//}
// now set the individual word scores in each section
//for ( long i = 0 ; i < nsecs ; i++ )
// setSectionScores ( i , secStarts , secEnds , wscores );
// copy scores
//for (long i = 0 ; i < nw ;i++) m_scores[i]=(unsigned char)fscores[i];
// done
if ( tmp != tstack ) mfree ( tmp , need , "Scores" );
// success
return true;
}