#include "gb-include.h"
#include "Scores.h"
#include "Words.h"
// . explicit article body indicator tags:
//
//
//
// --- although has "entry" for ads, etc.
// reuters: and Ends -->
// . forbes have a bunch of strewn together. they
// are neighbor sections.
#define MAX_LEVELS 200
Scores::Scores () {
m_buf = NULL;
m_bufSize = 0;
m_scores = NULL;
//m_rerankScores = NULL;
}
Scores::~Scores() {
reset();
}
void Scores::reset() {
if ( m_buf && m_needsFree ) // m_buf != m_localBuf )
mfree ( m_buf , m_bufSize , "Scores" );
m_buf = NULL;
m_scores = NULL;
}
bool Scores::set ( Words *words ,
Sections *sections ,
long titleRecVersion ,
bool eliminateMenus ,
// provide it with a buffer to prevent a malloc
char *buf ,
long bufSize ,
long minIndexableWords ) {
//long defaultm = 40;
//if ( titleRecVersion >= 56 ) defaultm = -1;
// "scoreBySection" (default is true)
// Should gigablast break the document into sections and score the
// words in sections with mostly link text lower than words in sections
// without much link text? This helps to reduce the effects of menu
// spam.
// Used for news articles.
// This only applies to the body of the document.
// "indexContentSectionOnly" (default is false)
// Should gigablast attempt to isolate just the single most-relevant
// content section from the document and not index anything else?
// Used for news articles.
// This only applies to the body of the document.
// "minSectionScore" (default is -1000000000)
// The minimum score an entire section of the document needs to have
// its words indexed. Each word in a section counts as 128 points, but
// a word in a hyperlink counts as -256 points.
// Used for news articles.
// This only applies to the body of the document.
// "minIndexableWords" (default is -1)
// If the number of indexable words that have a positive average score
// is below this value, then no words will be indexed. Used
// to just index beefy news articles. -1 means to ignore this
// constraint.
// "minAvgWordScore" (default is 0)
// Words have an average score of the 8 neighboring words on their left
// and the 8 neighboring words on their right, in the same section.
// These word scores are 128 points for a word not in a link, and only
// 21 points for a word in a link. What is the minimum score average
// score a word needs to be indexed? (Before applying the top word
// weight, below)
// scoreBySection must be enabled for this to work.
// "numTopWords" (default is 0)
// Weight the first X words higher.
// Used for news articles.
// This only applies to the body of the document.
// "topWordsWeight" (default is 1.0)
// Weight the first X words by this much, a rational number.
// Used for news articles.
// This only applies to the body of the document.
// "topSentenceWeight" (default is 1.0)
// Weight the first sentence by this much, a rational number.
// Only applies to documents that support western punctuation.
// Used for news articles.
// This only applies to the body of the document.
// "maxWordsInSentence" (default is 0)
// Do not weight more than this words in the first sentence.
// Used for news articles.
// This only applies to the body of the document.
// if we are doing "menu elimination technology" then zero out
// scores of terms not in the single content section
if ( eliminateMenus )
return set ( words ,
titleRecVersion,
true , // scoreBySection
true , // indexContentSectionOnly (DIFF)
-1000000000 , // minSectionScore
0 , // minAvgWordScore
40 , // minIndexableWords (DIFF)
0 , // numTopWords
3.0 , // topWordsWeight
1.0 , // topSentenceWeight
30 );// maxWordsInSentence
// use all defaults if no site rec
//if ( ! sx )
return set ( words ,
titleRecVersion ,
true , // scoreBySection
false , // indexContentSectionOnly
-1000000000 , // minSectionScore
0 , // minAvgWordScore
minIndexableWords , // defaults to -1
0 , // numTopWords
3.0 , // topWordsWeight
1.0 , // topSentenceWeight
30 );// maxWordsInSentence
/*
// there should only by one block in the ruleset file that has
// these special config switches
long n0 = 0;
long n1 = 0x7fffffff;
// this is used to decrease the scores of words in menu sections.
// this means that words will be scored based on their neighboring
// words in the same section of the document. the section of the
// document is determined by tags and the like.
// if the neighboring words are in links then the score is decreased.
// this way we expect to score words in menus less. this is now
// default scoring behaviour for newer documents.
bool scoreBySection = true;
if ( ! sx->getBool(n0,n1,"index.scoreBySection",true) )
scoreBySection = false;
// this is used to index newspaper articles.
// indexContentSectionOnly means to only index the words in the top-
// scoring section of the document. the section of the document
// is determined by tags and the like. the score
// of a section is based on how many words that are not in hyperlinks
// are contained in that section. words in hyperlinks actually decrease
// the score of the section.
bool indexContentSectionOnly =
sx->getBool(n0,n1,"index.indexContentSectionOnly",false);
//log("REMOVE ME");
//indexContentSectionOnly = true;
// if the total score of a section is less than this then no words
// in that section will get indexed. each word in a section is
// counted as 128 points, but if the word is in a hyper link it is
// counted as -256 points (-2*128)
long minSectionScore = sx->getLong(n0,n1,"index.minSectionScore",
-1000000000);
// count words in links as 21 points, words not in links as 128.
// the average score of each word is its score plus the scores of
// its 8 left and its 7 right neighbors divided by 16. if that
// average score is below this value, the word is not indexed.
// only valid if scoreBySection is true!
long minAvgWordScore = sx->getLong(n0,n1,"index.minAvgWordScore",0);
// if the whole document has less than this many words with positive
// scores, do not index any of the words (set their scores to 0)
long minIndexableWords =
sx->getLong (n0,n1,"index.minIndexableWords",defaultm);//40);
// . for weighting the top portion of the document more, use these.
// . only applicable if using the new parser so we can use the new
// Scores class
long numTopWords =
sx->getLong (n0,n1,"index.numTopWords",0);
float topWordsWeight =
sx->getFloat(n0,n1,"index.topWordsWeight",3.0);
float topSentenceWeight =
sx->getFloat(n0,n1,"index.topSentenceWeight",1.0);
long maxWordsInSentence =
sx->getLong (n0,n1,"index.maxWordsInSentence",30);
return set ( words ,
titleRecVersion ,
scoreBySection ,
indexContentSectionOnly ,
minSectionScore ,
minAvgWordScore ,
minIndexableWords ,
// these are for weighting top part
// of news articles
numTopWords ,
topWordsWeight ,
topSentenceWeight ,
maxWordsInSentence ) ;
*/
}
// . returns false and sets g_errno on error
// . scores the words in the Words.cpp class, which is set from an Xml pointer
// . Words.cpp must contain tags cuz that's what we look at to divide the
// words up into sections
// . most docs are divided up into sections based on div, and table/tr/td tags
// . look at each section independently and score words in each section based
// on the density of words in hyperlinks in their vicinity.
// . if a particular section has a lot of hyperlinked text it should score
// low, while a section of a lot of pure text should score high.
// . small sections with not much plain text, but no hyperlinks, will not score
// very high either, usually they are like copyright notices and stuff,
// although they could be a small message on a message board.
// . most sections really don't have many things embedded in them, with the
// exception of the root section, so we can linearly scan each section,
// skipping over the embedded sections, with decent speed and compute the
// score of each word on an individual basis.
// . sets m_wscores[i] to word #i's score weight.
// . if n1 is non-NULL we set the scores of all words that are not in the
// top-scoring section to 0 or -1. this is used for just indexing simple
// news articles which are mostly just contained in a single section.
// . if we have less than minIndexableWords positive scoring words, then do
// not index any words, set their scores to 0
bool Scores::set ( Words *words ,
Sections *sections ,
long titleRecVersion ,
bool scoreBySection ,
bool indexContentSectionOnly ,
long minSectionScore ,
long minAvgWordScore ,
long minIndexableWords ,
// these are for weighting top part of news articles
long numTopWords ,
float topWordsWeight ,
float topSentenceWeight ,
long maxWordsInSentence ,
char *buf ,
long bufSize ) {
// sanity check
//if ( m_buf ) { char *xx = NULL; *xx = 0; }
reset();
// save for printing into g_pbuf in TermTable.cpp
m_scoreBySection = scoreBySection ;
m_indexContentSectionOnly = indexContentSectionOnly ;
m_minSectionScore = minSectionScore ;
m_minAvgWordScore = minAvgWordScore ;
m_minIndexableWords = minIndexableWords ;
m_numTopWords = numTopWords ;
m_topWordsWeight = topWordsWeight ;
m_topSentenceWeight = topSentenceWeight ;
m_maxWordsInSentence = maxWordsInSentence ;
// allocate m_scores buffer, one byte score per word
m_scores = NULL;
long nw = words->getNumWords();
long need = nw * 4;
// assume no malloc
m_needsFree = false;
if ( need < SCORES_LOCALBUFSIZE ) m_buf = m_localBuf;
else if ( need < bufSize ) m_buf = buf;
else {
m_buf = (char *)mmalloc ( need , "Scores" );
m_needsFree = true;
}
m_bufSize = need;
if ( ! m_buf ) return false;
char *p = m_buf;
m_scores = (long *)p;
p += nw * 4;
//m_rerankScores = (long *) p;
// all words start with a default normal score, 128 as of right now
for ( long i = 0 ; i < nw ; i++ ) m_scores[i] = NORM_WORD_SCORE;
nodeid_t *tids = words->getTagIds ();
long long *wids = words->getWordIds ();
char **w = words->m_words;
long *wlens = words->m_wordLens;
// . zero out scores of words in javascript and style tags
// . set scores to 1 if word in select or marquee tag
// . MATCHES.CPP check if the score is -1, and ignores it if so!!!!
// so if you modify this, keep that in mind
if ( ! tids ) return true;
char inScript = 0;
char inStyle = 0;
char inSelect = 0;
char inMarquee = 0;
for ( long i = 0 ; i < nw ; i++ ) {
// skip if not tag
if ( ! tids[i] ) {
if (inScript || inStyle) { m_scores[i] = -1; continue;}
if (inSelect||inMarquee) { m_scores[i] = -1; continue;}
continue;
}
// give all tags score of 0 by default
m_scores[i] = 0;
if ( (tids[i]&BACKBITCOMP) == TAG_SCRIPT ) { // |
|
|