#include "gb-include.h" #include "Scores.h" #include "Words.h" // . explicit article body indicator tags: //
// // //
--- although has "entry" for ads, etc. // reuters: and Ends --> // . forbes have a bunch of strewn together. they // are neighbor sections. #define MAX_LEVELS 200 Scores::Scores () { m_buf = NULL; m_bufSize = 0; m_scores = NULL; //m_rerankScores = NULL; } Scores::~Scores() { reset(); } void Scores::reset() { if ( m_buf && m_needsFree ) // m_buf != m_localBuf ) mfree ( m_buf , m_bufSize , "Scores" ); m_buf = NULL; m_scores = NULL; } bool Scores::set ( Words *words , Sections *sections , long titleRecVersion , bool eliminateMenus , // provide it with a buffer to prevent a malloc char *buf , long bufSize , long minIndexableWords ) { //long defaultm = 40; //if ( titleRecVersion >= 56 ) defaultm = -1; // "scoreBySection" (default is true) // Should gigablast break the document into sections and score the // words in sections with mostly link text lower than words in sections // without much link text? This helps to reduce the effects of menu // spam. // Used for news articles. // This only applies to the body of the document. // "indexContentSectionOnly" (default is false) // Should gigablast attempt to isolate just the single most-relevant // content section from the document and not index anything else? // Used for news articles. // This only applies to the body of the document. // "minSectionScore" (default is -1000000000) // The minimum score an entire section of the document needs to have // its words indexed. Each word in a section counts as 128 points, but // a word in a hyperlink counts as -256 points. // Used for news articles. // This only applies to the body of the document. // "minIndexableWords" (default is -1) // If the number of indexable words that have a positive average score // is below this value, then no words will be indexed. Used // to just index beefy news articles. -1 means to ignore this // constraint. // "minAvgWordScore" (default is 0) // Words have an average score of the 8 neighboring words on their left // and the 8 neighboring words on their right, in the same section. // These word scores are 128 points for a word not in a link, and only // 21 points for a word in a link. What is the minimum score average // score a word needs to be indexed? (Before applying the top word // weight, below) // scoreBySection must be enabled for this to work. // "numTopWords" (default is 0) // Weight the first X words higher. // Used for news articles. // This only applies to the body of the document. // "topWordsWeight" (default is 1.0) // Weight the first X words by this much, a rational number. // Used for news articles. // This only applies to the body of the document. // "topSentenceWeight" (default is 1.0) // Weight the first sentence by this much, a rational number. // Only applies to documents that support western punctuation. // Used for news articles. // This only applies to the body of the document. // "maxWordsInSentence" (default is 0) // Do not weight more than this words in the first sentence. // Used for news articles. // This only applies to the body of the document. // if we are doing "menu elimination technology" then zero out // scores of terms not in the single content section if ( eliminateMenus ) return set ( words , titleRecVersion, true , // scoreBySection true , // indexContentSectionOnly (DIFF) -1000000000 , // minSectionScore 0 , // minAvgWordScore 40 , // minIndexableWords (DIFF) 0 , // numTopWords 3.0 , // topWordsWeight 1.0 , // topSentenceWeight 30 );// maxWordsInSentence // use all defaults if no site rec //if ( ! sx ) return set ( words , titleRecVersion , true , // scoreBySection false , // indexContentSectionOnly -1000000000 , // minSectionScore 0 , // minAvgWordScore minIndexableWords , // defaults to -1 0 , // numTopWords 3.0 , // topWordsWeight 1.0 , // topSentenceWeight 30 );// maxWordsInSentence /* // there should only by one block in the ruleset file that has // these special config switches long n0 = 0; long n1 = 0x7fffffff; // this is used to decrease the scores of words in menu sections. // this means that words will be scored based on their neighboring // words in the same section of the document. the section of the // document is determined by
tags and the like. // if the neighboring words are in links then the score is decreased. // this way we expect to score words in menus less. this is now // default scoring behaviour for newer documents. bool scoreBySection = true; if ( ! sx->getBool(n0,n1,"index.scoreBySection",true) ) scoreBySection = false; // this is used to index newspaper articles. // indexContentSectionOnly means to only index the words in the top- // scoring section of the document. the section of the document // is determined by
tags and the like. the score // of a section is based on how many words that are not in hyperlinks // are contained in that section. words in hyperlinks actually decrease // the score of the section. bool indexContentSectionOnly = sx->getBool(n0,n1,"index.indexContentSectionOnly",false); //log("REMOVE ME"); //indexContentSectionOnly = true; // if the total score of a section is less than this then no words // in that section will get indexed. each word in a section is // counted as 128 points, but if the word is in a hyper link it is // counted as -256 points (-2*128) long minSectionScore = sx->getLong(n0,n1,"index.minSectionScore", -1000000000); // count words in links as 21 points, words not in links as 128. // the average score of each word is its score plus the scores of // its 8 left and its 7 right neighbors divided by 16. if that // average score is below this value, the word is not indexed. // only valid if scoreBySection is true! long minAvgWordScore = sx->getLong(n0,n1,"index.minAvgWordScore",0); // if the whole document has less than this many words with positive // scores, do not index any of the words (set their scores to 0) long minIndexableWords = sx->getLong (n0,n1,"index.minIndexableWords",defaultm);//40); // . for weighting the top portion of the document more, use these. // . only applicable if using the new parser so we can use the new // Scores class long numTopWords = sx->getLong (n0,n1,"index.numTopWords",0); float topWordsWeight = sx->getFloat(n0,n1,"index.topWordsWeight",3.0); float topSentenceWeight = sx->getFloat(n0,n1,"index.topSentenceWeight",1.0); long maxWordsInSentence = sx->getLong (n0,n1,"index.maxWordsInSentence",30); return set ( words , titleRecVersion , scoreBySection , indexContentSectionOnly , minSectionScore , minAvgWordScore , minIndexableWords , // these are for weighting top part // of news articles numTopWords , topWordsWeight , topSentenceWeight , maxWordsInSentence ) ; */ } // . returns false and sets g_errno on error // . scores the words in the Words.cpp class, which is set from an Xml pointer // . Words.cpp must contain tags cuz that's what we look at to divide the // words up into sections // . most docs are divided up into sections based on div, and table/tr/td tags // . look at each section independently and score words in each section based // on the density of words in hyperlinks in their vicinity. // . if a particular section has a lot of hyperlinked text it should score // low, while a section of a lot of pure text should score high. // . small sections with not much plain text, but no hyperlinks, will not score // very high either, usually they are like copyright notices and stuff, // although they could be a small message on a message board. // . most sections really don't have many things embedded in them, with the // exception of the root section, so we can linearly scan each section, // skipping over the embedded sections, with decent speed and compute the // score of each word on an individual basis. // . sets m_wscores[i] to word #i's score weight. // . if n1 is non-NULL we set the scores of all words that are not in the // top-scoring section to 0 or -1. this is used for just indexing simple // news articles which are mostly just contained in a single section. // . if we have less than minIndexableWords positive scoring words, then do // not index any words, set their scores to 0 bool Scores::set ( Words *words , Sections *sections , long titleRecVersion , bool scoreBySection , bool indexContentSectionOnly , long minSectionScore , long minAvgWordScore , long minIndexableWords , // these are for weighting top part of news articles long numTopWords , float topWordsWeight , float topSentenceWeight , long maxWordsInSentence , char *buf , long bufSize ) { // sanity check //if ( m_buf ) { char *xx = NULL; *xx = 0; } reset(); // save for printing into g_pbuf in TermTable.cpp m_scoreBySection = scoreBySection ; m_indexContentSectionOnly = indexContentSectionOnly ; m_minSectionScore = minSectionScore ; m_minAvgWordScore = minAvgWordScore ; m_minIndexableWords = minIndexableWords ; m_numTopWords = numTopWords ; m_topWordsWeight = topWordsWeight ; m_topSentenceWeight = topSentenceWeight ; m_maxWordsInSentence = maxWordsInSentence ; // allocate m_scores buffer, one byte score per word m_scores = NULL; long nw = words->getNumWords(); long need = nw * 4; // assume no malloc m_needsFree = false; if ( need < SCORES_LOCALBUFSIZE ) m_buf = m_localBuf; else if ( need < bufSize ) m_buf = buf; else { m_buf = (char *)mmalloc ( need , "Scores" ); m_needsFree = true; } m_bufSize = need; if ( ! m_buf ) return false; char *p = m_buf; m_scores = (long *)p; p += nw * 4; //m_rerankScores = (long *) p; // all words start with a default normal score, 128 as of right now for ( long i = 0 ; i < nw ; i++ ) m_scores[i] = NORM_WORD_SCORE; nodeid_t *tids = words->getTagIds (); long long *wids = words->getWordIds (); char **w = words->m_words; long *wlens = words->m_wordLens; // . zero out scores of words in javascript and style tags // . set scores to 1 if word in select or marquee tag // . MATCHES.CPP check if the score is -1, and ignores it if so!!!! // so if you modify this, keep that in mind if ( ! tids ) return true; char inScript = 0; char inStyle = 0; char inSelect = 0; char inMarquee = 0; for ( long i = 0 ; i < nw ; i++ ) { // skip if not tag if ( ! tids[i] ) { if (inScript || inStyle) { m_scores[i] = -1; continue;} if (inSelect||inMarquee) { m_scores[i] = -1; continue;} continue; } // give all tags score of 0 by default m_scores[i] = 0; if ( (tids[i]&BACKBITCOMP) == TAG_SCRIPT ) { //