// TODO: pass spam class to weight class and modify weights based on the spam // then we can just serialize the weight vector in the title rec along // with ptr offsets to the words that we index. carver can just scan // through the word ptrs rather than 1 char at a time. summary generator // can just use the weights to score each sample then. #include "gb-include.h" #include "Weights.h" #include "Words.h" #include "Bits.h" #include "Phrases.h" #include "Titledb.h" #include "HashTableX.h" #include "Abbreviations.h" #include "XmlNode.h" // g_nodes[] #include "HashTable.h" #include "Sections.h" static HashTable s_punctTable; bool initPunctWeights ( ) ; Weights::Weights () { m_buf = NULL; m_bufSize = 0; m_ww = NULL; m_pw = NULL; m_countTablePtr = NULL; } Weights::~Weights () { reset(); } void Weights::reset() { if ( m_buf && m_buf != m_localBuf ) mfree ( m_buf , m_bufSize , "Weights" ); m_buf = NULL; m_bufSize = 0; m_ww = NULL; m_pw = NULL; m_countTablePtr = NULL; } // RULE #1 (apply to meta tags, too) // these rules should be applied when indexing/hashing the incoming linktext, // title, document body and meta tags. // RULE #2 (promte rather than demote) // prefer to promote rather than demote because then we do not have to reindex // the low quality pages, just the good ones, which are fewer, and the good // ones will bubble to the top. // RULE #3 (in parentheses) // if word in ()'s []'s or {}'s then demote word and phrase score equally. // after the first 40 words in the parentheses, do not demote any more. // RULE #4 (next to bad punct) // if we have a sequence of punctuation that is not just spaces on the left // or the right of the word, demote it. Right now we just use // Bits::canPairAcross(), but we should get more accurate... // We compute a word and phrase weight for each punctuation "word", i.e. // sequence of punctuation, and we use initPunctWeights() to set that table. // Apply these weights differently to both the word and the phrase. The // phrase weight from this is averaged across the // RULE #5 (in hyperlink) // if word or phrase is in a hyperlink multiply score by .15. // RULE #6 (in content section) // if word is in a section with lots of plain text words that are not // in hyperlinks, boost it. If section is skimpy demote each word, but only // demote a word if NOT the first occurence of that word in that section. // That way we allow for headers in their own section, like a table row, . // Sections are delimited by "breaking tags" defined in XmlNode.cpp, g_nodes[]. // RULE #7 (in header, italic, bold or title tag) // boost words and phrases in header, italic, bold or title tags. // Only do this for the first 20 words in header tags to // prevent abuse. Italic and bold are times also have a combined limit of 40. // up to the first 20 alnum words in a title tag are boosted. // RULE #8 (in a ul list) // demote if in a list, under the