// Matt Wells, copyright Jul 2005 // . the Scores class is a vector to weight Words scores by // . this was originally made to extract the news article from a web page // and discard the words in menu sections and other cruft. // . words are weighted by the number of neighboring words in their "sections" // that are not in hyperlinks. // . "sections" are determined by table/tr/td/div/... etc tags // . m_scores is 1-1 with the words in the supplied "words" class #ifndef _SCORES_H_ #define _SCORES_H_ #include "Words.h" // if you change this you must also change the shift logic in Phrases.cpp // for setting the "minScore" #define NORM_WORD_SCORE 128 #define SCORES_LOCALBUFSIZE 20 class Scores { public: Scores(); ~Scores(); void reset(); // if indexContentSectionsOnly is true, only the words in the most // relevant scores will have positive scores, all other words are // discarded. //bool set ( class Words *words , bool indexContentSectionsOnly ); bool set ( class Words *words , class Sections *sections , int32_t titleRecVersion , // this is true to zero-out terms in the menus, otherwise // we assign them a minimal score of 1 bool eliminateMenus , // provide it with a buffer to prevent a malloc char *buf = NULL , int32_t bufSize = 0 , int32_t minIndexableWords = -1 ); //char m_localBuf [ MAX_WORDS*8*10 ]; char m_localBuf[SCORES_LOCALBUFSIZE]; char *m_buf; int32_t m_bufSize; bool m_needsFree; private: // returns false and sets g_errno on error bool set ( class Words *words , class Sections *sections , int32_t titleRecVersion , bool scoreBySection , bool indexContentSectionOnly , int32_t minSectionScore , int32_t minAvgWordScore , int32_t minIndexableWords , // these are for weighting top part of news articles int32_t numTopWords , float topWordsWeight , float topSentenceWeight , int32_t maxWordsInSentence , char *buf = NULL , int32_t bufSize = 0 ) ; public: int32_t getMemUsed () { return m_bufSize; }; int32_t getScore ( int32_t i ) { return m_scores[i]; }; // private: bool setScoresBySection ( class Words *words , bool indexContentSectionOnly , int32_t minSectionScore , int32_t minAvgWordScore ); // percent to weight word scores by... actually from 0 to 128 // for speed reasons int32_t *m_scores; //int32_t *m_rerankScores; // these are printed out by PageParser.cpp in TermTable.cpp bool m_scoreBySection ; bool m_indexContentSectionOnly ; int32_t m_minSectionScore ; int32_t m_minAvgWordScore ; int32_t m_minIndexableWords ; int32_t m_numTopWords ; float m_topWordsWeight ; float m_topSentenceWeight ; int32_t m_maxWordsInSentence ; }; #endif