// Matt Wells, copyright Jul 2001 // . gets xhtml filtered into plain text // . parses plain text into Words // . gets rawTermIds from the query // . uses Matches class to find words that match rawTermIds // . for each term in query, find the line with that term and the most // other matching terms, and print that line // . modifications... // . exclude title from the plain text (call xml->getText() twice?) // . find up to X lines // . find phrases by setting the Phrases class as well // . score lines by termfreqs of terms you highlight in the line // . highlight terms in order of their termFreq, lowest first! // . remove junk from start/end of summary (no back-to-back punct) // . stop summary line on space, not non-alnum (no breaking on apostrophe) // . don't highlight stop words???? #ifndef _SUMMARY_H_ #define _SUMMARY_H_ #include "gb-include.h" #include "Unicode.h" #include "matches2.h" #include "Query.h" #include "Xml.h" #include "Mem.h" //#include "LinkInfo.h" // BIG HACK support #include "Words.h" #include "Bits.h" #include "Pos.h" #include "Matches.h" #include "HashTableT.h" //#include "Places.h" #include "Domains.h" #include "CountryCode.h" #define MAX_SUMMARY_LEN 1024*20 #define MAX_SUMMARY_EXCERPTS 1024 #define MAX_SUMMARY_LOCS 16 class Summary { public: Summary(); ~Summary(); void reset(); // . like above but flattens the xml for you then calls the above // . returns false and sets errno on error bool set ( class Xml *xml , class Query *q , int64_t *termFreqs , bool doStemming , int32_t maxSummaryLen , int32_t maxNumLines , int32_t maxNumCharsPerLine , //int32_t bigSampleRadius , //int32_t bigSampleMaxLen , bool ratInSummary = false , class Url *f = NULL ); //bool excludeAnchText = false, //bool hackFixWords = false, //bool hackFixPhrases = false ) ; // this should eventually replace set() bool set2 ( class Xml *xml , class Words *words , class Bits *bits , class Sections *sections , class Pos *pos , class Query *q , int64_t *termFreqs , float *affWeights , // 1-1 with qterms //char *coll , //int32_t collLen , bool doStemming , int32_t maxSummaryLen , int32_t numDisplayLines , int32_t maxNumLines , int32_t maxNumCharsPerLine , //int32_t bigSampleRadius , //int32_t bigSampleMaxLen , bool ratInSummary , //TitleRec *tr , class Url *f , //bool allowPunctInPhrase = true, //bool excludeLinkText = false, //bool excludeMetaText = false, //bool hackFixWords = false, //bool hackFixPhrases = false, //float *queryProximityScore= NULL , class Matches *matches = NULL , char *titleBuf = NULL , int32_t titleBufLen = 0 ); // this is NULL terminated char *getSummary ( ) { return m_summary; }; int32_t getSummaryLen ( ) { return m_summaryLen; }; // me = "max excerpts". we truncate the summary if we need to. // XmlDoc.cpp::getSummary(), likes to request more excerpts than are // actually displayed so it has a bigger summary for deduping purposes. int32_t getSummaryLen ( int32_t me ) ; // for related topics.. sample surrounding the query terms //char *getBigSampleBuf ( ) { return m_buf; }; //int32_t getBigSampleLen ( ) { return m_bufLen; }; void truncateSummaryForExcerpts ( int32_t numExcerpts , int32_t maxSummaryLen, char *dmozSumms , int32_t *dmozSummLens , int32_t numCatids , bool *sumFromDmoz ); //float getDiversity() {return m_diversity;} //float getProximityScore() { return m_proximityScore; }; // for places in summary /* bool scanForLocations ( ); int32_t getNumSummaryLocs ( ) { return m_summaryLocs.length()/sizeof(uint64_t); }; int32_t getSummaryLocsSize ( ) { return m_summaryLocs.length(); } uint64_t *getSummaryLocs ( ) { return (uint64_t *)m_summaryLocs.getBufStart(); }; int32_t getSummaryLocsPopsSize( ) { return m_summaryLocsPops.length(); } int32_t *getSummaryLocsPops ( ) { return (int32_t *)m_summaryLocsPops.getBufStart(); }; */ // private: // . content is an html/xml doc // . we highlight "query" in "content" as best as we can // . returns false and sets errno on error // . CAUTION: this is destructive on "doc" // . stores bigSample into "doc" which should be "m_buf" // and sets bytes stored into *bigSampleLen /* bool set ( char *doc , int32_t docLen , Query *q , int64_t *termFreqs , bool doStemming , int32_t maxSummaryLen , int32_t maxNumLines , int32_t maxNumCharsPerLine , //int32_t bigSampleRadius , //int32_t bigSampleMaxLen , //int32_t *bigSampleLen , char *foundTermVector ); */ // BIG HACK support //bool allQTermsFound( Query *q, TitleRec *tr, Xml *xml, // class Matches *matches, // qvec_t reqMask, qvec_t negMask, // bool excludeLinkText, // bool excludeMetaText, // bool allowPunctInPhrase ); ////////////////////////////////////////////////////////////////// // // THE NEW SUMMARY GENERATOR routines below here // ////////////////////////////////////////////////////////////////// bool getDefaultSummary ( Xml *xml, Words *words, class Sections *sections , Pos *pos, //int32_t bigSampleRadius, int32_t maxSummaryLen ); void setSummaryScores ( class Matches *matches , //Words *words , //Scores *scores , //Pos *pos , //int32_t numNeedles, //Needle *needles , Query *q , float *phraseAffWeights , //int32_t *docSummaryScore, //int32_t *queryInSectionScore, int32_t commentStart ); int64_t getBestWindow ( class Matches *matches , int32_t mn , int32_t *lasta , int32_t *besta , int32_t *bestb , char *gotIt , char *retired , int32_t maxExcerptLen ); //int32_t numFindableQWords, //char *represented, //int32_t *foundNew ); void reduceQueryScores ( class Matches *matches, int32_t m, int32_t a, int32_t b ) ; void reduceScoreForWords ( class Matches *matches, int32_t qtn ) ; // a wrapper basically for the set0 below bool set0 ( char *doc, int32_t docLen, Query *q, class Msg20Request *mr); // . the old string based summary generator -- ULTRA FAST! // . resurrected from /gb/datil2-release.git/src/Summary.cpp // . returns false with g_errno set on error bool set1 ( char *doc , int32_t docLen , Query *q , int32_t maxSummaryLen , int32_t maxNumLines , int32_t maxNumCharsPerLine , int32_t bigSampleRadius , int32_t bigSampleMaxLen , int32_t *bigSampleLen , char *foundTermVector , int64_t *termFreqs ) ; // null terminate and store the summary here. char m_summary [ MAX_SUMMARY_LEN ]; int32_t m_summaryLen; int32_t m_summaryExcerptLen [ MAX_SUMMARY_EXCERPTS ]; int32_t m_numExcerpts; bool m_isNormalized; // hold the big sample here //char *m_buf; //int32_t m_bufMaxLen; //int32_t m_bufLen; //bool m_freeBuf; //char m_localBuf[10032]; // if getting more lines for deduping than we need for displaying, // how big is that part of the summary to display? int32_t m_numDisplayLines; int32_t m_displayLen; int32_t getSummaryDisplayLen() { return m_displayLen; } int32_t m_maxNumCharsPerLine; int32_t m_titleVersion; // ptr to the query Query *m_q; // query scores //int32_t *m_qscores; // pub date list offsets bool m_useDateLists; bool m_exclDateList; int32_t m_begPubDateList; int32_t m_endPubDateList; //float m_diversity; //float m_proximityScore; char *m_bitScoresBuf; int32_t m_bitScoresBufSize; float m_wordWeights[MAX_QUERY_WORDS]; char m_summaryLocBuf[MAX_SUMMARY_LOCS*sizeof(uint64_t)]; SafeBuf m_summaryLocs; char m_summaryLocPopsBuf[MAX_SUMMARY_LOCS*sizeof(int32_t)]; SafeBuf m_summaryLocsPops; }; #endif