mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
b0caf3eb00
knobs for summary gen working.
279 lines
8.8 KiB
C++
279 lines
8.8 KiB
C++
// Matt Wells, copyright Jul 2001
|
|
|
|
// . gets xhtml filtered into plain text
|
|
// . parses plain text into Words
|
|
// . gets rawTermIds from the query
|
|
// . uses Matches class to find words that match rawTermIds
|
|
// . for each term in query, find the line with that term and the most
|
|
// other matching terms, and print that line
|
|
|
|
// . modifications...
|
|
// . exclude title from the plain text (call xml->getText() twice?)
|
|
// . find up to X lines
|
|
// . find phrases by setting the Phrases class as well
|
|
// . score lines by termfreqs of terms you highlight in the line
|
|
// . highlight terms in order of their termFreq, lowest first!
|
|
// . remove junk from start/end of summary (no back-to-back punct)
|
|
// . stop summary line on space, not non-alnum (no breaking on apostrophe)
|
|
// . don't highlight stop words????
|
|
|
|
#ifndef _SUMMARY_H_
|
|
#define _SUMMARY_H_
|
|
|
|
#include "gb-include.h"
|
|
#include "Unicode.h"
|
|
#include "matches2.h"
|
|
#include "Query.h"
|
|
#include "Xml.h"
|
|
#include "Mem.h"
|
|
//#include "LinkInfo.h" // BIG HACK support
|
|
#include "Words.h"
|
|
#include "Bits.h"
|
|
#include "Pos.h"
|
|
#include "Matches.h"
|
|
#include "HashTableT.h"
|
|
//#include "Places.h"
|
|
#include "Domains.h"
|
|
#include "CountryCode.h"
|
|
|
|
#define MAX_SUMMARY_LEN 1024*20
|
|
#define MAX_SUMMARY_EXCERPTS 1024
|
|
#define MAX_SUMMARY_LOCS 16
|
|
|
|
class Summary {
|
|
|
|
public:
|
|
|
|
Summary();
|
|
~Summary();
|
|
void reset();
|
|
|
|
// . like above but flattens the xml for you then calls the above
|
|
// . returns false and sets errno on error
|
|
bool set ( class Xml *xml ,
|
|
class Query *q ,
|
|
long long *termFreqs ,
|
|
bool doStemming ,
|
|
long maxSummaryLen ,
|
|
long maxNumLines ,
|
|
long maxNumCharsPerLine ,
|
|
//long bigSampleRadius ,
|
|
//long bigSampleMaxLen ,
|
|
bool ratInSummary = false ,
|
|
class Url *f = NULL );
|
|
//bool excludeAnchText = false,
|
|
//bool hackFixWords = false,
|
|
//bool hackFixPhrases = false ) ;
|
|
|
|
// this should eventually replace set()
|
|
bool set2 ( class Xml *xml ,
|
|
class Words *words ,
|
|
class Bits *bits ,
|
|
class Sections *sections ,
|
|
class Pos *pos ,
|
|
class Query *q ,
|
|
long long *termFreqs ,
|
|
float *affWeights , // 1-1 with qterms
|
|
//char *coll ,
|
|
//long collLen ,
|
|
bool doStemming ,
|
|
long maxSummaryLen ,
|
|
long numDisplayLines ,
|
|
long maxNumLines ,
|
|
long maxNumCharsPerLine ,
|
|
//long bigSampleRadius ,
|
|
//long bigSampleMaxLen ,
|
|
bool ratInSummary ,
|
|
//TitleRec *tr ,
|
|
class Url *f ,
|
|
//bool allowPunctInPhrase = true,
|
|
//bool excludeLinkText = false,
|
|
//bool excludeMetaText = false,
|
|
//bool hackFixWords = false,
|
|
//bool hackFixPhrases = false,
|
|
//float *queryProximityScore= NULL ,
|
|
class Matches *matches = NULL ,
|
|
char *titleBuf = NULL ,
|
|
long titleBufLen = 0 );
|
|
|
|
|
|
// this is NULL terminated
|
|
char *getSummary ( ) { return m_summary; };
|
|
long getSummaryLen ( ) { return m_summaryLen; };
|
|
|
|
// me = "max excerpts". we truncate the summary if we need to.
|
|
// XmlDoc.cpp::getSummary(), likes to request more excerpts than are
|
|
// actually displayed so it has a bigger summary for deduping purposes.
|
|
long getSummaryLen ( long me ) ;
|
|
|
|
// for related topics.. sample surrounding the query terms
|
|
//char *getBigSampleBuf ( ) { return m_buf; };
|
|
//long getBigSampleLen ( ) { return m_bufLen; };
|
|
|
|
void truncateSummaryForExcerpts ( long numExcerpts ,
|
|
long maxSummaryLen,
|
|
char *dmozSumms ,
|
|
long *dmozSummLens ,
|
|
long numCatids ,
|
|
bool *sumFromDmoz );
|
|
|
|
//float getDiversity() {return m_diversity;}
|
|
//float getProximityScore() { return m_proximityScore; };
|
|
|
|
// for places in summary
|
|
/*
|
|
bool scanForLocations ( );
|
|
long getNumSummaryLocs ( ) {
|
|
return m_summaryLocs.length()/sizeof(uint64_t); };
|
|
long getSummaryLocsSize ( ) {
|
|
return m_summaryLocs.length(); }
|
|
uint64_t *getSummaryLocs ( ) {
|
|
return (uint64_t *)m_summaryLocs.getBufStart(); };
|
|
long getSummaryLocsPopsSize( ) {
|
|
return m_summaryLocsPops.length(); }
|
|
long *getSummaryLocsPops ( ) {
|
|
return (long *)m_summaryLocsPops.getBufStart(); };
|
|
*/
|
|
|
|
// private:
|
|
|
|
// . content is an html/xml doc
|
|
// . we highlight "query" in "content" as best as we can
|
|
// . returns false and sets errno on error
|
|
// . CAUTION: this is destructive on "doc"
|
|
// . stores bigSample into "doc" which should be "m_buf"
|
|
// and sets bytes stored into *bigSampleLen
|
|
/*
|
|
bool set ( char *doc ,
|
|
long docLen ,
|
|
Query *q ,
|
|
long long *termFreqs ,
|
|
bool doStemming ,
|
|
long maxSummaryLen ,
|
|
long maxNumLines ,
|
|
long maxNumCharsPerLine ,
|
|
//long bigSampleRadius ,
|
|
//long bigSampleMaxLen ,
|
|
//long *bigSampleLen ,
|
|
char *foundTermVector );
|
|
*/
|
|
|
|
// BIG HACK support
|
|
//bool allQTermsFound( Query *q, TitleRec *tr, Xml *xml,
|
|
// class Matches *matches,
|
|
// qvec_t reqMask, qvec_t negMask,
|
|
// bool excludeLinkText,
|
|
// bool excludeMetaText,
|
|
// bool allowPunctInPhrase );
|
|
|
|
//////////////////////////////////////////////////////////////////
|
|
//
|
|
// THE NEW SUMMARY GENERATOR routines below here
|
|
//
|
|
//////////////////////////////////////////////////////////////////
|
|
|
|
bool getDefaultSummary ( Xml *xml,
|
|
Words *words,
|
|
class Sections *sections ,
|
|
Pos *pos,
|
|
//long bigSampleRadius,
|
|
long maxSummaryLen );
|
|
|
|
void setSummaryScores ( class Matches *matches ,
|
|
//Words *words ,
|
|
//Scores *scores ,
|
|
//Pos *pos ,
|
|
//long numNeedles,
|
|
//Needle *needles ,
|
|
Query *q ,
|
|
float *phraseAffWeights ,
|
|
//long *docSummaryScore,
|
|
//long *queryInSectionScore,
|
|
long commentStart );
|
|
|
|
long long getBestWindow ( class Matches *matches ,
|
|
long mn ,
|
|
long *lasta ,
|
|
long *besta ,
|
|
long *bestb ,
|
|
char *gotIt ,
|
|
char *retired ,
|
|
long maxExcerptLen );
|
|
//long numFindableQWords,
|
|
//char *represented,
|
|
//long *foundNew );
|
|
|
|
void reduceQueryScores ( class Matches *matches,
|
|
long m, long a, long b ) ;
|
|
void reduceScoreForWords ( class Matches *matches, long qtn ) ;
|
|
|
|
// a wrapper basically for the set0 below
|
|
bool set0 ( char *doc, long docLen, Query *q, class Msg20Request *mr);
|
|
|
|
// . the old string based summary generator -- ULTRA FAST!
|
|
// . resurrected from /gb/datil2-release.git/src/Summary.cpp
|
|
// . returns false with g_errno set on error
|
|
bool set1 ( char *doc ,
|
|
long docLen ,
|
|
Query *q ,
|
|
long maxSummaryLen ,
|
|
long maxNumLines ,
|
|
long maxNumCharsPerLine ,
|
|
long bigSampleRadius ,
|
|
long bigSampleMaxLen ,
|
|
long *bigSampleLen ,
|
|
char *foundTermVector ,
|
|
long long *termFreqs ) ;
|
|
|
|
// null terminate and store the summary here.
|
|
char m_summary [ MAX_SUMMARY_LEN ];
|
|
long m_summaryLen;
|
|
long m_summaryExcerptLen [ MAX_SUMMARY_EXCERPTS ];
|
|
long m_numExcerpts;
|
|
bool m_isNormalized;
|
|
// hold the big sample here
|
|
//char *m_buf;
|
|
//long m_bufMaxLen;
|
|
//long m_bufLen;
|
|
//bool m_freeBuf;
|
|
//char m_localBuf[10032];
|
|
|
|
// if getting more lines for deduping than we need for displaying,
|
|
// how big is that part of the summary to display?
|
|
long m_numDisplayLines;
|
|
long m_displayLen;
|
|
long getSummaryDisplayLen() { return m_displayLen; }
|
|
|
|
long m_maxNumCharsPerLine;
|
|
|
|
long m_titleVersion;
|
|
|
|
// ptr to the query
|
|
Query *m_q;
|
|
|
|
// query scores
|
|
//long *m_qscores;
|
|
|
|
// pub date list offsets
|
|
bool m_useDateLists;
|
|
bool m_exclDateList;
|
|
long m_begPubDateList;
|
|
long m_endPubDateList;
|
|
|
|
//float m_diversity;
|
|
|
|
//float m_proximityScore;
|
|
|
|
char *m_bitScoresBuf;
|
|
long m_bitScoresBufSize;
|
|
float m_wordWeights[MAX_QUERY_WORDS];
|
|
|
|
char m_summaryLocBuf[MAX_SUMMARY_LOCS*sizeof(uint64_t)];
|
|
SafeBuf m_summaryLocs;
|
|
char m_summaryLocPopsBuf[MAX_SUMMARY_LOCS*sizeof(long)];
|
|
SafeBuf m_summaryLocsPops;
|
|
};
|
|
|
|
#endif
|
|
|