mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
271 lines
8.7 KiB
C
271 lines
8.7 KiB
C
|
// Matt Wells, copyright Jul 2001
|
||
|
|
||
|
#ifndef _MATCHES_H_
|
||
|
#define _MATCHES_H_
|
||
|
|
||
|
#include "Query.h"
|
||
|
#include "Words.h"
|
||
|
#include "Xml.h"
|
||
|
#include "HashTableT.h"
|
||
|
#include "Pos.h"
|
||
|
#include "Bits.h"
|
||
|
|
||
|
// do not hash more than this many query words into the hash table
|
||
|
#define MAX_QUERY_WORDS_TO_MATCH 1000
|
||
|
|
||
|
// . i upped this from 500 to 3000 to better support the BIG HACK
|
||
|
// getting 3000 matches slows down the summary generator a lot.
|
||
|
// . i raised MAX_MATCHES to 3000 for huge UOR queries made in SearchInput.cpp
|
||
|
// from facebook interests
|
||
|
#define MAX_MATCHES 3000
|
||
|
#define MAX_MATCHES_FOR_BIG_HACK 3000
|
||
|
|
||
|
#define MAX_MATCHGROUPS 300
|
||
|
|
||
|
typedef long mf_t;
|
||
|
|
||
|
// . values for Match::m_flags
|
||
|
// . dictates the "match group" that the match belongs to
|
||
|
#define MF_TITLEGEN 0x0001 // in generated title?
|
||
|
#define MF_TITLETAG 0x0002
|
||
|
#define MF_LINK 0x0004 // in non-anomalous link text
|
||
|
//#define MF_ALINK 0x0008 // in anomalous link text
|
||
|
#define MF_HOOD 0x0010 // in non-anomalous neighborhood
|
||
|
//#define MF_AHOOD 0x0020 // in anomalous neighborhood
|
||
|
#define MF_BODY 0x0040 // in body
|
||
|
#define MF_METASUMM 0x0080 // in meta summary
|
||
|
#define MF_METADESC 0x0100 // in meta description
|
||
|
#define MF_METAKEYW 0x0200 // in meta keywords
|
||
|
#define MF_DMOZTITLE 0x0400
|
||
|
#define MF_DMOZSUMM 0x0800
|
||
|
#define MF_RSSTITLE 0x1000
|
||
|
#define MF_RSSDESC 0x2000
|
||
|
#define MF_URL 0x4000 // in url
|
||
|
#define MF_SYNONYM 0x8000
|
||
|
|
||
|
class Match {
|
||
|
public:
|
||
|
// word # we match in the document using "m_words" below
|
||
|
long m_wordNum;
|
||
|
// # of words in this match, like if we match a phrase
|
||
|
// we have > 1 words in the match
|
||
|
long m_numWords;
|
||
|
// word # we match in the query
|
||
|
long m_qwordNum;
|
||
|
// # of query words we match if we are a phrase, otherwise
|
||
|
// this is 1
|
||
|
long m_numQWords;
|
||
|
// . used for highlighting under different colors (Highlight.cpp)
|
||
|
// . words in the same quote should use the same highlight color
|
||
|
long m_colorNum;
|
||
|
|
||
|
// . max score of all contained query terms in this match
|
||
|
// . uses provided m_tscores[] array to get scores of the query terms
|
||
|
//long m_score;
|
||
|
|
||
|
// "match group" or type of match. i.e. MF_TITLETAG, MF_METASUMM, ...
|
||
|
mf_t m_flags;
|
||
|
|
||
|
// improve summary generation parms
|
||
|
long m_dist;
|
||
|
//bool m_crossedSection;
|
||
|
|
||
|
// . for convenience, these four class ptrs are used by Summary.cpp
|
||
|
// . m_wordNum is relative to this "words" class (and scores,bits,pos)
|
||
|
class Words *m_words;
|
||
|
class Sections *m_sections;
|
||
|
class Bits *m_bits;
|
||
|
class Pos *m_pos;
|
||
|
};
|
||
|
|
||
|
class Matches {
|
||
|
|
||
|
public:
|
||
|
|
||
|
void setQuery ( Query *q );//, long *tscores = NULL ) ;
|
||
|
|
||
|
bool set ( class XmlDoc *xd ,
|
||
|
class Words *bodyWords ,
|
||
|
//class Synonyms *bodySynonyms ,
|
||
|
class Phrases *bodyPhrases ,
|
||
|
class Sections *bodySections ,
|
||
|
class Bits *bodyBits ,
|
||
|
class Pos *bodyPos ,
|
||
|
class Xml *xml ,
|
||
|
class Title *tt ,
|
||
|
long niceness ) ;
|
||
|
|
||
|
bool addMatches ( char *s ,
|
||
|
long slen ,
|
||
|
mf_t flags ,
|
||
|
long long docId ,
|
||
|
long niceness ) ;
|
||
|
|
||
|
// . this sets the m_matches[] array
|
||
|
// . m_matches[i] is -1 if it matches no term in the query
|
||
|
// . m_matches[i] is X if it matches term #X in the query
|
||
|
// . returns false and sets errno on error
|
||
|
bool addMatches ( Words *words ,
|
||
|
//class Synonyms *syn = NULL ,
|
||
|
class Phrases *phrases = NULL ,
|
||
|
Sections *sections = NULL ,
|
||
|
Bits *bits = NULL ,
|
||
|
Pos *pos = NULL ,
|
||
|
long fieldCode = 0 , // wrds,0=none
|
||
|
bool allowPunctInPhrase = true ,
|
||
|
bool exclQTOnlyInAnchTxt = false,
|
||
|
qvec_t reqMask = 0 ,
|
||
|
qvec_t negMask = 0 ,
|
||
|
long diversityWeight = 1 ,
|
||
|
long long docId = 0 ,
|
||
|
mf_t flags = 0 );
|
||
|
|
||
|
// this is NULL terminated
|
||
|
//char getTermNum ( long i ) { return m_termNums[i]; };
|
||
|
//char getTermNum2 ( long i ) { return m_termNums2[i]; };
|
||
|
|
||
|
// get the whole array
|
||
|
//char *getTermNums ( ) { return m_termNums; };
|
||
|
//char *getPhraseNums ( ) { return m_phraseNums; };
|
||
|
|
||
|
// how many words matched a rawTermId?
|
||
|
long getNumMatches ( ) { return m_numMatches; };
|
||
|
|
||
|
//void clearBitsMatched( void ) { m_explicitBitsMatched = 0x00; };
|
||
|
//void addBitMatched ( qvec_t bit ) { m_explicitBitsMatched |= bit; };
|
||
|
//qvec_t getBitsMatched( void ) { return m_explicitBitsMatched; };
|
||
|
|
||
|
// set m_termNums from a to b to 0
|
||
|
//void clearMatches ( long a , long b );
|
||
|
|
||
|
// janitorial stuff
|
||
|
Matches ( ) ;
|
||
|
~Matches( ) ;
|
||
|
void reset ( ) ;
|
||
|
|
||
|
// BIG HACK support
|
||
|
//long getTermsFound ( bool *hadPhrases , bool *hadWords );
|
||
|
unsigned long getTermsFound2(bool *hadPhrases, bool *hadWords);
|
||
|
//bool negTermsFound ( );
|
||
|
|
||
|
// used internally and by PageGet.cpp
|
||
|
bool isMatchableTerm ( class QueryTerm *qt );//, long i );
|
||
|
|
||
|
// used internally
|
||
|
long getNumWordsInMatch ( Words *words ,
|
||
|
long wn ,
|
||
|
long n ,
|
||
|
long *numQWords ,
|
||
|
long *qwn ,
|
||
|
bool allowPunctInPhrase = true ) ;
|
||
|
|
||
|
// how many words matched a rawTermId?
|
||
|
Match m_matches[MAX_MATCHES_FOR_BIG_HACK];
|
||
|
long m_numMatches;
|
||
|
//long m_numNegTerms;
|
||
|
|
||
|
// found term vector, 1-1 with m_q->m_qterms[]
|
||
|
//char m_foundTermVector [ MAX_QUERY_TERMS ];
|
||
|
//char m_foundNegTermVector [ MAX_QUERY_TERMS ];
|
||
|
qvec_t m_explicitBitsMatched;
|
||
|
|
||
|
// . hash query word ids into a small hash table
|
||
|
// . we use this to see what words in the document are query terms
|
||
|
long long m_qtableIds [ MAX_QUERY_WORDS_TO_MATCH * 3 ];
|
||
|
long m_qtableWordNums [ MAX_QUERY_WORDS_TO_MATCH * 3 ];
|
||
|
char m_qtableFlags [ MAX_QUERY_WORDS_TO_MATCH * 3 ];
|
||
|
//long long m_qtableNegIds [ MAX_QUERY_WORDS_TO_MATCH * 3 ];
|
||
|
long m_numSlots;
|
||
|
Query *m_q;
|
||
|
long m_maxNQW;
|
||
|
long *m_tscores;
|
||
|
long m_numAlnums;
|
||
|
|
||
|
// . 1-1 with Query::m_qwords[] array of QWords
|
||
|
// . shows the match flags for that query word
|
||
|
mf_t m_qwordFlags[MAX_QUERY_WORDS];
|
||
|
|
||
|
//stuff for detecting whether a match is part of a larger phrase
|
||
|
void setSubPhraseDetection();
|
||
|
void detectSubPhrase(Words* w,
|
||
|
long matchWordNum,
|
||
|
long numMatchedWords,
|
||
|
long queryWordNum,
|
||
|
long diversityWeight );
|
||
|
float getDiversity();
|
||
|
|
||
|
bool m_detectSubPhrases;
|
||
|
long m_leftDiversity;
|
||
|
long m_rightDiversity;
|
||
|
|
||
|
static const long m_htSize = 128 * (sizeof(long long) + sizeof(long));
|
||
|
char m_subPhraseBuf[2 * m_htSize];
|
||
|
HashTableT<long long, long> m_pre;
|
||
|
HashTableT<long long, long> m_post;
|
||
|
|
||
|
// . one words/scores/bits/pos/flags class per "match group"
|
||
|
// . match groups examples = body, a single link text, a meta tag, etc.
|
||
|
// . match groups are basically disjoint chunks of text information
|
||
|
// . the document body (web page) is considered a single match group
|
||
|
// . a single link text is considered a match group
|
||
|
// . a single meta summary tag is a match group, ...
|
||
|
Words *m_wordsPtr [MAX_MATCHGROUPS];
|
||
|
Sections *m_sectionsPtr [MAX_MATCHGROUPS];
|
||
|
Bits *m_bitsPtr [MAX_MATCHGROUPS];
|
||
|
Pos *m_posPtr [MAX_MATCHGROUPS];
|
||
|
mf_t m_flags [MAX_MATCHGROUPS];
|
||
|
long m_numMatchGroups;
|
||
|
|
||
|
Words m_wordsArray [MAX_MATCHGROUPS];
|
||
|
Bits m_bitsArray [MAX_MATCHGROUPS];
|
||
|
Pos m_posArray [MAX_MATCHGROUPS];
|
||
|
|
||
|
long long *m_pids2;
|
||
|
long long *m_pids3;
|
||
|
long long *m_pids4;
|
||
|
long long *m_pids5;
|
||
|
|
||
|
// MDW: i am hoping we do not need this, it is really only useful
|
||
|
// for the words in the body of the document!!
|
||
|
//Sections m_sectionsArray [MAX_MATCHGROUPS];
|
||
|
|
||
|
bool getMatchGroup ( mf_t matchFlag ,
|
||
|
Words **wp ,
|
||
|
Pos **pp ,
|
||
|
Sections **sp );
|
||
|
|
||
|
//qvec_t m_explicitsMatched;
|
||
|
//qvec_t m_matchableRequiredBits;
|
||
|
// this may be false and still be included in the results if rat=0
|
||
|
//bool m_hasAllQueryTerms;
|
||
|
// if this is false big hack will exclude it from results
|
||
|
//bool m_matchesQuery;
|
||
|
};
|
||
|
|
||
|
#define OFFSET_BYTES 1
|
||
|
#define OFFSET_WORDS 2
|
||
|
|
||
|
// smaller class for attaching to Msg20
|
||
|
class MatchOffsets{
|
||
|
public:
|
||
|
MatchOffsets();
|
||
|
~MatchOffsets();
|
||
|
void reset();
|
||
|
bool set(Xml *xml, Words *words, Matches *matches,
|
||
|
unsigned char offsetType);
|
||
|
|
||
|
long getStoredSize();
|
||
|
long serialize(char *buf, long buflen);
|
||
|
long deserialize(char *buf, long buflen);
|
||
|
|
||
|
long m_numMatches;
|
||
|
unsigned char m_queryWords[MAX_MATCHES];
|
||
|
long m_matchOffsets[MAX_MATCHES];
|
||
|
// keep track of breaks between matches
|
||
|
long m_numBreaks;
|
||
|
long m_breakOffsets[MAX_MATCHES];
|
||
|
// and total number of alnums in the document
|
||
|
long m_numAlnums;
|
||
|
};
|
||
|
#endif
|