open-source-search-engine/Query.h
mwells a3a0b52ee4 added the query reindex smoke test.
fixed a lot of related bugs.
2014-09-25 17:44:35 -07:00

1024 lines
36 KiB
C++

// Matt Wells, copyright Aug 2003
// Query is a class for parsing queries
#ifndef _QUERY_H_
#define _QUERY_H_
#include "SafeBuf.h"
#include "Mem.h"
// keep these down to save memory
//#define MAX_QUERY_LEN 8000 // url:XXX can be quite long! (MAX_URL_LEN)
#define MAX_QUERY_LEN 3200
// . words need to deal with long list of sites!
// . remember, words can be string of punctuation, too
//#define MAX_QUERY_WORDS 5000
//#define MAX_QUERY_WORDS 32000
// not any more!
//#define MAX_QUERY_WORDS 320
// raise for crazy bool query on diffbot
// seems like we alloc just enough to hold our words now so that this
// is really a performance capper but it is used in Summary.cpp
// and Matches.h so don't go too big just yet
#define MAX_QUERY_WORDS 800
// . how many IndexLists might we get/intersect
// . we now use a long long to hold the query term bits for non-boolean queries
//#define MAX_QUERY_TERMS 216
//#define MAX_QUERY_TERMS 512
// seems like CTS is causing huge delay spiders in query processing so
// truncate for now...
//#define MAX_QUERY_TERMS 40
// we need more for zak's categories!
//#define MAX_QUERY_TERMS 1500
// nah, do 40 again
//#define MAX_QUERY_TERMS 40
// how to make a lock pick set loses synonyms from 40!
//#define MAX_QUERY_TERMS 80
#define MAX_QUERY_TERMS 160
// only allow up to 200 interests from facebook plus manually entered
// because we are limited by the query terms above so we can only
// UOR so many in SearchInput.cpp
#define MAX_INTERESTS 200
#define GBUF_SIZE (16*1024)
#define SYNBUF_SIZE (16*1024)
// score of highest-scoring query term in the QueryScore
//#define BASE_QUERY_SCORE 10000000
// let's support up to 64 query terms for now
typedef unsigned long long qvec_t;
#define MAX_EXPLICIT_BITS (sizeof(qvec_t)*8)
#define MAX_OVEC_SIZE 256
// only can use 16-bit since have to make a 64k truth table!
#define MAX_EXPLICIT_BITS_BOOLEAN (16*8)
// field codes
#define FIELD_URL 1
#define FIELD_LINK 2
#define FIELD_SITE 3
#define FIELD_IP 4
#define FIELD_SUBURL 5
#define FIELD_TITLE 6
#define FIELD_TYPE 7
#define FIELD_EXT 21
#define FIELD_COLL 22
#define FIELD_ILINK 23
#define FIELD_LINKS 24
#define FIELD_SITELINK 25
// non-standard field codes
#define FIELD_ZIP 8
#define FIELD_CITY 9
#define FIELD_STREET 10
#define FIELD_AUTHOR 11
#define FIELD_LANG 12
#define FIELD_CLASS 13
#define FIELD_COUNTRY 14
#define FIELD_TAG 15
#define FIELD_STATE 16
#define FIELD_DATE 17
#define FIELD_GENERIC 18
#define FIELD_ISCLEAN 19 // we hash field="isclean:" val="1" if doc clean
//#define FIELD_RANGE 20 // date range OBSOLETE, was only for newspaperarchive
#define FIELD_CHARSET 30
#define FIELD_GBRSS 31
#define FIELD_URLHASH 32
#define FIELD_URLHASHDIV10 33
#define FIELD_URLHASHDIV100 34
#define FIELD_GBRULESET 35
#define FIELD_GBLANG 36
#define FIELD_GBQUALITY 37
#define FIELD_LINKTEXTIN 38
#define FIELD_LINKTEXTOUT 39
#define FIELD_KEYWORD 40
#define FIELD_QUOTA 41
#define FIELD_GBTAGVECTOR 42
#define FIELD_GBGIGABITVECTOR 43
#define FIELD_GBSAMPLEVECTOR 44
#define FIELD_SYNONYM 45
#define FIELD_GBCOUNTRY 46
#define FIELD_GBAD 47
#define FIELD_GBSUBMITURL 48
#define FIELD_GBPERMALINK 49
#define FIELD_GBCSENUM 50
#define FIELD_GBSECTIONHASH 51
#define FIELD_GBDOCID 52
#define FIELD_GBCONTENTHASH 53 // for deduping at spider time
#define FIELD_GBSORTBYFLOAT 54 // i.e. sortby:price -> numeric termlist
#define FIELD_GBREVSORTBYFLOAT 55 // i.e. sortby:price -> low to high
#define FIELD_GBNUMBERMIN 56
#define FIELD_GBNUMBERMAX 57
#define FIELD_GBPARENTURL 58
#define FIELD_GBSORTBYINT 59
#define FIELD_GBREVSORTBYINT 60
#define FIELD_GBNUMBERMININT 61
#define FIELD_GBNUMBERMAXINT 62
#define FIELD_GBFACETSTR 63
#define FIELD_GBFACETINT 64
#define FIELD_GBFACETFLOAT 65
#define FIELD_GBNUMBEREQUALINT 66
#define FIELD_GBNUMBEREQUALFLOAT 67
#define FIELD_SUBURL2 68
#define FIELD_GBFIELDMATCH 69
#define FIELD_GBOTHER 92
// returns a FIELD_* code above, or FIELD_GENERIC if not in the list
char getFieldCode ( char *s , long len , bool *hasColon = NULL ) ;
char getFieldCode2 ( char *s , long len , bool *hasColon = NULL ) ;
char getFieldCode3 ( long long h64 ) ;
long getNumFieldCodes ( );
// . values for QueryField::m_flag
// . QTF_DUP means it is just for the help page in PageRoot.cpp to
// illustrate a second or third example
#define QTF_DUP 0x01
#define QTF_HIDE 0x02
#define QTF_BEGINNEWTABLE 0x04
struct QueryField {
char *text;
char field;
bool hasColon;
char *example;
char *desc;
char *m_title;
char m_flag;
};
extern struct QueryField g_fields[];
// reasons why we ignore a particular QueryWord's word or phrase
#define IGNORE_DEFAULT 1 // punct
#define IGNORE_CONNECTED 2 // connected sequence (cd-rom)
#define IGNORE_QSTOP 3 // query stop word (come 'to' me)
#define IGNORE_REPEAT 4 // repeated term (time after time)
#define IGNORE_FIELDNAME 5 // word is a field name, like title:
#define IGNORE_BREECH 6 // query exceeded MAX_QUERY_TERMS so we ignored part
#define IGNORE_BOOLOP 7 // boolean operator (OR,AND,NOT)
#define IGNORE_QUOTED 8 // word in quotes is ignored. "the day"
//#define IGNORE_SYNONYM 9 // part of a gbsynonym: field
// . reasons why we ignore a QueryTerm
// . we replace sequences of UOR'd terms with a compound term, which is
// created by merging the termlists of the UOR'd terms together. We store
// this compound termlist into a cache to avoid having to do the merge again.
#define IGNORE_COMPONENT 9 // if term was replaced by a compound term
// boolean query operators (m_opcode field in QueryWord)
#define OP_OR 1
#define OP_AND 2
#define OP_NOT 3
#define OP_LEFTPAREN 4
#define OP_RIGHTPAREN 5
#define OP_UOR 6
#define OP_PIPE 7
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
////////// BEGIN BOOLEAN STUFF /////////////
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
/*
// . creating a QueryBoolean class was unnecessary since it was only functional
// and had nothing new it would store that the Query class doesn't store
// . the entry point is the Query::setBitScoresBoolean() function below
// . we can have as many operands (plain opds, not expressions) as query terms
// . no, not anymore, we boosted MAX_QUERY_TERMS so we can have UORs which
// essentially make a bunch of terms use the same explicit bit
//#define MAX_OPERANDS 16
#define MAX_OPERANDS (MAX_QUERY_TERMS)
class Operand {
public:
long set ( long a , long b , class QueryWord *qwords , long level ,
bool underNOT ) ;
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
// . Operand::m_opBits is the required bits for operand to be true
// . does not include signless phrases
//bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
bool isTruth ( unsigned char *bitVec , long vecSize ) {
// must always satisfy hard required terms (+ sign)
//if ( (bits & m_forcedBits) != m_forcedBits )
// return false;
//if (m_hasNOT) return (bits & m_opBits & mask) == 0;
//return ( (bits & m_opBits & mask) == (m_opBits & mask));
if ( m_hasNOT ) {
for ( long i = 0 ; i < vecSize ; i++ )
if ( m_opBits[i] & bitVec[i] ) return false;
return true;
}
for ( long i = 0 ; i < vecSize ; i++ )
if ( m_opBits[i] & bitVec[i] ) return true;
return false;
// . we are now back to good ol' default OR
// . m_opBits should have been masked with
// m_requiredBits so as not to include signless phrases
//return ( (bits & m_opBits) != 0 );
};
void print (SafeBuf *sbuf);
// we are a sequence of QueryWords
//long m_startWordNum;
//long m_lastWordNum;
// . doc just needs one of these bits for this op to be considered true
// . terms under the same QueryTermInfo class should have the same
// termbit here
unsigned char m_opBits[MAX_OVEC_SIZE];
//long m_vecSize;
// does the word NOT preceed the operand?
bool m_hasNOT;
//class Expression *m_parent;
// we MUST have these for this OPERAND to be true
//unsigned short m_forcedBits;
};
*/
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
////////// END BOOLEAN STUFF /////////////
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
#define MAX_FACET_RANGES 256
// . these first two classes are functionless
// . QueryWord, like the Phrases class, is an extension on the Words class
// . the array of QueryWords, m_qwords[], is contained in the Query class
// . we compute the QueryTerms (m_qterms[]) from the QueryWords
class QueryWord {
public:
bool isAlphaWord() { return is_alnum_utf8(m_word); };
bool hasWhiteSpace() {
char *p = m_word;
char *pend = m_word + m_wordLen;
for ( ; p < pend ; p += getUtf8CharSize ( p ) )
if ( is_wspace_utf8 ( p ) ) return true;
return false;
};
//UCScript wordScript() {
// UChar*foo;
// return ucGetScript(utf16Decode((UChar*)(m_word),&foo));
//}
// this ptr references into the actual query
char *m_word ;
long m_wordLen ;
// the length of the phrase, if any. it starts at m_word
long m_phraseLen;
// this is the term hash with collection and field name and
// can be looked up directly in indexdb
long long m_wordId ;
long long m_phraseId;
// hash of field name then collection, used to hash termId
long long m_prefixHash;
long m_wordNum;
long m_posNum;
// are we in a phrase in a wikipedia title?
long m_wikiPhraseId;
long m_wikiPhraseStart;
long m_numWordsInWikiPhrase;
// . this is just the hash of m_term and is used for highlighting, etc.
// . it is 0 for terms in a field?
long long m_rawWordId ;
long long m_rawPhraseId ;
// if we are phrase, the end word's raw id
long long m_rightRawWordId;
// the field as a convenient numeric code
char m_fieldCode ;
// . '-' means to exclude from search results
// . '+' means to include in all search results
// . if we're a phrase term, signs distribute across quotes
char m_wordSign;
char m_phraseSign;
// this is 1 if the associated word is a valid query term but its
// m_explicitBit is 0. we use this to save explicit bits for those
// terms that need them (like those terms in complicated nested boolean
// expressions) and just use a hardCount to see how many hard required
// terms are contained by a document. see IndexTable.cpp "hardCount"
char m_hardCount;
// the parenthetical level of this word in the boolean expression.
// level 0 is the first level.
char m_level;
// . how many plusses preceed this query term?
// . the more plusses the more weight it is given
//char m_numPlusses ;
// is this word a query stop word?
bool m_isQueryStopWord ;
// is it a plain stop word?
bool m_isStopWord ;
bool m_isPunct;
// are we an op code?
char m_opcode;
// . the ignore code
// . explains why this query term should be ignored
// . see #define'd IGNORE_* codes above
char m_ignoreWord ;
char m_ignorePhrase ;
// so we ignore gbsortby:offerprice in bool expressions
char m_ignoreWordInBoolQuery;
// is this query single word in quotes?
bool m_inQuotes ;
// is this word in a phrase that is quoted?
bool m_inQuotedPhrase;
// what word # does the quote we are in start at?
long m_quoteStart;
long m_quoteEnd; // inclusive!
// are we connected to the alnum word on our left/right?
bool m_leftConnected;
bool m_rightConnected;
// if we're in middle or right end of a phrase, where does it start?
long m_leftPhraseStart;
// . what QueryTerm does our "phrase" map to? NULL if none.
// . this allows us to OR in extra bits into that QueryTerm's m_bits
// member that correspond to the single word constituents
// . remember, m_bits is a bit vector that represents the QueryTerms
// a document contains
class QueryTerm *m_queryPhraseTerm;
// . what QueryTerm does our "word" map to? NULL if none.
// . used by QueryBoolean since it uses QueryWords heavily
class QueryTerm *m_queryWordTerm;
// user defined weights
long m_userWeight;
char m_userType;
long m_userWeightPhrase;
char m_userTypePhrase;
bool m_queryOp;
// is it after a NOT operator? i.e. NOT ( x UOR y UOR ... )
bool m_underNOT;
// is this query word before a | (pipe) operator?
bool m_piped;
// used by Matches.cpp for highlighting under different colors
long m_colorNum;
// for min/max score ranges like gbmin:price:1.99
float m_float;
// for gbminint:99 etc. uses integers instead of floats for better res
long m_int;
// for holding some synonyms
SafeBuf m_synWordBuf;
long m_facetRangeIntA [MAX_FACET_RANGES];
long m_facetRangeIntB [MAX_FACET_RANGES];
float m_facetRangeFloatA [MAX_FACET_RANGES];
float m_facetRangeFloatB [MAX_FACET_RANGES];
long m_numFacetRanges;
// what operand bit # is it for doing boolen queries?
//long m_opBitNum;
// when an operand is an expression...
class Expression *m_expressionPtr;
};
// . we filter the QueryWords and turn them into QueryTerms
// . QueryTerms are the important parts of the QueryWords
class QueryTerm {
public:
// the query word we were derived from
QueryWord *m_qword;
// . are we a phrase termid or single word termid from that QueryWord?
// . the QueryWord instance represents both, so we must choose
bool m_isPhrase;
// for compound phrases like, "cat dog fish" we do not want docs
// with "cat dog" and "dog fish" to match, so we extended our hackfix
// in Summary.cpp to use m_phrasePart to do this post-query filtering
long m_phrasePart;
// this is phraseId for phrases, and wordId for words
long long m_termId;
// used by Matches.cpp
long long m_rawTermId;
// . if we are a phrase these are the termids of the word that
// starts the phrase and the word that ends the phrase respectively
long long m_rightRawWordId;
long long m_leftRawWordId;
// sign of the phrase or word we used
char m_termSign;
// our representative bit (up to 16 MAX_QUERY_TERMS)
//unsigned short m_explicitBit;
qvec_t m_explicitBit;
// usually this equal m_explicitBit, BUT if a word is repeated
// in different areas of the doc, we union all the individual
// explicit bits of that repeated word into this bit vec. it is
// used by Matches.cpp only so far.
qvec_t m_matchesExplicitBits;
// this is 1 if the associated word is a valid query term but its
// m_explicitBit is 0. we use this to save explicit bits for those
// terms that need them (like those terms in complicated nested boolean
// expressions) and just use a hardCount to see how many hard required
// terms are contained by a document. see IndexTable.cpp "hardCount"
char m_hardCount;
// the "number" of the query term used for evaluation boolean
// expressions in Expression::isTruth(). Basically just the
// QueryTermInfo for which this query term belongs. each QueryTermInfo
// is like a single query term and all its synonyms, etc.
long m_bitNum;
// point to term, either m_word or m_phrase
char *m_term;
long m_termLen;
// point to the posdblist that represents us
class RdbList *m_posdbListPtr;
// the ()'s following an int/float facet term dictate the
// ranges for clustering the numeric values. like
// gbfacetfloat:price:(0-10,10-20,...)
// values outside the ranges will be ignored
char *m_parenList;
long m_parenListLen;
// . our representative bits
// . the bits in this bit vector is 1-1 with the QueryTerms
// . if a doc has query term #i then bit #i will be set
// . if a doc EXplicitly has phrase "A B" then it may have
// term A and term B implicity
// . therefore we also OR the bits for term A and B into m_implicitBits
// . THIS SHIT SHOULD be just used in setBitScores() !!!
//unsigned short m_implicitBits;
qvec_t m_implicitBits;
// Summary.cpp and Matches.cpp use this one
bool m_isQueryStopWord ;
// IndexTable.cpp uses this one
bool m_inQuotes;
// . is this term under the influence of a boolean NOT operator?
// . used in IndexReadInfo.cpp, if so we must read the WHOLE termlist
bool m_underNOT;
// is it a repeat?
char m_repeat;
// user defined weight for this term, be it phrase or word
long m_userWeight;
char m_userType;
// . is this query term before a | (pipe) operator?
// . if so we must read the whole termlist, like m_underNOT above
bool m_piped;
// . we ignore component terms unless their compound term is not cached
// . now this is used to ignore low tf synonym terms only
char m_ignored ;
// is it part of a UOR chain?
bool m_isUORed;
QueryTerm *m_UORedTerm;
// . if synonymOf is not NULL, then m_term points into m_synBuf, not
// m_buf
//long m_affinity; // affinity to the synonym
QueryTerm *m_synonymOf;
long long m_synWids0;
long long m_synWids1;
long m_numAlnumWordsInSynonym;
// like if we are the "nj" syn of "new jersey", this will be 2 words
// since "new jersey", our base, is 2 alnum words.
long m_numAlnumWordsInBase;
// the phrase affinity from the wikititles.txt file used in Wiki.cpp
//float m_wikiAff ;
// if later, after getting a more accurate term freq because we
// actually download the termlist, its term freq drops a lot, we may
// end up filtering it in Query::filterSynonyms() called by Msg39. in
// which case the termlist is reset to 0 so it does not play a role
// in the search results computations in IndexTable2.cpp.
//char m_isFilteredSynonym;
// copied from derived QueryWord
char m_fieldCode ;
bool isSplit();
// . weights and affinities calculated in IndexTable2
// . do not store in here, just pass along as a separate vector
// . analogous to how Phrases is to Words is to Bits, etc.
//float m_termWeight;
//float m_phraseAffinity;
bool m_isRequired;
// . true if we are a word IN a phrase
// . used by IndexTable2's getWeightedScore()
char m_inPhrase;
char m_isWikiHalfStopBigram:1;
// if a single word term, what are the term #'s of the 2 phrases
// we can be in? uses -1 to indicate none.
long m_leftPhraseTermNum;
long m_rightPhraseTermNum;
// . what operand # are we a part of in a boolean query?
// . like for (x AND y) x would have an opNum of 0 and y an
// opNum of 1 for instance.
// . for things like (x1 OR x2 OR x3 ... ) we try to give all
// those query terms the same m_opNum for efficiency since
// they all have the same effecct
//long m_opNum;
// same as above basically
class QueryTerm *m_leftPhraseTerm;
class QueryTerm *m_rightPhraseTerm;
// for scoring summary sentences from XmlDoc::getEventSummary()
float m_score;
// facet support in Posdb.cpp for compiling the data and we'll
// send this back via Msg39Reply::ptr_facetHashList which will be
// 1-1 with the query terms.
HashTableX m_facetHashTable;
char m_startKey[MAX_KEY_BYTES];
char m_endKey [MAX_KEY_BYTES];
char m_ks;
// used by Msg40.cpp for gigabits generation
long long m_hash64d;
long m_popWeight;
};
//#define MAX_OPSLOTS 256
#define MAX_EXPRESSIONS 100
// operand1 AND operand2 OR ...
// operand1 OR operand2 AND ...
class Expression {
public:
bool addExpression (long start,
long end,
class Query *q,
long level );
bool isTruth ( unsigned char *bitVec , long vecSize );
// . what QueryTerms are UNDER the influence of the NOT opcode?
// . we read in the WHOLE termlist of those that are (like '-' sign)
// . returned bit vector is 1-1 with m_qterms in Query class
void print (SafeBuf *sbuf);
// . a list of operands separated by op codes (a AND b OR c ...)
// . sometimes and operand is another expression: a AND (b OR c)
// . use NULL in m_operands slot if we got an expression and vice versa
// . m_opcodes[i] is the opcode after operand #i
//class Expression *m_parent;
//bool m_hasNOT;
//long m_start;
//long m_end;
bool m_hadOpCode;
long m_expressionStartWord;
long m_numWordsInExpression;
Query *m_q;
// . opSlots can be operands operators or expressions
// . m_opTypes tells which of the 3 they are
//long m_opSlots[MAX_OPSLOTS];
//char m_opTypes[MAX_OPSLOTS];
//long m_cc;
};
// . this is the main class for representing a query
// . it contains array of QueryWords (m_qwords[]) and QueryTerms (m_qterms[])
class Query {
public:
void reset();
Query();
~Query();
void constructor();
void destructor();
// . returns false and sets g_errno on error
// . after calling this you can call functions below
// . if boolFlag is 0 we ignore all boolean operators
// . if boolFlag is 1 we assume query is boolen
// . if boolFlag is 2 we attempt to detect if query is boolean or not
bool set2 ( char *query ,
//long queryLen ,
//char *coll ,
//long collLen ,
uint8_t langId ,
char queryExpansion ,
bool useQueryStopWords = true );
//char boolFlag = 2 , // auto-detect if boolean query
//bool keepAllSingles = false ,
//long maxQueryTerms = 0x7fffffff );
// serialize/deserialize ourselves so we don't have to pass the
// unmodified string around and reparse it every time
long getStoredSize();
long serialize(char *buf, long bufLen);
long deserialize(char *buf, long bufLen);
// . if a term is truncated in indexdb, change its '+' sign to a '*'
// . will recopmute m_bitScores to fix bit #7
//void softenTruncatedTerms ( );
bool setQueryTermScores ( long long *termFreqsArg ) ;
// about how hits for this query?
//long long getEstimatedTotalHits ( );
char *getQuery ( ) { return m_orig ; };
long getQueryLen ( ) { return m_origLen; };
//long getNumIgnored ( ) { return m_numIgnored; };
//long getNumNotIgnored ( ) { return m_numTerms ; };
long getNumTerms ( ) { return m_numTerms; };
char getTermSign ( long i ) { return m_qterms[i].m_termSign; };
bool isPhrase ( long i ) { return m_qterms[i].m_isPhrase; };
bool isInPhrase ( long i ) { return m_qterms[i].m_inPhrase; };
bool isInQuotes ( long i ) { return m_qterms[i].m_inQuotes; };
long long getTermId ( long i ) { return m_qterms[i].m_termId; };
char getFieldCode2( long i ) { return m_qterms[i].m_fieldCode; };
long long getRawTermId ( long i ) { return m_qterms[i].m_rawTermId; };
char *getTerm ( long i ) { return m_qterms[i].m_term; };
long getTermLen ( long i ) { return m_qterms[i].m_termLen; };
bool isQueryStopWord (long i ) {
return m_qterms[i].m_isQueryStopWord; };
// . not HARD required, but is term #i used for an EXACT match?
// . this includes negatives and phrases with signs in addition to
// the standard signless single word query term
bool isRequired ( long i ) {
if ( ! m_qterms[i].m_isPhrase ) return true;
if ( m_qterms[i].m_termSign ) return true;
return false;
};
//long getNumRequired ( ) ;
bool isSplit();
bool isSplit(long i) { return m_qterms[i].isSplit(); };
// . Msg39 calls this to get our vector so it can pass it to Msg37
// . the signs and ids are dupped in the QueryTerm classes, too
//long long *getTermFreqs ( ) { return m_termFreqs ; };
//long long getTermFreq ( long i ) { return m_termFreqs[i]; };
long long *getTermIds ( ) { return m_termIds ; };
char *getTermSigns ( ) { return m_termSigns ; };
long *getComponentCodes ( ) { return m_componentCodes; };
long long getRawWordId ( long i ) { return m_qwords[i].m_rawWordId;};
long getNumComponentTerms ( ) { return m_numComponents; };
// sets m_bmap[][] so getImplicits() works
void setBitMap ( );
bool testBoolean(unsigned char *bits,long vecSize);
// print to log
void printBooleanTree();
void printQueryTerms();
// the new way as of 3/12/2014. just determine if matches the bool
// query or not. let's try to offload the scoring logic to other places
// if possible.
// bitVec is all the QueryWord::m_opBits some docid contains, so
// does it match our boolean query or not?
bool matchesBoolQuery ( unsigned char *bitVec , long vecSize ) ;
// . call this before calling getBitScore() to set m_bitScores[] table
// . returns false and sets g_errno on error (ENOMEM usually)
//bool setBitScores (qvec_t bitMask = (qvec_t)-1);
// . m_bitScores[ BITS ] maps BITS to a bitScore
// . the BITS of a doc is 1-1 with m_qterms[] present in that doc
// . bitScore returns # of required terms implicitly in the doc
// . required terms do not include query terms from signless phrases
// . if bitScore 0x80 is set it matches all forced terms (plus signs)
// . if bitScore 0x40 is set it has all required terms EXplicitly
// . if bitScore 0x20 is set it has all required terms IMplicitly
// . example query: 'cat dog' --> "cat dog"=bit#0, cat=bit#1, dog=bit#2
// . if a doc does not explicitly have 'cat', but it has the phrase
// "cat dog" then it is said to have 'cat' implicitly... implied
// through the phrase
// . the greater the number of IMplicit SINGLE words a doc has the
// bigger its bit score
/*
uint8_t getBitScore ( qvec_t ebits ) {
// get implicit bits from explicit bits
qvec_t ibits = getImplicits ( ebits );
// . boolean queries are limited in the # of terms so that
// ebits should NOT be too big, under 10MB i think now
// . sets the usual 0x80,0x40,0x20 + require termcount.
// . for boolean queries, if we have too many
// explicits then when we alloc for "need4" above, it
// should return ENOMEM and we should never make it
// here! so "ibits" should not be too big and breach
// the array!
if ( m_isBoolean ) {
//return m_bitScores[(uint32_t)ibits];
uint8_t bscore = 0;
if( testBoolean(ibits)) bscore = 0x80|0x40|0x20;
return bscore;
}
// just get those required
ibits &= m_requiredBits;
// get the vector of required bits we implicitly have
uint8_t *iv = (uint8_t *)&ibits;
// set this
uint8_t bscore ;
// . how many terms we do have implicitly?
// . the g_a table is in Mem.cpp and maps a byte to the number
// of bits it has that are in the ON position
bscore = g_a[iv[0]] + g_a[iv[1]] + g_a[iv[2]] + g_a[iv[3]]
+ g_a[iv[4]] + g_a[iv[5]] + g_a[iv[6]] + g_a[iv[7]];
// if we have synonyms, then any implied bits a synonym has
// should be treated as explicit bits for "bit score" purposes
// so that if someone searches 'cincinnati, oh' a doc with
// 'cincinnati ohio' is treated no lesser than a doc with
// 'cincinnati oh'. BUT only do this for "stem" or
// "morphological form" synonyms, because it is allows 'bib'
// for the query 'michael bibby facts' to outscore docs that
// have all the original query terms explicitly... so limit
// it to just the "stem" synonyms. BUT, if the syn affinity
// is 0, do not include these at all...
//qvec_t ebits2=ebits | getImplicits(ebits&m_synonymBitsHiAff);
qvec_t ebits2 = ebits | getImplicits(ebits&m_synonymBits);
// then OR in some high bits
if ((ebits2 & m_forcedBits) == m_forcedBits ) bscore|=0x80;
if ((ebits2 & m_requiredBits) == m_requiredBits ) bscore|=0x40;
if (ibits == m_requiredBits ) bscore|=0x20;
return bscore;
};
*/
// return an implicit vector from an explicit which contains the explic
qvec_t getImplicits ( qvec_t ebits ) {
if ( ! m_bmapIsSet ) { char *xx=NULL;*xx=0; }
uint8_t *ev = (uint8_t *)&ebits;
return m_bmap[0][ev[0]] |
m_bmap[1][ev[1]] |
m_bmap[2][ev[2]] |
m_bmap[3][ev[3]] |
m_bmap[4][ev[4]] |
m_bmap[5][ev[5]] |
m_bmap[6][ev[6]] |
m_bmap[7][ev[7]] ;
};
//qvec_t getImplicitBitsFromTermNum ( long qtnum ) {
//};
// returns false if no truths possible
bool setBitScoresBoolean ( char *buf , long bufSize );
// ALWAYS call this after calling setBitScores(), it uses m_bitScores[]
//long long getEstimatedTotalHitsBoolean ( );
// sets m_qwords[] array, this function is the heart of the class
bool setQWords ( char boolFlag , bool keepAllSingles ,
class Words &words , class Phrases &phrases ) ;
// sets m_qterms[] array from the m_qwords[] array
bool setQTerms ( class Words &words , class Phrases &phrases ) ;
// . query expansion functions, the first gets all possible candidates
// (eliminated after Msg37 returns), the second actually modifies the
// query to include new terms
//long getCandidates(long *synMap, long long *synIds, long num);
//void fixTermFreqs ( long *synMap ,
// long numTermsAndSyns ,
// long long *termFreqs ) ;
//long filterCandidates ( long *synMap ,
// long long *synIds ,
// long numTermsAndSyns ,
// long long *termFreqs ,
// char *coll ) ;
//bool expandQuery (long *synMap, long long *synIds, long num,
// long long *termFreqs);
// set m_expressions[] and m_operands[] arrays and m_numOperands
// for boolean queries
bool setBooleanOperands ( );
// helper funcs for parsing query into m_qwords[]
//char getFieldCode ( char *s , long len , bool *hasColon ) ;
bool isConnection ( char *s , long len ) ;
// set the QueryTerm::m_hasNOT members
//void setHasNOTs();
// . used by IndexTable.cpp to make a ptr map of the query terms
// to make intersecting the termlists one at a time efficient
// . "imap" is a list of the termlist numbers, but especially sorted
// . 0 <= imap[i] < m_numTerms
// . sizes[i] is the total docids for query term #i (up to the current
// tier being examined in IndexTable.cpp)
// . we set blocksize[i] only when imap[i] is a termlist which is not
// a signless phrase. it is a number, N, such that
// imap[i], imap[i+1], ... imap[i+N-1] are a "block" that has all
// the signless phrase terms that contain query term # imap[i].
// we cluster them together like this because IndexTable needs to
// hash them all together since the phrase terms can imply the single
// terms.
// . it now returns the number of terms put into imap[]
// . it sets *retNumBlocks to the number of blocks put into
// blocksizes[]
// . "sizes" is the size of each list (all tiers combined). this is
// in query term num space, not IMAP space, and must be provided by
// the caller. it is the only arg that is input, the rest are output.
long getImap ( long *sizes , long *imap , long *blocksizes ,
long *retNumBlocks );
// . replace sequences of UOR'd terms with a single termid
// . the sequence of UOR'd terms are the component terms
// . the term that replaces that sequence is the compound term
// . the compound termlist will be a merge of the components' termlists
// . this sets the component terms QueryTerm::m_ignore char to true
// when they are replaced by a compound term
// . ensures compound term inherits the common QueryTerm::m_explicitBit
// from the component terms it replaced
// . ensures the compound term's m_termFreqs[i] is the sum of the
// components' termFreqs
// . sets QueryTerm::m_component and QueryTerm::m_compound respectively
//void addCompoundTerms ();
public:
// hash of all the query terms
long long getQueryHash();
bool isCompoundTerm ( long i ) ;
class QueryTerm *getQueryTermByTermId64 ( long long termId ) {
for ( long i = 0 ; i < m_numTerms ; i++ ) {
if ( m_qterms[i].m_termId == termId )
return &m_qterms[i];
}
return NULL;
};
// silly little functions that support the BIG HACK
//long getNumNonFieldedSingletonTerms() { return m_numTermsSpecial; };
//long getTermsFound ( Query *q , char *foundTermVector ) ;
// return -1 if does not exist in query, otherwise return the
// query word num
long getWordNum ( long long wordId );
// this is now just used for boolean queries to deteremine if a docid
// is a match or not
unsigned char *m_bitScores ;
long m_bitScoresSize;
// . map explicit bits vector to implied bits vector
// . like m_bitScores but simpler
//qvec_t *m_bmap ;
//long m_bmapSize;
// one bmap per byte of qvec_t
qvec_t m_bmap[sizeof(qvec_t)][256];
// . bit vector that is 1-1 with m_qterms[]
// . only has bits that we must have if we were default AND
//unsigned short m_requiredBits;
qvec_t m_requiredBits;
qvec_t m_matchRequiredBits;
qvec_t m_negativeBits;
qvec_t m_forcedBits;
// bit vector for terms that are synonyms
qvec_t m_synonymBits;
long m_numRequired;
// language of the query
uint8_t m_langId;
bool m_useQueryStopWords;
// use a generic buffer for m_qwords and m_expressions to point into
// so we don't have to malloc for them
char m_gbuf [ GBUF_SIZE ];
char *m_gnext;
QueryWord *m_qwords ; // [ MAX_QUERY_WORDS ];
long m_numWords;
long m_qwordsAllocSize;
// QueryWords are converted to QueryTerms
QueryTerm m_qterms [ MAX_QUERY_TERMS ];
long m_numTerms;
long m_numTermsSpecial;
// separate vectors for easier interfacing, 1-1 with m_qterms
//long long m_termFreqs [ MAX_QUERY_TERMS ];
long long m_termIds [ MAX_QUERY_TERMS ];
char m_termSigns [ MAX_QUERY_TERMS ];
long m_componentCodes [ MAX_QUERY_TERMS ];
char m_ignore [ MAX_QUERY_TERMS ]; // is term ignored?
long m_numComponents;
// how many bits in the full vector?
//long m_numExplicitBits;
// how many terms are we ignoring?
//long m_numIgnored;
// site: field will disable site clustering
// ip: field will disable ip clustering
// site:, ip: and url: queries will disable caching
bool m_hasPositiveSiteField;
bool m_hasIpField;
bool m_hasUrlField;
bool m_hasSubUrlField;
bool m_hasIlinkField;
bool m_hasGBLangField;
bool m_hasGBCountryField;
char m_hasQuotaField;
// query id set by Msg39.cpp
long m_qid;
// . we set this to true if it is a boolean query
// . when calling Query::set() above you can tell it explicitly
// if query is boolean or not, OR you can tell it to auto-detect
// by giving different values to the "boolFlag" parameter.
bool m_isBoolean;
long m_synTerm; // first term that's a synonym
class SynonymInfo *m_synInfo;
long m_synInfoAllocSize;
// if they got a gbdocid: in the query and it's not boolean, set these
long long m_docIdRestriction;
class Host *m_groupThatHasDocId;
// for holding the filtered query, in utf8
char m_buf [ MAX_QUERY_LEN ];
long m_bufLen;
// for holding the filtered/NULL-terminated query for doing
// matching. basically store phrases in here without punct
// so we can point a needle to them for matching in XmlDoc.cpp.
char m_needleBuf [ MAX_QUERY_LEN + 1 ];
long m_needleBufLen;
// the original query
char m_orig [ MAX_QUERY_LEN ];
long m_origLen;
// we just have a ptr to this so don't pull the rug out
//char *m_coll;
//long m_collLen;
// . we now contain the parsing components for boolean queries
// . m_expressions points into m_gbuf or is allocated
//class Expression *m_expressions; // [ MAX_OPERANDS ];
//long m_expressionsAllocSize;
Expression m_expressions[MAX_EXPRESSIONS];
long m_numExpressions;
//class Operand m_operands [ MAX_OPERANDS ];
//long m_numOperands ;
// does query contain the pipe operator
bool m_piped;
long m_maxQueryTerms ;
bool m_queryExpansion;
bool m_truncated;
bool m_hasDupWords;
bool m_hasUOR;
bool m_hasLinksOperator;
bool m_bmapIsSet ;
bool m_hasSynonyms;
SafeBuf m_debugBuf;
};
/*
class QueryScores {
public:
QueryScores(){};
~QueryScores(){};
bool set(Query *q);
void reset();
long getNumTerms ( ) { return m_numTerms; } ;
long getScore ( long i ) { return m_scores[i]; } ;
void setScore (long i, long score) {m_scores[i] = score; };
long long getTermId ( long i ) { return m_q->getTermId(i); } ;
//long long getWordId ( long i ) { return m_wordIds[i]; } ;
private:
Query *m_q;
long m_numTerms;
long long *m_freqs;
long m_termPtrs [ MAX_QUERY_TERMS ];
long m_scores [ MAX_QUERY_TERMS ];
//long m_wordIds [ MAX_QUERY_TERMS ];
};
*/
bool queryTest();
#endif