open-source-search-engine/Query.cpp
2014-10-30 13:36:39 -06:00

4937 lines
148 KiB
C++

#include "gb-include.h"
#include "Query.h"
//#include "Indexdb.h" // g_indexdb.getTruncationLimit() g_indexdb.getTermId()
#include "Words.h"
#include "Bits.h"
#include "Phrases.h"
#include "Url.h"
#include "Clusterdb.h" // g_clusterdb.getNumGlobalRecs()
#include "StopWords.h" // isQueryStopWord()
#include "Sections.h"
#include "Msg1.h"
#include "Speller.h"
//#include "Thesaurus.h"
#include "Mem.h"
#include "Msg3a.h"
#include "HashTableX.h"
#include "Synonyms.h"
#include "Wiki.h"
Query::Query ( ) {
constructor();
}
void Query::constructor ( ) {
//m_bmap = NULL;
m_bitScores = NULL;
m_qwords = NULL;
//m_expressions = NULL;
m_qwordsAllocSize = 0;
//m_expressionsAllocSize = 0;
m_qwords = NULL;
//m_expressions = NULL;
reset ( );
}
void Query::destructor ( ) {
reset();
}
Query::~Query ( ) {
reset ( );
}
void Query::reset ( ) {
m_docIdRestriction = 0LL;
m_groupThatHasDocId = NULL;
m_bufLen = 0;
m_origLen = 0;
m_numWords = 0;
//m_numOperands = 0;
m_numTerms = 0;
m_synTerm = 0;
//m_numIgnored = 0;
//m_numRequired = -1;
m_numComponents = 0;
//if ( m_bmap && m_bmapSize ) // != m_bmbuf )
// mfree ( m_bmap , m_bmapSize , "Query1" );
//if ( m_bitScores && m_bitScoresSize ) // != m_bsbuf )
// mfree ( m_bitScores , m_bitScoresSize , "Query2" );
//m_bmap = NULL;
m_bitScores = NULL;
//m_bmapSize = 0;
m_bitScoresSize = 0;
//if ( m_expressionsAllocSize )
// mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
if ( m_qwordsAllocSize )
mfree ( m_qwords , m_qwordsAllocSize , "Query4" );
//m_expressionsAllocSize = 0;
m_qwordsAllocSize = 0;
m_qwords = NULL;
//m_expressions = NULL;
m_numExpressions = 0;
m_gnext = m_gbuf;
m_hasUOR = false;
m_bmapIsSet = false;
// the site: and ip: query terms will disable site clustering & caching
m_hasPositiveSiteField = false;
m_hasIpField = false;
m_hasUrlField = false;
m_hasSubUrlField = false;
m_hasIlinkField = false;
m_hasGBLangField = false;
m_hasGBCountryField = false;
m_hasQuotaField = false;
m_hasLinksOperator = false;
m_truncated = false;
m_hasSynonyms = false;
}
// . returns false and sets g_errno on error
// . "query" must be NULL terminated
// . if boolFlag is 0 we ignore all boolean operators
// . if boolFlag is 1 we assume query is boolen
// . if boolFlag is 2 we attempt to detect if query is boolean or not
// . if "keepAllSingles" is true we do not ignore any single word UNLESS
// it is a boolean operator (IGNORE_BOOLOP), fieldname (IGNORE_FIELDNAME)
// a punct word (IGNORE_DEFAULT) or part of one field value (IGNORE_DEFAULT)
// This is used for term highlighting (Highlight.cpp and Summary.cpp)
bool Query::set2 ( char *query ,
//long queryLen ,
//char *coll ,
//long collLen ,
//char boolFlag ,
//bool keepAllSingles ,
// need language for doing synonyms
uint8_t langId ,
char queryExpansion ,
bool useQueryStopWords ) {
//long maxQueryTerms ) {
m_langId = langId;
m_useQueryStopWords = useQueryStopWords;
// fix summary rerank and highlighting.
bool keepAllSingles = true;
// assume boolean auto-detect.
char boolFlag = 2;
// come back up here if we changed our boolean minds
// top:
reset();
if ( ! query ) return true;
// set to 256 for synonyms?
m_maxQueryTerms = 256;
m_queryExpansion = queryExpansion;
long queryLen = gbstrlen(query);
// override this to 32 at least for now
//if ( m_maxQueryTerms < 32 ) m_maxQueryTerms = 32;
// save collection info
//m_coll = coll;
//m_collLen = collLen;
// truncate query if too big
if ( queryLen >= MAX_QUERY_LEN ) {
log("query: Query length of %li must be less than %li. "
"Truncating.",queryLen,(long)MAX_QUERY_LEN);
queryLen = MAX_QUERY_LEN - 1;
m_truncated = true;
}
// save original query
m_origLen = queryLen;
memcpy ( m_orig , query , queryLen );
m_orig [ m_origLen ] = '\0';
log(LOG_DEBUG, "query: set called = %s", m_orig);
char *q = query;
// see if it should be boolean...
for ( long i = 0 ; i < queryLen ; i++ ) {
// but if bool flag is 0 that means it is NOT boolean!
// it must be one for autodetection. so do not autodetect
// unless this is 2.
if ( boolFlag != 2 ) break;
if ( q[i]=='A' && q[i+1]=='N' && q[i+2]=='D' &&
(q[i+3]==' ' || q[i+3]=='(') )
boolFlag = 1;
if ( q[i]=='O' && q[i+1]=='R' &&
(q[i+2]==' ' || q[i+2]=='(') )
boolFlag = 1;
if ( q[i]=='N' && q[i+1]=='O' && q[i+2]=='T' &&
(q[i+3]==' ' || q[i+3]=='(') )
boolFlag = 1;
}
// if we did not set the flag to 1 set it to 0. force to non-bool
if ( boolFlag == 2 ) boolFlag = 0;
// come back up here if we find no bool operators but had ()'s
// top:
// reset anything that was allocated... in case we're being
// called from below... m_qwords may have been allocated in call
// to setQWords() below
// NO! this resets m_origLen to 0!!! not to mention other member vars
// that were set somewhere above!!! i moved top: label above!
//reset();
// convenience ptr
char *p = m_buf;
char *pend = m_buf + MAX_QUERY_LEN;
bool inQuotesFlag = false;
// . copy query into m_buf
// . translate ( and ) to special query operators so Words class
// can parse them as their own word to make parsing bool queries ez
// for parsing out the boolean operators in setBitScoresBoolean()
for ( long i = 0 ; i < queryLen ; i++ ) {
// gotta count quotes! we ignore operators in quotes
// so you can search for diffbotUri:"article|0|123456"
if ( query[i] == '\"' ) inQuotesFlag = !inQuotesFlag;
if ( inQuotesFlag ) {
*p = query [i];
p++;
continue;
}
// dst buf must be big enough
if ( p + 8 >= pend ) {
g_errno = EBUFTOOSMALL;
return log(LOG_LOGIC,"query: query: query too big.");
}
// translate ( and )
if ( boolFlag == 1 && query[i] == '(' ) {
memcpy ( p , " LeFtP " , 7 ); p += 7;
continue;
}
if ( boolFlag == 1 && query[i] == ')' ) {
memcpy ( p , " RiGhP " , 7 ); p += 7;
continue;
}
if ( query[i] == '|' ) {
memcpy ( p , " PiiPE " , 7 ); p += 7;
continue;
}
// translate [#a] [#r] [#ap] [#rp] [] [p] to operators
if ( query[i] == '[' && is_digit(query[i+1])) {
long j = i+2;
long val = atol ( &query[i+1] );
while ( is_digit(query[j]) ) j++;
char c = query[j];
if ( (c == 'a' || c == 'r') && query[j+1]==']' ) {
sprintf ( p , " LeFtB %li %c RiGhB ",val,c);
p += gbstrlen(p);
i = j + 1;
continue;
}
else if ( (c == 'a' || c == 'r') &&
query[j+1]=='p' && query[j+2]==']') {
sprintf ( p , " LeFtB %li %cp RiGhB ",val,c);
p += gbstrlen(p);
i = j + 2;
continue;
}
}
if ( query[i] == '[' && query[i+1] == ']' ) {
sprintf ( p , " LeFtB RiGhB ");
p += gbstrlen(p);
i = i + 1;
continue;
}
if ( query[i] == '[' && query[i+1] == 'p' && query[i+2]==']') {
sprintf ( p , " LeFtB RiGhB ");
p += gbstrlen(p);
i = i + 2;
continue;
}
char *q = &(query[i]);
// Skip old buzz permalink keywords
if (*q == 'g' && *(q+1) == 'b'){
// do not skip anymore, Msg5e.cpp needs this
/*
if (*(q+2) == 'p' && *(q+3) == 'e' && *(q+4) == 'r'
&& *(q+5) == 'm' && *(q+6) == 'a' && *(q+7) == 'l'
&& *(q+8) == 'i' && *(q+9) == 'n' && *(q+10) == 'k'
&& *(q+11) == ':' && *(q+12) =='1'){
//i += 12;
static bool s_printed = false;
if ( ! s_printed )
logf(LOG_DEBUG,"query: skipping "
"gbpermalink term for buzz.");
if ( ! s_printed ) s_printed = true;
continue;
}
*/
if (*(q+2)=='k' && *(q+3)=='e' && *(q+4) == 'y'
&& *(q+5)=='w' && *(q+6)=='o' && *(q+7) == 'r'
&& *(q+8) == 'd' && *(q+9) == ':'
&& *(q+10)=='r' && *(q+11)=='3' && *(q+12)=='6'
&& *(q+13) == 'p' && *(q+14) == '1'){
//logf(LOG_DEBUG,"query: skipping funky "
// "keyword term for buzz.");
i += 14;
continue;
}
}
// TODO: copy altavista's operators here? & | !
// otherwise, just a plain copy
*p = query [i];
p++;
}
// NULL terminate
*p = '\0';
// debug statement
//log(LOG_DEBUG,"Query: Got new query=%s",tempBuf);
//printf("query: query: Got new query=%s\n",tempBuf);
// set length
m_bufLen = p - m_buf;
Words words;
Phrases phrases;
// set m_qwords[] array from m_buf
if ( ! setQWords ( boolFlag , keepAllSingles , words , phrases ) )
return false;
//log(LOG_DEBUG, "Query: QWords set");
// did we have any boolean operators
/*
char found = 0;
char parens = 0;
if ( boolFlag == 1 ) {
for ( long i = 0 ; i < m_numWords ; i++ ) {
char *w = m_qwords[i].m_word;
long wlen = m_qwords[i].m_wordLen;
if (wlen==2 &&w[0]=='O'&&w[1]=='R' )
found=1;
else if (wlen==3 &&w[0]=='A'&&w[1]=='N'&&w[2]=='D')
found=1;
else if (wlen==3 &&w[0]=='N'&&w[1]=='O'&&w[2]=='T')
found=1;
if (wlen==5 &&w[0]=='L' && w[1]=='e' &&
w[2]=='F' && w[3]=='t' && w[4]=='P' )
parens=1;
else if (wlen==5 &&w[0]=='R' && w[1]=='i' &&
w[2]=='G' && w[3]=='h' && w[4]=='P' )
parens=1;
}
// if we were told it was a bool query or to auto-detect
// and it has no operators, but had parens, re-do so parens
// do not get translated to LeFtP or RiGhP
if ( boolFlag >= 1 && found == 0 && parens == 1 ) {
boolFlag = 0; goto top; }
// if no bool operators, it's definitely not a boolean query
if ( found == 0 ) boolFlag = 0;
}
*/
// set m_qterms from m_qwords, always succeeds
setQTerms ( words , phrases );
// . now add in compound termlists
// . compound query terms replace lists of UOR'd query terms that
// share the same QueryTerm::m_exclusiveBit (ebit)
// . if it cannot get the compound termlist from a remote cache, then
// Msg2 should get its components
// . component termlists have their compound termlist number
// as their m_componentCode, compound termlists have a componentCode
// of -1, other termlists have a componentCode of -2.
// . Query::addCompoundTerms() will add one extra query term for every
// sequence of UOR'd query terms that share the same ebit.
// Furthermore, it sets the m_componentCodes[] array.
// . The compound term must have the same ebit as its component terms.
// . we use the termid of compound termlists (and NOT their components)
// when routing this query to the host that can use the least
// amount of bandwidth to download/get the termlists. if the compound
// termlist is not in the cache then it will not be on disk or
// in the tree since it is a vitual termlist, BUT we will still
// create it and store it in the cache, so assume it is in a cache,
// because the act of storing it in the cache may require sending
// it to another machine.
// . if m_compoundListMaxSize is 0, do not do compound lists
// . Query::addCompoundTerms() will set the termfreq of compound terms
// to the sum of the termfreqs of its component termlists
//if ( m_compoundListMaxSize > 0 ) addCompoundTerms( );
// . always add them for now
//addCompoundTerms( );
// if m_isBoolean was set and we only have OP_UOR then
// we should probably unset it here (mdw)
// set m_expressions[] and m_operands[] arrays and m_numOperands
// for boolean queries
//if ( m_isBoolean )
// if ( ! setBooleanOperands() ) return false;
// disable stuff for site:, ip: and url: queries
for ( long i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( qw->m_fieldCode == FIELD_SITE &&
qw->m_wordSign != '-' )
m_hasPositiveSiteField = true;
else if ( qw->m_fieldCode == FIELD_IP )
m_hasIpField = true;
else if ( qw->m_fieldCode == FIELD_URL )
m_hasUrlField = true;
else if ( qw->m_fieldCode == FIELD_ILINK )
m_hasIlinkField = true;
else if ( qw->m_fieldCode == FIELD_GBLANG )
m_hasGBLangField = true;
else if ( qw->m_fieldCode == FIELD_GBCOUNTRY )
m_hasGBCountryField = true;
else if ( qw->m_fieldCode == FIELD_QUOTA )
m_hasQuotaField = true;
else if ( qw->m_fieldCode == FIELD_SUBURL )
m_hasSubUrlField = true;
else if ( qw->m_fieldCode == FIELD_SUBURL2 )
m_hasSubUrlField = true;
}
// set m_docIdRestriction if a term is gbdocid:
for ( long i = 0 ; i < m_numTerms && ! m_isBoolean ; i++ ) {
// get it
QueryTerm *qt = &m_qterms[i];
// gbdocid:?
if ( qt->m_fieldCode != FIELD_GBDOCID ) continue;
// get docid
char *ds = m_qterms[i].m_term + 8;
m_docIdRestriction = atoll(ds);
//unsigned long gid;
unsigned long shard = getShardNumFromDocId(m_docIdRestriction);
//gid = g_hostdb.getGroupIdFromDocId(m_docIdRestriction);
//m_groupThatHasDocId = g_hostdb.getGroup(gid);
m_groupThatHasDocId = g_hostdb.getShard ( shard );
break;
}
// . keep it simple for now
// . we limit to MAX_EXRESSIONS to like 10 now i guess
if ( m_isBoolean ) {
m_numExpressions = 1;
if ( ! m_expressions[0].addExpression ( 0 ,
m_numWords ,
this , // Query
0 ) ) // level
// return false with g_errno set on error
return false;
}
// . if it is not truncated, no need to use hard counts
// . comment this line and the next one out for testing hard counts
if ( ! m_truncated ) return true;
// if got truncated AND under the HARD max, nothing we can do, it
// got cut off due to m_maxQueryTerms limit in Parms.cpp
if ( m_numTerms < (long)MAX_EXPLICIT_BITS ) return true;
// if they just hit the admin's ceiling, there's nothing we can do
if ( m_numTerms >= m_maxQueryTerms ) return true;
// a temp log message
log(LOG_DEBUG,"query: Encountered %li query terms.",m_numTerms);
// otherwise, we're below m_maxQueryTerms BUT above MAX_QUERY_TERMS
// so we can use hard counts to get more power...
// . use the hard count for excessive query terms to save explicit bits
// . just look for operands on the first level that are not OR'ed
char redo = 0;
for ( long i = 0 ; i < m_numWords ; i++ ) {
// get the ith word
QueryWord *qw = &m_qwords[i];
// mark him as NOT hard required
qw->m_hardCount = 0;
// skip if not on first level
if ( qw->m_level != 0 ) continue;
// stop at first OR on this level
if ( qw->m_opcode == OP_OR ) break;
// skip all punct
if ( qw->m_isPunct ) continue;
// if we are a boolean query,the next operator can NOT be OP_OR
// because we can not used terms that are involved in an OR
// as a hard count term, because they are not required terms
for ( long j=i+1 ; m_isBoolean && j<m_numWords; j++ ) {
// stop at previous operator
char opcode = m_qwords[j].m_opcode;
if ( ! opcode ) continue;
if ( opcode != OP_OR ) break;
// otherwise, the next operator is an OR, so do not
// use a hard count for this term
goto stop;
}
// mark him as required, so he won't use an explicit bit now
qw->m_hardCount = 1;
// mark it so we can reduce our number of explicit bits used
redo = 1;
}
stop:
// if nothing changed, return now
if ( ! redo ) return true;
// . set the query terms again if we have a long query
// . if QueryWords has m_hardCount set, ensure the explicit bit is 0
// . non-quoted phrases that contain a "required" single word should
// themselves have 0 for their implicit bits, BUT 0x8000 for their
// explicit bit
if ( ! setQTerms ( words , phrases ) )
return false;
// a temp log message
//log(LOG_DEBUG,"query: Compressed to %li query terms, %li hard. "
// "(nt=%li)",
// m_numExplicitBits,m_numTerms-m_numExplicitBits,m_numTerms);
//if ( ! m_isBoolean ) return true;
// free cuz it was already set
//if ( m_expressionsAllocSize )
// mfree(m_expressions,m_expressionsAllocSize , "Query" );
//m_expressionsAllocSize = 0;
//m_expressions = NULL;
// also set the boolean stuff again too!
//if ( ! setBooleanOperands() ) return false;
return true;
}
/*
// count how many so PageResults will know if he should offer
// a default OR alternative search if no more results for
// the default AND (rat=1)
long Query::getNumRequired ( ) {
if ( m_numRequired >= 0 ) return m_numRequired;
m_numRequired = 0;
for ( long i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// skip signless phrases
if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue;
if ( qt->m_synonymOf ) continue;
// count it up
m_numRequired++;
}
return m_numRequired;
}
*/
// returns false and sets g_errno on error
bool Query::setQTerms ( Words &words , Phrases &phrases ) {
//long shift = 0;
// . set m_qptrs/m_qtermIds/m_qbits
// . use one bit position for each phraseId and wordId
// . first set phrases
long n = 0;
// what is the max value for "shift"?
long max = (long)MAX_EXPLICIT_BITS;
if ( max > m_maxQueryTerms ) max = m_maxQueryTerms;
//char u8Buf[256];
for ( long i = 0 ; i < m_numWords && n < MAX_QUERY_TERMS ; i++ ) {
// break out if no more explicit bits!
/*
if ( shift >= max ) {
log("query: Query1 has more than %li unique terms. "
"Truncating.",max);
m_truncated = true;
break;
}
*/
QueryWord *qw = &m_qwords[i];
// skip if ignored... mdw...
if ( ! qw->m_phraseId ) continue;
if ( qw->m_ignorePhrase ) continue; // could be a repeat
// none if weight is absolute zero
if ( qw->m_userWeightPhrase == 0 &&
qw->m_userTypePhrase == 'a' ) continue;
// stop breach
if ( n >= MAX_QUERY_TERMS ) {
log("query: lost query phrase terms to max term "
"limit of %li",(long)MAX_QUERY_TERMS );
break;
}
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw ;
qt->m_piped = qw->m_piped;
qt->m_isPhrase = true ;
qt->m_isUORed = false;
qt->m_UORedTerm = NULL;
qt->m_synonymOf = NULL;
qt->m_ignored = 0;
// assume not a repeat of another query term (set below)
qt->m_repeat = false;
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = false;
// change in both places
qt->m_termId = qw->m_phraseId & TERMID_MASK;
m_termIds[n] = qw->m_phraseId & TERMID_MASK;
//log(LOG_DEBUG, "Setting query phrase term id %d: %lld", n, m_termIds[n]);
qt->m_rawTermId = qw->m_rawPhraseId;
// assume explicit bit is 0
qt->m_explicitBit = 0;
qt->m_matchesExplicitBits = 0;
// boolean queries are not allowed term signs for phrases
// UNLESS it is a '*' soft require sign which we need for
// phrases like: "cat dog" AND pig
if ( m_isBoolean && qw->m_phraseSign != '*' ) {
qt->m_termSign = '\0';
m_termSigns[n] = '\0';
}
// if not boolean, ensure to change signs in both places
else {
qt->m_termSign = qw->m_phraseSign;
m_termSigns[n] = qw->m_phraseSign;
}
//
// INSERT UOR LOGIC HERE
//
// long pw = i-1;
// // . back up until word that contains quote if in a quoted
// // phrase
// // . UOR can only support two word phrases really...
// if (m_qwords[i].m_quoteStart >= 0)
// pw = m_qwords[i].m_quoteStart - 1;
// if ( pw >= 0 && m_qwords[pw].m_quoteStart >= 0 )
// pw = m_qwords[pw].m_quoteStart - 1;
// // back two more if field
// //if ( pw >= 0 && m_qwords[pw].m_ignoreWord==IGNORE_FIELDNAME )
// // pw -= 2;
// while (pw>0 &&
// ((m_qwords[pw].m_ignoreWord == IGNORE_DEFAULT) ||
// (m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) pw--;
// // is UOR operator? if so, backup over it
// if ( pw >= 0 && m_qwords[pw].m_opcode == OP_UOR ) pw -= 2;
// else goto notUORPhrase;
// if ( pw < 0 ) goto notUORPhrase;
// // . if previous term is UOR'd with us then share the same ebit
// // . this allows us to use lots of UOR'd query terms
// // . the UOR'd lists may also be merged together into a single
// // list if "mergeListMaxSize" is positive
// // if ( n >= 1 &&
// // i >= 4 &&
// // //m_qterms[n-1].m_qword == &m_qwords[pw] &&
// // shift > 0 &&
// // qw->m_hardCount == 0 )
// // shift--;
// // set the UOR term sign
// qt->m_isUORed = true;
// notUORPhrase:
// do not use an explicit bit up if we have a hard count
qt->m_hardCount = qw->m_hardCount;
// if ( qw->m_hardCount == 0 ) {
// qt->m_explicitBit = 1 << shift ;
// shift++;
// }
qw->m_queryWordTerm = NULL;
// IndexTable.cpp uses this one
qt->m_inQuotes = qw->m_inQuotes;
// point to the string itself that is the phrase
qt->m_term = qw->m_word;
qt->m_termLen = qw->m_phraseLen;
// the QueryWord should have a direct link to the QueryTerm,
// at least for phrase, so we can OR in the bits of its
// constituents in the for loop below
qw->m_queryPhraseTerm = qt ;
// include ourselves in the implicit bits
// qt->m_implicitBits = qt->m_explicitBit;
// doh! gotta reset to 0
qt->m_implicitBits = 0;
// assume not under a NOT bool op
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeightPhrase ;
qt->m_userType = qw->m_userTypePhrase ;
qt->m_fieldCode = qw->m_fieldCode ;
// stuff before a pipe always has a weight of 1
if ( qt->m_piped ) {
qt->m_userWeight = 1;
qt->m_userType = 'a';
}
// debug
//char tmp[1024];
//memcpy ( tmp , qt->m_term , qt->m_termLen );
//tmp [ qt->m_termLen ] = 0;
//logf(LOG_DEBUG,"got term %s (%li)",tmp,qt->m_termLen);
// otherwise, add it
n++;
}
// now if we have enough room, do the singles
for ( long i = 0 ; i < m_numWords && n < MAX_QUERY_TERMS ; i++ ) {
// break out if no more explicit bits!
/*
if ( shift >= max ) {
logf(LOG_DEBUG,
"query: Query2 has more than %li unique terms. "
"Truncating.",max);
m_truncated = true;
break;
}
*/
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord &&
qw->m_ignoreWord != IGNORE_QSTOP) continue;
// if ( qw->m_ignoreWord ) continue;
// ignore if in quotes
//if ( qw->m_quoteStart >= 0 ) continue;
// ignore if in quotes and part of phrase, watch out
// for things like "word", a single word in quotes.
if ( qw->m_quoteStart >= 0 && qw->m_phraseId ) continue;
// ignore if weight is absolute zero
if ( qw->m_userWeight == 0 &&
qw->m_userType == 'a' ) continue;
// stop breach
if ( n >= MAX_QUERY_TERMS ) {
log("query: lost query terms to max term "
"limit of %li",(long)MAX_QUERY_TERMS );
break;
}
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw ;
qt->m_piped = qw->m_piped;
qt->m_isPhrase = false ;
qt->m_isUORed = false;
qt->m_UORedTerm = NULL;
qt->m_synonymOf = NULL;
// ignore some synonym terms if tf is too low
qt->m_ignored = qw->m_ignoreWord;
// qt->m_ignored = 0;
// assume not a repeat of another query term (set below)
qt->m_repeat = false;
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
// change in both places
qt->m_termId = qw->m_wordId & TERMID_MASK;
m_termIds[n] = qw->m_wordId & TERMID_MASK;
qt->m_rawTermId = qw->m_rawWordId;
// assume explicit bit is 0
qt->m_explicitBit = 0;
qt->m_matchesExplicitBits = 0;
//log(LOG_DEBUG, "Setting query phrase term id %d: %lld raw: %lld", n, m_termIds[n], qt->m_rawTermId);
// boolean queries are not allowed term signs
if ( m_isBoolean ) {
qt->m_termSign = '\0';
m_termSigns[n] = '\0';
// boolean fix for "health OR +sports" because
// the + there means exact word match, no synonyms.
if ( qw->m_wordSign == '+' ) {
qt->m_termSign = qw->m_wordSign;
m_termSigns[n] = qw->m_wordSign;
}
}
// if not boolean, ensure to change signs in both places
else {
qt->m_termSign = qw->m_wordSign;
m_termSigns[n] = qw->m_wordSign;
}
// get previous text word
//long pw = i - 2;
long pw = i-1;
// // . back up until word that contains quote if in a quoted
// // phrase
// // . UOR can only support two word phrases really...
if (m_qwords[i].m_quoteStart >= 0)
pw = m_qwords[i].m_quoteStart ;
if ( pw > 0 ) pw--;
// back two more if field
long fieldStart=-1;
long fieldLen=0;
if ( pw == 0 && m_qwords[pw].m_ignoreWord==IGNORE_FIELDNAME)
fieldStart = pw;
if ( pw > 0&& m_qwords[pw-1].m_ignoreWord==IGNORE_FIELDNAME ){
pw -= 1;
fieldStart = pw;
}
while (pw>0 &&
((m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) {
pw--;
fieldStart = pw;
}
if (fieldStart > -1) {
pw = i;
while (pw < m_numWords && m_qwords[pw].m_fieldCode)
pw++;
fieldLen = m_qwords[pw-1].m_word +
m_qwords[pw-1].m_wordLen -
m_qwords[fieldStart].m_word;
}
// // is UOR operator? if so, backup over it
// if ( pw >= 0 && m_qwords[pw].m_opcode == OP_UOR ){
// pw -= 2;
// }
// else goto notUOR;
// if ( pw < 0 ) goto notUOR;
// // . if previous term is UOR'd with us then share the same ebit
// // . this allows us to use lots of UOR'd query terms
// // . the UOR'd lists may also be merged together into a single
// // list if "mergeListMaxSize" is positive
// // if ( n >= 1 &&
// // i >= 4 &&
// // //m_qterms[n-1].m_qword == &m_qwords[pw] &&
// // shift > 0 &&
// // qw->m_hardCount == 0 )
// // shift--;
// // set the UOR term sign
// qt->m_isUORed = true;
// if (m_qwords[pw].m_queryWordTerm)
// m_qwords[pw].m_queryWordTerm->m_isUORed = true;
// if (m_qwords[pw].m_queryPhraseTerm)
// m_qwords[pw].m_queryPhraseTerm->m_isUORed = true;
// notUOR:
// do not use an explicit bit up if we have a hard count
qt->m_hardCount = qw->m_hardCount;
// if ( qw->m_hardCount == 0 ) {
// qt->m_explicitBit = 1 << shift ;
// shift++;
// }
qw->m_queryWordTerm = qt;
// IndexTable.cpp uses this one
qt->m_inQuotes = qw->m_inQuotes;
// point to the string itself that is the word
if (fieldLen > 0) {
qt->m_term = m_qwords[fieldStart].m_word;
qt->m_termLen = fieldLen;
// skip past the end of the field value
i = pw-1;
}
else {
qt->m_termLen = qw->m_wordLen;
qt->m_term = qw->m_word;
//log(LOG_DEBUG, "query: *** term \"%s\"", u8Buf);
}
// reset our implicit bits to 0
qt->m_implicitBits = 0;
// // . OR ourselves into our parent phrase's m_implicitBits
// // . this makes setting m_bitScores[] easy because if a
// // doc contains this prhase then it IMPLICITLY contains us
// // which will make it easier to satisfy requiredBits
// if ( qw->m_queryPhraseTerm )
// qw->m_queryPhraseTerm->m_implicitBits |=
// qt->m_explicitBit;
// // if we're in the middle of the phrase
// long pn = qw->m_leftPhraseStart;
// // convert word to its phrase QueryTerm ptr, if any
// QueryTerm *tt = NULL;
// if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm;
// if ( tt ) tt->m_implicitBits |= qt->m_explicitBit;
// // . there might be some phrase term that actually contains
// // the same word as we are, but a different occurence
// // . like '"knowledge management" AND NOT management' query
// for ( long j = 0 ; j < i ; j++ ) {
// // must be our same wordId (same word, different occ.)
// QueryWord *qw2 = &m_qwords[j];
// if ( qw2->m_wordId != qw->m_wordId ) continue;
// // get first word in the phrase that jth word is in
// long pn2 = qw2->m_leftPhraseStart;
// if ( pn2 < 0 ) continue;
// // he implies us!
// QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm;
// if ( tt2 ) tt2->m_implicitBits |= qt->m_explicitBit;
// break;
// }
// assume not under a NOT bool op
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeight ;
qt->m_userType = qw->m_userType ;
qt->m_fieldCode = qw->m_fieldCode ;
// stuff before a pipe always has a weight of 1
if ( qt->m_piped ) {
qt->m_userWeight = 1;
qt->m_userType = 'a';
}
// debug
//char tmp[1024];
//memcpy ( tmp , qt->m_term , qt->m_termLen );
//tmp [ qt->m_termLen ] = 0;
//logf(LOG_DEBUG,"got term %s (%li)",tmp,qt->m_termLen);
n++;
}
// now handle the explicit bits
// moved out of separate phrase and singleton loops
// for phrase UOR support
/*
for ( long i = 0 ; i < m_numWords ; i++ ) {
// break out if no more explicit bits!
// if ( shift >= max ) {
// log("query: Query has more than %li unique terms. "
// "Truncating.",max);
// m_truncated = true;
// break;
// }
long pw;
QueryWord *qw = &m_qwords[i];
if (!qw->m_queryWordTerm && !qw->m_queryPhraseTerm)
continue;
QueryTerm *qt = qw->m_queryPhraseTerm?
qw->m_queryPhraseTerm :
qw->m_queryWordTerm;
if (!qt) continue;
doAgain:
pw = i-1;
// . back up until word that contains quote if in a quoted
// phrase
// . UOR can only support two word phrases really...
//if (m_qwords[i].m_quoteStart >= 0)
// pw = m_qwords[i].m_quoteStart - 1;
while (pw>0 &&
((m_qwords[pw].m_ignoreWord == IGNORE_DEFAULT) ||
(m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) pw--;
// is UOR operator? if so, backup over it
if ( pw < 0 || m_qwords[pw].m_opcode != OP_UOR )
goto notUOR;
pw--;
while (pw>0 &&
((m_qwords[pw].m_ignoreWord == IGNORE_DEFAULT) ||
(m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) pw--;
if ( pw >= 0 && m_qwords[pw].m_quoteStart >= 0 )
//pw = m_qwords[pw].m_quoteStart + 1;
pw = m_qwords[pw].m_quoteStart;
if (pw < 0) goto notUOR;
// . if previous term is UOR'd with us then share the same ebit
// . this allows us to use lots of UOR'd query terms
// . the UOR'd lists may also be merged together into a single
// list if "mergeListMaxSize" is positive
qt->m_isUORed = true;
// set uor flag on all words in phrase
if (qw->m_queryPhraseTerm && m_qwords[i].m_quoteStart >= 0){
long quoteStart = m_qwords[i].m_quoteStart;
for (long j=quoteStart;j<m_numWords;j++){
if (m_qwords[j].m_ignoreWord) continue;
if (m_qwords[j].m_quoteStart != quoteStart)
break;
QueryTerm *qtp = m_qwords[j].m_queryWordTerm;
if (qtp) {
qtp->m_isUORed = true;
qtp->m_UORedTerm =
m_qwords[pw].m_queryPhraseTerm;
}
}
}
//QueryTerm *pqt = NULL;
if (m_qwords[pw].m_queryWordTerm){
m_qwords[pw].m_queryWordTerm->m_isUORed = true;
qt->m_UORedTerm = m_qwords[pw].m_queryWordTerm;
}
//pqt = m_qwords[pw].m_queryWordTerm;
// set uor flag on all words in previous phrase
if (m_qwords[pw].m_queryPhraseTerm &&
m_qwords[pw].m_quoteStart >= 0) {
m_qwords[pw].m_queryPhraseTerm->m_isUORed = true;
qt->m_UORedTerm = m_qwords[pw].m_queryPhraseTerm;
long quoteStart = m_qwords[pw].m_quoteStart;
for (long j=quoteStart;j<m_numWords;j++){
if (m_qwords[j].m_ignoreWord) continue;
if (m_qwords[j].m_quoteStart != quoteStart)
break;
QueryTerm *qtp = m_qwords[j].m_queryWordTerm;
if (qtp) {
qtp->m_isUORed = true;
qtp->m_UORedTerm =
m_qwords[pw].m_queryPhraseTerm;
}
}
}
// if ( n >= 1 &&
// i >= 4 &&
// //m_qterms[n-1].m_qword == &m_qwords[pw] &&
// shift > 0 &&
// qw->m_hardCount == 0 ) {
// shift--;
// }
notUOR:
// if ( qt->m_hardCount == 0 ) {
// // qt->m_explicitBit = 1 << shift ;
// qt->m_explicitBit = shift ;
// shift++;
// }
// // . OR ourselves into our parent phrase's m_implicitBits
// // . this makes setting m_bitScores[] easy because if a
// // doc contains this prhase then it IMPLICITLY contains us
// // which will make it easier to satisfy requiredBits
// if ( qw->m_queryPhraseTerm )
// qw->m_queryPhraseTerm->m_implicitBits |=
// qt->m_explicitBit;
// // if we're in the middle of the phrase
// long pn = qw->m_leftPhraseStart;
// // convert word to its phrase QueryTerm ptr, if any
// QueryTerm *tt = NULL;
// if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm;
// if ( tt ) tt->m_implicitBits |= qt->m_explicitBit;
// // . there might be some phrase term that actually contains
// // the same word as we are, but a different occurence
// // . like '"knowledge management" AND NOT management' query
// for ( long j = 0 ; j < i ; j++ ) {
// // must be our same wordId (same word, different occ.)
// //QueryWord *qw2 = m_qterms[j].m_qword;
// QueryWord *qw2 = &m_qwords[j];
// if ( qw2->m_wordId != qw->m_wordId ) continue;
// // get first word in the phrase that jth word is in
// long pn2 = qw2->m_leftPhraseStart;
// if ( pn2 < 0 ) continue;
// // he implies us!
// QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm;
// if ( tt2 ) tt2->m_implicitBits |= qt->m_explicitBit;
// break;
// }
if (qt == qw->m_queryPhraseTerm){
if ( qw->m_queryWordTerm){
qt = qw->m_queryWordTerm;
goto doAgain;
}
}
}
*/
/*
// Handle exclusive explicit bits only
shift = 0;
int n2 = 0;
for ( long i = 0; i < n ; i++ ){
// break out if no more explicit bits!
if ( shift >= max ) {
logf(LOG_DEBUG,
"query: Query4 has more than %li unique terms. "
"Truncating.",max);
m_truncated = true;
break;
}
QueryTerm *qt = &m_qterms[i];
if (qt->m_UORedTerm) continue;
// sometims UORedTerm is NULL i guess because of IGNORE_BREECH
if ( qt->m_isUORed && qt->m_qword && qt->m_qword->m_ignoreWord )
continue;
// Skip duplicate terms before we waste an explicit bit
bool skip=false;
for (long j=0;j<i;j++){
if ( qt->m_termId != m_qterms[j].m_termId ||
qt->m_termSign != m_qterms[j].m_termSign){
continue;
}
skip = true;
qt->m_explicitBit = m_qterms[j].m_explicitBit;
break;
}
n2++;
if (skip) continue;
if ( qt->m_hardCount == 0 ) {
qt->m_explicitBit = 1 << shift++;
}
}
// count them for doing number of combos
m_numExplicitBits = shift;
*/
// Handle shared explicit bits
for ( long i = 0; i < n ; i++ ){
QueryTerm *qt = &m_qterms[i];
// assume not in a phrase
qt->m_inPhrase = 0;
qt->m_rightPhraseTermNum = -1;
qt->m_leftPhraseTermNum = -1;
qt->m_rightPhraseTerm = NULL;
qt->m_leftPhraseTerm = NULL;
QueryTerm *qt2 = qt->m_UORedTerm;
if (!qt2) continue;
// chase down first term in UOR chain
while (qt2->m_UORedTerm) qt2 = qt2->m_UORedTerm;
//if (!qt2->m_explicitBit) continue;
//qt->m_explicitBit = qt2->m_explicitBit;
//n2++;
}
//m_numTerms = n2;
// . set implicit bits, m_implicitBits
// . set m_inPhrase
for (long i = 0; i < m_numWords ; i++ ){
QueryWord *qw = &m_qwords[i];
QueryTerm *qt = qw->m_queryWordTerm;
if (!qt) continue;
if ( qw->m_queryPhraseTerm )
qw->m_queryPhraseTerm->m_implicitBits |=
qt->m_explicitBit;
// set flag if in a a phrase, and set phrase term num
if ( qw->m_queryPhraseTerm ) {
qt->m_inPhrase = 1;
QueryTerm *pt = qw->m_queryPhraseTerm;
qt->m_rightPhraseTermNum = pt - m_qterms;
qt->m_rightPhraseTerm = pt;
}
// if we're in the middle of the phrase
long pn = qw->m_leftPhraseStart;
// convert word to its phrase QueryTerm ptr, if any
QueryTerm *tt = NULL;
if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm;
if ( tt ) tt->m_implicitBits |= qt->m_explicitBit;
if ( tt ) {
qt->m_inPhrase = 1;
qt->m_leftPhraseTermNum = tt - m_qterms;
qt->m_leftPhraseTerm = tt;
}
// . there might be some phrase term that actually contains
// the same word as we are, but a different occurence
// . like '"knowledge management" AND NOT management' query
// . made it from "j < i" into "j < m_numWords" because
// 'test "test bed"' was not working but '"test bed" test'
// was working.
for ( long j = 0 ; j < m_numWords ; j++ ) {
// must be our same wordId (same word, different occ.)
QueryWord *qw2 = &m_qwords[j];
if ( qw2->m_wordId != qw->m_wordId ) continue;
// get first word in the phrase that jth word is in
long pn2 = qw2->m_leftPhraseStart;
// we might be the guy that starts it!
if ( pn2 < 0 && qw2->m_quoteStart != -1 ) pn2 = j;
// if neither is the case, skip this query word
if ( pn2 < 0 ) continue;
// he implies us!
QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm;
if ( tt2 ) tt2->m_implicitBits |= qt->m_explicitBit;
if ( tt2 ) {
qt->m_inPhrase = 1;
qt->m_leftPhraseTermNum = tt2 - m_qterms;
qt->m_leftPhraseTerm = tt2;
}
break;
}
}
/*
// synonym terms should have copy all the implicit/explicit bits
// into their implicit bits field
for (long i = 0; i < m_numTerms; i++) {
QueryTerm *qt = &m_qterms[i];
QueryTerm *st = qt->m_synonymOf;
if (!st) continue;
// also, if we are "auto insurance", a synonymOf
// "car insurance", we should also imply "car insurance"'s
// terms, 'car' and 'insurance' for purposes of
// IndexTable2.cpp::getWeightScore()'s calculation of "min".
// Because when finding the "max" score of a word, we also
// allow its phrase and synonyms' scores to compete.
qt->m_implicitBits = st->m_implicitBits | st->m_explicitBit;
// now skip if not a phrase synonym
if ( ! qt->m_isPhrase ) continue;
// . we also imply the two words bookending this phrase, if any
// . so see if the leftSynHash is in the syn list
for ( long k = m_synTerm ; k < m_numTerms ; k++ ) {
// get term
QueryTerm *tt = &m_qterms[k];
// skip if phrase
if ( tt->m_isPhrase ) continue;
// must be synonym
if ( ! tt->m_synonymOf ) continue;
// must match one of our ids
if ( tt->m_qword->m_rawWordId != qt->m_leftRawWordId &&
tt->m_qword->m_rawWordId != qt->m_rightRawWordId )
continue;
// we imply it now!
qt->m_implicitBits |= tt->m_explicitBit;
}
}
*/
////////////
//
// . add synonym query terms now
// . skip this part if language is unknown i guess
//
////////////
long sn = 0;
Synonyms syn;
// loop over all words in query and process its synonyms list
//if ( m_langId != langUnknown && m_queryExpansion )
// if lang is "xx" unknown we still do synonyms it just does
// a loop over all languages starting with english
if ( m_queryExpansion )
sn = m_numWords;
int64_t to = hash64n("to",0LL);
for ( long i = 0 ; i < sn ; i++ ) {
// get query word
QueryWord *qw = &m_qwords[i];
// skip if in quotes, we will not get synonyms for it
if ( qw->m_inQuotes ) continue;
// skip if has plus sign in front
if ( qw->m_wordSign == '+' ) continue;
// no url: stuff, maybe only title
if ( qw->m_fieldCode &&
qw->m_fieldCode != FIELD_TITLE &&
qw->m_fieldCode != FIELD_GENERIC )
continue;
// skip if ignored like a stopword (stop to->too)
//if ( qw->m_ignoreWord ) continue;
// no, hurts 'Greencastle IN economic development'
if ( qw->m_wordId == to ) continue;
// single letters...
if ( qw->m_wordLen == 1 ) continue;
// set the synonyms for this word
char tmpBuf [ TMPSYNBUFSIZE ];
long naids = syn.getSynonyms ( &words ,
i ,
m_langId ,
tmpBuf ,
0 ); // m_niceness );
// if no synonyms, all done
if ( naids <= 0 ) continue;
// sanity
if ( naids > MAX_SYNS ) { char *xx=NULL;*xx=0; }
// now make the buffer to hold them for us
qw->m_synWordBuf.safeMemcpy ( &syn.m_synWordBuf );
// get the term for this word
QueryTerm *origTerm = qw->m_queryWordTerm;
// loop over synonyms for word #i now
for ( long j = 0 ; j < naids ; j++ ) {
// stop breach
if ( n >= MAX_QUERY_TERMS ) {
log("query: lost synonyms due to max term "
"limit of %li",(long)MAX_QUERY_TERMS );
break;
}
// this happens for 'da da da'
if ( ! origTerm ) continue;
// add that query term
QueryTerm *qt = &m_qterms[n];
qt->m_qword = qw; // NULL;
qt->m_piped = qw->m_piped;
qt->m_isPhrase = false ;
qt->m_isUORed = false;
qt->m_UORedTerm = NULL;
// synonym of this term...
qt->m_synonymOf = origTerm;
// nuke this crap since it was done above and we
// missed out!
qt->m_inPhrase = 0;
qt->m_rightPhraseTermNum = -1;
qt->m_leftPhraseTermNum = -1;
qt->m_rightPhraseTerm = NULL;
qt->m_leftPhraseTerm = NULL;
// need this for Matches.cpp
qt->m_synWids0 = syn.m_wids0[j];
qt->m_synWids1 = syn.m_wids1[j];
long na = syn.m_numAlnumWords[j];
// how many words were in the base we used to
// get the synonym. i.e. if the base is "new jersey"
// then it's 2! and the synonym "nj" has one alnum
// word.
long ba = syn.m_numAlnumWordsInBase[j];
qt->m_numAlnumWordsInSynonym = na;
qt->m_numAlnumWordsInBase = ba;
// crap, "nj" is a synonym of the PHRASE TERM
// bigram "new jersey" not of the single word term
// "new" so fix that.
if ( ba == 2 && origTerm->m_rightPhraseTerm )
qt->m_synonymOf = origTerm->m_rightPhraseTerm;
// ignore some synonym terms if tf is too low
qt->m_ignored = qw->m_ignoreWord;
// assume not a repeat of another query term(set below)
qt->m_repeat = false;
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
// change in both places
int64_t wid = syn.m_aids[j];
// might be in a title: field or something
if ( qw->m_prefixHash ) {
int64_t ph = qw->m_prefixHash;
wid= hash64h(wid,ph);
}
qt->m_termId = wid & TERMID_MASK;
m_termIds[n] = wid & TERMID_MASK;
qt->m_rawTermId = syn.m_aids[j];
// assume explicit bit is 0
qt->m_explicitBit = 0;
qt->m_matchesExplicitBits = 0;
// boolean queries are not allowed term signs
if ( m_isBoolean ) {
qt->m_termSign = '\0';
m_termSigns[n] = '\0';
// boolean fix for "health OR +sports" because
// the + there means exact word match, no syns
if ( qw->m_wordSign == '+' ) {
qt->m_termSign = qw->m_wordSign;
m_termSigns[n] = qw->m_wordSign;
}
}
// if not bool, ensure to change signs in both places
else {
qt->m_termSign = qw->m_wordSign;
m_termSigns[n] = qw->m_wordSign;
}
// do not use an explicit bit up if we got a hard count
qt->m_hardCount = qw->m_hardCount;
//qw->m_queryWordTerm = qt;
// IndexTable.cpp uses this one
qt->m_inQuotes = qw->m_inQuotes;
// usually this is right
char *ptr = syn.m_termPtrs[j];
// buf if it is NULL that means we transformed the
// word by like removing accent marks and stored
// it in m_synWordBuf, as opposed to just pointing
// to a line in memory of wiktionary-buf.txt.
if ( ! ptr ) {
long off = syn.m_termOffs[j];
if ( off < 0 ) {
char *xx=NULL;*xx=0; }
if ( off > qw->m_synWordBuf.length() ) {
char *xx=NULL;*xx=0; }
// use QueryWord::m_synWordBuf which should
// be persistent and not disappear like
// syn.m_synWordBuf.
ptr = qw->m_synWordBuf.getBufStart() + off;
}
// point to the string itself that is the word
qt->m_term = ptr;
qt->m_termLen = syn.m_termLens[j];
// qt->m_term = syn.m_termPtrs[j];
// reset our implicit bits to 0
qt->m_implicitBits = 0;
// assume not under a NOT bool op
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeight ;
qt->m_userType = qw->m_userType ;
qt->m_fieldCode = qw->m_fieldCode ;
// stuff before a pipe always has a weight of 1
if ( qt->m_piped ) {
qt->m_userWeight = 1;
qt->m_userType = 'a';
}
// otherwise, add it
n++;
}
}
m_numTerms = n;
if ( n > MAX_QUERY_TERMS ) { char *xx=NULL;*xx=0; }
// count them for doing number of combos
//m_numExplicitBits = shift;
// . repeated terms have the same termbits!!
// . this is only for bool queries since regular queries ignore
// repeated terms in setWords()
// . we need to support: "trains AND (perl OR python) NOT python"
for ( long i = 0 ; i < n ; i++ ) {
// BUT NOT IF in a UOR'd list!!! Metalincs bug...
if ( m_qterms[i].m_isUORed ) continue;
// that didn't seem to fix it right, for dup terms that
// are the FIRST term in a UOR sequence... they don't seem
// to have m_isUORed set
if ( m_hasUOR ) continue;
for ( long j = 0 ; j < i ; j++ ) {
// skip if not a termid match
if(m_qterms[i].m_termId!=m_qterms[j].m_termId)continue;
m_qterms[i].m_explicitBit = m_qterms[j].m_explicitBit;
// if doing phrases, ignore the unrequired phrase
if ( m_qterms[i].m_isPhrase ) {
if ( m_qterms[j].m_implicitBits )
m_qterms[j].m_repeat = true;
else
m_qterms[i].m_repeat = true;
continue;
}
// if not doing phrases, just ignore term #i
m_qterms[i].m_repeat = true;
}
}
// if we're a special range: term and a doc has us, then
// assume it has our associates too because we are all
// essentially the same term. we don't want this to be a
// factor in the ranking. since gigablast usually puts docs
// with all the terms (between OR operators) above terms that do
// not have all ther terms. that is not a good thing for these terms.
/*
long nw = m_numWords;
for ( long i = 0 ; i < nw ; i++ ) {
// skip if not a range: query term
if ( m_qwords[i].m_fieldCode != FIELD_RANGE ) continue;
// loop over all our associates (in same parens level) to
// get the OR of all the explicit bits
qvec_t allBits = 0;
for ( long j=i;j<nw &&m_qwords[j].m_opcode!=OP_RIGHTPAREN;j++){
if ( m_qwords[j].m_ignoreWord ) continue;
// get the jth word's term
QueryTerm *qt = m_qwords[j].m_queryWordTerm;
// this can be NULL if we already got 16 query terms!
if ( ! qt ) continue;
// skip if no value
if ( ! qt->m_explicitBit ) continue;
// grab it
allBits |= qt->m_explicitBit ;
}
// now make everyone use just one of those bits
for ( long j=i;j<nw &&m_qwords[j].m_opcode!=OP_RIGHTPAREN;j++){
if ( m_qwords[j].m_ignoreWord ) continue;
// get the jth word's term
QueryTerm *qt = m_qwords[j].m_queryWordTerm;
// this can be NULL if we already got 16 query terms!
if ( ! qt ) continue;
// skip if no value
if ( ! qt->m_explicitBit ) continue;
// force it to use the common bit
qt->m_explicitBit = allBits;
qt->m_implicitBits = allBits;
}
}
*/
// . if only have one term and it is a signless phrase, make it signed
// . don't forget to set m_termSigns too!
if ( n == 1 && m_qterms[0].m_isPhrase && ! m_qterms[0].m_termSign ) {
m_qterms[0].m_termSign = '*';
m_termSigns[0] = '*';
}
// . or bits into the m_implicitBits member of phrase QueryTerms that
// represent the consitutent words
// . loop over each
//m_numTerms = n2;
// . how many of the terms are non fielded singletons?
// . this is just for support of the BIG HACK in Summary.cpp
/*
m_numTermsSpecial = 0;
for ( long i = 0 ; i < n ; i++ ) {
if ( m_qterms[i].m_isPhrase ) continue;
if ( m_qterms[i].m_fieldCode ) continue;
if ( m_qterms[i].m_isUORed ) continue;
// only skip query stop words if in quotes, if it is in
// quotes then we gotta have it...
if ( m_qterms[i].m_isQueryStopWord && !
m_qterms[i].m_inQuotes ) continue;
if ( m_qterms[i].m_underNOT ) continue;
if ( m_qterms[i].m_termSign == '-' ) continue;
m_numTermsSpecial++;
}
*/
// . set m_componentCodes all to -2
// . addCompoundTerms() will set these appropriately
// . see Msg2.cpp for more info on componentCodes
// . -2 means unset, neither a compound term nor a component term at
// this time
for ( long i = 0 ; i < m_numTerms ; i++ ) m_componentCodes[i] = -2;
m_numComponents = 0;
// . now set m_phrasePart for Summary.cpp's hackfix filter
// . only set this for the non-phrase terms, since keepAllSingles is
// set to true when setting the Query for Summary.cpp::set in order
// to match the singles
for ( long i = 0 ; i < m_numTerms ; i++ ) {
// assume not in a phrase
m_qterms[i].m_phrasePart = -1;
//if ( ! m_qterms[i].m_isPhrase ) continue;
// skip cd-rom too, if not in quotes
if ( ! m_qterms[i].m_inQuotes ) continue;
// is next term also in a quoted phrase?
if ( i - 1 < 0 ) continue;
//if ( ! m_qterms[i+1].m_isPhrase ) continue;
if ( ! m_qterms[i-1].m_inQuotes ) continue;
// are we in the same quoted phrase?
if ( m_qterms[i+0].m_qword->m_quoteStart !=
m_qterms[i-1].m_qword->m_quoteStart ) continue;
// ok, we're in the same quoted phrase
m_qterms[i+0].m_phrasePart=m_qterms[i+0].m_qword->m_quoteStart;
m_qterms[i-1].m_phrasePart=m_qterms[i+0].m_qword->m_quoteStart;
}
// . set m_requiredBits
// . these are 1-1 with m_qterms (QueryTerms)
// . required terms have no - sign and have no signless phrases
// . these are what terms doc would NEED to have if we were default AND
// BUT for boolean queries that doesn't apply
m_requiredBits = 0; // no - signs, no signless phrases
m_negativeBits = 0; // terms with - signs
m_forcedBits = 0; // terms with + signs
m_synonymBits = 0;
for ( long i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) {
m_negativeBits |= qt->m_explicitBit; // (1 << i );
continue;
}
// forced bits
if ( qt->m_termSign == '+' && ! m_isBoolean )
m_forcedBits |= qt->m_explicitBit; //(1 << i);
// skip signless phrases
if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue;
if ( qt->m_synonymOf ) {
m_synonymBits |= qt->m_explicitBit;
continue;
}
// fix gbhastitleindicator:1 where "1" is a stop word
if ( qt->m_isQueryStopWord && ! m_qterms[i].m_fieldCode )
continue;
// OR it all up
m_requiredBits |= qt->m_explicitBit; // (1 << i);
}
// set m_matchRequiredBits which we use for Matches.cpp
m_matchRequiredBits = 0;
for ( long i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// skip all phrase terms
if ( qt->m_isPhrase ) continue;
// OR it all up
m_matchRequiredBits |= qt->m_explicitBit;
}
// if we have '+test -test':
if ( m_negativeBits & m_requiredBits )
m_numTerms = 0;
// we need to remember this now for tier integration in IndexTable.cpp
//m_requiredBits = requiredBits;
// now set m_matches,ExplicitBits, used only by Matches.cpp so far
for ( long i = 0 ; i < m_numTerms ; i++ ) {
// set it up
m_qterms[i].m_matchesExplicitBits = m_qterms[i].m_explicitBit;
// or in the repeats
for ( long j = 0 ; j < m_numTerms ; j++ ) {
// skip if termid mismatch
if ( m_qterms[i].m_termId != m_qterms[j].m_termId )
continue;
// i guess signs do not have to match
//m_qterms[i].m_termSign == m_qterms[j].m_termSign){
m_qterms[i].m_matchesExplicitBits |=
m_qterms[j].m_explicitBit;
}
}
m_numRequired = 0;
for ( long i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// assume not required
qt->m_isRequired = false;
// don't require if negative
// no, consider required, but NEGATIVE required...
//if ( qt->m_termSign == '-' ) continue;
// skip signless phrases
if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue;
if ( qt->m_isPhrase && qt->m_termSign == '*' ) continue;
if ( qt->m_synonymOf ) continue;
// IGNORE_QSTOP?
if ( qt->m_ignored ) continue;
// mark it
qt->m_isRequired = true;
// count them
m_numRequired++;
}
// required quoted phrase terms
for ( long i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// quoted phrase?
if ( ! qt->m_isPhrase ) continue;
if ( ! qt->m_inQuotes ) continue;
// mark it
qt->m_isRequired = true;
// count them
m_numRequired++;
}
// . for query 'to be or not to be shakespeare'
// require 'tobe' 'beor' 'tobe' because
// they are bigrams in the wikipedia phrase 'to be or not to be'
// and they all consist solely of query stop words. as of
// 8/20/2012 i took 'not' off the query stop word list.
// . require bigrams that consist of 2 query stop words and
// are in a wikipedia phrase. set termSign to '+' i guess?
// . for 'in the nick' , a wiki phrase, make "in the" required
// and give a big bonus for "the nick" below.
for ( long i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// only check bigrams here
if ( ! qt->m_isPhrase ) continue;
// get the query word that starts this phrase
QueryWord *qw1 = qt->m_qword;
// must be in a wikiphrase
if ( qw1->m_wikiPhraseId <= 0 ) continue;
// what query word # is that?
long qwn = qw1 - m_qwords;
// get the next alnum word after that
// assume its the last word in our bigram phrase
QueryWord *qw2 = &m_qwords[qwn+2];
// must be in same wikiphrase
if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
// must be two stop words
if ( ! qw1->m_isQueryStopWord ) continue;
if ( ! qw2->m_isQueryStopWord ) continue;
// mark it
qt->m_isRequired = true;
// count them
m_numRequired++;
}
//
// new logic for XmlDoc::setRelatedDocIdWeight() to use
//
long shift = 0;
m_requiredBits = 0;
for ( long i = 0; i < n ; i++ ){
QueryTerm *qt = &m_qterms[i];
qt->m_explicitBit = 0;
if ( ! qt->m_isRequired ) continue;
// negative terms are "negative required", but we ignore here
if ( qt->m_termSign == '-' ) continue;
qt->m_explicitBit = 1<<shift;
m_requiredBits |= qt->m_explicitBit;
shift++;
if ( shift >= (long)(sizeof(qvec_t)*8) ) break;
}
// now implicit bits
for ( long i = 0; i < n ; i++ ){
QueryTerm *qt = &m_qterms[i];
// make it explicit bit at least
qt->m_implicitBits = qt->m_explicitBit;
if ( qt->m_isRequired ) continue;
// synonym?
if ( qt->m_synonymOf )
qt->m_implicitBits |= qt->m_synonymOf->m_explicitBit;
// skip if not bigram
if ( ! qt->m_isPhrase ) continue;
// get sides
QueryTerm *t1 = qt->m_leftPhraseTerm;
QueryTerm *t2 = qt->m_rightPhraseTerm;
if ( ! t1 || ! t2 ) continue;
qt->m_implicitBits |= t1->m_explicitBit;
qt->m_implicitBits |= t2->m_explicitBit;
}
// . for query 'to be or not to be shakespeare'
// give big bonus for 'ornot' and 'notto' bigram terms because
// the single terms 'or' and 'to' are ignored and because
// 'to be or not to be' is a wikipedia phrase
// . on 8/20/2012 i took 'not' off the query stop word list.
// . now give a big bonus for bigrams whose two terms are in the
// same wikipedia phrase and one and only one of the terms in
// the bigram is a query stop word
// . in general 'ornot' is considered a "synonym" of 'not' and
// gets hit with a .90 score factor, but that should never
// happen, it should be 1.00 and in this special case it should
// be 1.20
// . so for 'time enough for love' the phrase term "enough for"
// gets its m_isWikiHalfStopBigram set AND that phrase term
// is a synonym term of the single word term "enough" and is treated
// as such in the Posdb.cpp logic.
for ( long i = 0 ; i < m_numTerms ; i++ ) {
// QueryTerms are derived from QueryWords
QueryTerm *qt = &m_qterms[i];
// assume not!
qt->m_isWikiHalfStopBigram = 0;
// don't require if negative
if ( qt->m_termSign == '-' ) continue;
// only check bigrams here
if ( ! qt->m_isPhrase ) continue;
// get the query word that starts this phrase
QueryWord *qw1 = qt->m_qword;
// must be in a wikiphrase
if ( qw1->m_wikiPhraseId <= 0 ) continue;
// what query word # is that?
long qwn = qw1 - m_qwords;
// get the next alnum word after that
// assume its the last word in our bigram phrase
QueryWord *qw2 = &m_qwords[qwn+2];
// must be in same wikiphrase
if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue;
// if both query stop words, should have been handled above
// we need one to be a query stop word and the other not
// for this algo
if ( qw1->m_isQueryStopWord && qw2->m_isQueryStopWord )
continue;
// skip if neither is a query stop word
if ( ! qw1->m_isQueryStopWord&& ! qw2->m_isQueryStopWord )
continue;
// one must be a stop word i guess
// so for 'the time machine' we do not count 'time machine'
// as a halfstopwikibigram
if ( ! qw1->m_isQueryStopWord && ! qw2->m_isQueryStopWord )
continue;
// don't require it, if query is 'the tigers' accept
// just 'tigers' but give a bonus for 'the tigers' in
// the document.
//qt->m_isRequired = true;
// count them
//m_numRequired++;
// special flag
qt->m_isWikiHalfStopBigram = true;
}
return true;
}
/*
// . add in compound terms
// . set m_componentCodes appropriately
void Query::addCompoundTerms ( ) {
// loop through possible starting points of sequences of the same ebit
for (long i = 0 ; i < m_numTerms - 1 ; i++ ) {
// break if too many already
if ( m_numTerms >= MAX_QUERY_TERMS ) break;
// if already processed, skip it
if ( m_componentCodes[i] != -2 ) continue;
// get ebit of the ith query term
qvec_t ebit = m_qterms[i].m_explicitBit;
// skip if 0, it is ignored because it breeched limit of 15
if ( ebit == 0 ) continue;
// skip if next term's ebit is different
//if ( ebit != m_qterms[i+1].m_explicitBit ) continue;
// skip if not UOR'd because it could just be a repeat term
//if ( ! m_qterms[i+1].m_isUORed ) continue;
// all UORed terms have m_isOURed set now
// because UORed terms are not necessarily in order
// (first phrases, then words)
if ( ! m_qterms[i].m_isUORed ) continue;
// the termid of the compound list
int64_t id = 0LL;
// store compound terms last
long n = m_numTerms;
// sum of termfreqs
//int64_t sum = 0;
// we got a UOR'd list, see whose involved
long j ;
long numUORComponents = 0;
char *beg = NULL;
char *end = NULL;
for ( j = 0; j < m_numTerms ; j++ ) {
// if term does not have our ebit, break out
if ( ebit != m_qterms[j].m_explicitBit ) continue;
// otherwise, make this term point to the compound term
m_componentCodes[j] = n;
// an integrate its termid into the compound termid
id = hash64 ( m_qterms[j].m_termId , id ) &TERMID_MASK;
// add in the term frequency (aka popularity)
//sum += m_termFreqs[j];
// keep track so IndexTable::alloc() can get it
m_numComponents++;
numUORComponents++;
// get phrase UOR term right
long a = j;
long b = j;
// if (m_qterms[j].m_qword->m_leftPhraseStart >= 0){
// a = m_qterms[j].m_qword->m_leftPhraseStart;
// b++;
// }
char *newBeg = m_qterms[a].m_term;
// had to add check for newBeg being null
// (because of -O2 ???)
if (!beg || (newBeg && newBeg < beg))
beg = newBeg;
char *newEnd = m_qterms[b].m_term
+ m_qterms[b].m_termLen;
if (!end || newEnd > end)
end = newEnd;
}
if (!numUORComponents) continue;
// copy it
memcpy ( &m_qterms[n] , &m_qterms[i] , sizeof(QueryTerm) );
// get term's length
//char *beg = m_qterms[i].m_term;
//char *end = m_qterms[j-1].m_term + m_qterms[j-1].m_termLen;
m_qterms[n].m_term = beg;
m_qterms[n].m_termLen = end - beg;
// set its id
m_qterms[n].m_termId = id;
// this array too!
m_termIds[n] = id;
m_qterms[n].m_rawTermId = 0LL;
m_qterms[n].m_isQueryStopWord = false;
m_componentCodes[n] = -1; // code for a compound termid is -1
//m_termFreqs [n] = sum;
m_termSigns [n] = '\0';
// inc the total term count
m_numTerms++;
}
}
*/
// -1 means compound, -2 means unset, >= 0 means component
bool Query::isCompoundTerm ( long i ) {
return ( m_componentCodes[i] == -1 );
}
bool Query::setQWords ( char boolFlag ,
bool keepAllSingles ,
Words &words ,
Phrases &phrases ) {
// . break query up into Words and phrases
// . because we now deal with boolean queries, we make parentheses
// their own separate Word, so tell "words" we're setting a query
//Words words;
if ( ! words.set ( m_buf , m_bufLen,
TITLEREC_CURRENT_VERSION, true, true ) )
return log("query: Had error parsing query: %s.",
mstrerror(g_errno));
long numWords = words.getNumWords();
// truncate it
if ( numWords > MAX_QUERY_WORDS ) {
log("query: Had %li words. Max is %li. Truncating.",
numWords,(long)MAX_QUERY_WORDS);
numWords = MAX_QUERY_WORDS;
m_truncated = true;
}
m_numWords = numWords;
// alloc the mem if we need to (mdw left off here)
long need = m_numWords * sizeof(QueryWord);
// sanity check
if ( m_qwords || m_qwordsAllocSize ) { char *xx = NULL; *xx = 0; }
// point m_qwords to our generic buffer if it will fit
// if ( need < GBUF_SIZE ) {
if ( m_gnext + need < m_gbuf + GBUF_SIZE &&
// it can wrap so watch out with this:
need < GBUF_SIZE ) {
m_qwords = (QueryWord *)m_gnext;
m_gnext += need;
}
// otherwise, we must allocate memory for it
else {
m_qwords = (QueryWord *)mmalloc ( need , "Query4" );
if ( ! m_qwords )
return log("query: Could not allocate mem for query.");
m_qwordsAllocSize = need;
}
// is all alpha chars in query in upper case? caps lock on?
bool allUpper = true;
char *p = m_buf;
char *pend = m_buf + m_bufLen;
for ( ; p < pend ; p += getUtf8CharSize(p) )
if ( is_alpha_utf8 ( p ) && ! is_upper_utf8 ( p ) ) {
allUpper = false; break; }
// . come back here from below when we detect dat query is not boolean
// . we need to redo the bits cuz they may have been messed with below
// redo:
// field code we are in
char fieldCode = 0;
char fieldSign = 0;
char *field = NULL;
long fieldLen = 0;
// keep track of the start of different chunks of quotes
long quoteStart = -1;
bool inQuotes = false;
//bool inVQuotes = false;
char quoteSign = 0;
// the current little sign
char wordSign = 0;
// when reading first word in link: ... field we skip the following
// words until we hit a space because we hash them all together
bool ignoreTilSpace = false;
// assume we're NOT a boolean query
m_isBoolean = false;
// used to not respect the bool operator if it is the first word
bool firstWord = true;
// the query processing is broken into 3 stages.
// . STAGE #1
// . reset all query words to default
// set all m_ignoreWord and m_ignorePhrase to IGNORE_DEFAULT
// . set m_isFieldName, m_fieldCode and m_quoteStart for query words.
// no field names in quotes. +title:"hey there".
// set m_quoteStart to -1 if not in quotes.
// . if quotes immediately follow field code's ':' then distribute
// the field code to all words in the quotes
// . distribute +/- signs across quotes and fields to m_wordSigns.
// support -title:"hey there".
// . set m_quoteStart to -1 if only one alnum word is
// in quotes, what's the point of that?
// . set boolean op codes (m_opcode). cannot be in quotes.
// cannot have a field code. cannot have a word sign (+/-).
// . set m_wordId of FIELD_LINK, _URL, _SITE, _IP fields.
// m_wordId of first should be hash of the whole field value.
// only set its m_ignoreWord to 0, keep it's m_ignorePhrase to DEF.
// . set m_ignore of non-op codes, non-fieldname, alnum words to 0.
// . set m_wordId of each non-ignored alnum word.
// . STAGE #2
// . customize Bits class:
// first alnum word can start phrase.
// first alnum word in quotes (m_quoteStart >= 0 ) can start phrase.
// connected on the right but not on the left.. can start phrase.
// no pair across any double quote
// no pair across ".." --- UNLESS in quotes!
// no pair across any change of field code.
// field names may not be part of any phrase or paired across.
// boolean ops may not be part of any phrase or paired across.
// ignored words may not be part of any phrase or paired across.
// . STAGE #3
// . set phrases class w/ custom Bits class mods.
// . set m_phraseId and m_rawPhraseId of all QueryWords. if phraseId
// is not 0 (phrase exists) then set m_ignorePhrase to 0.
// . set m_leftConnected, m_rightConnected. word you are connecting
// to must not be ignored. (no field names or op codes).
// ensure you are in a phrase with the connected word, too, to
// really be connected.
// . set m_leftPhraseStart and m_rightPhraseEnd for all
// m_inQuotePhrase is not needed since if m_quoteStart is >= 0
// we MUST be in a quoted phrase!
// . if word is Connected then set m_ignoreWord to IGNORE_CONNECTED.
// set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0).
// m_wordSign may have inherited quote or field sign.
// . if word's m_quoteStart is >= 0 set m_ignoreWord to IGNORE_QUOTED
// set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0)
// m_wordSign may have inherited quote or field sign.
// . if one word in a phrase is negative, then set m_phraseSign to '-'
// set the Bits used for making phrases from the Words class
Bits bits;
if ( ! bits.set ( &words, TITLEREC_CURRENT_VERSION , 0 ))
return log("query: Had error processing query: %s.",
mstrerror(g_errno));
long userWeight = 1;
char userType = 'r';
long userWeightPhrase = 1;
char userTypePhrase = 'r';
long ignorei = -1;
// assume we contain no pipe operator
long pi = -1;
long posNum = 0;
char *ignoreTill = NULL;
// loop over all words, these QueryWords are 1-1 with "words"
for ( long i = 0 ; i < numWords && i < MAX_QUERY_WORDS ; i++ ) {
// convenience var, these are 1-1 with "words"
QueryWord *qw = &m_qwords[i];
// set to defaults?
memset ( qw , 0 , sizeof(QueryWord) );
// but quotestart should be -1
qw->m_quoteStart = -1;
qw->m_leftPhraseStart = -1;
// assume QueryWord is ignored by default
qw->m_ignoreWord = IGNORE_DEFAULT;
qw->m_ignorePhrase = IGNORE_DEFAULT;
qw->m_ignoreWordInBoolQuery = false;
qw->m_wordNum = i;
// get word as a string
//char *w = words.getWord(i);
//long wlen = words.getWordLen(i);
qw->m_word = words.getWord(i);
qw->m_wordLen = words.getWordLen(i);
qw->m_isPunct = words.isPunct(i);
qw->m_posNum = posNum;
// count 1 unit for it
posNum++;
// we ignore the facet value range list...
if ( ignoreTill && qw->m_word < ignoreTill )
continue;
// . we duplicated this code from XmlDoc.cpp's
// getWordPosVec() function
if ( qw->m_isPunct ) { // ! wids[i] ) {
char *wp = qw->m_word;
long wplen = qw->m_wordLen;
// simple space or sequence of just white space
if ( words.isSpaces(i) )
posNum += 0;
// 'cd-rom'
else if ( wp[0]=='-' && wplen==1 )
posNum += 0;
// 'mr. x'
else if ( wp[0]=='.' && words.isSpaces2(i,1))
posNum += 0;
// animal (dog)
else
posNum++;
}
char *w = words.getWord(i);
long wlen = words.getWordLen(i);
// assume it is a query weight operator
qw->m_queryOp = true;
// ignore it? (this is for query weight operators)
if ( i <= ignorei ) continue;
// deal with pipe operators
if ( wlen == 5 &&
w[0]=='P'&&w[1]=='i'&&w[2]=='i'&&w[3]=='P'&&w[4]=='E') {
pi = i;
qw->m_opcode = OP_PIPE;
continue;
}
// is it the bracket operator?
// " LeFtB 113 rp RiGhB "
if ( wlen == 5 &&
w[0]=='L'&&w[1]=='e'&&w[2]=='F'&&w[3]=='t'&&w[4]=='B'&&
i+4 < numWords ) {
// s MUST point to a number
char *s = words.getWord(i+2);
long slen = words.getWordLen(i+2);
// if no number, it must be
// " leFtB RiGhB " or " leFtB p RiGhB "
if ( ! is_digit(s[0]) ) {
// phrase weight reset
if ( s[0] == 'p' ) {
userWeightPhrase = 1;
userTypePhrase = 'r';
ignorei = i + 4;
}
// word reset
else {
userWeight = 1;
userType = 'r';
ignorei = i + 2;
}
continue;
}
// get the number
long val = atol2 (s, slen);
// s2 MUST point to the a,r,ap,rp string
char *s2 = words.getWord(i+4);
// is it a phrase?
if ( s2[1] == 'p' ) {
userWeightPhrase = val;
userTypePhrase = s2[0]; // a or r
}
else {
userWeight = val;
userType = s2[0]; // a or r
}
// ignore all following words up and inc. i+6
ignorei = i + 6;
continue;
}
// assign score weight, if any for this guy
qw->m_userWeight = userWeight ;
qw->m_userType = userType ;
qw->m_userWeightPhrase = userWeightPhrase ;
qw->m_userTypePhrase = userTypePhrase ;
qw->m_queryOp = false;
// does word #i have a space in it? that will cancel fieldCode
// if we were in a field
bool endField = false;
if ( words.hasSpace(i) && ! inQuotes ) endField = true;
// TODO: fix title:" hey there" (space in quotes is ok)
// if there's a quote before the first space then
// it's ok!!!
if ( endField ) {
char *s = words.m_words[i];
char *send = s + words.m_wordLens[i];
for ( ; s < send ; s++ ) {
// if the space is inside the quotes then it
// doesn't count!
if ( *s == '\"' ) { endField = false; break;}
if ( is_wspace_a(*s) ) break;
}
}
// cancel the field if we hit a space (not in quotes)
if ( endField ) {
// cancel the field
fieldCode = 0;
fieldLen = 0;
field = NULL;
// we no longer have to ignore for link: et al
ignoreTilSpace = false;
}
// . maintain inQuotes and quoteStart
// . quoteStart is the word # that starts the current quote
long nq = words.getNumQuotes(i) ;
if ( nq > 0 ) { // && ! ignoreQuotes ) {
// toggle quotes if we need to
if ( nq & 0x01 ) inQuotes = ! inQuotes;
// set quote sign to sign before the quote
if ( inQuotes ) {
quoteSign = '\0';
for ( char *p = w + wlen - 1 ; p > w ; p--){
if ( *p != '\"' ) continue;
if ( *(p-1) == '-' ) quoteSign = '-';
if ( *(p-1) == '+' ) quoteSign = '+';
break;
}
}
// . quoteStart is the word # the quotes started at
// . it is -1 if not in quotes
// . now we set it to the alnum word AFTER us!!
if ( inQuotes && i+1< numWords ) quoteStart = i+1;
else quoteStart = -1;
}
//log(LOG_DEBUG, "Query: nq: %ld inQuotes: %d,quoteStart: %ld",
// nq, inQuotes, quoteStart);
// does word #i have a space in it? that will cancel fieldCode
// if we were in a field
// TODO: fix title:" hey there" (space in quotes is ok)
bool cancelField = false;
if ( words.hasSpace(i) && ! inQuotes ) cancelField = true;
// fix title:"foo bar" "another quote" so "another quote"
// is not in the title: field
if ( words.hasSpace(i) && inQuotes && nq>= 2 )
cancelField = true;
// BUT if we have a quote, and they just got turned off,
// and the space is not after the quote, do not cancel field!
if ( nq == 1 && cancelField ) {
// if we hit the space BEFORE the quote, do NOT cancel
// the field
for ( char *p = w + wlen - 1 ; p > w ; p--) {
// hey, we got the quote first, keep field
if ( *p == '\"' ) {cancelField = false; break;}
// otherwise, we got space first? cancel it!
if ( is_wspace_a(*p) ) break;
}
}
if ( cancelField ) {
// cancel the field
fieldCode = 0;
fieldLen = 0;
field = NULL;
// we no longer have to ignore for link: et al
ignoreTilSpace = false;
}
// skip if we should
if ( ignoreTilSpace ){
if (m_qwords[i-1].m_fieldCode){
qw->m_fieldCode = m_qwords[i-1].m_fieldCode;
}
continue;
}
// . is this word potentially a field?
// . it cannot be another field name in a field
if ( i < (m_numWords-2) &&
w[wlen] == ':' && ! is_wspace_utf8(w+wlen+1) &&
//w[wlen+1] != '/' && // as in http://
(! is_punct_utf8(w+wlen+1) || w[wlen+1]=='\"' ||
// for gblatrange2:-106.940994to-106.361282
w[wlen+1]=='-') &&
! fieldCode && ! inQuotes ) {
// field name may have started before though if it
// was a compound field name containing hyphens,
// underscores or periods
long j = i-1 ;
while ( j > 0 &&
((m_qwords[j].m_rawWordId != 0) ||
( m_qwords[j].m_wordLen ==1 &&
((m_qwords[j].m_word)[0]=='-' ||
(m_qwords[j].m_word)[0]=='_' ||
(m_qwords[j].m_word)[0]=='.'))))
{
j--;
}
if ( j < 0 ) {
//log(LOG_LOGIC,"query: query: bad "
//"engineer.");
j = 0; }
// advance j to a non-punct word
while (words.isPunct(j)) j++;
// ignore all of these words then,
// they're part of field name
long tlen = 0;
for ( long k = j ; k <= i ; k++ )
tlen += words.getWordLen(k);
// set field name to the compound name if it is
field = words.getWord (j);
fieldLen = tlen;
if ( j == i ) fieldSign = wordSign;
else fieldSign = m_qwords[j].m_wordSign;
// debug msg
//char ttt[128];
//memcpy ( ttt , field , fieldLen );
//ttt[fieldLen] = '\0';
//log("field name = %s", ttt);
// . is it recognized field name,like "title" or "url"?
// . does it officially end in a colon? incl. in hash?
bool hasColon;
fieldCode = getFieldCode (field, fieldLen, &hasColon) ;
// only url,link,site,ip and suburl field names will
// end a colon, due to historical fuck up
//if ( hasColon ){
// fieldLen++;
//}
// reassign alias fields
//Why??? -p
//if ( fieldCode == FIELD_TYPE ) {
// field = "type" ; fieldLen = 4; }
// if so, it does NOT get its own QueryWord,
// but its sign can be inherited by its members
if ( fieldCode ) {
for ( long k = j ; k <= i ; k++ )
m_qwords[k].m_ignoreWord =
IGNORE_FIELDNAME;
continue;
}
}
// what quote chunk are we in? this is 0 if we're not in quotes
if ( inQuotes ) qw->m_quoteStart = quoteStart ;
else qw->m_quoteStart = -1;
qw->m_inQuotes = inQuotes;
// ptr to field, if any
qw->m_fieldCode = fieldCode;
// if we are a punct word, see if we end in a sign that can
// be applied to the next word, a non-punct word
if ( words.isPunct(i) ) {
wordSign = w[wlen-1];
if ( wordSign != '-' && wordSign != '+') wordSign = 0;
if ( wlen>1 &&!is_wspace_a (w[wlen-2]) ) wordSign = 0;
if ( i > 0 && wlen == 1 ) wordSign = 0;
}
// assign quoteSign to wordSign if we just got into quotes
//if ( nq > 0 && inQuotes ) quoteSign = wordSign;
// don't add any QueryWord for a punctuation word
if ( words.isPunct(i) ) continue;
// what is the sign of our term? +, -, *, ...
char mysign;
if ( fieldCode ) mysign = fieldSign;
else if ( inQuotes ) mysign = quoteSign;
else mysign = wordSign;
// are we doing default AND?
//if ( forcePlus && ! *mysign ) mysign = '+';
// store the sign
qw->m_wordSign = mysign;
// what quote chunk are we in? this is 0 if we're not in quotes
if ( inQuotes ) qw->m_quoteStart = quoteStart ;
else qw->m_quoteStart = -1;
// if we're the first alnum in this quote and
// the next word has a quote, then we're just a single word
// in quotes which is silly, so undo it. But we should
// still inherit any quoteSign, however. Be sure to also
// set m_inQuotes to false so Matches.cpp::matchWord() works.
if ( i == quoteStart ) { // + 1 ) {
if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) {
qw->m_quoteStart = -1;
qw->m_inQuotes = false;
}
}
// . get prefix hash of collection name and field
// . but first convert field to lower case
uint64_t ph;
long fflen = fieldLen;
if ( fflen > 62 ) fflen = 62;
char ff[64];
to_lower3_a ( field , fflen , ff );
//unsigned longlongph=getPrefixHash(m_coll,m_collLen,ff,fflen);
//ph=getPrefixHash(NULL,0,ff,fflen);
ph = hash64 ( ff , fflen );
// map "intitle" map to "title"
if ( fieldCode == FIELD_TITLE )
ph = hash64 ( "title", 5 );
// make "suburl" map to "inurl"
if ( fieldCode == FIELD_SUBURL )
ph = hash64 ( "inurl", 5 );
// fix for filetype:pdf queries
if ( fieldCode == FIELD_TYPE )
ph = hash64 ("type",4);
// these are range constraints on the gbsortby: termlist
// which sorts numbers in a field from low to high
if ( fieldCode == FIELD_GBNUMBERMIN )
ph = hash64 ("gbsortby", 8);
if ( fieldCode == FIELD_GBNUMBERMAX )
ph = hash64 ("gbsortby", 8);
if ( fieldCode == FIELD_GBNUMBEREQUALFLOAT )
ph = hash64 ("gbsortby", 8);
// fix for gbsortbyfloat:product.price
if ( fieldCode == FIELD_GBSORTBYFLOAT )
ph = hash64 ("gbsortby", 8);
if ( fieldCode == FIELD_GBNUMBERMININT )
ph = hash64 ("gbsortbyint", 11);
if ( fieldCode == FIELD_GBNUMBERMAXINT )
ph = hash64 ("gbsortbyint", 11);
if ( fieldCode == FIELD_GBNUMBEREQUALINT )
ph = hash64 ("gbsortbyint", 11);
// really just like the gbfacetstr operator but we do not
// display the facets, instead we try to match the provided
// facet value exactly, case sensitvely
if ( fieldCode == FIELD_GBFIELDMATCH )
ph = hash64 ("gbfacetstr", 10);
if ( fieldCode == FIELD_GBFACETFLOAT )
ph = hash64 ("gbsortby",8);
if ( fieldCode == FIELD_GBFACETINT )
ph = hash64 ("gbsortbyint",11);
// ptr to field, if any
qw->m_fieldCode = fieldCode;
// prefix hash
qw->m_prefixHash = ph;
// set this flag
if ( fieldCode == FIELD_LINKS ) m_hasLinksOperator = true;
if ( fieldCode == FIELD_SITELINK ) m_hasLinksOperator = true;
// if we're hashing a url:, link:, site: or ip: term,
// then we need to hash ALL up to the first space
if ( fieldCode == FIELD_URL ||
fieldCode == FIELD_GBPARENTURL ||
fieldCode == FIELD_EXT ||
fieldCode == FIELD_LINK ||
fieldCode == FIELD_ILINK||
fieldCode == FIELD_SITELINK||
fieldCode == FIELD_LINKS||
fieldCode == FIELD_SITE ||
fieldCode == FIELD_IP ||
fieldCode == FIELD_ISCLEAN ||
fieldCode == FIELD_QUOTA ||
fieldCode == FIELD_GBSORTBYFLOAT ||
fieldCode == FIELD_GBREVSORTBYFLOAT ||
// gbmin:price:1.23
fieldCode == FIELD_GBNUMBERMIN ||
fieldCode == FIELD_GBNUMBERMAX ||
fieldCode == FIELD_GBNUMBEREQUALFLOAT ||
fieldCode == FIELD_GBSORTBYINT ||
fieldCode == FIELD_GBREVSORTBYINT ||
fieldCode == FIELD_GBNUMBERMININT ||
fieldCode == FIELD_GBNUMBERMAXINT ||
fieldCode == FIELD_GBNUMBEREQUALINT ||
fieldCode == FIELD_GBFACETSTR ||
fieldCode == FIELD_GBFACETINT ||
fieldCode == FIELD_GBFACETFLOAT ||
fieldCode == FIELD_GBFIELDMATCH ||
fieldCode == FIELD_GBAD ) {
// . find 1st space -- that terminates the field value
// . make "end" point to the end of the entire query
char *end =
(words.m_words[words.m_numWords-1] +
words.m_wordLens[words.m_numWords-1]);
// use this for gbmin:price:1.99 etc.
long firstColonLen = -1;
long lastColonLen = -1;
long colonCount = 0;
long firstComma = -1;
// "w" points to the first alnumword after the field,
// so for site:xyz.com "w" points to the 'x' and wlen
// would be 3 in that case sinze xyz is a word of 3
// chars. so advance
// wlen until we hit a space.
while ( w + wlen < end ) {
// stop at first white space
if ( is_wspace_utf8(w+wlen) ) break;
// in case of gbmin:price:1.99 record first ':'
if ( w[wlen]==':' ) {
lastColonLen = wlen;
if ( firstColonLen == -1 )
firstColonLen = wlen;
colonCount++;
}
// hit a comma in something like
// gbfacetfloat:price,0-1,1-2.5,2.5-10
if ( w[wlen]==',' && firstComma == -1 )
firstComma = wlen;
wlen++;
}
// ignore following words until we hit a space
ignoreTilSpace = true;
// the hash
uint64_t wid = hash64 ( w , wlen, 0LL );
//
// BEGIN FACET RANGE LISTS
//
qw->m_numFacetRanges = 0;
// for gbfacetfloat:price,0-1,1-2.5,... just hash price
if ( firstComma > 0 &&
( fieldCode == FIELD_GBFACETINT ||
fieldCode == FIELD_GBFACETFLOAT ) )
// hash the "price" not the following range lst
wid = hash64 ( w , firstComma );
// now store the range list so we can
// fill up the buckets below
char *s = w + firstComma + 1;
char *send = w + wlen;
long nr = 0;
for ( ; s <send && fieldCode == FIELD_GBFACETINT;){
// must be a digit or . or -
if ( ! is_digit(s[0]) &&
s[0] != '.' &&
s[0] != '-' )
break;
char *sav = s;
// skip to hyphen
for ( ; s < send && *s != '-' ; s++ );
// stop if not hyphen
if ( *s != '-' ) break;
// skip hyphen
s++;
// must be a digit or . or -
if ( ! is_digit(s[0]) &&
s[0] != '.' &&
s[0] != '-' )
break;
// if under max, add it
if ( nr < MAX_FACET_RANGES ) {
qw->m_facetRangeIntA [nr] = atoll(sav);
qw->m_facetRangeIntB [nr] = atoll(s);
qw->m_numFacetRanges = ++nr;
}
// skip to comma or end
for ( ; s < send && *s != ',' ; s++ );
// skip that
if ( *s != ',' ) break;
// SKIP COMMA
s++;
// ignore till. does not included s
ignoreTill = s;
}
for ( ; s <send && fieldCode==FIELD_GBFACETFLOAT;){
// must be a digit or . or -
if ( ! is_digit(s[0]) &&
s[0] != '.' &&
s[0] != '-' )
break;
char *sav = s;
// skip to hyphen
for ( ; s < send && *s != '-' ; s++ );
// stop if not hyphen
if ( *s != '-' ) break;
// skip hyphen
s++;
// must be a digit or . or -
if ( ! is_digit(s[0]) &&
s[0] != '.' &&
s[0] != '-' )
break;
// if under max, add it
if ( nr < MAX_FACET_RANGES ) {
qw->m_facetRangeFloatA [nr] =atof(sav);
qw->m_facetRangeFloatB [nr] =atof(s);
qw->m_numFacetRanges = ++nr;
}
// skip to comma or end
for ( ; s < send && *s != ',' ; s++ );
// skip that
if ( *s != ',' ) break;
// SKIP COMMA
s++;
// ignore till. does not included s
ignoreTill = s;
}
//
// END FACET RANGE LISTS
//
// i've decided not to make
// gbsortby:products.offerPrice
// gbmin:price:1.23 case insensitive
if ( fieldCode == FIELD_GBSORTBYFLOAT ||
fieldCode == FIELD_GBREVSORTBYFLOAT ||
fieldCode == FIELD_GBSORTBYINT ||
fieldCode == FIELD_GBREVSORTBYINT ) {
wid = hash64Lower_utf8 ( w , wlen , 0LL );
// do not include this word as part of
// any boolean expression, so
// Expression::isTruth() will ignore it and we
// fix '(A OR B) gbsortby:offperice' query
qw->m_ignoreWordInBoolQuery = true;
}
// gbmin:price:1.23
if ( lastColonLen>0 &&
( fieldCode == FIELD_GBNUMBERMIN ||
fieldCode == FIELD_GBNUMBERMAX ||
fieldCode == FIELD_GBNUMBEREQUALFLOAT ||
fieldCode == FIELD_GBNUMBEREQUALINT ||
fieldCode == FIELD_GBNUMBERMININT ||
fieldCode == FIELD_GBNUMBERMAXINT ||
fieldCode == FIELD_GBFIELDMATCH ) ) {
// record the field
wid = hash64Lower_utf8(w,lastColonLen , 0LL );
// fix gbminint:gbfacetstr:gbxpath...:165004297
if ( colonCount == 2 ) {
int64_t wid1;
int64_t wid2;
char *a = w;
char *b = w + firstColonLen;
wid1 = hash64Lower_utf8(a,b-a);
a = w + firstColonLen+1;
b = w + lastColonLen;
wid2 = hash64Lower_utf8(a,b-a);
// keep prefix as 2nd arg to this
wid = hash64 ( wid2 , wid1 );
// we need this for it to work
ph = 0LL;
}
// and also the floating point after that
qw->m_float = atof ( w + lastColonLen + 1 );
qw->m_int = (long)atoll( w + lastColonLen+1);
// if it is like
// gbfieldmatch:tag.uri:"http://xyz.com/poo"
// then we should hash the string into
// an int just like how the field value would
// be hashed when adding gbfacetstr: terms
// in XmlDoc.cpp:hashFacet2(). the hash of
// the tag.uri field, for example, is set
// in hashFacet1() and set to "val32". so
// hash it just like that does here.
if ( colonCount >= 1 &&
fieldCode == FIELD_GBFIELDMATCH &&
firstColonLen > 0 ) {
char *a = w + firstColonLen + 1;
// . skip over colon at start
if ( a[0] == ':' ) a++;
// . skip over quotes at start/end
bool inQuotes = false;
if ( a[0] == '\"' ) {
inQuotes = true;
a++;
}
// end of field
char *b = a;
// if not in quotes advance until
// we hit whitespace
char cs;
for ( ; ! inQuotes && *b ; b += cs ) {
cs = getUtf8CharSize(b);
if ( is_wspace_utf8(b) ) break;
}
// if in quotes, go until we hit quote
for ( ; inQuotes && *b != '\"';b++);
// now hash the value
qw->m_int = hash32 ( a , b - a );
qw->m_float = (float)qw->m_int;
//
// hash it like
// gbfacetstr:object.price
// even though its
// gbfieldmatch:object.title:"some foo"
//
/*
int64_t wid1;
int64_t wid2;
a = w;
b = w + firstColonLen;
wid1 = hash64Lower_utf8(a,b-a);
a = w + firstColonLen+1;
b = w + lastColonLen;
wid2 = hash64Lower_utf8(a,b-a);
// keep prefix as 2nd arg to this
wid = hash64 ( wid2 , wid1 );
// we need this for it to work
ph = 0LL;
*/
}
}
// should we have normalized before hashing?
if ( fieldCode == FIELD_URL ||
fieldCode == FIELD_GBPARENTURL ||
fieldCode == FIELD_LINK ||
fieldCode == FIELD_ILINK ||
fieldCode == FIELD_SITELINK ||
fieldCode == FIELD_LINKS ||
fieldCode == FIELD_SITE ) {
Url url;
// do we add www?
bool addwww = false;
if ( fieldCode == FIELD_LINK ) addwww = true;
if ( fieldCode == FIELD_ILINK) addwww = true;
if ( fieldCode == FIELD_LINKS) addwww = true;
if ( fieldCode == FIELD_URL ) addwww = true;
if ( fieldCode == FIELD_GBPARENTURL )
addwww = true;
if ( fieldCode == FIELD_SITELINK)
addwww = true;
url.set ( w , wlen , addwww );
char *site = url.getHost();
long siteLen = url.getHostLen();
if (fieldCode == FIELD_SITELINK)
wid = hash64 ( site , siteLen );
else
wid = hash64 ( url.getUrl(),
url.getUrlLen() );
}
//qw->m_wordId = g_indexdb.getTermId ( ph , wid );
// like we do it in XmlDoc.cpp's hashString()
if ( ph ) qw->m_wordId = hash64h ( wid , ph );
else qw->m_wordId = wid;
qw->m_rawWordId = 0LL; // only for highlighting?
qw->m_phraseId = 0LL;
qw->m_rawPhraseId = 0LL;
qw->m_opcode = 0;
// definitely not a query stop word
qw->m_isQueryStopWord = false;
// do not ignore the wordId
qw->m_ignoreWord = 0;
// override the word length
//qw->m_wordLen = ulen * 2;
// we are the first word?
firstWord = false;
// we're done with this one
continue;
}
char opcode = 0;
// if query is all in upper case and we're doing boolean
// DETECT, then assume not boolean
if ( allUpper && boolFlag == 2 ) boolFlag = 0;
// . having the UOR opcode does not mean we are boolean because
// we want to keep it fast.
// . we need to set this opcode so the UOR logic in setQTerms()
// works, because it checks the m_opcode value. otherwise
// Msg20 won't think we are a boolean query and set boolFlag
// to 0 when setting the query for summary generation and
// will not recognize the UOR word as being an operator
if ( wlen==3 && w[0]=='U' && w[1]=='O' && w[2]=='R' &&
! firstWord ) {
opcode = OP_UOR; m_hasUOR = true; goto skipin; }
// . is this word a boolean operator?
// . cannot be in quotes or field
if ( boolFlag >= 1 && ! inQuotes && ! fieldCode ) {
// are we an operator?
if ( ! firstWord && wlen==2 &&
w[0]=='O' && w[1]=='R')
opcode = OP_OR;
else if ( ! firstWord && wlen==3 &&
w[0]=='A' && w[1]=='N' && w[2]=='D')
opcode = OP_AND;
else if ( ! firstWord && wlen==3 &&
w[0]=='N' && w[1]=='O' && w[2]=='T')
opcode = OP_NOT;
else if ( wlen==5 && w[0]=='L' && w[1]=='e' &&
w[2]=='F' && w[3]=='t' && w[4]=='P' )
opcode = OP_LEFTPAREN;
else if ( wlen==5 && w[0]=='R' && w[1]=='i' &&
w[2]=='G' && w[3]=='h' && w[4]=='P' )
opcode = OP_RIGHTPAREN;
skipin:
// if we are detecting if query is boolean or not AND
// if we are not an operator and have more than 1 cap
// char then the turn off boolean
//if ( boolFlag==2 &&!opcode &&wlen>1&&is_upper(w[1])){
// // turn boolean stuff off
// boolFlag = 0;
// // start again from the top with NO boolean
// goto redo;
//}
// no pair across or even include any boolean op phrs
if ( opcode ) {
bits.m_bits[i] &= ~D_CAN_START_PHRASE;
bits.m_bits[i] &= ~D_CAN_PAIR_ACROSS;
bits.m_bits[i] &= ~D_CAN_BE_IN_PHRASE;
qw->m_ignoreWord = IGNORE_BOOLOP;
qw->m_opcode = opcode;
if ( opcode == OP_LEFTPAREN ) continue;
if ( opcode == OP_RIGHTPAREN ) continue;
// if this is uncommented all of our operators
// become actual query terms (mdw)
if ( opcode == OP_UOR ) continue;
// if you just have ANDs and ()'s that does
// not make you a boolean query! we are bool
// by default!!
if ( opcode == OP_AND ) continue;
m_isBoolean = true;
continue;
}
}
// . add single-word term id
// . this is computed by hash64AsciiLower()
// . but only hash64Lower_a if _HASHWITHACCENTS_ is true
uint64_t wid = 0LL;
if (fieldCode == FIELD_CHARSET){
// find first space -- that terminates the field value
char* end =
(words.m_words[words.m_numWords-1] +
words.m_wordLens[words.m_numWords-1]);
while ( w+wlen<end &&
! is_wspace_utf8(w+wlen) ) wlen++;
// ignore following words until we hit a space
ignoreTilSpace = true;
// the hash
//wid = hash64 ( uw , ulen, 0LL );
// convert to enum value
short csenum = get_iana_charset(w,wlen);
// convert back to string
char astr[128];
long alen = sprintf(astr, "%d", csenum);
wid = hash64(astr, alen, 0LL);
}
else{
wid = words.getWordId(i);
}
qw->m_rawWordId = wid;
// we now have a first word already set
firstWord = false;
// . are we a QUERY stop word?
// . NEVER count as stop word if it's in all CAPS and
// not all letters in the whole query is NOT in all CAPS
// . It's probably an acronym
if ( words.isUpper(i) && words.getWordLen(i)>1 && ! allUpper ){
qw->m_isQueryStopWord = false;
qw->m_isStopWord = false;
}
else {
qw->m_isQueryStopWord =::isQueryStopWord (w,wlen,wid);
// . BUT, if it is a single letter contraction thing
// . ninad: make this == 1 if in utf8! TODO!! it is!
if ( wlen == 1 && w[-1] == '\'' )
qw->m_isQueryStopWord = true;
qw->m_isStopWord =::isStopWord (w,wlen,wid);
}
// . do not count as query stop word if it is the last in query
// . like the query: 'baby names that start with j'
if ( i + 2 > numWords ) qw->m_isQueryStopWord = false;
// hash the termid
//qw->m_wordId = g_indexdb.getTermId ( ph , wid );
// like we do it in XmlDoc.cpp's hashString()
if ( ph ) qw->m_wordId = hash64 ( wid , ph );
else qw->m_wordId = wid;
// do not ignore the word
qw->m_ignoreWord = 0;
}
// pipe those that should be piped
for ( long i = 0 ; i < pi ; i++ ) m_qwords[i].m_piped = true;
if ( pi >= 0 ) m_piped = true;
// . set m_leftConnected and m_rightConnected
// . we are connected to the first non-punct word on our left
// if we are separated by a small $ of defined punctuation
// . see getIsConnection() for that definition
// . this allows us to just lookup the phrase for things like
// "cd-rom" rather than lookup "cd" , "rom" and "cd-rom"
// . skip if prev word is IGNORE_BOOLOP, IGNORE_FIELDNAME or
// IGNORE_DEFAULT
// . we have to set outside the main loop above since we check
// the m_ignoreWord member of the i+2nd word
for ( long i = 0 ; i < numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( i + 2 < numWords && ! m_qwords[i+2].m_ignoreWord&&
isConnection(words.getWord(i+1),words.getWordLen(i+1)) )
qw->m_rightConnected = true;
if ( i - 2 >= 0 && ! m_qwords[i-2].m_ignoreWord &&
isConnection(words.getWord(i-1),words.getWordLen(i-1) ) )
qw->m_leftConnected = true;
}
// now modify the Bits class before generating phrases
for ( long i = 0 ; i < numWords ; i++ ) {
// get default bits
unsigned char b = bits.m_bits[i];
// allow pairing across anything by default
b |= D_CAN_PAIR_ACROSS;
// get Query Word
QueryWord *qw = &m_qwords[i];
// . skip if part of a query weight operator
// . cannot be in a phrase, or anything
if ( qw->m_queryOp && !qw->m_opcode) {
b = D_CAN_PAIR_ACROSS;
}
// is this word a sequence of punctuation and spaces?
else if ( words.isPunct(i) ) {
// pair across ANY punct, even double spaces by default
b |= D_CAN_PAIR_ACROSS;
// but do not pair across anything with a quote in it
if ( words.getNumQuotes(i) >0) b &= ~D_CAN_PAIR_ACROSS;
// continue if we're in quotes
else if ( qw->m_quoteStart >= 0 ) goto next;
// continue if we're in a field
else if ( qw->m_fieldCode > 0 ) goto next;
// if guy on left is in field, do not pair across
if ( i > 0 && m_qwords[i-1].m_fieldCode > 0 )
b &= ~D_CAN_PAIR_ACROSS;
// or if guy on right in field
if ( i +1 < numWords && m_qwords[i+1].m_fieldCode > 0 )
b &= ~D_CAN_PAIR_ACROSS;
// do not pair across ".." when not in quotes/field
char *w = words.getWord (i);
long wlen = words.getWordLen(i);
for ( long j = 0 ; j < wlen-1 ; j++ ) {
if ( w[j ]!='.' ) continue;
if ( w[j+1]!='.' ) continue;
b &= ~D_CAN_PAIR_ACROSS;
break;
}
}
else {
// . not even capped query stop words can start phrase
// . 'Mice And Men' is just one phrase then
// . TODO: "12345678 it was rainy"
// ("it" should start a phrase)
//if ( qw->m_isQueryStopWord) b &= ~D_CAN_START_PHRASE;
if ( qw->m_isStopWord ) b &= ~D_CAN_START_PHRASE;
// . first alnum word can start phrase.
// . example: 'the tigers'
if ( i <= 1 ) b |= D_CAN_START_PHRASE;
// first alnum word in quotes can start phrase.
if ( qw->m_quoteStart == i ) // + 1 )
b |= D_CAN_START_PHRASE;
// . right connected but not left can start phrase
// . example: 'buy a-rom' , 'buy i-phone'
if ( qw->m_rightConnected && ! qw->m_leftConnected )
b |= D_CAN_START_PHRASE;
// . no field names, bool operators, cruft in fields
// can be any part of a phrase
// . no pair across any change of field code
// . 'girl title:boy' --> no "girl title" phrase!
if ( qw->m_ignoreWord ) { //== IGNORE_FIELDNAME ) {
b &= ~D_CAN_PAIR_ACROSS;
b &= ~D_CAN_BE_IN_PHRASE;
b &= ~D_CAN_START_PHRASE;
}
// . no boolean ops
// . 'this OR that' --> no "this OR that" phrase
if ( qw->m_opcode ) {
b &= ~D_CAN_PAIR_ACROSS;
b &= ~D_CAN_BE_IN_PHRASE;
}
if ( qw->m_wordSign == '-' && qw->m_quoteStart < 0) {
b &= ~D_CAN_PAIR_ACROSS;
b &= ~D_CAN_BE_IN_PHRASE;
}
}
next:
// set it back all tweaked
bits.m_bits[i] = b;
}
// . now since we may have prevented pairing across certain things
// we need to set D_CAN_START_PHRASE for stop words whose left
// punct word can no longer be paired across
// . "dancing in the rain" is fun --> will include phrase "is fun".
// . title:"is it right"? --> will include phrase "is it"
for ( long i = 1 ; i < numWords ; i++ ) {
// no punct, alnum only
if ( words.isPunct(i) ) continue;
// skip if not a stop word
if ( ! bits.m_bits[i] & D_IS_STOPWORD ) continue;
// continue if you can still pair across prev punct word
if ( bits.m_bits[i-1] & D_CAN_PAIR_ACROSS ) continue;
// otherwise, we can now start a phrase
bits.m_bits[i] |= D_CAN_START_PHRASE;
}
// a bogus spam class, all words have 0 for their spam probability
//Spam spam;
//spam.reset ( words.getNumWords() );
// treat strongly connected phrases like cd-rom and 3.2.0.3 as being
// in quotes for the most part, therefore, set m_quoteStart for them
long j;
long qs = -1;
for ( j = 0 ; j < numWords ; j++ ) {
// skip all but strongly connected words
if ( m_qwords[j].m_ignoreWord != IGNORE_CONNECTED &&
// must also be non punct word OR a space
( !words.isPunct(j) || words.m_words[j][0] == ' ' ) ) {
// break the "quote", if any
qs = -1; continue; }
// if he is punctuation and qs is -1, skip him,
// punctuation words can no longer start a quote
if ( words.isPunct(j) && qs == -1 ) continue;
// uningore him if we should
if ( keepAllSingles ) m_qwords[j].m_ignoreWord = 0;
// if already in quotes, don't bother!
if ( m_qwords[j].m_quoteStart >= 0 ) continue;
// remember him
if ( qs == -1 ) qs = j;
// he starts the phrase
m_qwords[j].m_quoteStart = qs;
// force him into a quoted phrase
m_qwords[j].m_inQuotes = true;
//m_qwords[j].m_inQuotedPhrase = true;
}
// fix for tags.uri:http://foo.com/bar so it works like
// tags.uri:"http://foo.com/bar" like it should
long first = -1;
for ( j = 0 ; j < numWords ; j++ ) {
// stop when we hit spaces
if ( words.hasSpace(j) ) {
first = -1;
continue;
}
// skip if not in field
if ( ! m_qwords[j].m_fieldCode ) continue;
// must be in a generic field, the other fields like site:
// will be messed up by this logic
if ( m_qwords[j].m_fieldCode != FIELD_GENERIC ) continue;
// first alnumword in field?
if ( first == -1 ) {
// must be alnum
if ( m_qwords[j].m_isPunct ) continue;
// must have punct then another alnum word
if ( j+2 >= numWords ) break;
// spaces screw it up
if ( words.hasSpace(j+1) ) continue;
// then an alnum word after
first = j;
}
// we are in fake quoted phrase
m_qwords[j].m_inQuotes = true;
m_qwords[j].m_quoteStart = first;
}
// make the phrases from the words and the tweaked Bits class
//Phrases phrases;
if ( ! phrases.set ( &words ,
&bits ,
//NULL ,
true , // use stop words?
false , // use stems?
TITLEREC_CURRENT_VERSION,
0 /*niceness*/))//disallows HUGE phrases
return false;
int64_t *wids = words.getWordIds();
// do phrases stuff
for ( long i = 0 ; i < numWords ; i++ ) {
// get the ith QueryWord
QueryWord *qw = &m_qwords[i];
// if word is ignored because it is opcode, or whatever,
// it cannot start a phrase
// THIS IS BROKEN
//if ( qw->m_queryOp && qw->m_opcode == OP_PIPE){
// for (long j = i-1;j>=0;j--){
// if (!m_qwords[j].m_phraseId) continue;
// m_qwords[j].m_ignorePhrase = IGNORE_BOOLOP;
// break;
// }
//
//}
if ( qw->m_ignoreWord ) continue;
if ( qw->m_fieldCode && qw->m_quoteStart < 0) continue;
// get the first word # to our left that starts a phrase
// of which we are a member
qw->m_leftPhraseStart = -1;
//int64_t tmp;
for ( long j = i - 1 ; j >= 0 ; j-- ) {
//if ( ! bits.isIndexable(j) ) continue;
if ( ! bits.canPairAcross(j+1) ) break;
//if ( ! bits.canStartPhrase(j) ) continue;
if ( ! wids[j] ) continue;
// phrases.getNumWordsInPhrase()
//if( j + phrases.getMaxWordsInPhrase(j,&tmp)<i) break;
qw->m_leftPhraseStart = j;
// we can't pair across alnum words now, we just want bigrams
if ( wids[j] ) break;
//break;
// now we do bigrams so only allow two words even
// if they are stop words
break;
}
// . is this word in a quoted phrase?
// . the whole phrase must be in the same set of quotes
// . if we're in a left phrase, he must be in our quotes
if ( qw->m_leftPhraseStart >= 0 &&
qw->m_quoteStart >= 0 &&
qw->m_leftPhraseStart >= qw->m_quoteStart )
qw->m_inQuotedPhrase = true;
// if we start a phrase, ensure next guy is in our quote
if ( ! qw->m_ignorePhrase && i+1 < numWords &&
m_qwords[i+1].m_quoteStart >= 0 &&
m_qwords[i+1].m_quoteStart <= i )
qw->m_inQuotedPhrase = true;
// are we the first word in the quote?
if ( i-1>=0 && qw->m_quoteStart == i )
qw->m_inQuotedPhrase = true;
// ignore single words that are in a quoted phrase
if ( ! keepAllSingles && qw->m_inQuotedPhrase )
qw->m_ignoreWord = IGNORE_QUOTED;
// . get phrase info for this term
// . a pid (phraseId)of 0 indicates it does not start a phrase
// . raw phrase termId
//uint64_t pid = phrases.getPhraseId(i);
uint64_t pid = 0LL;
// nwp is a REGULAR WORD COUNT!!
long nwp = 0;
if ( qw->m_inQuotedPhrase )
// keep at a bigram for now... i'm not sure if we
// will be indexing trigrams
nwp = phrases.getMinWordsInPhrase(i,(int64_t *)&pid);
// just get a two-word phrase term if not in quotes
else
nwp = phrases.getMinWordsInPhrase(i,(int64_t *)&pid);
// store it
qw->m_rawPhraseId = pid;
// does word #i start a phrase?
if ( pid != 0 ) {
uint64_t ph = qw->m_prefixHash ;
// store the phrase id with coll/prefix
//qw->m_phraseId = g_indexdb.getTermId ( ph , pid );
// like we do it in XmlDoc.cpp's hashString()
if ( ph ) qw->m_phraseId = hash64 ( pid , ph );
else qw->m_phraseId = pid;
// how many regular words long is the bigram?
long plen2; phrases.getPhrase ( i , &plen2 ,2);
// the trigram?
long plen3; phrases.getPhrase ( i , &plen3 ,3);
// get just the bigram for now
qw->m_phraseLen = plen2;
// do not ignore the phrase, it's valid
qw->m_ignorePhrase = 0;
// set our rightPhraseEnd point
//qw->m_rightPhraseEnd = i + phrases.getNumWords(i);
// leave it as 0 if it got truncated i guess by the
// MAX_QUERY_WORDS of 320
qw->m_rightRawWordId = 0LL;
// store left and right raw word ids
long ni = i + nwp - 1;
if ( ni < m_numWords )
qw->m_rightRawWordId=m_qwords[ni].m_rawWordId;
}
// . phrase sign is inherited from word's sign if it's a minus
// . word sign is inherited from field, quote or right before
// the word
// . that is, all words in -"to be or not" will have a '-' sign
// . phraseId may or may not be 0 at this point
if ( qw->m_wordSign == '-' ) qw->m_phraseSign = '-';
// . dist word signs to others in the same connected string
// . use "-cd-rom x-box" w/ no connector in between
// . test queries:
// . +cd-rom +x-box
// . -cd-rom +x-box
// . -m-o-n
// . who was the first (was is a query stop word)
// . www.xxx.com
// . welcome to har.com
// . hezekiah walker the love family affair ii live at radio
// city music hall
// . fotostudio +m-o-n-a-r-t
// . fotostudio -m-o-n-a-r-t
// . i'm home
if ( qw->m_leftConnected && qw->m_leftPhraseStart >= 0 )
qw->m_wordSign = m_qwords[i-2].m_wordSign;
// . if we connected to the alnum word on our right then
// soft require the phrase (i.e. treat like a single term)
// . example: cd-rom or www.xxx.com
// . 'welcome to har.com' should get a '*' for "har.com" sign
if ( qw->m_rightConnected ) {
if ( qw->m_wordSign) qw->m_phraseSign = qw->m_wordSign;
else qw->m_phraseSign = '*';
}
// . if we're in quotes then any phrase we have should be
// soft required (i.e. treated like a single term)
// . we do not allow phrases in queries to pair across
// quotes. See where we tweak the Bits class above.
if ( qw->m_quoteStart >= 0 ) {
//if (qw->m_wordSign)qw->m_phraseSign = qw->m_wordSign;
//else qw->m_phraseSign = '*';
qw->m_phraseSign = '*';
}
// . if we are the last word in a phrase that consists of all
// PLAIN stop words then make the phrase have a '*'
// . 'to be or not to be .. test' (cannot pair across "..")
// . don't use QUERY stop words cuz of "who was the first?" qry
if ( pid ) {
long nw = phrases.getNumWordsInPhrase2(i);
long j;
// search up to this far
long maxj = i + nw;
// but not past our truncated limit
if ( maxj > MAX_QUERY_WORDS ) maxj = MAX_QUERY_WORDS;
for ( j = i ; j < maxj ; j++ ) {
// skip punct
if ( words.isPunct(j) ) continue;
// break out if not a stop word
if ( ! bits.isStopWord(j) ) break;
// break out if has a term sign
if ( m_qwords[j].m_wordSign ) break;
}
// if everybody in phrase #i was a signless stopword
// and the phrase was signless, make it have a '*' sign
if ( j >= maxj && m_qwords[i].m_phraseSign == '\0' )
m_qwords[i].m_phraseSign = '*';
// . if a constituent has a - sign, then the whole
// phrase becomes negative, too
// . fixes 'apple -computer' truncation problem
for ( long j = i ; j < maxj ; j++ )
if ( m_qwords[j].m_wordSign == '-' )
qw->m_phraseSign = '-';
}
// . ignore unsigned QUERY stop words that are not yet ignored
// and are in unignored phrases
// . 'who was the first taiwanese president' should not get
// "who was" term sign changed to '*' because "was" is a
// QUERY stop word. So ignore singles query stop words
// in phrases now
if ( //! keepAllSingles &&
(qw->m_isQueryStopWord && !m_isBoolean) &&
m_useQueryStopWords &&
! qw->m_fieldCode &&
// fix 'the tigers'
//(qw->m_leftPhraseStart >= 0 || qw->m_phraseId > 0 ) &&
! qw->m_wordSign &&
! qw->m_ignoreWord )
qw->m_ignoreWord = IGNORE_QSTOP;
// . ignore word if connected to right or left alnum word
// . we will be replaced by a phrase(s)
// . do not worry about keepAllSingles because we turn
// this into a phrase below!
// . if ( ! keepAllSingles &&
if ( ( qw->m_leftConnected || qw->m_rightConnected ) )
qw->m_ignoreWord = IGNORE_CONNECTED;
// . ignore and/or between quoted phrases, save user from
// themselves (they meant AND/OR)
if ( ! keepAllSingles && qw->m_isQueryStopWord &&
! qw->m_fieldCode &&
m_useQueryStopWords &&
! qw->m_phraseId && ! qw->m_inQuotes &&
((qw->m_wordId == 255176654160863LL) ||
(qw->m_wordId == 46196171999655LL)) )
qw->m_ignoreWord = IGNORE_QSTOP;
// . ignore repeated single words and phrases
// . look at the old termIds for this, too
// . should ignore 2nd 'time' in 'time after time' then
// . but boolean queries often need to repeat terms
// . NEW - words much be same sign and not in different
// . quoted phrases to be ignored -partap
m_hasDupWords = false;
if ( ! m_isBoolean && !qw->m_ignoreWord ) {
for ( long j = 0 ; j < i ; j++ ) {
if ( m_qwords[j].m_ignoreWord ) continue;
if ( m_qwords[j].m_wordId == qw->m_wordId &&
m_qwords[j].m_wordSign ==qw->m_wordSign &&
(!keepAllSingles ||
(m_qwords[j].m_quoteStart
== qw->m_quoteStart))){
qw->m_ignoreWord = IGNORE_REPEAT;
m_hasDupWords = true;
}
}
}
if ( ! m_isBoolean && !qw->m_ignorePhrase ) {
// ignore repeated phrases too!
for ( long j = 0 ; j < i ; j++ ) {
if ( m_qwords[j].m_ignorePhrase ) continue;
if ( m_qwords[j].m_phraseId == qw->m_phraseId &&
m_qwords[j].m_phraseSign
== qw->m_phraseSign)
qw->m_ignorePhrase = IGNORE_REPEAT;
}
}
}
// . if we only have one quoted query then force its sign to be '+'
// . '"get the phrase" the' --> +"get the phrase" (last the is ignored)
// . "time enough for love" --> +"time enough" +"enough for love"
// . if all unignored words are in the same set of quotes then change
// all '*' (soft-required) phrase signs to '+'
for ( j= 0 ; j < numWords ; j++ ) {
if ( words.isPunct(j)) continue;
if ( m_qwords[j].m_quoteStart < 0 ) break;
if ( m_qwords[j].m_ignoreWord ) continue;
if ( j < 2 ) continue;
if ( m_qwords[j-2].m_quoteStart != m_qwords[j].m_quoteStart )
break;
}
if ( j >= numWords ) {
for ( j= 0 ; j < numWords ; j++ ) {
if ( m_qwords[j].m_phraseSign == '*' )
m_qwords[j].m_phraseSign = '+';
}
}
// . force a plus on any site: or ip: query terms
// . also disable site clustering if we have either of these terms
for ( long i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( qw->m_wordSign ) continue;
if ( qw->m_fieldCode != FIELD_SITE &&
qw->m_fieldCode != FIELD_IP ) continue;
qw->m_wordSign = '+';
}
// now check phrase terms. if you do a search in quotes like
// "directions and nearby" it will now generate two phrases:
// "directions and nearby" and "and nearby", so stop "and nearby"
/*
for ( long i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignorePhrase ) continue;
if ( ! qw->m_phraseId ) continue;
// skip if we start this phrase
if ( qw->m_quoteStart == i ) continue;
// . skip if are not a phrase stop word that is paired across
// . not now, we support 3,4 and 5 word phrases...
//if ( ! qw->m_isStopWord ) continue;
// however, we some quoted phrases are more than 5 words
// TODO: fix this!!!
// ok, nuke this term otherwise
qw->m_ignorePhrase = IGNORE_DEFAULT;
}
*/
// . if one or more of a phrase's constituent terms exceeded
// term #MAX_QUERY_TERMS then we should also soft require that phrase
// . fixes 'hezekiah walker the love family affair ii live at
// radio city music hall'
// . how many non-ignored phrases?
long count = 0;
for ( long i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignorePhrase ) continue;
if ( ! qw->m_phraseId ) continue;
count++;
}
for ( long i = 0 ; i < numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
// count non-ignored words
if ( qw->m_ignoreWord ) continue;
// if under limit, continue
if ( count++ < MAX_QUERY_TERMS ) continue;
// . otherwise, ignore
// . if we set this for our UOR'ed terms from SearchInput.cpp's
// UOR'ed facebook interests then it causes us to get no results!
// so make sure that MAX_QUERY_TERMS is big enough with respect to
// the opCount in SearchInput.cpp
qw->m_ignoreWord = IGNORE_BREECH;
// left phrase should get a '*'
long left = qw->m_leftPhraseStart;
if ( left >= 0 && ! m_qwords[left].m_phraseSign )
m_qwords[left].m_phraseSign = '*';
// our phrase should get a '*'
if ( qw->m_phraseId && ! qw->m_phraseSign )
qw->m_phraseSign = '*';
}
// . fix the 'x -50a' query so it returns results
// . how many non-negative, non-ignored words/phrases do we have?
count = 0;
for ( long i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord ) continue;
if ( qw->m_wordSign == '-' ) continue;
count++;
}
for ( long i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignorePhrase ) continue;
if ( qw->m_phraseSign == '-' ) continue;
if ( qw->m_phraseId == 0LL ) continue;
count++;
}
// if everybody is ignored or negative UNignore first query stop word
if ( count == 0 ) {
for ( long i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
if ( qw->m_ignoreWord != IGNORE_QSTOP ) continue;
qw->m_ignoreWord = 0;
count++;
break;
}
}
// . count ignored WORDS for logging stats
// . do not IGNORE_DEFAULT though, that doesn't really count
//m_numIgnored = 0;
//for ( long i = 0 ; i < m_numWords ; i++ ) {
// if ( ! m_qwords[i].m_ignoreWord ) continue;
// if ( m_qwords[i].m_ignoreWord == IGNORE_DEFAULT ) continue;
// m_numIgnored++;
//}
quoteStart = -1;
long quoteEnd = -1;
// set m_quoteENd
for ( long i = m_numWords - 1 ; i >= 0 ; i-- ) {
// get ith word
QueryWord *qw = &m_qwords[i];
// skip if ignored
if ( qw->m_ignoreWord ) continue;
// skip if not in quotes
if ( qw->m_quoteStart < 0 ) continue;
// if match previous guy...
if ( qw->m_quoteStart == quoteStart ) {
// inherit the end
qw->m_quoteEnd = quoteEnd;
// all done
continue;
}
// ok, we are the end then
quoteEnd = i;
quoteStart = qw->m_quoteStart;
}
long wkid = 0;
long upTo = -1;
long wk_start;
long wk_nwk;
//int64_t *wids = words.getWordIds();
//
// set the wiki phrase ids
//
for ( long i = 0 ; i < m_numWords ; i++ ) {
// get ith word
QueryWord *qw = &m_qwords[i];
// in a phrase from before?
if ( i < upTo ) {
qw->m_wikiPhraseId = wkid;
qw->m_wikiPhraseStart = wk_start;
qw->m_numWordsInWikiPhrase = wk_nwk;
continue;
}
// assume none
qw->m_wikiPhraseId = 0;
// skip if punct
if ( ! wids[i] ) continue;
// get word
long nwk ;
nwk = g_wiki.getNumWordsInWikiPhrase ( i , &words );
// bail if none
if ( nwk <= 1 ) continue;
// save these too
wk_start = i;
wk_nwk = nwk;
// inc it
wkid++;
// store it
qw->m_wikiPhraseId = wkid;
qw->m_wikiPhraseStart = wk_start;
qw->m_numWordsInWikiPhrase = wk_nwk;
// set loop parm
upTo = i + nwk;
}
// all done
return true;
}
// return -1 if does not exist in query, otherwise return the query word num
long Query::getWordNum ( int64_t wordId ) {
// skip if punct or whatever
if ( wordId == 0LL || wordId == -1LL ) return -1;
for ( long i = 0 ; i < m_numWords ; i++ ) {
QueryWord *qw = &m_qwords[i];
// the non-raw word id includes a hash with "0", which
// signifies an empty field term
if ( qw->m_rawWordId == wordId ) return i;
}
// otherwise, not found
return -1;
}
//static TermTable s_table;
static HashTableX s_table;
static bool s_isInitialized = false;
// 3rd field = m_hasColon
struct QueryField g_fields[] = {
{"gbfieldmatch",
FIELD_GBFIELDMATCH,
true,
"gbfieldmatch:strings.vendor:\"My Vendor Inc.\"",
"Matches all the meta tag or JSON or XML fields that have "
"the name \"strings.vendor\" and contain the exactly provided "
"value, in this case, <i>My Vendor Inc.</i>. This is case "
"sensitive and includes punctuation, so it's exact match.",
"Advanced Query Operators",
QTF_BEGINNEWTABLE },
{"url",
FIELD_URL,
true,
"url:www.abc.com/page.html",
"Matches the page with that exact url. Uses the first url, not "
"the url it redirects to, if any." ,
NULL,
0 },
{"ext",
FIELD_EXT,
true,
"ext:doc",
"Match documents whose url ends in the <i>.doc</i> file extension.",
NULL,
0 },
{"link",
FIELD_LINK,
true,
"link:www.gigablast.com/foo.html",
"Matches all the documents that have a link to "
"http://www.gigablast.com/foobar.html",
NULL,
0 },
//{"links", FIELD_LINKS, true,"Same as link:."},
//{"ilink", FIELD_ILINK, true,"Similar to above."},
{"sitelink",
FIELD_SITELINK,
true,
"sitelink:abc.foobar.com",
"Matches all documents that link to any page on the "
"<i>abc.foobar.com</i> site.",
NULL,
0 },
{"site",
FIELD_SITE,
true,
"site:mysite.com",
"Matches all documents on the mysite.com domain.",
NULL,
0 },
{"site",
FIELD_SITE,
true,
"site:www.mysite.com/dir1/dir2/",
"Matches all documents whose url starts with "
"www.mysite.com/dir1/dir2/",
NULL,
QTF_DUP },
//{"coll", FIELD_COLL, true,"Not sure if this works."},
{"ip",
FIELD_IP,
true,
"ip:1.2.3.4",
"Matches all documents whose IP is 1.2.3.4.",
NULL,
0 },
{"ip",
FIELD_IP,
true,
"ip:1.2.3",
"Matches all documents whose IP STARTS with 1.2.3.",
NULL,
QTF_DUP },
{"inurl",
FIELD_SUBURL,
true,
"inurl:dog",
"Matches all documents that have the word dog in their url, like "
"http://www.mysite.com/dog/food.html. However will not match "
"http://www.mysite.com/dogfood.html because it is not an "
"individual word. It must be delineated by punctuation.",
NULL,
0 },
{"suburl",
FIELD_SUBURL,
true,
"suburl:dog",
"Same as inurl.",
NULL,
0},
{"intitle",
FIELD_TITLE,
false,
"title:cat",
"Matches all the documents that have the word cat in their "
"title.",
NULL,
0 },
{"intitle",
FIELD_TITLE,
false,
"title:\"cat food\"",
"Matches all the documents that have the phrase \"cat food\" "
"in their title.",
NULL,
QTF_DUP },
{"title",
FIELD_TITLE,
false,
"title:cat",
"Same as intitle:",
NULL,
0},
//{"isclean", FIELD_ISCLEAN, true,"Matches all pages that are deemed non-offensive and safe for children."},
{"gbinrss",
FIELD_GBRSS,
true,
"gbinrss:1",
"Matches all documents that are in RSS feeds. Likewise, use "
"<i>gbinrss:0</i> to match all documents that are NOT in RSS feeds.",
NULL,
0},
{"type",
FIELD_TYPE,
false,
"type:pdf",
"Matches all documents that are PDFs. Other possible types include "
"<i>html, text, xml, pdf, doc, xls, ppt, ps, css, json, status.</i> "
"<i>status</i> matches special documents that are stored every time "
"a url is spidered so you can see all the spider attempts and when "
"they occurred as well as the outcome.",
NULL,
0},
{"filetype",
FIELD_TYPE,
false,
"filetype:pdf",
"Same as type: above.",
NULL,
0},
{"gbisadult",
FIELD_GENERIC,
false,
"gbisadult:1",
"Matches all documents that have been detected as adult documents "
"and may be unsuitable for children. Likewise, use "
"<i>gbisadult:0</i> to match all documents that were NOT detected "
"as adult documents.",
NULL,
0},
{"gbimage",
FIELD_URL,
false,
"gbimage:site.com/image.jpg",
"Matches all documents that contain the specified image.",
NULL,
0},
{"gbhasthumbnail",
FIELD_GENERIC,
false,
"gbhasthumbnail:1",
"Matches all documents for which Gigablast detected a thumbnail. "
"Likewise use <i>gbhasthumbnail:0</i> to match all documents that "
"do not have thumbnails.",
NULL,
0},
{"gbtag*",
FIELD_TAG,
false,
"gbtag*",
"Matches all documents whose tag named * have the specified value "
"in the tagdb entry for the url. Example: gbtagsitenuminlinks:2 "
"matches all documents that have 2 qualified "
"inlinks pointing to their site "
"based on the tagdb record. You can also provide your own "
"tags in addition to the tags already present. See the <i>tagdb</i> "
"menu for more information.",
NULL,
0},
{"gbzipcode",
FIELD_ZIP,
false,
"gbzip:90210",
"Matches all documents that have the specified zip code "
"in their meta zip code tag.",
NULL,
0},
{"gbcharset",
FIELD_CHARSET,
false,
"gbcharset:utf-8",
"Matches all documents originally in the Utf-8 charset. "
"Available character sets are listed in the <i>iana_charset.cpp</i> "
"file in the open source distribution. There are a lot. Some "
"more popular ones are: <i>us, latin1, iso-8859-1, csascii, ascii, "
"latin2, latin3, latin4, greek, shift_jis.",
NULL,
0},
// this just complicates things for now, so comment out
//{"urlhash",FIELD_URLHASH, false,""},
//{"urlhashdiv10",FIELD_URLHASHDIV10, false,""},
//{"urlhashdiv100",FIELD_URLHASHDIV100, false,""},
{"gblang",
FIELD_GBLANG,
false,
"gblang:de",
"Matches all documents in german. "
"The supported language abbreviations "
"are at the bottom of the <a href=/admin/filters>url filters</a> "
"page. Some more "
"common ones are <i>gblang:en, gblang:es, gblang:fr, "
// need quotes for this one!!
"gblang:\"zh_cn\"</i> (note the quotes for zh_cn!).",
NULL,
0},
//{"gbquality",FIELD_GBQUALITY,true,""},
//{"gblinktextin",FIELD_LINKTEXTIN,true,""},
//{"gblinktextout",FIELD_LINKTEXTOUT,true,""},
//{"gbkeyword",FIELD_KEYWORD,true,""},
//{"gbcharset", FIELD_CHARSET, false,""},
{"gbpathdepth",
FIELD_GBOTHER,
false,
"gbpathdepth:3",
"Matches all documents whose url has 3 path components to it like "
"http://somedomain.com/dir1/dir2/dir3/foo.html",
NULL,
0},
{"gbhopcount",
FIELD_GBOTHER,
false,
"gbhopcount:2",
"Matches all documents that are a minimum of two link hops away "
"from a root url.",
NULL,
0},
{"gbhasfilename",
FIELD_GBOTHER,
false,
"gbhasfilename:1",
"Matches all documents whose url ends in a filename like "
"<i>http://somedomain.com/dir1/myfile</i> and not "
"<i>http://somedomain.com/dir1/dir2/</i>. Likewise, use "
"<i>gbhasfilename:0</i> to match all the documents that do not "
"have a filename in their url.",
NULL,
0},
{"gbiscgi",
FIELD_GBOTHER,
false,
"gbiscgi:1",
"Matches all documents that have a question mark in their url. "
"Likewise gbiscgi:0 matches all documents that do not.",
NULL,
0},
{"gbhasext",
FIELD_GBOTHER,
false,
"gbhasext:1",
"Matches all documents that have a file extension in their url. "
"Likewise, <i>gbhasext:0</i> matches all documents that do not have "
"a file extension in their url.",
NULL,
0},
{"gbsubmiturl",
FIELD_GBOTHER,
false,
"gbsubmiturl:domain.com/process.php",
"Matches all documents that have a form that submits to the "
"specified url.",
NULL,
0},
// diffbot only
{"gbparenturl",
FIELD_GBPARENTURL,
true,
"gbparenturl:www.xyz.com/abc.html",
"Diffbot only. Match the json urls that "
"were extract from this parent url. Example: "
"gbparenturl:www.gigablast.com/addurl.htm",
NULL,
0},
//
// for content type CT_STATUS documents (Spider status docs)
//
//{"qdom", FIELD_QUOTA, false,""},
//{"qhost", FIELD_QUOTA, false,""},
{"gbsortbyfloat",
FIELD_GBSORTBYFLOAT,
false,
"cameras gbsortbyfloat:price",
"Sort all documents that "
"contain 'camera' by price. <i>price</i> can be a root JSON field or "
"in a meta tag, or in an xml &lt;price&gt; tag.",
"Numeric Field Query Operators",
QTF_BEGINNEWTABLE },
{"gbsortbyfloat",
FIELD_GBSORTBYFLOAT,
false,
"cameras gbsortbyfloat:product.price",
"Sort all documents that "
"contain 'camera' by price. <i>price</i> can be in a JSON document "
"like "
"<i>{ \"product\":{\"price\":1500.00}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;price&gt;1500.00&lt;/price&gt;&lt;/product&gt;"
"</i>",
NULL,
0 },
{"gbrevsortbyfloat",
FIELD_GBREVSORTBYFLOAT,
false,
"cameras gbrevsortbyfloat:product.price",
"Like above example but sorted with highest prices on top.",
NULL,
0 },
{"gbsortby",
FIELD_GBSORTBYFLOAT,
false,
"dog gbsortbyint:gbspiderdate",
"Sort the documents that contain 'dog' by "
"the date they were last spidered, with the newest "
"on top.",
NULL,
QTF_HIDE},
{"gbrevsortby",
FIELD_GBREVSORTBYFLOAT,
false,
"dog gbrevsortbyint:gbspiderdate",
"Sort the documents that contain 'dog' by "
"the date they were last spidered, but with the "
"oldest on top.",
NULL,
QTF_HIDE},
{"gbsortbyint",
FIELD_GBSORTBYINT,
false,
"pilots gbsortbyint:employees",
"Sort all documents that "
"contain 'pilots' by employees. "
"<i>employees</i> can be a root JSON field or "
"in a meta tag, or in an xml &lt;price&gt; tag. The value it "
"contains is interpreted as a 32-bit integer.",
NULL,
0 },
{"gbsortbyint",
FIELD_GBSORTBYINT,
false,
"gbsortbyint:gbspiderdate",
"Sort all documents by the date they were spidered/downloaded.",
NULL,
0},
{"gbsortbyint",
FIELD_GBSORTBYINT,
false,
"gbsortbyint:company.employees",
"Sort all documents by employees. Documents can contain "
"<i>employees</i> in a JSON document "
"like "
"<i>{ \"product\":{\"price\":1500.00}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;price&gt;1500.00&lt;/price&gt;&lt;/product&gt;"
"</i>",
NULL,
0 },
{"gbsortbyint",
FIELD_GBSORTBYINT,
false,
"gbsortbyint:gbsitenuminlinks",
"Sort all documents by the number of distinct inlinks the "
"document's site has.",
NULL,
0 },
{"gbrevsortbyint",
FIELD_GBREVSORTBYINT,
false,
"gbrevsortbyint:gbspiderdate",
"Sort all documents by the date they were spidered/downloaded "
"but with the oldest on top.",
NULL,
0},
// gbmin:price:1.23
{"gbminfloat",
FIELD_GBNUMBERMIN,
false,
"cameras gbminfloat:price:109.99",
"Matches all documents that "
"contain 'camera' or 'cameras' and have a price of at least 109.99. "
"<i>price</i> can be a root JSON field or "
"in a meta tag name <i>price</i>, or in an xml &lt;price&gt; tag.",
NULL,
0 },
{"gbminfloat",
FIELD_GBNUMBERMIN,
false,
"cameras gbminfloat:product.price:109.99",
"Matches all documents that "
"contain 'camera' or 'cameras' and have a price of at least 109.99 "
"in a JSON document like "
"<i>{ \"product\":{\"price\":1500.00}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;price&gt;1500.00&lt;/price&gt;&lt;/product&gt;"
"</i>",
NULL,
0 },
// alias we need to bury
{"gbmin",
FIELD_GBNUMBERMIN,
false,
"",
"",
NULL,
QTF_HIDE},
{"gbmaxfloat",
FIELD_GBNUMBERMAX,
false,
"cameras gbmaxfloat:price:109.99",
"Like the gbminfloat examples above, but is an upper bound.",
NULL,
0 },
{"gbequalfloat",
FIELD_GBNUMBEREQUALFLOAT,
false,
"gbequalfloat:product.price:1.23",
"Similar to gbminfloat and gbmaxfloat but is an equality constraint.",
NULL,
0 },
{"gbmax",
FIELD_GBNUMBERMAX,
false,
"",
"",
NULL,
QTF_HIDE},
{"gbminint",
FIELD_GBNUMBERMININT,
false,
"gbminint:gbspiderdate:1391749680",
"Matches all documents with a spider timestamp of at least "
"1391749680. Use this as opposed th gbminfloat when you need "
"32 bits of integer precision.",
NULL,
0},
{"gbmaxint",
FIELD_GBNUMBERMAXINT,
false,
"gbmaxint:company.employees:20",
"Matches all companies with 20 or less employees "
"in a JSON document like "
"<i>{ \"company\":{\"employees\":13}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;company&gt;&lt;employees&gt;13&lt;/employees&gt;"
"&lt;/company&gt;"
"</i>",
NULL,
0},
{"gbequalint",
FIELD_GBNUMBEREQUALINT,
false,
"gbequalint:company.employees:13",
"Similar to gbminint and gbmaxint but is an equality constraint.",
NULL,
0},
{"gbdocspiderdate",
FIELD_GENERIC,
false,
"gbspiderdate:1400081479",
"Matches documents that have "
"that spider date timestamp (UTC). Does not include the "
"special spider status documents. This is the time the document "
"completed downloading.",
"Date Related Query Operators",
QTF_BEGINNEWTABLE},
{"gbspiderdate",
FIELD_GENERIC,
false,
"gbspiderdate:1400081479",
"Like above, but DOES include the special spider status documents.",
NULL,
0},
{"gbdocindexdate",
FIELD_GENERIC,
false,
"gbdocindexdate:1400081479",
"Like above, but is the time the document was last indexed. "
"This time is "
"slightly greater than or equal to the spider date. Does not "
"include the special spider status documents.",
NULL,
0},
{"gbindexdate",
FIELD_GENERIC,
false,
"gbindexdate:1400081479",
"Like above, but it does include the special spider status "
"documents.",
NULL,
0},
// {"gbreplyspiderdate",FIELD_GENERIC,false,
// "Example: gbspiderdate:1400081479 will return spider log "
// "results that have "
// "that spider date timestamp (UTC)"},
{"gbfacetstr",
FIELD_GBFACETSTR,
false,
"gbfacetstr:color",
"Returns facets in "
"the search results "
"by their color field.",
"Facet Related Query Operators",
QTF_BEGINNEWTABLE},
{"gbfacetstr",
FIELD_GBFACETSTR,
false,
"gbfacetstr:product.color",
"Returns facets in "
"of the color field in a JSON document like "
"<i>{ \"product\":{\"color\":\"red\"}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;color&gt;red&lt;/price&gt;&lt;/product&gt;"
"</i>",
NULL,
0},
{"gbfacetint", FIELD_GBFACETINT, false,
"gbfacetint:product.cores",
"Returns facets in "
"of the <i>cores</i> field in a JSON document like "
"<i>{ \"product\":{\"cores\":10}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;cores&gt;10&lt;/price&gt;&lt;/product&gt;"
"</i>",
NULL,
0},
{"gbfacetint", FIELD_GBFACETINT, false,
"gbfacetint:gbhopcount",
"Returns facets in "
"of the <i>gbhopcount</i> field over the documents so you can "
"search the distribution of hopcounts over the index.",
NULL,
0},
{"gbfacetint", FIELD_GBFACETINT, false,
"gbfacetint:size,0-10,10-20,30-100,100-200,200-1000,1000-10000",
"Returns facets in "
"of the <i>size</i> field (either in json, field or a meta tag) "
"and cluster the results into the specified ranges.",
NULL,
0},
{"gbfacetfloat", FIELD_GBFACETFLOAT, false,
"gbfacetfloat:product.weight",
"Returns facets "
"of the <i>weight</i> field in a JSON document like "
"<i>{ \"product\":{\"weight\":1.45}} "
"</i> or, alternatively, an XML document like <i>"
"&lt;product&gt;&lt;weight&gt;1.45&lt;/price&gt;&lt;/product&gt;"
"</i>",
NULL,
0},
{"gbfacetfloat", FIELD_GBFACETFLOAT, false,
"gbfacetfloat:product.price,0-1.5,1.5-5,5.0-20,20-100.0",
"Similar to above but cluster the pricess into the specified ranges.",
NULL,
0},
{"gbcountry",
FIELD_GBCOUNTRY,
false,
"gbcountry:us",
"Matches documents determined by Gigablast to be from the United "
"States. See the country abbreviations in the CountryCode.cpp "
"open source distribution. Some more popular examples include: "
"de, fr, uk, ca, cn.",
NULL,
0} ,
// mdw
{"gbpermalink",
FIELD_GBPERMALINK,
false,
"gbpermalink:1",
"Matches documents that are permalinks. Use <i>gbpermalink:0</i> "
"to match documents that are NOT permalinks.",
NULL,
0},
{"gbdocid",
FIELD_GBDOCID,
false,
"gbdocid:123456",
"Matches the document with the docid 123456",
NULL,
0},
//
// spider status docs queries
//
{"gbstatus",
FIELD_GENERIC,
false,
"gbstatus:0",
"Matches all special spider status documents that spidered "
"their url successfully. Replace <i>0</i> with other numeric error "
"codes to get the other outcomes.",
"Spider Status Documents", // title
QTF_BEGINNEWTABLE},
{"gbstatusmsg",
FIELD_GENERIC,
false,
"gbstatusmsg:tcp",
"Matches all special spider status documents that had a status "
"message containing the word <i>tcp</i> like in "
"<i>TCP Timed Out</i>. Similarly, gbstatus:success, "
"gbstatus:\"robots.txt\" are other possibilities.",
NULL,
0},
{"url2",
FIELD_URL,
true,
"url2:www.abc.com/page.html",
"Matches the <i>Spider Status</i> documents for the specified url. "
"These special documents "
"let you know exactly when the url was attempted to be "
"spidered and the outcome.",
NULL,
0 },
{"site2",
FIELD_SITE,
true,
"site2:mysite.com",
"Matches all the special spider status documents on the "
"mysite.com domain.",
NULL,
0 },
{"ip2",
FIELD_IP,
true,
"ip2:1.2.3.4",
"Matches all the special spider status "
"documents whose IP is 1.2.3.4.",
NULL,
0 },
{"inurl2",
FIELD_SUBURL2,
true,
"inurl2:dog",
"Matches all the special spider status "
"documents that have the word dog in their url, like "
"http://www.mysite.com/dog/food.html. However will not match "
"http://www.mysite.com/dogfood.html because it is not an "
"individual word. It must be delineated by punctuation.",
NULL,
0 },
{"gbpathdepth2",
FIELD_GBOTHER,
false,
"gbpathdepth2:2",
"Similar to gbpathdepth: described above but for special "
"spider status documents.",
NULL,
0},
{"gbhopcount2",
FIELD_GBOTHER,
false,
"gbhopcount2:3",
"Similar to gbhopcount: described above but for special "
"spider status documents.",
NULL,
0},
{"gbhasfilename2",
FIELD_GBOTHER,
false,
"gbhasfilename2:1",
"Similar to gbhasfilename: described above but for special "
"spider status documents.",
NULL,
0},
{"gbiscgi2",
FIELD_GBOTHER,
false,
"gbiscgi2:1",
"Similar to gbiscgi: described above but for special "
"spider status documents.",
NULL,
0},
{"gbhasext2",
FIELD_GBOTHER,
false,
"gbhasext2:1",
"Similar to gbhasext: described above but for special "
"spider status documents.",
NULL,
0},
// they don't need to know about this
{"gbad",FIELD_GBAD,false,"","",NULL,QTF_HIDE},
{"gbtagvector", FIELD_GBTAGVECTOR, false,"","",NULL,QTF_HIDE},
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,"","",NULL,QTF_HIDE},
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,"","",NULL,QTF_HIDE},
{"gbcontenthash", FIELD_GBCONTENTHASH, false,"","",NULL,QTF_HIDE},
{"gbduphash" ,FIELD_GBOTHER,false,"","",NULL,QTF_HIDE},
// call it field url to hash all up to the first space
{"gbsitetemplate" ,FIELD_URL,false,"","",NULL,QTF_HIDE}
//{"gbcsenum",FIELD_GBCSENUM,false,""},
//{"gboutlinkedtitle" ,FIELD_GBOTHER,false,"gboutlinkedtitle:0 and gboutlinkedtitle:1 matches events whose title is not in and in a hyperlink, respectively."},
//{"gbisaggregator" ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."},
//{"gbdeduped" ,FIELD_GBOTHER,false,""},
//{"gbinjected", FIELD_GBOTHER,false,"Was the document injected?."},
};
void resetQuery ( ) {
s_table.reset();
}
long getNumFieldCodes ( ) {
return (long)sizeof(g_fields) / (long)sizeof(QueryField);
}
static bool initFieldTable(){
if ( ! s_isInitialized ) {
// set up the hash table
if ( ! s_table.set ( 8 , 4 , 255,NULL,0,false,0,"qryfldtbl" ) )
return log("build: Could not init table of "
"query fields.");
// now add in all the stop words
long n = getNumFieldCodes();
for ( long i = 0 ; i < n ; i++ ) {
// skip if dup
//if ( g_fields[i].m_flag & QTF_DUP ) continue;
int64_t h = hash64b ( g_fields[i].text );
// if already in there it is a dup
if ( s_table.isInTable ( &h ) ) continue;
// store the entity index in the hash table as score
if ( ! s_table.addTerm ( &h, i+1 ) ) return false;
}
s_isInitialized = true;
}
return true;
}
char getFieldCode ( char *s , long len , bool *hasColon ) {
// default
if (hasColon) *hasColon = false;
if (!initFieldTable()) return FIELD_GENERIC;
int64_t h = hash64Lower_a(s, len );//>> 1) ;
long i = (long) s_table.getScore ( &h ) ;
if (i==0) return FIELD_GENERIC;
//if (hasColon) *hasColon = g_fields[i-1].hasColon ;
return g_fields[i-1].field;
}
char getFieldCode2 ( char *s , long len , bool *hasColon ) {
// default
if (hasColon) *hasColon = false;
if (!initFieldTable()) return FIELD_GENERIC;
// subtract the colon for matching
if ( s[len-1]==':') len--;
int64_t h = hash64 (s , len , 0LL );
long i = (long) s_table.getScore ( &h ) ;
if (i==0) return FIELD_GENERIC;
//if (hasColon) *hasColon = g_fields[i-1].hasColon ;
return g_fields[i-1].field;
}
char getFieldCode3 ( int64_t h64 ) {
if (!initFieldTable()) return FIELD_GENERIC;
// subtract the colon for matching
long i = (long) s_table.getScore ( &h64 ) ;
if (i==0) return FIELD_GENERIC;
//if (hasColon) *hasColon = g_fields[i-1].hasColon ;
return g_fields[i-1].field;
}
// guaranteed to be punctuation
bool Query::isConnection ( char *s , long len ) {
if ( len == 1 ) {
switch (*s) {
// . only allow apostrophe if it's NOT a 's
// . so contractions are ok, and names too
case '\'':
// no, i think we should require it. google seems to,
// and msn and yahoo do. 'john's room -"john's" gives
// no result son yahoo and msn.
return true;
if ( *(s+1) !='s' ) return true;
return false;
case ':': return true;
case '-': return true;
case '.': return true;
case '@': return true;
case '#': return true;
case '/': return true;
case '_': return true;
case '&': return true;
case '=': return true;
case '\\': return true;
default: return false;
}
return false;
}
//if ( len == 3 && s[0]==' ' && s[1]=='&' && s[2]==' ' ) return true;
if ( len == 3 && s[0]==':' && s[1]=='/' && s[2]=='/' ) return true;
return false;
}
void Query::printQueryTerms(){
for (long i=0;i<m_numTerms;i++){
char c = getTermSign(i);
char tt[512];
long ttlen = getTermLen(i);
if ( ttlen > 254 ) ttlen = 254;
if ( ttlen < 0 ) ttlen = 0;
// this is utf8
memcpy ( tt , getTerm(i) , ttlen );
tt[ttlen]='\0';
if ( c == '\0' ) c = ' ';
logf(LOG_DEBUG, "query: Query Term #%ld "
"phr=%li termId=%llu rawTermId=%llu"
" sign=%c "
"ebit=0x%0llx "
"impBits=0x%0llx "
"hc=%li "
"component=%li "
"otermLen=%li "
"term=%s ",
i,
(long)isPhrase (i) ,
getTermId (i) ,
getRawTermId (i) ,
c ,
(int64_t)m_qterms[i].m_explicitBit ,
(int64_t)m_qterms[i].m_implicitBits ,
(long) m_qterms[i].m_hardCount ,
m_componentCodes[i],
getTermLen(i),
tt );
}
}
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
////////// ONLY BOOLEAN STUFF BELOW HERE /////////////
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
bool Query::testBoolean( unsigned char *bits ,long vecSize){//qvec_t bitmask){
if (!m_isBoolean) return false;
Expression *e = &m_expressions [ 0 ];
// find top-level expression
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
return e->isTruth(bits,vecSize);//, bitmask);
}
void Query::printBooleanTree(){
if (!m_isBoolean) return;
//Expression *e = &m_expressions [ 0 ];
// find top-level expression
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
//SafeBuf sbuf(1024,"botree");
//e->print(&sbuf);
//logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
}
/*
// . also sets the m_underNOT member of each QueryTerm, too!!
// . returns false and sets g_errno on error, true otherwise
bool Query::setBooleanOperands ( ) {
// we're done if we're not boolean
if ( ! m_isBoolean ) return true;
if ( m_truncated ) {
g_errno = ETOOMANYOPERANDS;
return log("query: Maximum number of bool operands "
"exceeded (%ld).",m_numTerms);
}
// set the QueryWord::m_opBit member of each query word.
// so if you have a query like 'A B OR C' then you need
// to have both A and B if you don't have C. so every word
// unless its an operator needs its own bit. quoted phrases
// may present a problem down the road we'll have to deal with.
long opNum = 0;
for ( long i = 0 ; i < m_numWords ; i++ ) {
// skip if field, opcode, punct. etc.
if ( m_qwords[i].m_ignoreWord ) continue;
// assign it a # i guess
m_qwords[i].m_opNum = opNum++;
}
// alloc the mem if we need to (mdw left off here)
//long need = (m_numWords/3) * sizeof(Expression);
// illegitmate bool expressions breech the buffer
long need = (m_numWords) * sizeof(Expression);
// sanity check
if ( m_expressions || m_expressionsAllocSize ) {
char *xx = NULL; *xx = 0; }
// point m_qwords to our generic buffer if it will fit
if ( m_gnext + need < m_gbuf + GBUF_SIZE ) {
m_expressions = (Expression *)m_gnext;
m_gnext += need;
}
// otherwise, we must allocate memory for it
else {
m_expressions = (Expression *)mmalloc ( need , "Query3" );
if ( ! m_expressions ) return log("query: Could not allocate "
"expressions for query.");
m_expressionsAllocSize = need;
}
// otherwise, we need to set the boolean Expression classes now
// so we can determine which terms are UNDER the influence of
// NOT operators so IndexReadInfo.cpp can read in the WHOLE termlist
// for those terms. (like it would if they had a '-' m_termSign)
Expression *e = &m_expressions [ 0 ];
m_numExpressions = 1;
// . set the expression recursively
// . just setting this will not set the m_hasNOT members of each
// QueryTerm
long status = e->add ( 0 , // first word #
m_numWords , // last word #
this , // array of QueryWords
0 ,// level
false ); // has NOT?
if ( status < 0 ) {
g_errno = ETOOMANYOPERANDS;
return log("query: Maximum number of bool operands "
"(%li) exceeded.",(long)MAX_OPERANDS);
}
while (e->m_parent) {
if (e == e->m_parent) {
g_errno = EBADREQUEST;
return log(LOG_WARN, "query: expression is own parent: "
"%s", m_orig);
}
e = e->m_parent;
}
//log(LOG_DEBUG, "query: set %li operands",
// m_numOperands);
if (g_conf.m_logDebugQuery) {
SafeBuf sbuf(1024);
e->print(&sbuf);
log(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
}
// . get all the terms that are UNDER a NOT operator in some fashion
// . these bits are 1-1 with m_qterms[]
*/
/*
qvec_t notBits = e->getNOTBits( false );
for ( long i = 0 ; i < m_numTerms ; i++ ) {
if ( m_qterms[i].m_explicitBit & notBits )
m_qterms[i].m_underNOT = true;
else
m_qterms[i].m_underNOT = false;
}
*/
/*
return true;
}
*/
/*
// . returns -1 on bad query error
// . returns word AFTER the last word in our operand
long Operand::set ( long a , long b , QueryWord *qwords , long level ,
bool underNOT ) {
// clear these
//m_termBits = 0;
memset(m_opBits,0,MAX_OVEC_SIZE);
m_hasNOT = false;
//m_hardRequiredBits = 0;
// . parse out the operands and OR in their term bits
// . the boy AND girl --> (the AND boy) AND girl
// . "the boy toy" AND girl --> "the boy" AND "boy toy" AND girl
// . cd-rom AND buy --> "cd-rom" AND buy
// . phraseSign will not be 0 if its important (in quotes, cd-rom,...)
for ( long i = a ; i < b ; i++ ) {
// get the QUERY word
QueryWord *qw = &qwords[i];
// set the parenthetical level of the word
qw->m_level = level;
// set this
//qw->m_underNOT = underNOT;
// skip punct
if ( ! qw->isAlphaWord() ) {
// if it is a parens, bail!
if ( qw->m_opcode == OP_LEFTPAREN ) return i ;
if ( qw->m_opcode == OP_RIGHTPAREN ) return i ;
// otherwise, skip this punct and get next word
else continue;
}
// bail if op code, return PUNCT word # before it
if ( qw->m_opcode ) return i ;
if ( qw->m_wordSign == '-' || qw->m_phraseSign == '-'){
if (i == a) {
m_hasNOT = true;
}
else {
if (!m_hasNOT) return i;
}
}
else if (i>a && m_hasNOT) return i;
// . does it have an unsigned phrase? or in phrase term bits
// . might have a phrase that's not a QueryTerm because
// query is too long
if ( qw->m_phraseId && qw->m_queryPhraseTerm &&
qw->m_phraseSign ) {
//qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
//m_termBits |= e;
long byte = qw->m_opNum / 8;
long mask = 1<<(qw->m_opNum % 8);
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
}
// why would it be ignored? oh... if like cd-rom or in quotes
if ( qw->m_ignoreWord ) continue;
// . OR in the word term bits
// . might be a word that's not a QueryTerm because
// query is too long
if ( qw->m_queryWordTerm ) {
//qvec_t e = qw->m_queryWordTerm->m_explicitBit;
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
//m_termBits |= e;
long byte = qw->m_opNum / 8;
long mask = 1<<(qw->m_opNum % 8);
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
}
}
return b;
}
*/
// . returns -1 on bad query error
// . returns next word to parse (after expression) on success
// . "*globalNumOperands" is how many expressions/operands are being used
// in the global "expressions" and "operands" array
// . new: organize query into sum of products normal form, ie:
// . (a) OR (b AND c AND d) OR (e AND f)
/*
unsigned char precedence[] = {
0, // term
4, // OR
3, // AND
2, // NOT
1, // LEFTP
1, // RIGHTP
3, // UOR
5, // PIPE
};
*/
//#define TYPE_OPERAND 1
//#define TYPE_OPCODE 2
//#define TYPE_EXPRESSION 3
// return false and set g_errno on error
// returns how many words expression was
bool Expression::addExpression (long start,
long end,
class Query *q,
long level
) {
if ( level >= MAX_EXPRESSIONS ) {
g_errno = ETOOMANYPARENS;
return false;
}
// the # of the first alnumpunct word in the expression
m_expressionStartWord = start;
// and the last one
//m_end = end;
//m_hasNOT = hasNOT;
m_q = q;
//m_cc = 0;
long i = m_expressionStartWord;
// try to fix
// type:html AND ((site:xyz.com OR site:abc.com))
// query where there are double parens
m_hadOpCode = false;
// "start" is the current alnumpunct word we are parsing out
for ( ; i<end ; i++ ) {
QueryWord *qwords = q->m_qwords;
QueryWord * qw = &qwords[i];
// set this
//qw->m_underNOT = underNOT;
// set leaf node if not an opcode like "AND" and not punct.
if ( ! qw->m_opcode && qw->isAlphaWord()){
//m_opSlots[m_cc] = i;
//m_opTypes[m_cc] = TYPE_OPERAND;
//qw->m_opBitNum = m_cc;
continue;//goto endExpr; mdw
}
if (qw->m_opcode == OP_NOT){
//hasNOT = !hasNOT;
//underNOT = hasNOT;
continue;
}
else if (qw->m_opcode == OP_LEFTPAREN){
// this is expression
// . it should advance "i" to end of expression
// point to next...
q->m_numExpressions++;
// make a new one:
Expression *e=&q->m_expressions[q->m_numExpressions-1];
// now set it
if ( ! e->addExpression ( i+1, // skip over (
end ,
q ,
level + 1) )
return false;
// skip over it. pt to ')'
i += e->m_numWordsInExpression;
qw->m_expressionPtr = e;
//m_opSlots[m_cc] = (long)e;
//m_opTypes[m_cc] = TYPE_EXPRESSION;
//qw->m_opBitNum = m_cc;
}
else if (qw->m_opcode == OP_RIGHTPAREN){
// return size i guess, include )
m_numWordsInExpression = i - m_expressionStartWord+1;
return true;
}
else if (qw->m_opcode) {
// add that mdw
//m_opSlots[m_cc] = qw->m_opcode;
//m_opTypes[m_cc] = TYPE_OPCODE;
//qw->m_opBitNum = m_cc;
//m_cc++;
m_hadOpCode = true;
continue;
}
// white space?
continue;
}
m_numWordsInExpression = i - m_expressionStartWord;
return true;
}
// each bit is 1-1 with the explicit terms in the boolean query
bool Query::matchesBoolQuery ( unsigned char *bitVec , long vecSize ) {
return m_expressions[0].isTruth ( bitVec , vecSize );
}
bool isBitNumSet ( long opBitNum, unsigned char *bitVec, long vecSize ) {
long byte = opBitNum / 8;
long mask = 1<<(opBitNum % 8);
if ( byte >= vecSize ) { char *xx=NULL;*xx=0; }
return bitVec[byte] & mask;
}
// . "bits" are 1-1 with the query words in Query::m_qwords[] array
// including ignored words and spaces i guess since Expression::add()
// seems to do that.
bool Expression::isTruth ( unsigned char *bitVec ,long vecSize ) {
//
// operand1 operand2 operator1 operand3 operator2 ....
//
// result: -1 means unknown at this point
long result = -1;
char prevOpCode = 0;
long prevResult ;
// result of current operand
long opResult = -1;
long i = m_expressionStartWord;
long iend = i + m_numWordsInExpression;
bool hasNot = false;
for ( ; i < iend ; i++ ) {
QueryWord *qw = &m_q->m_qwords[i];
// ignore parentheses, aren't real opcodes.
// we just want OP_AND/OP_OR/OP_NOT
long opcode = qw->m_opcode;
if ( opcode != OP_AND &&
opcode != OP_OR &&
opcode != OP_NOT )
opcode = 0;
if ( opcode == OP_NOT ) {
hasNot = true;
continue;
}
// so operands are expressions as well
Expression *e = (Expression *)qw->m_expressionPtr;
if ( e ) {
// save prev one. -1 means no prev.
prevResult = opResult;
// set new onw
opResult = e->isTruth ( bitVec , vecSize );
// skip over that expression. point to ')'
i += e->m_numWordsInExpression;
// flip?
if ( hasNot ) {
if ( opResult == 1 ) opResult = 0;
else opResult = 1;
hasNot = false;
}
}
if ( opcode && ! e ) {
prevOpCode = opcode;//m_opSlots[i];
continue;
}
// simple operand
if ( ! opcode && ! e ) {
// for regular word operands
// ignore it like a space?
if ( qw->m_ignoreWord ) continue;
// ignore gbsortby:offerprice in bool queries
// at least for evaluating them
if ( qw->m_ignoreWordInBoolQuery ) continue;
// save old one
prevResult = opResult;
// convert word to term #
QueryTerm *qt = qw->m_queryWordTerm;
// fix title:"notre dame" AND NOT irish
if ( ! qt ) qt = qw->m_queryPhraseTerm;
if ( ! qt ) continue;
// . m_bitNum is set in Posdb.cpp when it sets its
// QueryTermInfo array
// . it is basically the query term #
// . see iff that bit is set in this docid's vec
opResult = isBitNumSet ( qt->m_bitNum,bitVec,vecSize );
// flip?
if ( hasNot ) {
if ( opResult == 1 ) opResult = 0;
else opResult = 1;
hasNot = false;
}
}
// need two to tango. i.e. (true OR false)
if ( prevResult == -1 ) continue;
// if this is not the first time... we got two
if ( prevOpCode == OP_AND ) {
// if first operation we encount is A AND B then
// default result to on. only allow an AND operation
// to turn if off.
if ( result == -1 ) result = true;
if ( ! prevResult ) result = false;
if ( ! opResult ) result = false;
}
else if ( prevOpCode == OP_OR ) {
// if first operation we encount is A OR B then
// default result to off
if ( result == -1 ) result = false;
if ( prevResult ) result = true;
if ( opResult ) result = true;
}
}
// if we never set result, then it was probably a single
// argument expression like something in double parens like
// ((site:xyz.com OR site:abc.com)). so set it to value of
// first operand, opResult.
if ( prevOpCode == 0 && result == -1 ) result = opResult;
if ( result == -1 ) return true;
if ( result == 0 ) return false;
return true;
}
/*
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
// . hasNOT is true if there's a NOT just to the left of this WHOLE expressions
// ourside the parens
qvec_t Expression::getNOTBits ( bool hasNOT ) {
qvec_t notBits = 0;
// for ( long i = 0 ; i < m_numOperands ; i++ ) {
// // get value of the ith operand, be it plain or an expression
// if ( m_operands[i] ) {
// if ( m_hasNOT[i] || hasNOT )
// notBits |= m_operands[i]->m_termBits;
// }
// else
// notBits |= m_expressions[i]->getNOTBits (m_hasNOT[i]);
// }
// success, all operand pairs were true
return notBits;
}
*/
// print boolean expression for debug purposes
void Expression::print(SafeBuf *sbuf) {
/*
if (m_hasNOT) sbuf->safePrintf("NOT ");
if (m_operand){
m_operand->print(sbuf);
return;
}
sbuf->safePrintf("(");
for (long i=0; i < m_numChildren ; i++) {
m_children[i]->print(sbuf);
if (i >= m_numChildren-1) break;
switch (m_opcode) {
case OP_OR: sbuf->safePrintf(" OR " ); break;
case OP_AND: sbuf->safePrintf(" AND " ); break;
case OP_UOR: sbuf->safePrintf(" UOR " ); break;
case OP_PIPE: sbuf->safePrintf(" PIPE "); break;
}
}
sbuf->safePrintf(")");
*/
}
/*
void Operand::print(SafeBuf *sbuf) {
// long shift = 0;
// while (m_termBits >> shift) shift++;
// sbuf->safePrintf("%i", 1<<(shift-1));
if (m_hasNOT) sbuf->safePrintf("NOT 0x%llx",*(int64_t *)m_opBits);
else sbuf->safePrintf("0x%llx", *(int64_t *)m_opBits);
}
*/
// if any one query term is split, msg3a has to split the query
bool Query::isSplit() {
for(long i = 0; i < m_numTerms; i++)
if(m_qterms[i].isSplit()) return true;
return false;
}
bool QueryTerm::isSplit() {
if(!m_fieldCode) return true;
if(m_fieldCode == FIELD_QUOTA) return false;
if(m_fieldCode == FIELD_GBTAGVECTOR) return false;
if(m_fieldCode == FIELD_GBGIGABITVECTOR) return false;
if(m_fieldCode == FIELD_GBSAMPLEVECTOR) return false;
if(m_fieldCode == FIELD_GBSECTIONHASH) return false;
if(m_fieldCode == FIELD_GBCONTENTHASH) return false;
return true;
}
// hash of all the query terms
int64_t Query::getQueryHash() {
int64_t qh = 0LL;
for ( long i = 0 ; i < m_numTerms ; i++ )
qh = hash64 ( m_termIds[i] , qh );
return qh;
}