#include "gb-include.h" #include "Query.h" //#include "Indexdb.h" // g_indexdb.getTruncationLimit() g_indexdb.getTermId() #include "Words.h" #include "Bits.h" #include "Phrases.h" #include "Url.h" #include "Clusterdb.h" // g_clusterdb.getNumGlobalRecs() #include "StopWords.h" // isQueryStopWord() #include "Speller.h" //#include "Thesaurus.h" #include "Mem.h" #include "Msg3a.h" #include "HashTableX.h" #include "Synonyms.h" #include "Wiki.h" Query::Query ( ) { constructor(); } void Query::constructor ( ) { //m_bmap = NULL; m_bitScores = NULL; m_qwords = NULL; m_expressions = NULL; m_qwordsAllocSize = 0; m_expressionsAllocSize = 0; m_qwords = NULL; m_expressions = NULL; reset ( ); } void Query::destructor ( ) { reset(); } Query::~Query ( ) { reset ( ); } void Query::reset ( ) { m_docIdRestriction = 0LL; m_groupThatHasDocId = NULL; m_bufLen = 0; m_origLen = 0; m_numWords = 0; m_numOperands = 0; m_numTerms = 0; m_synTerm = 0; //m_numIgnored = 0; //m_numRequired = -1; m_numComponents = 0; //if ( m_bmap && m_bmapSize ) // != m_bmbuf ) // mfree ( m_bmap , m_bmapSize , "Query1" ); //if ( m_bitScores && m_bitScoresSize ) // != m_bsbuf ) // mfree ( m_bitScores , m_bitScoresSize , "Query2" ); //m_bmap = NULL; m_bitScores = NULL; //m_bmapSize = 0; m_bitScoresSize = 0; if ( m_expressionsAllocSize ) mfree ( m_expressions , m_expressionsAllocSize , "Query3" ); if ( m_qwordsAllocSize ) mfree ( m_qwords , m_qwordsAllocSize , "Query4" ); m_expressionsAllocSize = 0; m_qwordsAllocSize = 0; m_qwords = NULL; m_expressions = NULL; m_numExpressions = 0; m_gnext = m_gbuf; m_hasUOR = false; m_bmapIsSet = false; // the site: and ip: query terms will disable site clustering & caching m_hasPositiveSiteField = false; m_hasIpField = false; m_hasUrlField = false; m_hasSubUrlField = false; m_hasIlinkField = false; m_hasGBLangField = false; m_hasGBCountryField = false; m_hasQuotaField = false; m_hasLinksOperator = false; m_truncated = false; m_hasSynonyms = false; } // . returns false and sets g_errno on error // . "query" must be NULL terminated // . if boolFlag is 0 we ignore all boolean operators // . if boolFlag is 1 we assume query is boolen // . if boolFlag is 2 we attempt to detect if query is boolean or not // . if "keepAllSingles" is true we do not ignore any single word UNLESS // it is a boolean operator (IGNORE_BOOLOP), fieldname (IGNORE_FIELDNAME) // a punct word (IGNORE_DEFAULT) or part of one field value (IGNORE_DEFAULT) // This is used for term highlighting (Highlight.cpp and Summary.cpp) bool Query::set2 ( char *query , //long queryLen , //char *coll , //long collLen , //char boolFlag , //bool keepAllSingles , // need language for doing synonyms uint8_t langId , char queryExpansion , bool useQueryStopWords ) { //long maxQueryTerms ) { m_langId = langId; m_useQueryStopWords = useQueryStopWords; // fix summary rerank and highlighting. bool keepAllSingles = true; // assume not boolean char boolFlag = 0; // come back up here if we changed our boolean minds top: reset(); if ( ! query ) return true; // set to 256 for synonyms? m_maxQueryTerms = 256; m_queryExpansion = queryExpansion; long queryLen = gbstrlen(query); // override this to 32 at least for now //if ( m_maxQueryTerms < 32 ) m_maxQueryTerms = 32; // save collection info //m_coll = coll; //m_collLen = collLen; // truncate query if too big if ( queryLen >= MAX_QUERY_LEN ) { log("query: Query length of %li must be less than %li. " "Truncating.",queryLen,(long)MAX_QUERY_LEN); queryLen = MAX_QUERY_LEN - 1; m_truncated = true; } // save original query m_origLen = queryLen; memcpy ( m_orig , query , queryLen ); m_orig [ m_origLen ] = '\0'; log(LOG_DEBUG, "query: set called = %s", m_orig); char *q = query; // see if it should be boolean... for ( long i = 0 ; boolFlag && i < queryLen ; i++ ) { if ( q[i]=='A' && q[i+1]=='N' && q[i+2]=='D' && (q[i+3]==' ' || q[i+3]=='(') ) boolFlag = 1; if ( q[i]=='O' && q[i+1]=='R' && (q[i+2]==' ' || q[i+2]=='(') ) boolFlag = 1; if ( q[i]=='N' && q[i+1]=='O' && q[i+2]=='T' && (q[i+3]==' ' || q[i+3]=='(') ) boolFlag = 1; } // come back up here if we find no bool operators but had ()'s // top: // reset anything that was allocated... in case we're being // called from below... m_qwords may have been allocated in call // to setQWords() below // NO! this resets m_origLen to 0!!! not to mention other member vars // that were set somewhere above!!! i moved top: label above! //reset(); // convenience ptr char *p = m_buf; char *pend = m_buf + MAX_QUERY_LEN; // . copy query into m_buf // . translate ( and ) to special query operators so Words class // can parse them as their own word to make parsing bool queries ez // for parsing out the boolean operators in setBitScoresBoolean() for ( long i = 0 ; i < queryLen ; i++ ) { // dst buf must be big enough if ( p + 8 >= pend ) { g_errno = EBUFTOOSMALL; return log(LOG_LOGIC,"query: query: query too big."); } // translate ( and ) if ( boolFlag != 0 && query[i] == '(' ) { memcpy ( p , " LeFtP " , 7 ); p += 7; continue; } if ( boolFlag != 0 && query[i] == ')' ) { memcpy ( p , " RiGhP " , 7 ); p += 7; continue; } if ( query[i] == '|' ) { memcpy ( p , " PiiPE " , 7 ); p += 7; continue; } // translate [#a] [#r] [#ap] [#rp] [] [p] to operators if ( query[i] == '[' && is_digit(query[i+1])) { long j = i+2; long val = atol ( &query[i+1] ); while ( is_digit(query[j]) ) j++; char c = query[j]; if ( (c == 'a' || c == 'r') && query[j+1]==']' ) { sprintf ( p , " LeFtB %li %c RiGhB ",val,c); p += gbstrlen(p); i = j + 1; continue; } else if ( (c == 'a' || c == 'r') && query[j+1]=='p' && query[j+2]==']') { sprintf ( p , " LeFtB %li %cp RiGhB ",val,c); p += gbstrlen(p); i = j + 2; continue; } } if ( query[i] == '[' && query[i+1] == ']' ) { sprintf ( p , " LeFtB RiGhB "); p += gbstrlen(p); i = i + 1; continue; } if ( query[i] == '[' && query[i+1] == 'p' && query[i+2]==']') { sprintf ( p , " LeFtB RiGhB "); p += gbstrlen(p); i = i + 2; continue; } char *q = &(query[i]); // Skip old buzz permalink keywords if (*q == 'g' && *(q+1) == 'b'){ // do not skip anymore, Msg5e.cpp needs this /* if (*(q+2) == 'p' && *(q+3) == 'e' && *(q+4) == 'r' && *(q+5) == 'm' && *(q+6) == 'a' && *(q+7) == 'l' && *(q+8) == 'i' && *(q+9) == 'n' && *(q+10) == 'k' && *(q+11) == ':' && *(q+12) =='1'){ //i += 12; static bool s_printed = false; if ( ! s_printed ) logf(LOG_DEBUG,"query: skipping " "gbpermalink term for buzz."); if ( ! s_printed ) s_printed = true; continue; } */ if (*(q+2)=='k' && *(q+3)=='e' && *(q+4) == 'y' && *(q+5)=='w' && *(q+6)=='o' && *(q+7) == 'r' && *(q+8) == 'd' && *(q+9) == ':' && *(q+10)=='r' && *(q+11)=='3' && *(q+12)=='6' && *(q+13) == 'p' && *(q+14) == '1'){ //logf(LOG_DEBUG,"query: skipping funky " // "keyword term for buzz."); i += 14; continue; } } // TODO: copy altavista's operators here? & | ! // otherwise, just a plain copy *p = query [i]; p++; } // NULL terminate *p = '\0'; // debug statement //log(LOG_DEBUG,"Query: Got new query=%s",tempBuf); //printf("query: query: Got new query=%s\n",tempBuf); // set length m_bufLen = p - m_buf; Words words; Phrases phrases; // set m_qwords[] array from m_buf if ( ! setQWords ( boolFlag , keepAllSingles , words , phrases ) ) return false; //log(LOG_DEBUG, "Query: QWords set"); // did we have any boolean operators char found = 0; char parens = 0; if ( boolFlag >= 1 ) { for ( long i = 0 ; i < m_numWords ; i++ ) { char *w = m_qwords[i].m_word; long wlen = m_qwords[i].m_wordLen; if (wlen==2 &&w[0]=='O'&&w[1]=='R' ) found=1; else if (wlen==3 &&w[0]=='A'&&w[1]=='N'&&w[2]=='D') found=1; else if (wlen==3 &&w[0]=='N'&&w[1]=='O'&&w[2]=='T') found=1; if (wlen==5 &&w[0]=='L' && w[1]=='e' && w[2]=='F' && w[3]=='t' && w[4]=='P' ) parens=1; else if (wlen==5 &&w[0]=='R' && w[1]=='i' && w[2]=='G' && w[3]=='h' && w[4]=='P' ) parens=1; } // if we were told it was a bool query or to auto-detect // and it has no operators, but had parens, re-do so parens // do not get translated to LeFtP or RiGhP if ( boolFlag >= 1 && found == 0 && parens == 1 ) { boolFlag = 0; goto top; } // if no bool operators, it's definitely not a boolean query if ( found == 0 ) boolFlag = 0; } // set m_qterms from m_qwords, always succeeds setQTerms ( words , phrases ); // . now add in compound termlists // . compound query terms replace lists of UOR'd query terms that // share the same QueryTerm::m_exclusiveBit (ebit) // . if it cannot get the compound termlist from a remote cache, then // Msg2 should get its components // . component termlists have their compound termlist number // as their m_componentCode, compound termlists have a componentCode // of -1, other termlists have a componentCode of -2. // . Query::addCompoundTerms() will add one extra query term for every // sequence of UOR'd query terms that share the same ebit. // Furthermore, it sets the m_componentCodes[] array. // . The compound term must have the same ebit as its component terms. // . we use the termid of compound termlists (and NOT their components) // when routing this query to the host that can use the least // amount of bandwidth to download/get the termlists. if the compound // termlist is not in the cache then it will not be on disk or // in the tree since it is a vitual termlist, BUT we will still // create it and store it in the cache, so assume it is in a cache, // because the act of storing it in the cache may require sending // it to another machine. // . if m_compoundListMaxSize is 0, do not do compound lists // . Query::addCompoundTerms() will set the termfreq of compound terms // to the sum of the termfreqs of its component termlists //if ( m_compoundListMaxSize > 0 ) addCompoundTerms( ); // . always add them for now //addCompoundTerms( ); // if m_isBoolean was set and we only have OP_UOR then // we should probably unset it here (mdw) // set m_expressions[] and m_operands[] arrays and m_numOperands // for boolean queries if ( m_isBoolean ) if ( ! setBooleanOperands() ) return false; // disable stuff for site:, ip: and url: queries for ( long i = 0 ; i < m_numWords ; i++ ) { QueryWord *qw = &m_qwords[i]; if ( qw->m_ignoreWord ) continue; if ( qw->m_fieldCode == FIELD_SITE && qw->m_wordSign != '-' ) m_hasPositiveSiteField = true; else if ( qw->m_fieldCode == FIELD_IP ) m_hasIpField = true; else if ( qw->m_fieldCode == FIELD_URL ) m_hasUrlField = true; else if ( qw->m_fieldCode == FIELD_ILINK ) m_hasIlinkField = true; else if ( qw->m_fieldCode == FIELD_GBLANG ) m_hasGBLangField = true; else if ( qw->m_fieldCode == FIELD_GBCOUNTRY ) m_hasGBCountryField = true; else if ( qw->m_fieldCode == FIELD_QUOTA ) m_hasQuotaField = true; else if ( qw->m_fieldCode == FIELD_SUBURL ) m_hasSubUrlField = true; } // set m_docIdRestriction if a term is gbdocid: for ( long i = 0 ; i < m_numTerms && ! m_isBoolean ; i++ ) { // get it QueryTerm *qt = &m_qterms[i]; // gbdocid:? if ( qt->m_fieldCode != FIELD_GBDOCID ) continue; // get docid char *ds = m_qterms[i].m_term + 8; m_docIdRestriction = atoll(ds); //unsigned long gid; unsigned long shard = getShardNumFromDocId(m_docIdRestriction); //gid = g_hostdb.getGroupIdFromDocId(m_docIdRestriction); //m_groupThatHasDocId = g_hostdb.getGroup(gid); m_groupThatHasDocId = g_hostdb.getShard ( shard ); break; } // . if it is not truncated, no need to use hard counts // . comment this line and the next one out for testing hard counts if ( ! m_truncated ) return true; // if got truncated AND under the HARD max, nothing we can do, it // got cut off due to m_maxQueryTerms limit in Parms.cpp if ( m_numTerms < (long)MAX_EXPLICIT_BITS ) return true; // if they just hit the admin's ceiling, there's nothing we can do if ( m_numTerms >= m_maxQueryTerms ) return true; // a temp log message log(LOG_DEBUG,"query: Encountered %li query terms.",m_numTerms); // otherwise, we're below m_maxQueryTerms BUT above MAX_QUERY_TERMS // so we can use hard counts to get more power... // . use the hard count for excessive query terms to save explicit bits // . just look for operands on the first level that are not OR'ed char redo = 0; for ( long i = 0 ; i < m_numWords ; i++ ) { // get the ith word QueryWord *qw = &m_qwords[i]; // mark him as NOT hard required qw->m_hardCount = 0; // skip if not on first level if ( qw->m_level != 0 ) continue; // stop at first OR on this level if ( qw->m_opcode == OP_OR ) break; // skip all punct if ( qw->m_isPunct ) continue; // if we are a boolean query,the next operator can NOT be OP_OR // because we can not used terms that are involved in an OR // as a hard count term, because they are not required terms for ( long j=i+1 ; m_isBoolean && jm_hardCount = 1; // mark it so we can reduce our number of explicit bits used redo = 1; } stop: // if nothing changed, return now if ( ! redo ) return true; // . set the query terms again if we have a long query // . if QueryWords has m_hardCount set, ensure the explicit bit is 0 // . non-quoted phrases that contain a "required" single word should // themselves have 0 for their implicit bits, BUT 0x8000 for their // explicit bit if ( ! setQTerms ( words , phrases ) ) return false; // a temp log message //log(LOG_DEBUG,"query: Compressed to %li query terms, %li hard. " // "(nt=%li)", // m_numExplicitBits,m_numTerms-m_numExplicitBits,m_numTerms); if ( ! m_isBoolean ) return true; // free cuz it was already set if ( m_expressionsAllocSize ) mfree(m_expressions,m_expressionsAllocSize , "Query" ); m_expressionsAllocSize = 0; m_expressions = NULL; // also set the boolean stuff again too! if ( ! setBooleanOperands() ) return false; return true; } /* // count how many so PageResults will know if he should offer // a default OR alternative search if no more results for // the default AND (rat=1) long Query::getNumRequired ( ) { if ( m_numRequired >= 0 ) return m_numRequired; m_numRequired = 0; for ( long i = 0 ; i < m_numTerms ; i++ ) { // QueryTerms are derived from QueryWords QueryTerm *qt = &m_qterms[i]; // don't require if negative if ( qt->m_termSign == '-' ) continue; // skip signless phrases if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue; if ( qt->m_synonymOf ) continue; // count it up m_numRequired++; } return m_numRequired; } */ // returns false and sets g_errno on error bool Query::setQTerms ( Words &words , Phrases &phrases ) { //long shift = 0; // . set m_qptrs/m_qtermIds/m_qbits // . use one bit position for each phraseId and wordId // . first set phrases long n = 0; // what is the max value for "shift"? long max = (long)MAX_EXPLICIT_BITS; if ( max > m_maxQueryTerms ) max = m_maxQueryTerms; //char u8Buf[256]; for ( long i = 0 ; i < m_numWords && n < MAX_QUERY_TERMS ; i++ ) { // break out if no more explicit bits! /* if ( shift >= max ) { log("query: Query1 has more than %li unique terms. " "Truncating.",max); m_truncated = true; break; } */ QueryWord *qw = &m_qwords[i]; // skip if ignored... mdw... if ( ! qw->m_phraseId ) continue; if ( qw->m_ignorePhrase ) continue; // could be a repeat // none if weight is absolute zero if ( qw->m_userWeightPhrase == 0 && qw->m_userTypePhrase == 'a' ) continue; // stop breach if ( n >= MAX_QUERY_TERMS ) { log("query: lost query phrase terms to max term " "limit of %li",(long)MAX_QUERY_TERMS ); break; } QueryTerm *qt = &m_qterms[n]; qt->m_qword = qw ; qt->m_piped = qw->m_piped; qt->m_isPhrase = true ; qt->m_isUORed = false; qt->m_UORedTerm = NULL; qt->m_synonymOf = NULL; qt->m_ignored = 0; // assume not a repeat of another query term (set below) qt->m_repeat = false; // stop word? no, we're a phrase term qt->m_isQueryStopWord = false; // change in both places qt->m_termId = qw->m_phraseId & TERMID_MASK; m_termIds[n] = qw->m_phraseId & TERMID_MASK; //log(LOG_DEBUG, "Setting query phrase term id %d: %lld", n, m_termIds[n]); qt->m_rawTermId = qw->m_rawPhraseId; // assume explicit bit is 0 qt->m_explicitBit = 0; qt->m_matchesExplicitBits = 0; // boolean queries are not allowed term signs for phrases // UNLESS it is a '*' soft require sign which we need for // phrases like: "cat dog" AND pig if ( m_isBoolean && qw->m_phraseSign != '*' ) { qt->m_termSign = '\0'; m_termSigns[n] = '\0'; } // if not boolean, ensure to change signs in both places else { qt->m_termSign = qw->m_phraseSign; m_termSigns[n] = qw->m_phraseSign; } // // INSERT UOR LOGIC HERE // // long pw = i-1; // // . back up until word that contains quote if in a quoted // // phrase // // . UOR can only support two word phrases really... // if (m_qwords[i].m_quoteStart >= 0) // pw = m_qwords[i].m_quoteStart - 1; // if ( pw >= 0 && m_qwords[pw].m_quoteStart >= 0 ) // pw = m_qwords[pw].m_quoteStart - 1; // // back two more if field // //if ( pw >= 0 && m_qwords[pw].m_ignoreWord==IGNORE_FIELDNAME ) // // pw -= 2; // while (pw>0 && // ((m_qwords[pw].m_ignoreWord == IGNORE_DEFAULT) || // (m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) pw--; // // is UOR operator? if so, backup over it // if ( pw >= 0 && m_qwords[pw].m_opcode == OP_UOR ) pw -= 2; // else goto notUORPhrase; // if ( pw < 0 ) goto notUORPhrase; // // . if previous term is UOR'd with us then share the same ebit // // . this allows us to use lots of UOR'd query terms // // . the UOR'd lists may also be merged together into a single // // list if "mergeListMaxSize" is positive // // if ( n >= 1 && // // i >= 4 && // // //m_qterms[n-1].m_qword == &m_qwords[pw] && // // shift > 0 && // // qw->m_hardCount == 0 ) // // shift--; // // set the UOR term sign // qt->m_isUORed = true; // notUORPhrase: // do not use an explicit bit up if we have a hard count qt->m_hardCount = qw->m_hardCount; // if ( qw->m_hardCount == 0 ) { // qt->m_explicitBit = 1 << shift ; // shift++; // } qw->m_queryWordTerm = NULL; // IndexTable.cpp uses this one qt->m_inQuotes = qw->m_inQuotes; // point to the string itself that is the phrase qt->m_term = qw->m_word; qt->m_termLen = qw->m_phraseLen; // the QueryWord should have a direct link to the QueryTerm, // at least for phrase, so we can OR in the bits of its // constituents in the for loop below qw->m_queryPhraseTerm = qt ; // include ourselves in the implicit bits // qt->m_implicitBits = qt->m_explicitBit; // doh! gotta reset to 0 qt->m_implicitBits = 0; // assume not under a NOT bool op qt->m_underNOT = false; // assign score weight, we're a phrase here qt->m_userWeight = qw->m_userWeightPhrase ; qt->m_userType = qw->m_userTypePhrase ; qt->m_fieldCode = qw->m_fieldCode ; // stuff before a pipe always has a weight of 1 if ( qt->m_piped ) { qt->m_userWeight = 1; qt->m_userType = 'a'; } // debug //char tmp[1024]; //memcpy ( tmp , qt->m_term , qt->m_termLen ); //tmp [ qt->m_termLen ] = 0; //logf(LOG_DEBUG,"got term %s (%li)",tmp,qt->m_termLen); // otherwise, add it n++; } // now if we have enough room, do the singles for ( long i = 0 ; i < m_numWords && n < MAX_QUERY_TERMS ; i++ ) { // break out if no more explicit bits! /* if ( shift >= max ) { logf(LOG_DEBUG, "query: Query2 has more than %li unique terms. " "Truncating.",max); m_truncated = true; break; } */ QueryWord *qw = &m_qwords[i]; if ( qw->m_ignoreWord && qw->m_ignoreWord != IGNORE_QSTOP) continue; // if ( qw->m_ignoreWord ) continue; // ignore if in quotes if ( qw->m_quoteStart >= 0 ) continue; // if nore if weight is absolute zero if ( qw->m_userWeight == 0 && qw->m_userType == 'a' ) continue; // stop breach if ( n >= MAX_QUERY_TERMS ) { log("query: lost query terms to max term " "limit of %li",(long)MAX_QUERY_TERMS ); break; } QueryTerm *qt = &m_qterms[n]; qt->m_qword = qw ; qt->m_piped = qw->m_piped; qt->m_isPhrase = false ; qt->m_isUORed = false; qt->m_UORedTerm = NULL; qt->m_synonymOf = NULL; // ignore some synonym terms if tf is too low qt->m_ignored = qw->m_ignoreWord; // qt->m_ignored = 0; // assume not a repeat of another query term (set below) qt->m_repeat = false; // stop word? no, we're a phrase term qt->m_isQueryStopWord = qw->m_isQueryStopWord; // change in both places qt->m_termId = qw->m_wordId & TERMID_MASK; m_termIds[n] = qw->m_wordId & TERMID_MASK; qt->m_rawTermId = qw->m_rawWordId; // assume explicit bit is 0 qt->m_explicitBit = 0; qt->m_matchesExplicitBits = 0; //log(LOG_DEBUG, "Setting query phrase term id %d: %lld raw: %lld", n, m_termIds[n], qt->m_rawTermId); // boolean queries are not allowed term signs if ( m_isBoolean ) { qt->m_termSign = '\0'; m_termSigns[n] = '\0'; // boolean fix for "health OR +sports" because // the + there means exact word match, no synonyms. if ( qw->m_wordSign == '+' ) { qt->m_termSign = qw->m_wordSign; m_termSigns[n] = qw->m_wordSign; } } // if not boolean, ensure to change signs in both places else { qt->m_termSign = qw->m_wordSign; m_termSigns[n] = qw->m_wordSign; } // get previous text word //long pw = i - 2; long pw = i-1; // // . back up until word that contains quote if in a quoted // // phrase // // . UOR can only support two word phrases really... if (m_qwords[i].m_quoteStart >= 0) pw = m_qwords[i].m_quoteStart ; if ( pw > 0 ) pw--; // back two more if field long fieldStart=-1; long fieldLen=0; if ( pw == 0 && m_qwords[pw].m_ignoreWord==IGNORE_FIELDNAME) fieldStart = pw; if ( pw > 0&& m_qwords[pw-1].m_ignoreWord==IGNORE_FIELDNAME ){ pw -= 1; fieldStart = pw; } while (pw>0 && ((m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) { pw--; fieldStart = pw; } if (fieldStart > -1) { pw = i; while (pw < m_numWords && m_qwords[pw].m_fieldCode) pw++; fieldLen = m_qwords[pw-1].m_word + m_qwords[pw-1].m_wordLen - m_qwords[fieldStart].m_word; } // // is UOR operator? if so, backup over it // if ( pw >= 0 && m_qwords[pw].m_opcode == OP_UOR ){ // pw -= 2; // } // else goto notUOR; // if ( pw < 0 ) goto notUOR; // // . if previous term is UOR'd with us then share the same ebit // // . this allows us to use lots of UOR'd query terms // // . the UOR'd lists may also be merged together into a single // // list if "mergeListMaxSize" is positive // // if ( n >= 1 && // // i >= 4 && // // //m_qterms[n-1].m_qword == &m_qwords[pw] && // // shift > 0 && // // qw->m_hardCount == 0 ) // // shift--; // // set the UOR term sign // qt->m_isUORed = true; // if (m_qwords[pw].m_queryWordTerm) // m_qwords[pw].m_queryWordTerm->m_isUORed = true; // if (m_qwords[pw].m_queryPhraseTerm) // m_qwords[pw].m_queryPhraseTerm->m_isUORed = true; // notUOR: // do not use an explicit bit up if we have a hard count qt->m_hardCount = qw->m_hardCount; // if ( qw->m_hardCount == 0 ) { // qt->m_explicitBit = 1 << shift ; // shift++; // } qw->m_queryWordTerm = qt; // IndexTable.cpp uses this one qt->m_inQuotes = qw->m_inQuotes; // point to the string itself that is the word if (fieldLen > 0) { qt->m_term = m_qwords[fieldStart].m_word; qt->m_termLen = fieldLen; // skip past the end of the field value i = pw-1; } else { qt->m_termLen = qw->m_wordLen; qt->m_term = qw->m_word; //log(LOG_DEBUG, "query: *** term \"%s\"", u8Buf); } // reset our implicit bits to 0 qt->m_implicitBits = 0; // // . OR ourselves into our parent phrase's m_implicitBits // // . this makes setting m_bitScores[] easy because if a // // doc contains this prhase then it IMPLICITLY contains us // // which will make it easier to satisfy requiredBits // if ( qw->m_queryPhraseTerm ) // qw->m_queryPhraseTerm->m_implicitBits |= // qt->m_explicitBit; // // if we're in the middle of the phrase // long pn = qw->m_leftPhraseStart; // // convert word to its phrase QueryTerm ptr, if any // QueryTerm *tt = NULL; // if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm; // if ( tt ) tt->m_implicitBits |= qt->m_explicitBit; // // . there might be some phrase term that actually contains // // the same word as we are, but a different occurence // // . like '"knowledge management" AND NOT management' query // for ( long j = 0 ; j < i ; j++ ) { // // must be our same wordId (same word, different occ.) // QueryWord *qw2 = &m_qwords[j]; // if ( qw2->m_wordId != qw->m_wordId ) continue; // // get first word in the phrase that jth word is in // long pn2 = qw2->m_leftPhraseStart; // if ( pn2 < 0 ) continue; // // he implies us! // QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm; // if ( tt2 ) tt2->m_implicitBits |= qt->m_explicitBit; // break; // } // assume not under a NOT bool op qt->m_underNOT = false; // assign score weight, we're a phrase here qt->m_userWeight = qw->m_userWeight ; qt->m_userType = qw->m_userType ; qt->m_fieldCode = qw->m_fieldCode ; // stuff before a pipe always has a weight of 1 if ( qt->m_piped ) { qt->m_userWeight = 1; qt->m_userType = 'a'; } // debug //char tmp[1024]; //memcpy ( tmp , qt->m_term , qt->m_termLen ); //tmp [ qt->m_termLen ] = 0; //logf(LOG_DEBUG,"got term %s (%li)",tmp,qt->m_termLen); n++; } // now handle the explicit bits // moved out of separate phrase and singleton loops // for phrase UOR support /* for ( long i = 0 ; i < m_numWords ; i++ ) { // break out if no more explicit bits! // if ( shift >= max ) { // log("query: Query has more than %li unique terms. " // "Truncating.",max); // m_truncated = true; // break; // } long pw; QueryWord *qw = &m_qwords[i]; if (!qw->m_queryWordTerm && !qw->m_queryPhraseTerm) continue; QueryTerm *qt = qw->m_queryPhraseTerm? qw->m_queryPhraseTerm : qw->m_queryWordTerm; if (!qt) continue; doAgain: pw = i-1; // . back up until word that contains quote if in a quoted // phrase // . UOR can only support two word phrases really... //if (m_qwords[i].m_quoteStart >= 0) // pw = m_qwords[i].m_quoteStart - 1; while (pw>0 && ((m_qwords[pw].m_ignoreWord == IGNORE_DEFAULT) || (m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) pw--; // is UOR operator? if so, backup over it if ( pw < 0 || m_qwords[pw].m_opcode != OP_UOR ) goto notUOR; pw--; while (pw>0 && ((m_qwords[pw].m_ignoreWord == IGNORE_DEFAULT) || (m_qwords[pw].m_ignoreWord == IGNORE_FIELDNAME))) pw--; if ( pw >= 0 && m_qwords[pw].m_quoteStart >= 0 ) //pw = m_qwords[pw].m_quoteStart + 1; pw = m_qwords[pw].m_quoteStart; if (pw < 0) goto notUOR; // . if previous term is UOR'd with us then share the same ebit // . this allows us to use lots of UOR'd query terms // . the UOR'd lists may also be merged together into a single // list if "mergeListMaxSize" is positive qt->m_isUORed = true; // set uor flag on all words in phrase if (qw->m_queryPhraseTerm && m_qwords[i].m_quoteStart >= 0){ long quoteStart = m_qwords[i].m_quoteStart; for (long j=quoteStart;jm_isUORed = true; qtp->m_UORedTerm = m_qwords[pw].m_queryPhraseTerm; } } } //QueryTerm *pqt = NULL; if (m_qwords[pw].m_queryWordTerm){ m_qwords[pw].m_queryWordTerm->m_isUORed = true; qt->m_UORedTerm = m_qwords[pw].m_queryWordTerm; } //pqt = m_qwords[pw].m_queryWordTerm; // set uor flag on all words in previous phrase if (m_qwords[pw].m_queryPhraseTerm && m_qwords[pw].m_quoteStart >= 0) { m_qwords[pw].m_queryPhraseTerm->m_isUORed = true; qt->m_UORedTerm = m_qwords[pw].m_queryPhraseTerm; long quoteStart = m_qwords[pw].m_quoteStart; for (long j=quoteStart;jm_isUORed = true; qtp->m_UORedTerm = m_qwords[pw].m_queryPhraseTerm; } } } // if ( n >= 1 && // i >= 4 && // //m_qterms[n-1].m_qword == &m_qwords[pw] && // shift > 0 && // qw->m_hardCount == 0 ) { // shift--; // } notUOR: // if ( qt->m_hardCount == 0 ) { // // qt->m_explicitBit = 1 << shift ; // qt->m_explicitBit = shift ; // shift++; // } // // . OR ourselves into our parent phrase's m_implicitBits // // . this makes setting m_bitScores[] easy because if a // // doc contains this prhase then it IMPLICITLY contains us // // which will make it easier to satisfy requiredBits // if ( qw->m_queryPhraseTerm ) // qw->m_queryPhraseTerm->m_implicitBits |= // qt->m_explicitBit; // // if we're in the middle of the phrase // long pn = qw->m_leftPhraseStart; // // convert word to its phrase QueryTerm ptr, if any // QueryTerm *tt = NULL; // if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm; // if ( tt ) tt->m_implicitBits |= qt->m_explicitBit; // // . there might be some phrase term that actually contains // // the same word as we are, but a different occurence // // . like '"knowledge management" AND NOT management' query // for ( long j = 0 ; j < i ; j++ ) { // // must be our same wordId (same word, different occ.) // //QueryWord *qw2 = m_qterms[j].m_qword; // QueryWord *qw2 = &m_qwords[j]; // if ( qw2->m_wordId != qw->m_wordId ) continue; // // get first word in the phrase that jth word is in // long pn2 = qw2->m_leftPhraseStart; // if ( pn2 < 0 ) continue; // // he implies us! // QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm; // if ( tt2 ) tt2->m_implicitBits |= qt->m_explicitBit; // break; // } if (qt == qw->m_queryPhraseTerm){ if ( qw->m_queryWordTerm){ qt = qw->m_queryWordTerm; goto doAgain; } } } */ /* // Handle exclusive explicit bits only shift = 0; int n2 = 0; for ( long i = 0; i < n ; i++ ){ // break out if no more explicit bits! if ( shift >= max ) { logf(LOG_DEBUG, "query: Query4 has more than %li unique terms. " "Truncating.",max); m_truncated = true; break; } QueryTerm *qt = &m_qterms[i]; if (qt->m_UORedTerm) continue; // sometims UORedTerm is NULL i guess because of IGNORE_BREECH if ( qt->m_isUORed && qt->m_qword && qt->m_qword->m_ignoreWord ) continue; // Skip duplicate terms before we waste an explicit bit bool skip=false; for (long j=0;jm_termId != m_qterms[j].m_termId || qt->m_termSign != m_qterms[j].m_termSign){ continue; } skip = true; qt->m_explicitBit = m_qterms[j].m_explicitBit; break; } n2++; if (skip) continue; if ( qt->m_hardCount == 0 ) { qt->m_explicitBit = 1 << shift++; } } // count them for doing number of combos m_numExplicitBits = shift; */ // Handle shared explicit bits for ( long i = 0; i < n ; i++ ){ QueryTerm *qt = &m_qterms[i]; // assume not in a phrase qt->m_inPhrase = 0; qt->m_rightPhraseTermNum = -1; qt->m_leftPhraseTermNum = -1; qt->m_rightPhraseTerm = NULL; qt->m_leftPhraseTerm = NULL; QueryTerm *qt2 = qt->m_UORedTerm; if (!qt2) continue; // chase down first term in UOR chain while (qt2->m_UORedTerm) qt2 = qt2->m_UORedTerm; //if (!qt2->m_explicitBit) continue; //qt->m_explicitBit = qt2->m_explicitBit; //n2++; } //m_numTerms = n2; // . set implicit bits, m_implicitBits // . set m_inPhrase for (long i = 0; i < m_numWords ; i++ ){ QueryWord *qw = &m_qwords[i]; QueryTerm *qt = qw->m_queryWordTerm; if (!qt) continue; if ( qw->m_queryPhraseTerm ) qw->m_queryPhraseTerm->m_implicitBits |= qt->m_explicitBit; // set flag if in a a phrase, and set phrase term num if ( qw->m_queryPhraseTerm ) { qt->m_inPhrase = 1; QueryTerm *pt = qw->m_queryPhraseTerm; qt->m_rightPhraseTermNum = pt - m_qterms; qt->m_rightPhraseTerm = pt; } // if we're in the middle of the phrase long pn = qw->m_leftPhraseStart; // convert word to its phrase QueryTerm ptr, if any QueryTerm *tt = NULL; if ( pn >= 0 ) tt = m_qwords[pn].m_queryPhraseTerm; if ( tt ) tt->m_implicitBits |= qt->m_explicitBit; if ( tt ) { qt->m_inPhrase = 1; qt->m_leftPhraseTermNum = tt - m_qterms; qt->m_leftPhraseTerm = tt; } // . there might be some phrase term that actually contains // the same word as we are, but a different occurence // . like '"knowledge management" AND NOT management' query // . made it from "j < i" into "j < m_numWords" because // 'test "test bed"' was not working but '"test bed" test' // was working. for ( long j = 0 ; j < m_numWords ; j++ ) { // must be our same wordId (same word, different occ.) QueryWord *qw2 = &m_qwords[j]; if ( qw2->m_wordId != qw->m_wordId ) continue; // get first word in the phrase that jth word is in long pn2 = qw2->m_leftPhraseStart; // we might be the guy that starts it! if ( pn2 < 0 && qw2->m_quoteStart != -1 ) pn2 = j; // if neither is the case, skip this query word if ( pn2 < 0 ) continue; // he implies us! QueryTerm *tt2 = m_qwords[pn2].m_queryPhraseTerm; if ( tt2 ) tt2->m_implicitBits |= qt->m_explicitBit; if ( tt2 ) { qt->m_inPhrase = 1; qt->m_leftPhraseTermNum = tt2 - m_qterms; qt->m_leftPhraseTerm = tt2; } break; } } /* // synonym terms should have copy all the implicit/explicit bits // into their implicit bits field for (long i = 0; i < m_numTerms; i++) { QueryTerm *qt = &m_qterms[i]; QueryTerm *st = qt->m_synonymOf; if (!st) continue; // also, if we are "auto insurance", a synonymOf // "car insurance", we should also imply "car insurance"'s // terms, 'car' and 'insurance' for purposes of // IndexTable2.cpp::getWeightScore()'s calculation of "min". // Because when finding the "max" score of a word, we also // allow its phrase and synonyms' scores to compete. qt->m_implicitBits = st->m_implicitBits | st->m_explicitBit; // now skip if not a phrase synonym if ( ! qt->m_isPhrase ) continue; // . we also imply the two words bookending this phrase, if any // . so see if the leftSynHash is in the syn list for ( long k = m_synTerm ; k < m_numTerms ; k++ ) { // get term QueryTerm *tt = &m_qterms[k]; // skip if phrase if ( tt->m_isPhrase ) continue; // must be synonym if ( ! tt->m_synonymOf ) continue; // must match one of our ids if ( tt->m_qword->m_rawWordId != qt->m_leftRawWordId && tt->m_qword->m_rawWordId != qt->m_rightRawWordId ) continue; // we imply it now! qt->m_implicitBits |= tt->m_explicitBit; } } */ //////////// // // . add synonym query terms now // . skip this part if language is unknown i guess // //////////// long sn = 0; Synonyms syn; // loop over all words in query and process its synonyms list if ( m_langId != langUnknown && m_queryExpansion ) sn = m_numWords; long long to = hash64n("to",0LL); for ( long i = 0 ; i < sn ; i++ ) { // get query word QueryWord *qw = &m_qwords[i]; // skip if in quotes, we will not get synonyms for it if ( qw->m_inQuotes ) continue; // skip if has plus sign in front if ( qw->m_wordSign == '+' ) continue; // no url: stuff, maybe only title if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE ) continue; // skip if ignored like a stopword (stop to->too) //if ( qw->m_ignoreWord ) continue; // no, hurts 'Greencastle IN economic development' if ( qw->m_wordId == to ) continue; // single letters... if ( qw->m_wordLen == 1 ) continue; // set the synonyms for this word char tmpBuf [ TMPSYNBUFSIZE ]; long naids = syn.getSynonyms ( &words , i , m_langId , tmpBuf , 0 ); // m_niceness ); // if no synonyms, all done if ( naids <= 0 ) continue; // get the term for this word QueryTerm *origTerm = qw->m_queryWordTerm; // loop over synonyms for word #i now for ( long j = 0 ; j < naids ; j++ ) { // stop breach if ( n >= MAX_QUERY_TERMS ) { log("query: lost synonyms due to max term " "limit of %li",(long)MAX_QUERY_TERMS ); break; } // this happens for 'da da da' if ( ! origTerm ) continue; // add that query term QueryTerm *qt = &m_qterms[n]; qt->m_qword = qw; // NULL; qt->m_piped = qw->m_piped; qt->m_isPhrase = false ; qt->m_isUORed = false; qt->m_UORedTerm = NULL; // synonym of this term... qt->m_synonymOf = origTerm; // nuke this crap since it was done above and we // missed out! qt->m_inPhrase = 0; qt->m_rightPhraseTermNum = -1; qt->m_leftPhraseTermNum = -1; qt->m_rightPhraseTerm = NULL; qt->m_leftPhraseTerm = NULL; // need this for Matches.cpp qt->m_synWids0 = syn.m_wids0[j]; qt->m_synWids1 = syn.m_wids1[j]; long na = syn.m_numAlnumWords[j]; // how many words were in the base we used to // get the synonym. i.e. if the base is "new jersey" // then it's 2! and the synonym "nj" has one alnum // word. long ba = syn.m_numAlnumWordsInBase[j]; qt->m_numAlnumWordsInSynonym = na; qt->m_numAlnumWordsInBase = ba; // crap, "nj" is a synonym of the PHRASE TERM // bigram "new jersey" not of the single word term // "new" so fix that. if ( ba == 2 && origTerm->m_rightPhraseTerm ) qt->m_synonymOf = origTerm->m_rightPhraseTerm; // ignore some synonym terms if tf is too low qt->m_ignored = qw->m_ignoreWord; // assume not a repeat of another query term(set below) qt->m_repeat = false; // stop word? no, we're a phrase term qt->m_isQueryStopWord = qw->m_isQueryStopWord; // change in both places qt->m_termId = syn.m_aids[j] & TERMID_MASK; m_termIds[n] = syn.m_aids[j] & TERMID_MASK; qt->m_rawTermId = syn.m_aids[j]; // assume explicit bit is 0 qt->m_explicitBit = 0; qt->m_matchesExplicitBits = 0; // boolean queries are not allowed term signs if ( m_isBoolean ) { qt->m_termSign = '\0'; m_termSigns[n] = '\0'; // boolean fix for "health OR +sports" because // the + there means exact word match, no syns if ( qw->m_wordSign == '+' ) { qt->m_termSign = qw->m_wordSign; m_termSigns[n] = qw->m_wordSign; } } // if not bool, ensure to change signs in both places else { qt->m_termSign = qw->m_wordSign; m_termSigns[n] = qw->m_wordSign; } // do not use an explicit bit up if we got a hard count qt->m_hardCount = qw->m_hardCount; //qw->m_queryWordTerm = qt; // IndexTable.cpp uses this one qt->m_inQuotes = qw->m_inQuotes; // point to the string itself that is the word qt->m_term = syn.m_termPtrs[j]; qt->m_termLen = syn.m_termLens[j]; // reset our implicit bits to 0 qt->m_implicitBits = 0; // assume not under a NOT bool op qt->m_underNOT = false; // assign score weight, we're a phrase here qt->m_userWeight = qw->m_userWeight ; qt->m_userType = qw->m_userType ; qt->m_fieldCode = qw->m_fieldCode ; // stuff before a pipe always has a weight of 1 if ( qt->m_piped ) { qt->m_userWeight = 1; qt->m_userType = 'a'; } // otherwise, add it n++; } } m_numTerms = n; if ( n > MAX_QUERY_TERMS ) { char *xx=NULL;*xx=0; } // count them for doing number of combos //m_numExplicitBits = shift; // . repeated terms have the same termbits!! // . this is only for bool queries since regular queries ignore // repeated terms in setWords() // . we need to support: "trains AND (perl OR python) NOT python" for ( long i = 0 ; i < n ; i++ ) { // BUT NOT IF in a UOR'd list!!! Metalincs bug... if ( m_qterms[i].m_isUORed ) continue; // that didn't seem to fix it right, for dup terms that // are the FIRST term in a UOR sequence... they don't seem // to have m_isUORed set if ( m_hasUOR ) continue; for ( long j = 0 ; j < i ; j++ ) { // skip if not a termid match if(m_qterms[i].m_termId!=m_qterms[j].m_termId)continue; m_qterms[i].m_explicitBit = m_qterms[j].m_explicitBit; // if doing phrases, ignore the unrequired phrase if ( m_qterms[i].m_isPhrase ) { if ( m_qterms[j].m_implicitBits ) m_qterms[j].m_repeat = true; else m_qterms[i].m_repeat = true; continue; } // if not doing phrases, just ignore term #i m_qterms[i].m_repeat = true; } } // if we're a special range: term and a doc has us, then // assume it has our associates too because we are all // essentially the same term. we don't want this to be a // factor in the ranking. since gigablast usually puts docs // with all the terms (between OR operators) above terms that do // not have all ther terms. that is not a good thing for these terms. /* long nw = m_numWords; for ( long i = 0 ; i < nw ; i++ ) { // skip if not a range: query term if ( m_qwords[i].m_fieldCode != FIELD_RANGE ) continue; // loop over all our associates (in same parens level) to // get the OR of all the explicit bits qvec_t allBits = 0; for ( long j=i;jm_explicitBit ) continue; // grab it allBits |= qt->m_explicitBit ; } // now make everyone use just one of those bits for ( long j=i;jm_explicitBit ) continue; // force it to use the common bit qt->m_explicitBit = allBits; qt->m_implicitBits = allBits; } } */ // . if only have one term and it is a signless phrase, make it signed // . don't forget to set m_termSigns too! if ( n == 1 && m_qterms[0].m_isPhrase && ! m_qterms[0].m_termSign ) { m_qterms[0].m_termSign = '*'; m_termSigns[0] = '*'; } // . or bits into the m_implicitBits member of phrase QueryTerms that // represent the consitutent words // . loop over each //m_numTerms = n2; // . how many of the terms are non fielded singletons? // . this is just for support of the BIG HACK in Summary.cpp /* m_numTermsSpecial = 0; for ( long i = 0 ; i < n ; i++ ) { if ( m_qterms[i].m_isPhrase ) continue; if ( m_qterms[i].m_fieldCode ) continue; if ( m_qterms[i].m_isUORed ) continue; // only skip query stop words if in quotes, if it is in // quotes then we gotta have it... if ( m_qterms[i].m_isQueryStopWord && ! m_qterms[i].m_inQuotes ) continue; if ( m_qterms[i].m_underNOT ) continue; if ( m_qterms[i].m_termSign == '-' ) continue; m_numTermsSpecial++; } */ // . set m_componentCodes all to -2 // . addCompoundTerms() will set these appropriately // . see Msg2.cpp for more info on componentCodes // . -2 means unset, neither a compound term nor a component term at // this time for ( long i = 0 ; i < m_numTerms ; i++ ) m_componentCodes[i] = -2; m_numComponents = 0; // . now set m_phrasePart for Summary.cpp's hackfix filter // . only set this for the non-phrase terms, since keepAllSingles is // set to true when setting the Query for Summary.cpp::set in order // to match the singles for ( long i = 0 ; i < m_numTerms ; i++ ) { // assume not in a phrase m_qterms[i].m_phrasePart = -1; //if ( ! m_qterms[i].m_isPhrase ) continue; // skip cd-rom too, if not in quotes if ( ! m_qterms[i].m_inQuotes ) continue; // is next term also in a quoted phrase? if ( i - 1 < 0 ) continue; //if ( ! m_qterms[i+1].m_isPhrase ) continue; if ( ! m_qterms[i-1].m_inQuotes ) continue; // are we in the same quoted phrase? if ( m_qterms[i+0].m_qword->m_quoteStart != m_qterms[i-1].m_qword->m_quoteStart ) continue; // ok, we're in the same quoted phrase m_qterms[i+0].m_phrasePart=m_qterms[i+0].m_qword->m_quoteStart; m_qterms[i-1].m_phrasePart=m_qterms[i+0].m_qword->m_quoteStart; } // . set m_requiredBits // . these are 1-1 with m_qterms (QueryTerms) // . required terms have no - sign and have no signless phrases // . these are what terms doc would NEED to have if we were default AND // BUT for boolean queries that doesn't apply m_requiredBits = 0; // no - signs, no signless phrases m_negativeBits = 0; // terms with - signs m_forcedBits = 0; // terms with + signs m_synonymBits = 0; for ( long i = 0 ; i < m_numTerms ; i++ ) { // QueryTerms are derived from QueryWords QueryTerm *qt = &m_qterms[i]; // don't require if negative if ( qt->m_termSign == '-' ) { m_negativeBits |= qt->m_explicitBit; // (1 << i ); continue; } // forced bits if ( qt->m_termSign == '+' && ! m_isBoolean ) m_forcedBits |= qt->m_explicitBit; //(1 << i); // skip signless phrases if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue; if ( qt->m_synonymOf ) { m_synonymBits |= qt->m_explicitBit; continue; } // fix gbhastitleindicator:1 where "1" is a stop word if ( qt->m_isQueryStopWord && ! m_qterms[i].m_fieldCode ) continue; // OR it all up m_requiredBits |= qt->m_explicitBit; // (1 << i); } // set m_matchRequiredBits which we use for Matches.cpp m_matchRequiredBits = 0; for ( long i = 0 ; i < m_numTerms ; i++ ) { // QueryTerms are derived from QueryWords QueryTerm *qt = &m_qterms[i]; // don't require if negative if ( qt->m_termSign == '-' ) continue; // skip all phrase terms if ( qt->m_isPhrase ) continue; // OR it all up m_matchRequiredBits |= qt->m_explicitBit; } // if we have '+test -test': if ( m_negativeBits & m_requiredBits ) m_numTerms = 0; // we need to remember this now for tier integration in IndexTable.cpp //m_requiredBits = requiredBits; // now set m_matches,ExplicitBits, used only by Matches.cpp so far for ( long i = 0 ; i < m_numTerms ; i++ ) { // set it up m_qterms[i].m_matchesExplicitBits = m_qterms[i].m_explicitBit; // or in the repeats for ( long j = 0 ; j < m_numTerms ; j++ ) { // skip if termid mismatch if ( m_qterms[i].m_termId != m_qterms[j].m_termId ) continue; // i guess signs do not have to match //m_qterms[i].m_termSign == m_qterms[j].m_termSign){ m_qterms[i].m_matchesExplicitBits |= m_qterms[j].m_explicitBit; } } m_numRequired = 0; for ( long i = 0 ; i < m_numTerms ; i++ ) { // QueryTerms are derived from QueryWords QueryTerm *qt = &m_qterms[i]; // assume not required qt->m_isRequired = false; // don't require if negative // no, consider required, but NEGATIVE required... //if ( qt->m_termSign == '-' ) continue; // skip signless phrases if ( qt->m_isPhrase && qt->m_termSign == '\0' ) continue; if ( qt->m_isPhrase && qt->m_termSign == '*' ) continue; if ( qt->m_synonymOf ) continue; // IGNORE_QSTOP? if ( qt->m_ignored ) continue; // mark it qt->m_isRequired = true; // count them m_numRequired++; } // required quoted phrase terms for ( long i = 0 ; i < m_numTerms ; i++ ) { // QueryTerms are derived from QueryWords QueryTerm *qt = &m_qterms[i]; // quoted phrase? if ( ! qt->m_isPhrase ) continue; if ( ! qt->m_inQuotes ) continue; // mark it qt->m_isRequired = true; // count them m_numRequired++; } // . for query 'to be or not to be shakespeare' // require 'tobe' 'beor' 'tobe' because // they are bigrams in the wikipedia phrase 'to be or not to be' // and they all consist solely of query stop words. as of // 8/20/2012 i took 'not' off the query stop word list. // . require bigrams that consist of 2 query stop words and // are in a wikipedia phrase. set termSign to '+' i guess? // . for 'in the nick' , a wiki phrase, make "in the" required // and give a big bonus for "the nick" below. for ( long i = 0 ; i < m_numTerms ; i++ ) { // QueryTerms are derived from QueryWords QueryTerm *qt = &m_qterms[i]; // don't require if negative if ( qt->m_termSign == '-' ) continue; // only check bigrams here if ( ! qt->m_isPhrase ) continue; // get the query word that starts this phrase QueryWord *qw1 = qt->m_qword; // must be in a wikiphrase if ( qw1->m_wikiPhraseId <= 0 ) continue; // what query word # is that? long qwn = qw1 - m_qwords; // get the next alnum word after that // assume its the last word in our bigram phrase QueryWord *qw2 = &m_qwords[qwn+2]; // must be in same wikiphrase if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue; // must be two stop words if ( ! qw1->m_isQueryStopWord ) continue; if ( ! qw2->m_isQueryStopWord ) continue; // mark it qt->m_isRequired = true; // count them m_numRequired++; } // // new logic for XmlDoc::setRelatedDocIdWeight() to use // long shift = 0; m_requiredBits = 0; for ( long i = 0; i < n ; i++ ){ QueryTerm *qt = &m_qterms[i]; qt->m_explicitBit = 0; if ( ! qt->m_isRequired ) continue; // negative terms are "negative required", but we ignore here if ( qt->m_termSign == '-' ) continue; qt->m_explicitBit = 1<m_explicitBit; shift++; if ( shift >= (long)(sizeof(qvec_t)*8) ) break; } // now implicit bits for ( long i = 0; i < n ; i++ ){ QueryTerm *qt = &m_qterms[i]; // make it explicit bit at least qt->m_implicitBits = qt->m_explicitBit; if ( qt->m_isRequired ) continue; // synonym? if ( qt->m_synonymOf ) qt->m_implicitBits |= qt->m_synonymOf->m_explicitBit; // skip if not bigram if ( ! qt->m_isPhrase ) continue; // get sides QueryTerm *t1 = qt->m_leftPhraseTerm; QueryTerm *t2 = qt->m_rightPhraseTerm; if ( ! t1 || ! t2 ) continue; qt->m_implicitBits |= t1->m_explicitBit; qt->m_implicitBits |= t2->m_explicitBit; } // . for query 'to be or not to be shakespeare' // give big bonus for 'ornot' and 'notto' bigram terms because // the single terms 'or' and 'to' are ignored and because // 'to be or not to be' is a wikipedia phrase // . on 8/20/2012 i took 'not' off the query stop word list. // . now give a big bonus for bigrams whose two terms are in the // same wikipedia phrase and one and only one of the terms in // the bigram is a query stop word // . in general 'ornot' is considered a "synonym" of 'not' and // gets hit with a .90 score factor, but that should never // happen, it should be 1.00 and in this special case it should // be 1.20 // . so for 'time enough for love' the phrase term "enough for" // gets its m_isWikiHalfStopBigram set AND that phrase term // is a synonym term of the single word term "enough" and is treated // as such in the Posdb.cpp logic. for ( long i = 0 ; i < m_numTerms ; i++ ) { // QueryTerms are derived from QueryWords QueryTerm *qt = &m_qterms[i]; // assume not! qt->m_isWikiHalfStopBigram = 0; // don't require if negative if ( qt->m_termSign == '-' ) continue; // only check bigrams here if ( ! qt->m_isPhrase ) continue; // get the query word that starts this phrase QueryWord *qw1 = qt->m_qword; // must be in a wikiphrase if ( qw1->m_wikiPhraseId <= 0 ) continue; // what query word # is that? long qwn = qw1 - m_qwords; // get the next alnum word after that // assume its the last word in our bigram phrase QueryWord *qw2 = &m_qwords[qwn+2]; // must be in same wikiphrase if ( qw2->m_wikiPhraseId != qw1->m_wikiPhraseId ) continue; // if both query stop words, should have been handled above // we need one to be a query stop word and the other not // for this algo if ( qw1->m_isQueryStopWord && qw2->m_isQueryStopWord ) continue; // skip if neither is a query stop word if ( ! qw1->m_isQueryStopWord&& ! qw2->m_isQueryStopWord ) continue; // one must be a stop word i guess // so for 'the time machine' we do not count 'time machine' // as a halfstopwikibigram if ( ! qw1->m_isQueryStopWord && ! qw2->m_isQueryStopWord ) continue; // don't require it, if query is 'the tigers' accept // just 'tigers' but give a bonus for 'the tigers' in // the document. //qt->m_isRequired = true; // count them //m_numRequired++; // special flag qt->m_isWikiHalfStopBigram = true; } return true; } /* // . add in compound terms // . set m_componentCodes appropriately void Query::addCompoundTerms ( ) { // loop through possible starting points of sequences of the same ebit for (long i = 0 ; i < m_numTerms - 1 ; i++ ) { // break if too many already if ( m_numTerms >= MAX_QUERY_TERMS ) break; // if already processed, skip it if ( m_componentCodes[i] != -2 ) continue; // get ebit of the ith query term qvec_t ebit = m_qterms[i].m_explicitBit; // skip if 0, it is ignored because it breeched limit of 15 if ( ebit == 0 ) continue; // skip if next term's ebit is different //if ( ebit != m_qterms[i+1].m_explicitBit ) continue; // skip if not UOR'd because it could just be a repeat term //if ( ! m_qterms[i+1].m_isUORed ) continue; // all UORed terms have m_isOURed set now // because UORed terms are not necessarily in order // (first phrases, then words) if ( ! m_qterms[i].m_isUORed ) continue; // the termid of the compound list long long id = 0LL; // store compound terms last long n = m_numTerms; // sum of termfreqs //long long sum = 0; // we got a UOR'd list, see whose involved long j ; long numUORComponents = 0; char *beg = NULL; char *end = NULL; for ( j = 0; j < m_numTerms ; j++ ) { // if term does not have our ebit, break out if ( ebit != m_qterms[j].m_explicitBit ) continue; // otherwise, make this term point to the compound term m_componentCodes[j] = n; // an integrate its termid into the compound termid id = hash64 ( m_qterms[j].m_termId , id ) &TERMID_MASK; // add in the term frequency (aka popularity) //sum += m_termFreqs[j]; // keep track so IndexTable::alloc() can get it m_numComponents++; numUORComponents++; // get phrase UOR term right long a = j; long b = j; // if (m_qterms[j].m_qword->m_leftPhraseStart >= 0){ // a = m_qterms[j].m_qword->m_leftPhraseStart; // b++; // } char *newBeg = m_qterms[a].m_term; // had to add check for newBeg being null // (because of -O2 ???) if (!beg || (newBeg && newBeg < beg)) beg = newBeg; char *newEnd = m_qterms[b].m_term + m_qterms[b].m_termLen; if (!end || newEnd > end) end = newEnd; } if (!numUORComponents) continue; // copy it memcpy ( &m_qterms[n] , &m_qterms[i] , sizeof(QueryTerm) ); // get term's length //char *beg = m_qterms[i].m_term; //char *end = m_qterms[j-1].m_term + m_qterms[j-1].m_termLen; m_qterms[n].m_term = beg; m_qterms[n].m_termLen = end - beg; // set its id m_qterms[n].m_termId = id; // this array too! m_termIds[n] = id; m_qterms[n].m_rawTermId = 0LL; m_qterms[n].m_isQueryStopWord = false; m_componentCodes[n] = -1; // code for a compound termid is -1 //m_termFreqs [n] = sum; m_termSigns [n] = '\0'; // inc the total term count m_numTerms++; } } */ // -1 means compound, -2 means unset, >= 0 means component bool Query::isCompoundTerm ( long i ) { return ( m_componentCodes[i] == -1 ); } bool Query::setQWords ( char boolFlag , bool keepAllSingles , Words &words , Phrases &phrases ) { // . break query up into Words and phrases // . because we now deal with boolean queries, we make parentheses // their own separate Word, so tell "words" we're setting a query //Words words; if ( ! words.set ( m_buf , m_bufLen, TITLEREC_CURRENT_VERSION, true, true ) ) return log("query: Had error parsing query: %s.", mstrerror(g_errno)); long numWords = words.getNumWords(); // truncate it if ( numWords > MAX_QUERY_WORDS ) { log("query: Had %li words. Max is %li. Truncating.", numWords,(long)MAX_QUERY_WORDS); numWords = MAX_QUERY_WORDS; m_truncated = true; } m_numWords = numWords; // alloc the mem if we need to (mdw left off here) long need = m_numWords * sizeof(QueryWord); // sanity check if ( m_qwords || m_qwordsAllocSize ) { char *xx = NULL; *xx = 0; } // point m_qwords to our generic buffer if it will fit // if ( need < GBUF_SIZE ) { if ( m_gnext + need < m_gbuf + GBUF_SIZE ) { m_qwords = (QueryWord *)m_gnext; m_gnext += need; } // otherwise, we must allocate memory for it else { m_qwords = (QueryWord *)mmalloc ( need , "Query4" ); if ( ! m_qwords ) return log("query: Could not allocate mem for query."); m_qwordsAllocSize = need; } // is all alpha chars in query in upper case? caps lock on? bool allUpper = true; char *p = m_buf; char *pend = m_buf + m_bufLen; for ( ; p < pend ; p += getUtf8CharSize(p) ) if ( is_alpha_utf8 ( p ) && ! is_upper_utf8 ( p ) ) { allUpper = false; break; } // . come back here from below when we detect dat query is not boolean // . we need to redo the bits cuz they may have been messed with below // redo: // field code we are in char fieldCode = 0; char fieldSign = 0; char *field = NULL; long fieldLen = 0; // keep track of the start of different chunks of quotes long quoteStart = -1; bool inQuotes = false; //bool inVQuotes = false; char quoteSign = 0; // the current little sign char wordSign = 0; // when reading first word in link: ... field we skip the following // words until we hit a space because we hash them all together bool ignoreTilSpace = false; // assume we're NOT a boolean query m_isBoolean = false; // used to not respect the bool operator if it is the first word bool firstWord = true; // the query processing is broken into 3 stages. // . STAGE #1 // . reset all query words to default // set all m_ignoreWord and m_ignorePhrase to IGNORE_DEFAULT // . set m_isFieldName, m_fieldCode and m_quoteStart for query words. // no field names in quotes. +title:"hey there". // set m_quoteStart to -1 if not in quotes. // . if quotes immediately follow field code's ':' then distribute // the field code to all words in the quotes // . distribute +/- signs across quotes and fields to m_wordSigns. // support -title:"hey there". // . set m_quoteStart to -1 if only one alnum word is // in quotes, what's the point of that? // . set boolean op codes (m_opcode). cannot be in quotes. // cannot have a field code. cannot have a word sign (+/-). // . set m_wordId of FIELD_LINK, _URL, _SITE, _IP fields. // m_wordId of first should be hash of the whole field value. // only set its m_ignoreWord to 0, keep it's m_ignorePhrase to DEF. // . set m_ignore of non-op codes, non-fieldname, alnum words to 0. // . set m_wordId of each non-ignored alnum word. // . STAGE #2 // . customize Bits class: // first alnum word can start phrase. // first alnum word in quotes (m_quoteStart >= 0 ) can start phrase. // connected on the right but not on the left.. can start phrase. // no pair across any double quote // no pair across ".." --- UNLESS in quotes! // no pair across any change of field code. // field names may not be part of any phrase or paired across. // boolean ops may not be part of any phrase or paired across. // ignored words may not be part of any phrase or paired across. // . STAGE #3 // . set phrases class w/ custom Bits class mods. // . set m_phraseId and m_rawPhraseId of all QueryWords. if phraseId // is not 0 (phrase exists) then set m_ignorePhrase to 0. // . set m_leftConnected, m_rightConnected. word you are connecting // to must not be ignored. (no field names or op codes). // ensure you are in a phrase with the connected word, too, to // really be connected. // . set m_leftPhraseStart and m_rightPhraseEnd for all // m_inQuotePhrase is not needed since if m_quoteStart is >= 0 // we MUST be in a quoted phrase! // . if word is Connected then set m_ignoreWord to IGNORE_CONNECTED. // set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0). // m_wordSign may have inherited quote or field sign. // . if word's m_quoteStart is >= 0 set m_ignoreWord to IGNORE_QUOTED // set his m_phraseSign to m_wordSign (if not 0) or '*' (if it is 0) // m_wordSign may have inherited quote or field sign. // . if one word in a phrase is negative, then set m_phraseSign to '-' // set the Bits used for making phrases from the Words class Bits bits; if ( ! bits.set ( &words, TITLEREC_CURRENT_VERSION , 0 )) return log("query: Had error processing query: %s.", mstrerror(g_errno)); long userWeight = 1; char userType = 'r'; long userWeightPhrase = 1; char userTypePhrase = 'r'; long ignorei = -1; // assume we contain no pipe operator long pi = -1; // loop over all words, these QueryWords are 1-1 with "words" for ( long i = 0 ; i < numWords && i < MAX_QUERY_WORDS ; i++ ) { // convenience var, these are 1-1 with "words" QueryWord *qw = &m_qwords[i]; // set to defaults? memset ( qw , 0 , sizeof(QueryWord) ); // but quotestart should be -1 qw->m_quoteStart = -1; qw->m_leftPhraseStart = -1; // assume QueryWord is ignored by default qw->m_ignoreWord = IGNORE_DEFAULT; qw->m_ignorePhrase = IGNORE_DEFAULT; // get word as a string //char *w = words.getWord(i); //long wlen = words.getWordLen(i); qw->m_word = words.getWord(i); qw->m_wordLen = words.getWordLen(i); qw->m_isPunct = words.isPunct(i); char *w = words.getWord(i); long wlen = words.getWordLen(i); // assume it is a query weight operator qw->m_queryOp = true; // ignore it? (this is for query weight operators) if ( i <= ignorei ) continue; // deal with pipe operators if ( wlen == 5 && w[0]=='P'&&w[1]=='i'&&w[2]=='i'&&w[3]=='P'&&w[4]=='E') { pi = i; qw->m_opcode = OP_PIPE; continue; } // is it the bracket operator? // " LeFtB 113 rp RiGhB " if ( wlen == 5 && w[0]=='L'&&w[1]=='e'&&w[2]=='F'&&w[3]=='t'&&w[4]=='B'&& i+4 < numWords ) { // s MUST point to a number char *s = words.getWord(i+2); long slen = words.getWordLen(i+2); // if no number, it must be // " leFtB RiGhB " or " leFtB p RiGhB " if ( ! is_digit(s[0]) ) { // phrase weight reset if ( s[0] == 'p' ) { userWeightPhrase = 1; userTypePhrase = 'r'; ignorei = i + 4; } // word reset else { userWeight = 1; userType = 'r'; ignorei = i + 2; } continue; } // get the number long val = atol2 (s, slen); // s2 MUST point to the a,r,ap,rp string char *s2 = words.getWord(i+4); // is it a phrase? if ( s2[1] == 'p' ) { userWeightPhrase = val; userTypePhrase = s2[0]; // a or r } else { userWeight = val; userType = s2[0]; // a or r } // ignore all following words up and inc. i+6 ignorei = i + 6; continue; } // assign score weight, if any for this guy qw->m_userWeight = userWeight ; qw->m_userType = userType ; qw->m_userWeightPhrase = userWeightPhrase ; qw->m_userTypePhrase = userTypePhrase ; qw->m_queryOp = false; // does word #i have a space in it? that will cancel fieldCode // if we were in a field bool endField = false; if ( words.hasSpace(i) && ! inQuotes ) endField = true; // TODO: fix title:" hey there" (space in quotes is ok) // if there's a quote before the first space then // it's ok!!! if ( endField ) { char *s = words.m_words[i]; char *send = s + words.m_wordLens[i]; for ( ; s < send ; s++ ) { // if the space is inside the quotes then it // doesn't count! if ( *s == '\"' ) { endField = false; break;} if ( is_wspace_a(*s) ) break; } } // cancel the field if we hit a space (not in quotes) if ( endField ) { // cancel the field fieldCode = 0; fieldLen = 0; field = NULL; // we no longer have to ignore for link: et al ignoreTilSpace = false; } // . maintain inQuotes and quoteStart // . quoteStart is the word # that starts the current quote long nq = words.getNumQuotes(i) ; if ( nq > 0 ) { // && ! ignoreQuotes ) { // toggle quotes if we need to if ( nq & 0x01 ) inQuotes = ! inQuotes; // set quote sign to sign before the quote if ( inQuotes ) { quoteSign = '\0'; for ( char *p = w + wlen - 1 ; p > w ; p--){ if ( *p != '\"' ) continue; if ( *(p-1) == '-' ) quoteSign = '-'; if ( *(p-1) == '+' ) quoteSign = '+'; break; } } // . quoteStart is the word # the quotes started at // . it is -1 if not in quotes // . now we set it to the alnum word AFTER us!! if ( inQuotes && i+1< numWords ) quoteStart = i+1; else quoteStart = -1; } //log(LOG_DEBUG, "Query: nq: %ld inQuotes: %d,quoteStart: %ld", // nq, inQuotes, quoteStart); // does word #i have a space in it? that will cancel fieldCode // if we were in a field // TODO: fix title:" hey there" (space in quotes is ok) bool cancelField = false; if ( words.hasSpace(i) && ! inQuotes ) cancelField = true; // BUT if we have a quote, and they just got turned off, // and the space is not after the quote, do not cancel field! if ( nq == 1 && cancelField ) { // if we hit the space BEFORE the quote, do NOT cancel // the field for ( char *p = w + wlen - 1 ; p > w ; p--) { // hey, we got the quote first, keep field if ( *p == '\"' ) {cancelField = false; break;} // otherwise, we got space first? cancel it! if ( is_wspace_a(*p) ) break; } } if ( cancelField ) { // cancel the field fieldCode = 0; fieldLen = 0; field = NULL; // we no longer have to ignore for link: et al ignoreTilSpace = false; } // skip if we should if ( ignoreTilSpace ){ if (m_qwords[i-1].m_fieldCode){ qw->m_fieldCode = m_qwords[i-1].m_fieldCode; } continue; } // . is this word potentially a field? // . it cannot be another field name in a field if ( i < (m_numWords-2) && w[wlen] == ':' && ! is_wspace_utf8(w+wlen+1) && //w[wlen+1] != '/' && // as in http:// (! is_punct_utf8(w+wlen+1) || w[wlen+1]=='\"' || // for gblatrange2:-106.940994to-106.361282 w[wlen+1]=='-') && ! fieldCode && ! inQuotes ) { // field name may have started before though if it // was a compound field name containing hyphens, // underscores or periods long j = i-1 ; while ( j > 0 && ((m_qwords[j].m_rawWordId != 0) || ( m_qwords[j].m_wordLen ==1 && ((m_qwords[j].m_word)[0]=='-' || (m_qwords[j].m_word)[0]=='_' || (m_qwords[j].m_word)[0]=='.')))) { j--; } if ( j < 0 ) { //log(LOG_LOGIC,"query: query: bad " //"engineer."); j = 0; } // advance j to a non-punct word while (words.isPunct(j)) j++; // ignore all of these words then, // they're part of field name long tlen = 0; for ( long k = j ; k <= i ; k++ ) tlen += words.getWordLen(k); // set field name to the compound name if it is field = words.getWord (j); fieldLen = tlen; if ( j == i ) fieldSign = wordSign; else fieldSign = m_qwords[j].m_wordSign; // debug msg //char ttt[128]; //memcpy ( ttt , field , fieldLen ); //ttt[fieldLen] = '\0'; //log("field name = %s", ttt); // . is it recognized field name,like "title" or "url"? // . does it officially end in a colon? incl. in hash? bool hasColon; fieldCode = getFieldCode (field, fieldLen, &hasColon) ; // only url,link,site,ip and suburl field names will // end a colon, due to historical fuck up //if ( hasColon ){ // fieldLen++; //} // reassign alias fields //Why??? -p //if ( fieldCode == FIELD_TYPE ) { // field = "type" ; fieldLen = 4; } // if so, it does NOT get its own QueryWord, // but its sign can be inherited by its members if ( fieldCode ) { for ( long k = j ; k <= i ; k++ ) m_qwords[k].m_ignoreWord = IGNORE_FIELDNAME; continue; } } // what quote chunk are we in? this is 0 if we're not in quotes if ( inQuotes ) qw->m_quoteStart = quoteStart ; else qw->m_quoteStart = -1; qw->m_inQuotes = inQuotes; // ptr to field, if any qw->m_fieldCode = fieldCode; // if we are a punct word, see if we end in a sign that can // be applied to the next word, a non-punct word if ( words.isPunct(i) ) { wordSign = w[wlen-1]; if ( wordSign != '-' && wordSign != '+') wordSign = 0; if ( wlen>1 &&!is_wspace_a (w[wlen-2]) ) wordSign = 0; if ( i > 0 && wlen == 1 ) wordSign = 0; } // assign quoteSign to wordSign if we just got into quotes //if ( nq > 0 && inQuotes ) quoteSign = wordSign; // don't add any QueryWord for a punctuation word if ( words.isPunct(i) ) continue; // what is the sign of our term? +, -, *, ... char mysign; if ( fieldCode ) mysign = fieldSign; else if ( inQuotes ) mysign = quoteSign; else mysign = wordSign; // are we doing default AND? //if ( forcePlus && ! *mysign ) mysign = '+'; // store the sign qw->m_wordSign = mysign; // what quote chunk are we in? this is 0 if we're not in quotes if ( inQuotes ) qw->m_quoteStart = quoteStart ; else qw->m_quoteStart = -1; // if we're the first alnum in this quote and // the next word has a quote, then we're just a single word // in quotes which is silly, so undo it. But we should // still inherit any quoteSign, however. Be sure to also // set m_inQuotes to false so Matches.cpp::matchWord() works. if ( i == quoteStart ) { // + 1 ) { if ( i + 1 >= numWords || words.getNumQuotes(i+1)>0 ) { qw->m_quoteStart = -1; qw->m_inQuotes = false; } } // . get prefix hash of collection name and field // . but first convert field to lower case unsigned long long ph; long fflen = fieldLen; if ( fflen > 62 ) fflen = 62; char ff[64]; to_lower3_a ( field , fflen , ff ); //unsigned longlongph=getPrefixHash(m_coll,m_collLen,ff,fflen); //ph=getPrefixHash(NULL,0,ff,fflen); ph = hash64 ( ff , fflen ); // map "intitle" map to "title" if ( fieldCode == FIELD_TITLE ) ph = hash64 ( "title", 5 ); // make "suburl" map to "inurl" if ( fieldCode == FIELD_SUBURL ) ph = hash64 ( "inurl", 5 ); // ptr to field, if any qw->m_fieldCode = fieldCode; // prefix hash qw->m_prefixHash = ph; // set this flag if ( fieldCode == FIELD_LINKS ) m_hasLinksOperator = true; if ( fieldCode == FIELD_SITELINK ) m_hasLinksOperator = true; // if we're hashing a url:, link:, site: or ip: term, // then we need to hash ALL up to the first space if ( fieldCode == FIELD_URL || fieldCode == FIELD_EXT || fieldCode == FIELD_LINK || fieldCode == FIELD_ILINK|| fieldCode == FIELD_SITELINK|| fieldCode == FIELD_LINKS|| fieldCode == FIELD_SITE || fieldCode == FIELD_IP || fieldCode == FIELD_ISCLEAN || fieldCode == FIELD_QUOTA || fieldCode == FIELD_GBSORTBY || fieldCode == FIELD_GBREVSORTBY || fieldCode == FIELD_GBNUMBERMIN || fieldCode == FIELD_GBNUMBERMAX || fieldCode == FIELD_GBAD ) { // find first space -- that terminates the field value char *end = (words.m_words[words.m_numWords-1] + words.m_wordLens[words.m_numWords-1]); while ( w+wlen < end && ! is_wspace_utf8(w+wlen) ) wlen++; // ignore following words until we hit a space ignoreTilSpace = true; // the hash unsigned long long wid = hash64 ( w , wlen, 0LL ); // i've decided not to make // gbsortby:products.offerPrice case sensitive if ( fieldCode == FIELD_GBSORTBY || fieldCode == FIELD_GBREVSORTBY || fieldCode == FIELD_GBNUMBERMIN || fieldCode == FIELD_GBNUMBERMAX ) wid = hash64Lower_utf8 ( w , wlen , 0LL ); // should we have normalized before hashing? if ( fieldCode == FIELD_URL || fieldCode == FIELD_LINK || fieldCode == FIELD_ILINK || fieldCode == FIELD_SITELINK || fieldCode == FIELD_LINKS || fieldCode == FIELD_SITE ) { Url url; // do we add www? bool addwww = false; if ( fieldCode == FIELD_LINK ) addwww = true; if ( fieldCode == FIELD_ILINK) addwww = true; if ( fieldCode == FIELD_LINKS) addwww = true; if ( fieldCode == FIELD_URL ) addwww = true; if ( fieldCode == FIELD_SITELINK) addwww = true; url.set ( w , wlen , addwww ); char *site = url.getHost(); long siteLen = url.getHostLen(); if (fieldCode == FIELD_SITELINK) wid = hash64 ( site , siteLen ); else wid = hash64 ( url.getUrl(), url.getUrlLen() ); } //qw->m_wordId = g_indexdb.getTermId ( ph , wid ); // like we do it in XmlDoc.cpp's hashString() if ( ph ) qw->m_wordId = hash64h ( wid , ph ); else qw->m_wordId = wid; qw->m_rawWordId = 0LL; // only for highlighting? qw->m_phraseId = 0LL; qw->m_rawPhraseId = 0LL; qw->m_opcode = 0; // definitely not a query stop word qw->m_isQueryStopWord = false; // do not ignore the wordId qw->m_ignoreWord = 0; // override the word length //qw->m_wordLen = ulen * 2; // we are the first word? firstWord = false; // we're done with this one continue; } char opcode = 0; // if query is all in upper case and we're doing boolean // DETECT, then assume not boolean if ( allUpper && boolFlag == 2 ) boolFlag = 0; // . having the UOR opcode does not mean we are boolean because // we want to keep it fast. // . we need to set this opcode so the UOR logic in setQTerms() // works, because it checks the m_opcode value. otherwise // Msg20 won't think we are a boolean query and set boolFlag // to 0 when setting the query for summary generation and // will not recognize the UOR word as being an operator if ( wlen==3 && w[0]=='U' && w[1]=='O' && w[2]=='R' && ! firstWord ) { opcode = OP_UOR; m_hasUOR = true; goto skipin; } // . is this word a boolean operator? // . cannot be in quotes or field if ( boolFlag >= 1 && ! inQuotes && ! fieldCode ) { // are we an operator? if ( ! firstWord && wlen==2 && w[0]=='O' && w[1]=='R') opcode = OP_OR; else if ( ! firstWord && wlen==3 && w[0]=='A' && w[1]=='N' && w[2]=='D') opcode = OP_AND; else if ( ! firstWord && wlen==3 && w[0]=='N' && w[1]=='O' && w[2]=='T') opcode = OP_NOT; else if ( wlen==5 && w[0]=='L' && w[1]=='e' && w[2]=='F' && w[3]=='t' && w[4]=='P' ) opcode = OP_LEFTPAREN; else if ( wlen==5 && w[0]=='R' && w[1]=='i' && w[2]=='G' && w[3]=='h' && w[4]=='P' ) opcode = OP_RIGHTPAREN; skipin: // if we are detecting if query is boolean or not AND // if we are not an operator and have more than 1 cap // char then the turn off boolean //if ( boolFlag==2 &&!opcode &&wlen>1&&is_upper(w[1])){ // // turn boolean stuff off // boolFlag = 0; // // start again from the top with NO boolean // goto redo; //} // no pair across or even include any boolean op phrs if ( opcode ) { bits.m_bits[i] &= ~D_CAN_START_PHRASE; bits.m_bits[i] &= ~D_CAN_PAIR_ACROSS; bits.m_bits[i] &= ~D_CAN_BE_IN_PHRASE; qw->m_ignoreWord = IGNORE_BOOLOP; qw->m_opcode = opcode; if ( opcode == OP_LEFTPAREN ) continue; if ( opcode == OP_RIGHTPAREN ) continue; // if this is uncommented all of our operators // become actual query terms (mdw) if ( opcode == OP_UOR ) continue; // if you just have ANDs and ()'s that does // not make you a boolean query! we are bool // by default!! if ( opcode == OP_AND ) continue; m_isBoolean = true; continue; } } // . add single-word term id // . this is computed by hash64AsciiLower() // . but only hash64Lower_a if _HASHWITHACCENTS_ is true unsigned long long wid = 0LL; if (fieldCode == FIELD_CHARSET){ // find first space -- that terminates the field value char* end = (words.m_words[words.m_numWords-1] + words.m_wordLens[words.m_numWords-1]); while ( w+wlenm_rawWordId = wid; // we now have a first word already set firstWord = false; // . are we a QUERY stop word? // . NEVER count as stop word if it's in all CAPS and // not all letters in the whole query is NOT in all CAPS // . It's probably an acronym if ( words.isUpper(i) && words.getWordLen(i)>1 && ! allUpper ){ qw->m_isQueryStopWord = false; qw->m_isStopWord = false; } else { qw->m_isQueryStopWord =::isQueryStopWord (w,wlen,wid); // . BUT, if it is a single letter contraction thing // . ninad: make this == 1 if in utf8! TODO!! it is! if ( wlen == 1 && w[-1] == '\'' ) qw->m_isQueryStopWord = true; qw->m_isStopWord =::isStopWord (w,wlen,wid); } // . do not count as query stop word if it is the last in query // . like the query: 'baby names that start with j' if ( i + 2 > numWords ) qw->m_isQueryStopWord = false; // hash the termid //qw->m_wordId = g_indexdb.getTermId ( ph , wid ); // like we do it in XmlDoc.cpp's hashString() if ( ph ) qw->m_wordId = hash64 ( wid , ph ); else qw->m_wordId = wid; // do not ignore the word qw->m_ignoreWord = 0; } // pipe those that should be piped for ( long i = 0 ; i < pi ; i++ ) m_qwords[i].m_piped = true; if ( pi >= 0 ) m_piped = true; // . set m_leftConnected and m_rightConnected // . we are connected to the first non-punct word on our left // if we are separated by a small $ of defined punctuation // . see getIsConnection() for that definition // . this allows us to just lookup the phrase for things like // "cd-rom" rather than lookup "cd" , "rom" and "cd-rom" // . skip if prev word is IGNORE_BOOLOP, IGNORE_FIELDNAME or // IGNORE_DEFAULT // . we have to set outside the main loop above since we check // the m_ignoreWord member of the i+2nd word for ( long i = 0 ; i < numWords ; i++ ) { QueryWord *qw = &m_qwords[i]; if ( qw->m_ignoreWord ) continue; if ( i + 2 < numWords && ! m_qwords[i+2].m_ignoreWord&& isConnection(words.getWord(i+1),words.getWordLen(i+1)) ) qw->m_rightConnected = true; if ( i - 2 >= 0 && ! m_qwords[i-2].m_ignoreWord && isConnection(words.getWord(i-1),words.getWordLen(i-1) ) ) qw->m_leftConnected = true; } // now modify the Bits class before generating phrases for ( long i = 0 ; i < numWords ; i++ ) { // get default bits unsigned char b = bits.m_bits[i]; // allow pairing across anything by default b |= D_CAN_PAIR_ACROSS; // get Query Word QueryWord *qw = &m_qwords[i]; // . skip if part of a query weight operator // . cannot be in a phrase, or anything if ( qw->m_queryOp && !qw->m_opcode) { b = D_CAN_PAIR_ACROSS; } // is this word a sequence of punctuation and spaces? else if ( words.isPunct(i) ) { // pair across ANY punct, even double spaces by default b |= D_CAN_PAIR_ACROSS; // but do not pair across anything with a quote in it if ( words.getNumQuotes(i) >0) b &= ~D_CAN_PAIR_ACROSS; // continue if we're in quotes else if ( qw->m_quoteStart >= 0 ) goto next; // continue if we're in a field else if ( qw->m_fieldCode > 0 ) goto next; // if guy on left is in field, do not pair across if ( i > 0 && m_qwords[i-1].m_fieldCode > 0 ) b &= ~D_CAN_PAIR_ACROSS; // or if guy on right in field if ( i +1 < numWords && m_qwords[i+1].m_fieldCode > 0 ) b &= ~D_CAN_PAIR_ACROSS; // do not pair across ".." when not in quotes/field char *w = words.getWord (i); long wlen = words.getWordLen(i); for ( long j = 0 ; j < wlen-1 ; j++ ) { if ( w[j ]!='.' ) continue; if ( w[j+1]!='.' ) continue; b &= ~D_CAN_PAIR_ACROSS; break; } } else { // . not even capped query stop words can start phrase // . 'Mice And Men' is just one phrase then // . TODO: "12345678 it was rainy" // ("it" should start a phrase) //if ( qw->m_isQueryStopWord) b &= ~D_CAN_START_PHRASE; if ( qw->m_isStopWord ) b &= ~D_CAN_START_PHRASE; // . first alnum word can start phrase. // . example: 'the tigers' if ( i <= 1 ) b |= D_CAN_START_PHRASE; // first alnum word in quotes can start phrase. if ( qw->m_quoteStart == i ) // + 1 ) b |= D_CAN_START_PHRASE; // . right connected but not left can start phrase // . example: 'buy a-rom' , 'buy i-phone' if ( qw->m_rightConnected && ! qw->m_leftConnected ) b |= D_CAN_START_PHRASE; // . no field names, bool operators, cruft in fields // can be any part of a phrase // . no pair across any change of field code // . 'girl title:boy' --> no "girl title" phrase! if ( qw->m_ignoreWord ) { //== IGNORE_FIELDNAME ) { b &= ~D_CAN_PAIR_ACROSS; b &= ~D_CAN_BE_IN_PHRASE; b &= ~D_CAN_START_PHRASE; } // . no boolean ops // . 'this OR that' --> no "this OR that" phrase if ( qw->m_opcode ) { b &= ~D_CAN_PAIR_ACROSS; b &= ~D_CAN_BE_IN_PHRASE; } if ( qw->m_wordSign == '-' && qw->m_quoteStart < 0) { b &= ~D_CAN_PAIR_ACROSS; b &= ~D_CAN_BE_IN_PHRASE; } } next: // set it back all tweaked bits.m_bits[i] = b; } // . now since we may have prevented pairing across certain things // we need to set D_CAN_START_PHRASE for stop words whose left // punct word can no longer be paired across // . "dancing in the rain" is fun --> will include phrase "is fun". // . title:"is it right"? --> will include phrase "is it" for ( long i = 1 ; i < numWords ; i++ ) { // no punct, alnum only if ( words.isPunct(i) ) continue; // skip if not a stop word if ( ! bits.m_bits[i] & D_IS_STOPWORD ) continue; // continue if you can still pair across prev punct word if ( bits.m_bits[i-1] & D_CAN_PAIR_ACROSS ) continue; // otherwise, we can now start a phrase bits.m_bits[i] |= D_CAN_START_PHRASE; } // a bogus spam class, all words have 0 for their spam probability //Spam spam; //spam.reset ( words.getNumWords() ); // make the phrases from the words and the tweaked Bits class //Phrases phrases; if ( ! phrases.set ( &words , &bits , //NULL , true , // use stop words? false , // use stems? TITLEREC_CURRENT_VERSION, 0 /*niceness*/))//disallows HUGE phrases return false; long long *wids = words.getWordIds(); // do phrases stuff for ( long i = 0 ; i < numWords ; i++ ) { // get the ith QueryWord QueryWord *qw = &m_qwords[i]; // if word is ignored because it is opcode, or whatever, // it cannot start a phrase // THIS IS BROKEN //if ( qw->m_queryOp && qw->m_opcode == OP_PIPE){ // for (long j = i-1;j>=0;j--){ // if (!m_qwords[j].m_phraseId) continue; // m_qwords[j].m_ignorePhrase = IGNORE_BOOLOP; // break; // } // //} if ( qw->m_ignoreWord ) continue; if ( qw->m_fieldCode && qw->m_quoteStart < 0) continue; // get the first word # to our left that starts a phrase // of which we are a member qw->m_leftPhraseStart = -1; //long long tmp; for ( long j = i - 1 ; j >= 0 ; j-- ) { //if ( ! bits.isIndexable(j) ) continue; if ( ! bits.canPairAcross(j+1) ) break; //if ( ! bits.canStartPhrase(j) ) continue; if ( ! wids[j] ) continue; // phrases.getNumWordsInPhrase() //if( j + phrases.getMaxWordsInPhrase(j,&tmp)m_leftPhraseStart = j; // we can't pair across alnum words now, we just want bigrams if ( wids[j] ) break; //break; // now we do bigrams so only allow two words even // if they are stop words break; } // . is this word in a quoted phrase? // . the whole phrase must be in the same set of quotes // . if we're in a left phrase, he must be in our quotes if ( qw->m_leftPhraseStart >= 0 && qw->m_quoteStart >= 0 && qw->m_leftPhraseStart >= qw->m_quoteStart ) qw->m_inQuotedPhrase = true; // if we start a phrase, ensure next guy is in our quote if ( ! qw->m_ignorePhrase && i+1 < numWords && m_qwords[i+1].m_quoteStart >= 0 && m_qwords[i+1].m_quoteStart <= i ) qw->m_inQuotedPhrase = true; // are we the first word in the quote? if ( i-1>=0 && qw->m_quoteStart == i ) qw->m_inQuotedPhrase = true; // ignore single words that are in a quoted phrase if ( ! keepAllSingles && qw->m_inQuotedPhrase ) qw->m_ignoreWord = IGNORE_QUOTED; // . get phrase info for this term // . a pid (phraseId)of 0 indicates it does not start a phrase // . raw phrase termId //unsigned long long pid = phrases.getPhraseId(i); unsigned long long pid = 0LL; // nwp is a REGULAR WORD COUNT!! long nwp = 0; if ( qw->m_inQuotedPhrase ) // keep at a bigram for now... i'm not sure if we // will be indexing trigrams nwp = phrases.getMinWordsInPhrase(i,(long long *)&pid); // just get a two-word phrase term if not in quotes else nwp = phrases.getMinWordsInPhrase(i,(long long *)&pid); // store it qw->m_rawPhraseId = pid; // does word #i start a phrase? if ( pid != 0 ) { unsigned long long ph = qw->m_prefixHash ; // store the phrase id with coll/prefix //qw->m_phraseId = g_indexdb.getTermId ( ph , pid ); // like we do it in XmlDoc.cpp's hashString() if ( ph ) qw->m_phraseId = hash64 ( pid , ph ); else qw->m_phraseId = pid; // how many regular words long is the bigram? long plen2; phrases.getPhrase ( i , &plen2 ,2); // the trigram? long plen3; phrases.getPhrase ( i , &plen3 ,3); // get just the bigram for now qw->m_phraseLen = plen2; // do not ignore the phrase, it's valid qw->m_ignorePhrase = 0; // set our rightPhraseEnd point //qw->m_rightPhraseEnd = i + phrases.getNumWords(i); // leave it as 0 if it got truncated i guess by the // MAX_QUERY_WORDS of 320 qw->m_rightRawWordId = 0LL; // store left and right raw word ids long ni = i + nwp - 1; if ( ni < m_numWords ) qw->m_rightRawWordId=m_qwords[ni].m_rawWordId; } // . phrase sign is inherited from word's sign if it's a minus // . word sign is inherited from field, quote or right before // the word // . that is, all words in -"to be or not" will have a '-' sign // . phraseId may or may not be 0 at this point if ( qw->m_wordSign == '-' ) qw->m_phraseSign = '-'; // . dist word signs to others in the same connected string // . use "-cd-rom x-box" w/ no connector in between // . test queries: // . +cd-rom +x-box // . -cd-rom +x-box // . -m-o-n // . who was the first (was is a query stop word) // . www.xxx.com // . welcome to har.com // . hezekiah walker the love family affair ii live at radio // city music hall // . fotostudio +m-o-n-a-r-t // . fotostudio -m-o-n-a-r-t // . i'm home if ( qw->m_leftConnected && qw->m_leftPhraseStart >= 0 ) qw->m_wordSign = m_qwords[i-2].m_wordSign; // . if we connected to the alnum word on our right then // soft require the phrase (i.e. treat like a single term) // . example: cd-rom or www.xxx.com // . 'welcome to har.com' should get a '*' for "har.com" sign if ( qw->m_rightConnected ) { if ( qw->m_wordSign) qw->m_phraseSign = qw->m_wordSign; else qw->m_phraseSign = '*'; } // . if we're in quotes then any phrase we have should be // soft required (i.e. treated like a single term) // . we do not allow phrases in queries to pair across // quotes. See where we tweak the Bits class above. if ( qw->m_quoteStart >= 0 ) { //if (qw->m_wordSign)qw->m_phraseSign = qw->m_wordSign; //else qw->m_phraseSign = '*'; qw->m_phraseSign = '*'; } // . if we are the last word in a phrase that consists of all // PLAIN stop words then make the phrase have a '*' // . 'to be or not to be .. test' (cannot pair across "..") // . don't use QUERY stop words cuz of "who was the first?" qry if ( pid ) { long nw = phrases.getNumWordsInPhrase2(i); long j; // search up to this far long maxj = i + nw; // but not past our truncated limit if ( maxj > MAX_QUERY_WORDS ) maxj = MAX_QUERY_WORDS; for ( j = i ; j < maxj ; j++ ) { // skip punct if ( words.isPunct(j) ) continue; // break out if not a stop word if ( ! bits.isStopWord(j) ) break; // break out if has a term sign if ( m_qwords[j].m_wordSign ) break; } // if everybody in phrase #i was a signless stopword // and the phrase was signless, make it have a '*' sign if ( j >= maxj && m_qwords[i].m_phraseSign == '\0' ) m_qwords[i].m_phraseSign = '*'; // . if a constituent has a - sign, then the whole // phrase becomes negative, too // . fixes 'apple -computer' truncation problem for ( long j = i ; j < maxj ; j++ ) if ( m_qwords[j].m_wordSign == '-' ) qw->m_phraseSign = '-'; } // . ignore unsigned QUERY stop words that are not yet ignored // and are in unignored phrases // . 'who was the first taiwanese president' should not get // "who was" term sign changed to '*' because "was" is a // QUERY stop word. So ignore singles query stop words // in phrases now if ( //! keepAllSingles && (qw->m_isQueryStopWord && !m_isBoolean) && m_useQueryStopWords && ! qw->m_fieldCode && // fix 'the tigers' //(qw->m_leftPhraseStart >= 0 || qw->m_phraseId > 0 ) && ! qw->m_wordSign && ! qw->m_ignoreWord ) qw->m_ignoreWord = IGNORE_QSTOP; // . ignore word if connected to right or left alnum word // . we will be replaced by a phrase(s) // . do not worry about keepAllSingles because we turn // this into a phrase below! // . if ( ! keepAllSingles && if ( ( qw->m_leftConnected || qw->m_rightConnected ) ) qw->m_ignoreWord = IGNORE_CONNECTED; // . ignore and/or between quoted phrases, save user from // themselves (they meant AND/OR) if ( ! keepAllSingles && qw->m_isQueryStopWord && ! qw->m_fieldCode && m_useQueryStopWords && ! qw->m_phraseId && ! qw->m_inQuotes && ((qw->m_wordId == 255176654160863LL) || (qw->m_wordId == 46196171999655LL)) ) qw->m_ignoreWord = IGNORE_QSTOP; // . ignore repeated single words and phrases // . look at the old termIds for this, too // . should ignore 2nd 'time' in 'time after time' then // . but boolean queries often need to repeat terms // . NEW - words much be same sign and not in different // . quoted phrases to be ignored -partap m_hasDupWords = false; if ( ! m_isBoolean && !qw->m_ignoreWord ) { for ( long j = 0 ; j < i ; j++ ) { if ( m_qwords[j].m_ignoreWord ) continue; if ( m_qwords[j].m_wordId == qw->m_wordId && m_qwords[j].m_wordSign ==qw->m_wordSign && (!keepAllSingles || (m_qwords[j].m_quoteStart == qw->m_quoteStart))){ qw->m_ignoreWord = IGNORE_REPEAT; m_hasDupWords = true; } } } if ( ! m_isBoolean && !qw->m_ignorePhrase ) { // ignore repeated phrases too! for ( long j = 0 ; j < i ; j++ ) { if ( m_qwords[j].m_ignorePhrase ) continue; if ( m_qwords[j].m_phraseId == qw->m_phraseId && m_qwords[j].m_phraseSign == qw->m_phraseSign) qw->m_ignorePhrase = IGNORE_REPEAT; } } } // treat strongly connected phrases like cd-rom and 3.2.0.3 as being // in quotes for the most part, therefore, set m_quoteStart for them long j; long qs = -1; for ( j = 0 ; j < numWords ; j++ ) { // skip all but strongly connected words if ( m_qwords[j].m_ignoreWord != IGNORE_CONNECTED && // must also be non punct word OR a space ( !words.isPunct(j) || words.m_words[j][0] == ' ' ) ) { // break the "quote", if any qs = -1; continue; } // if he is punctuation and qs is -1, skip him, // punctuation words can no longer start a quote if ( words.isPunct(j) && qs == -1 ) continue; // uningore him if we should if ( keepAllSingles ) m_qwords[j].m_ignoreWord = 0; // if already in quotes, don't bother! if ( m_qwords[j].m_quoteStart >= 0 ) continue; // remember him if ( qs == -1 ) qs = j; // he starts the phrase m_qwords[j].m_quoteStart = qs; // force him into a quoted phrase m_qwords[j].m_inQuotes = true; //m_qwords[j].m_inQuotedPhrase = true; } // . if we only have one quoted query then force its sign to be '+' // . '"get the phrase" the' --> +"get the phrase" (last the is ignored) // . "time enough for love" --> +"time enough" +"enough for love" // . if all unignored words are in the same set of quotes then change // all '*' (soft-required) phrase signs to '+' for ( j= 0 ; j < numWords ; j++ ) { if ( words.isPunct(j)) continue; if ( m_qwords[j].m_quoteStart < 0 ) break; if ( m_qwords[j].m_ignoreWord ) continue; if ( j < 2 ) continue; if ( m_qwords[j-2].m_quoteStart != m_qwords[j].m_quoteStart ) break; } if ( j >= numWords ) { for ( j= 0 ; j < numWords ; j++ ) { if ( m_qwords[j].m_phraseSign == '*' ) m_qwords[j].m_phraseSign = '+'; } } // . force a plus on any site: or ip: query terms // . also disable site clustering if we have either of these terms for ( long i = 0 ; i < m_numWords ; i++ ) { QueryWord *qw = &m_qwords[i]; if ( qw->m_ignoreWord ) continue; if ( qw->m_wordSign ) continue; if ( qw->m_fieldCode != FIELD_SITE && qw->m_fieldCode != FIELD_IP ) continue; qw->m_wordSign = '+'; } // now check phrase terms. if you do a search in quotes like // "directions and nearby" it will now generate two phrases: // "directions and nearby" and "and nearby", so stop "and nearby" /* for ( long i = 0 ; i < m_numWords ; i++ ) { QueryWord *qw = &m_qwords[i]; if ( qw->m_ignorePhrase ) continue; if ( ! qw->m_phraseId ) continue; // skip if we start this phrase if ( qw->m_quoteStart == i ) continue; // . skip if are not a phrase stop word that is paired across // . not now, we support 3,4 and 5 word phrases... //if ( ! qw->m_isStopWord ) continue; // however, we some quoted phrases are more than 5 words // TODO: fix this!!! // ok, nuke this term otherwise qw->m_ignorePhrase = IGNORE_DEFAULT; } */ // . if one or more of a phrase's constituent terms exceeded // term #MAX_QUERY_TERMS then we should also soft require that phrase // . fixes 'hezekiah walker the love family affair ii live at // radio city music hall' // . how many non-ignored phrases? long count = 0; for ( long i = 0 ; i < m_numWords ; i++ ) { QueryWord *qw = &m_qwords[i]; if ( qw->m_ignorePhrase ) continue; if ( ! qw->m_phraseId ) continue; count++; } for ( long i = 0 ; i < numWords ; i++ ) { QueryWord *qw = &m_qwords[i]; // count non-ignored words if ( qw->m_ignoreWord ) continue; // if under limit, continue if ( count++ < MAX_QUERY_TERMS ) continue; // . otherwise, ignore // . if we set this for our UOR'ed terms from SearchInput.cpp's // UOR'ed facebook interests then it causes us to get no results! // so make sure that MAX_QUERY_TERMS is big enough with respect to // the opCount in SearchInput.cpp qw->m_ignoreWord = IGNORE_BREECH; // left phrase should get a '*' long left = qw->m_leftPhraseStart; if ( left >= 0 && ! m_qwords[left].m_phraseSign ) m_qwords[left].m_phraseSign = '*'; // our phrase should get a '*' if ( qw->m_phraseId && ! qw->m_phraseSign ) qw->m_phraseSign = '*'; } // . fix the 'x -50a' query so it returns results // . how many non-negative, non-ignored words/phrases do we have? count = 0; for ( long i = 0 ; i < m_numWords ; i++ ) { QueryWord *qw = &m_qwords[i]; if ( qw->m_ignoreWord ) continue; if ( qw->m_wordSign == '-' ) continue; count++; } for ( long i = 0 ; i < m_numWords ; i++ ) { QueryWord *qw = &m_qwords[i]; if ( qw->m_ignorePhrase ) continue; if ( qw->m_phraseSign == '-' ) continue; if ( qw->m_phraseId == 0LL ) continue; count++; } // if everybody is ignored or negative UNignore first query stop word if ( count == 0 ) { for ( long i = 0 ; i < m_numWords ; i++ ) { QueryWord *qw = &m_qwords[i]; if ( qw->m_ignoreWord != IGNORE_QSTOP ) continue; qw->m_ignoreWord = 0; count++; break; } } // . count ignored WORDS for logging stats // . do not IGNORE_DEFAULT though, that doesn't really count //m_numIgnored = 0; //for ( long i = 0 ; i < m_numWords ; i++ ) { // if ( ! m_qwords[i].m_ignoreWord ) continue; // if ( m_qwords[i].m_ignoreWord == IGNORE_DEFAULT ) continue; // m_numIgnored++; //} quoteStart = -1; long quoteEnd = -1; // set m_quoteENd for ( long i = m_numWords - 1 ; i >= 0 ; i-- ) { // get ith word QueryWord *qw = &m_qwords[i]; // skip if ignored if ( qw->m_ignoreWord ) continue; // skip if not in quotes if ( qw->m_quoteStart < 0 ) continue; // if match previous guy... if ( qw->m_quoteStart == quoteStart ) { // inherit the end qw->m_quoteEnd = quoteEnd; // all done continue; } // ok, we are the end then quoteEnd = i; quoteStart = qw->m_quoteStart; } long wkid = 0; long upTo = -1; long wk_start; long wk_nwk; //long long *wids = words.getWordIds(); // // set the wiki phrase ids // for ( long i = 0 ; i < m_numWords ; i++ ) { // get ith word QueryWord *qw = &m_qwords[i]; // in a phrase from before? if ( i < upTo ) { qw->m_wikiPhraseId = wkid; qw->m_wikiPhraseStart = wk_start; qw->m_numWordsInWikiPhrase = wk_nwk; continue; } // assume none qw->m_wikiPhraseId = 0; // skip if punct if ( ! wids[i] ) continue; // get word long nwk ; nwk = g_wiki.getNumWordsInWikiPhrase ( i , &words ); // bail if none if ( nwk <= 1 ) continue; // save these too wk_start = i; wk_nwk = nwk; // inc it wkid++; // store it qw->m_wikiPhraseId = wkid; qw->m_wikiPhraseStart = wk_start; qw->m_numWordsInWikiPhrase = wk_nwk; // set loop parm upTo = i + nwk; } // all done return true; } // return -1 if does not exist in query, otherwise return the query word num long Query::getWordNum ( long long wordId ) { // skip if punct or whatever if ( wordId == 0LL || wordId == -1LL ) return -1; for ( long i = 0 ; i < m_numWords ; i++ ) { QueryWord *qw = &m_qwords[i]; // the non-raw word id includes a hash with "0", which // signifies an empty field term if ( qw->m_rawWordId == wordId ) return i; } // otherwise, not found return -1; } //static TermTable s_table; static HashTableX s_table; static bool s_isInitialized = false; // 3rd field = m_hasColon struct QueryField g_fields[] = { {"url", FIELD_URL, true,"Match the exact url. Example: url:www.gigablast.com/addurl.htm"}, {"ext", FIELD_EXT, true,"Match the url extension. Example: ext:htm or ext:mpeg to find urls ending in .htm or .mpeg respectively."}, {"link", FIELD_LINK, true,"Match pages that link to the given url. Example: link:www.gigablast.com will return all pages linking to the www.gigablast.com page."}, {"links", FIELD_LINKS, true,"Same as link:."}, {"ilink", FIELD_ILINK, true,"Similar to above."}, {"sitelink", FIELD_SITELINK, true,"Matches all pages that link to the given site. Example:sitelink:www.gigablast.com matches all pages that link to some page on the www.gigablast.com site."}, {"site", FIELD_SITE, true,"Matches all pages from the given site. Example: site:www.gigablast.com will return all the pages on the gigablast site"}, {"coll", FIELD_COLL, true,"Not sure if this works."}, {"ip", FIELD_IP, true,"Matches all pages with the given ip. Example:1.2.3.4 will match all pages whose urls have that IP address."}, {"inurl", FIELD_SUBURL, true,"Matches all pages that have the given terms in the url. Example inurl:water will match all pages whose url has the word water in it, but the word must be delineated by punctuation."}, {"suburl", FIELD_SUBURL, true,"Same as inurl."}, {"intitle", FIELD_TITLE, false,"Matches all pages that have pages that have the given term in their title. Example: title:web returns all pages that have the word web in their title."}, {"title", FIELD_TITLE, false,"Same as intitle:"}, {"isclean", FIELD_ISCLEAN, true,"Matches all pages that are deemed non-offensive and safe for children."}, {"gbrss", FIELD_GBRSS, true,"Matches all pages that are rss feeds."}, //{"gbruleset",FIELD_GBRULESET, true,"Obsolete."}, {"type", FIELD_TYPE, false,"Matches all pages of the specified file type. Example: type:pdf will match pdf documents, regardless of their file extension."}, {"filetype", FIELD_TYPE, false,"Same as type:"}, {"gbisadult",FIELD_TYPE,false,"use gbisadult:0 and gbisadult:1 to restrict results to non-adult and adult documents respectively."}, {"gbtag*", FIELD_TAG, false,"Matches all pages whose tag named * have the specified value. Example: gbtagingoogle:1 matches all pages that have a value of 1 for their ingoogle tag in tagdb."}, {"zip", FIELD_ZIP, false,"Matches all pages that have the specified zip code in their meta zip code tag. Not to be used with events."}, {"zipcode", FIELD_ZIP, false,"Same as zip:"}, //{"range", FIELD_RANGE, false,""}, // obsolete, datedb replaced {"charset", FIELD_CHARSET, false,"Matches all pages in the given character set."}, {"urlhash",FIELD_URLHASH, false,""}, {"urlhashdiv10",FIELD_URLHASHDIV10, false,""}, {"urlhashdiv100",FIELD_URLHASHDIV100, false,""}, {"gblang",FIELD_GBLANG,false,"Matches all pages in the given language. Examples: gblang:en gblang:fr gblang:de"}, {"gbquality",FIELD_GBQUALITY,true,""}, {"gblinktextin",FIELD_LINKTEXTIN,true,""}, {"gblinktextout",FIELD_LINKTEXTOUT,true,""}, {"gbkeyword",FIELD_KEYWORD,true,""}, {"gbcharset", FIELD_CHARSET, false,""}, {"gbpathdepth", FIELD_GBOTHER, false,"the path depth of the url"}, {"gbhasfilename", FIELD_GBOTHER, false,""}, {"gbiscgi", FIELD_GBOTHER, false,""}, {"gbhasext", FIELD_GBOTHER, false,""}, {"gbsubmiturl", FIELD_GBOTHER, false,""}, {"qdom", FIELD_QUOTA, false,""}, {"qhost", FIELD_QUOTA, false,""}, {"gbtagvector", FIELD_GBTAGVECTOR, false,""}, {"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""}, {"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""}, {"gbcontenthash", FIELD_GBCONTENTHASH, false,""}, {"gbsortby", FIELD_GBSORTBY, false,""}, {"gbrevsortby", FIELD_GBREVSORTBY, false,""}, {"gbnumbermin", FIELD_GBNUMBERMIN, false,""}, {"gbnumbermax", FIELD_GBNUMBERMAX, false,""}, {"gbcountry",FIELD_GBCOUNTRY,false,""}, {"gbad",FIELD_GBAD,false,""}, {"gbsectionhash" ,FIELD_GBSECTIONHASH,false,"Internal use only."}, {"gbduphash" ,FIELD_GBOTHER,false,"Internal use only."}, {"gbsitetemplate" ,FIELD_GBOTHER,false,"Internal use only."}, {"gboutlinkedtitle" ,FIELD_GBOTHER,false,"gboutlinkedtitle:0 and gboutlinkedtitle:1 matches events whose title is not in and in a hyperlink, respectively."}, {"gbisaggregator" ,FIELD_GBOTHER,false,"gbisaggregator:0|1 depending on if the event came from an event aggregator website, like eviesays.com."}, {"gbdeduped" ,FIELD_GBOTHER,false,""}, {"gbinjected", FIELD_GBOTHER,false,"Was the event injected?."}, //{"gbstartrange",FIELD_GBSTARTRANGE,false,""}, //{"gbendrange",FIELD_GBENDRANGE,false,""}, {"gbpermalink",FIELD_GBPERMALINK,false,""}, {"gbcsenum",FIELD_GBCSENUM,false,""}, {"gbdocid",FIELD_GBDOCID,false,"restrict results to this docid"} }; void resetQuery ( ) { s_table.reset(); } long getNumFieldCodes ( ) { return (long)sizeof(g_fields) / (long)sizeof(QueryField); } static bool initFieldTable(){ if ( ! s_isInitialized ) { // set up the hash table if ( ! s_table.set ( 8 , 4 , 255,NULL,0,false,0,"qryfldtbl" ) ) return log("build: Could not init table of " "query fields."); // now add in all the stop words long n = getNumFieldCodes(); for ( long i = 0 ; i < n ; i++ ) { long long h = hash64b ( g_fields[i].text ); // store the entity index in the hash table as score if ( ! s_table.addTerm ( &h, i+1 ) ) return false; } s_isInitialized = true; } return true; } char getFieldCode ( char *s , long len , bool *hasColon ) { // default if (hasColon) *hasColon = false; if (!initFieldTable()) return FIELD_GENERIC; long long h = hash64Lower_a(s, len );//>> 1) ; long i = (long) s_table.getScore ( &h ) ; if (i==0) return FIELD_GENERIC; //if (hasColon) *hasColon = g_fields[i-1].hasColon ; return g_fields[i-1].field; } char getFieldCode2 ( char *s , long len , bool *hasColon ) { // default if (hasColon) *hasColon = false; if (!initFieldTable()) return FIELD_GENERIC; // subtract the colon for matching if ( s[len-1]==':') len--; long long h = hash64 (s , len , 0LL ); long i = (long) s_table.getScore ( &h ) ; if (i==0) return FIELD_GENERIC; //if (hasColon) *hasColon = g_fields[i-1].hasColon ; return g_fields[i-1].field; } char getFieldCode3 ( long long h64 ) { if (!initFieldTable()) return FIELD_GENERIC; // subtract the colon for matching long i = (long) s_table.getScore ( &h64 ) ; if (i==0) return FIELD_GENERIC; //if (hasColon) *hasColon = g_fields[i-1].hasColon ; return g_fields[i-1].field; } // guaranteed to be punctuation bool Query::isConnection ( char *s , long len ) { if ( len == 1 ) { switch (*s) { // . only allow apostrophe if it's NOT a 's // . so contractions are ok, and names too case '\'': // no, i think we should require it. google seems to, // and msn and yahoo do. 'john's room -"john's" gives // no result son yahoo and msn. return true; if ( *(s+1) !='s' ) return true; return false; case ':': return true; case '-': return true; case '.': return true; case '@': return true; case '#': return true; case '/': return true; case '_': return true; case '&': return true; case '=': return true; case '\\': return true; default: return false; } return false; } //if ( len == 3 && s[0]==' ' && s[1]=='&' && s[2]==' ' ) return true; if ( len == 3 && s[0]==':' && s[1]=='/' && s[2]=='/' ) return true; return false; } void Query::printQueryTerms(){ for (long i=0;i 254 ) ttlen = 254; if ( ttlen < 0 ) ttlen = 0; // this is utf8 memcpy ( tt , getTerm(i) , ttlen ); tt[ttlen]='\0'; if ( c == '\0' ) c = ' '; logf(LOG_DEBUG, "query: Query Term #%ld " "phr=%li termId=%llu rawTermId=%llu" " sign=%c " "ebit=0x%0llx " "impBits=0x%0llx " "hc=%li " "component=%li " "otermLen=%li " "term=%s ", i, (long)isPhrase (i) , getTermId (i) , getRawTermId (i) , c , (long long)m_qterms[i].m_explicitBit , (long long)m_qterms[i].m_implicitBits , (long) m_qterms[i].m_hardCount , m_componentCodes[i], getTermLen(i), tt ); } } //////////////////////////////////////////////////////// //////////////////////////////////////////////////////// ////////// ONLY BOOLEAN STUFF BELOW HERE ///////////// //////////////////////////////////////////////////////// //////////////////////////////////////////////////////// bool Query::testBoolean(qvec_t bits, qvec_t bitmask){ if (!m_isBoolean) return false; Expression *e = &m_expressions [ 0 ]; // find top-level expression while (e->m_parent && e != e->m_parent) e = e->m_parent; return e->isTruth(bits, bitmask); } void Query::printBooleanTree(){ if (!m_isBoolean) return; Expression *e = &m_expressions [ 0 ]; // find top-level expression while (e->m_parent && e != e->m_parent) e = e->m_parent; SafeBuf sbuf(1024); e->print(&sbuf); logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart()); } // . also sets the m_underNOT member of each QueryTerm, too!! // . returns false and sets g_errno on error, true otherwise bool Query::setBooleanOperands ( ) { // we're done if we're not boolean if ( ! m_isBoolean ) return true; if ( m_truncated ) { g_errno = ETOOMANYOPERANDS; return log("query: Maximum number of bool operands " "exceeded (%ld).",m_numTerms); } // alloc the mem if we need to (mdw left off here) //long need = (m_numWords/3) * sizeof(Expression); // illegitmate bool expressions breech the buffer long need = (m_numWords) * sizeof(Expression); // sanity check if ( m_expressions || m_expressionsAllocSize ) { char *xx = NULL; *xx = 0; } // point m_qwords to our generic buffer if it will fit if ( m_gnext + need < m_gbuf + GBUF_SIZE ) { m_expressions = (Expression *)m_gnext; m_gnext += need; } // otherwise, we must allocate memory for it else { m_expressions = (Expression *)mmalloc ( need , "Query3" ); if ( ! m_expressions ) return log("query: Could not allocate " "expressions for query."); m_expressionsAllocSize = need; } // otherwise, we need to set the boolean Expression classes now // so we can determine which terms are UNDER the influence of // NOT operators so IndexReadInfo.cpp can read in the WHOLE termlist // for those terms. (like it would if they had a '-' m_termSign) Expression *e = &m_expressions [ 0 ]; m_numExpressions = 1; // . set the expression recursively // . just setting this will not set the m_hasNOT members of each // QueryTerm long status = e->set ( 0 , // first word # m_numWords , // last word # 0 , // parser position this , // array of QueryWords 0 ,// level NULL, NULL, // parent, leftchild false , // has NOT? false ); // under NOT? if ( status < 0 ) { g_errno = ETOOMANYOPERANDS; return log("query: Maximum number of bool operands " "(%li) exceeded.",(long)MAX_OPERANDS); } while (e->m_parent) { if (e == e->m_parent) { g_errno = EBADREQUEST; return log(LOG_WARN, "query: expression is own parent: " "%s", m_orig); } e = e->m_parent; } //log(LOG_DEBUG, "query: set %li operands", // m_numOperands); if (g_conf.m_logDebugQuery) { SafeBuf sbuf(1024); e->print(&sbuf); log(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart()); } // . get all the terms that are UNDER a NOT operator in some fashion // . these bits are 1-1 with m_qterms[] qvec_t notBits = e->getNOTBits( false ); for ( long i = 0 ; i < m_numTerms ; i++ ) { if ( m_qterms[i].m_explicitBit & notBits ) m_qterms[i].m_underNOT = true; else m_qterms[i].m_underNOT = false; } return true; } // . returns -1 on bad query error // . returns word AFTER the last word in our operand long Operand::set ( long a , long b , QueryWord *qwords , long level , bool underNOT ) { // clear these m_termBits = 0; m_hasNOT = false; //m_hardRequiredBits = 0; // . parse out the operands and OR in their term bits // . the boy AND girl --> (the AND boy) AND girl // . "the boy toy" AND girl --> "the boy" AND "boy toy" AND girl // . cd-rom AND buy --> "cd-rom" AND buy // . phraseSign will not be 0 if its important (in quotes, cd-rom,...) for ( long i = a ; i < b ; i++ ) { // get the QUERY word QueryWord *qw = &qwords[i]; // set the parenthetical level of the word qw->m_level = level; // set this qw->m_underNOT = underNOT; // skip punct if ( ! qw->isAlphaWord() ) { // if it is a parens, bail! if ( qw->m_opcode == OP_LEFTPAREN ) return i ; if ( qw->m_opcode == OP_RIGHTPAREN ) return i ; // otherwise, skip this punct and get next word else continue; } // bail if op code, return PUNCT word # before it if ( qw->m_opcode ) return i ; if ( qw->m_wordSign == '-' || qw->m_phraseSign == '-'){ if (i == a) { m_hasNOT = true; } else { if (!m_hasNOT) return i; } } else if (i>a && m_hasNOT) return i; // . does it have an unsigned phrase? or in phrase term bits // . might have a phrase that's not a QueryTerm because // query is too long if ( qw->m_phraseId && qw->m_queryPhraseTerm && qw->m_phraseSign ) { qvec_t e =qw->m_queryPhraseTerm->m_explicitBit; //if (qw->m_phraseSign == '+') m_hardRequiredBits |= e; m_termBits |= e; } // why would it be ignored? oh... if like cd-rom or in quotes if ( qw->m_ignoreWord ) continue; // . OR in the word term bits // . might be a word that's not a QueryTerm because // query is too long if ( qw->m_queryWordTerm ) { qvec_t e = qw->m_queryWordTerm->m_explicitBit; //if (qw->m_phraseSign == '+') m_hardRequiredBits |= e; m_termBits |= e; } } return b; } // . returns -1 on bad query error // . returns next word to parse (after expression) on success // . "*globalNumOperands" is how many expressions/operands are being used // in the global "expressions" and "operands" array // . new: organize query into sum of products normal form, ie: // . (a) OR (b AND c AND d) OR (e AND f) unsigned char precedence[] = { 0, // term 4, // OR 3, // AND 2, // NOT 1, // LEFTP 1, // RIGHTP 3, // UOR 5, // PIPE }; long Expression::set (long start, long end, long pos, // current parsing position class Query *q, long level, class Expression *parent, class Expression *leftChild, bool hasNOT , bool underNOT ) { m_start = start; m_end = end; m_opcode = 0; m_operand = NULL; m_numChildren = 0; m_hasNOT = hasNOT; m_parent = parent; uint8_t curOp = 0; QueryWord *qwords = q->m_qwords; Expression *o_expressions = q->m_expressions; Operand *o_operands = q->m_operands; long *o_numOperands = &q->m_numOperands; long *o_numExpressions = &q->m_numExpressions; long maxExpressions = q->m_numWords; // Lets really try to catch this if (m_parent == this) { //log(LOG_WARN, "query: Warning, setting expression " // "parent to self"); char *xx = NULL; *xx = 0; } //set initial args if (leftChild) { leftChild->m_parent = this; m_children[0] = leftChild; m_numChildren = 1; } hasNOT = false; for ( long i=pos ; im_underNOT = underNOT; // set leaf node if (!qw->m_opcode && qw->isAlphaWord()){ if (i > m_start) goto setChildExpr; // if we maxxed out, error out if ( *o_numOperands >= MAX_OPERANDS ) return -1; Operand *op = &o_operands [ *o_numOperands ]; *o_numOperands = *o_numOperands + 1; // . return ptr to next word for us to parse // . subtract once since for loop will inc it i = op->set ( i , end , qwords , level , underNOT ); if ( i < 0 ) return -1; m_operand = op; goto endExpr; } if (qw->m_opcode == OP_NOT){ hasNOT = !hasNOT; underNOT = hasNOT; continue; } else if (qw->m_opcode == OP_LEFTPAREN){ if (i == m_start) i++; goto setChildExpr; } else if (qw->m_opcode == OP_RIGHTPAREN){ goto endExpr; } else if (qw->m_opcode) { int delta = 0; curOp = qw->m_opcode; if (m_numChildren == 1) m_opcode = curOp; if (m_numChildren > 1 && curOp != m_opcode) { delta = (int)precedence[curOp] - (int)precedence[m_opcode]; } if (delta > 0){ goto endExpr; } if (delta < 0){ // set a subexpression conataining the // last operand we found as the first goto setChildExpr2; } } continue; endExpr: //log(LOG_DEBUG, "query: set Expr [%ld, %ld), opcode: %d", // a, i, curOp); // if we've matched parens, go to next word // but if we have an extra right paren, don't crash if (qw->m_opcode == OP_RIGHTPAREN && (qwords[m_start].m_opcode == OP_LEFTPAREN || m_start == 0)) i++; m_end = i; // We have an extra open paren if (qwords[m_start].m_opcode == OP_LEFTPAREN && qw->m_opcode != OP_RIGHTPAREN) goto setParentExpr; // we are top-level expr, but there is more to parse if (!m_parent && i < end-1) goto setParentExpr; // just return return i; // add a parent expression with this one as the left child setParentExpr: { if ( *o_numExpressions >= maxExpressions ) return -1; //if (qw->m_opcode == OP_RIGHTPAREN) i++; Expression *e = &o_expressions[*o_numExpressions]; *o_numExpressions = *o_numExpressions + 1; i = e->set ( m_start , end ,i, q , level+1, m_parent, this, false , underNOT ) ; return i; } // add a child expression setChildExpr: { if ( *o_numExpressions >= maxExpressions ) return -1; Expression *e = &o_expressions[*o_numExpressions]; *o_numExpressions = *o_numExpressions + 1; i = e->set ( i , end , i, q , level+1, this, NULL, hasNOT , underNOT ) -1; if ( i < 0 ) return -1; // trim needless parens while (e->m_numChildren == 1) { hasNOT = e->m_hasNOT; e = e->m_children[0]; if (hasNOT) e->m_hasNOT = ! e->m_hasNOT; } hasNOT = false; //cull empty expressions if (e->m_numChildren < 1 && e->m_operand == NULL) continue; if (m_numChildren >= MAX_OPERANDS) return -1; // add good expressions m_children [ m_numChildren] = e; m_numChildren++; if (m_numChildren > 1 && m_opcode == 0) m_opcode = OP_AND; // default AND continue; } // we need to make the last operand we passed // be the first operand of a subexpression setChildExpr2: { // remove the last expression from our list Expression *ce = m_children[m_numChildren-1]; m_numChildren--; if ( *o_numExpressions >= maxExpressions ) return -1; Expression *e = &o_expressions[*o_numExpressions]; *o_numExpressions = *o_numExpressions + 1; i = e->set ( ce->m_start , end , i, q , level+1, this, ce, false , underNOT ) -1; ce->m_parent = e; if ( i < 0 ) return -1; if (m_numChildren >= MAX_OPERANDS) return -1; m_children [ m_numChildren ] = e; hasNOT = false; m_numChildren++; continue; } } return end; } // . "bits" are 1-1 with the query terms in Query::m_qterms[] array bool Expression::isTruth ( qvec_t bits, qvec_t mask ) { //bool op1 = false ; // set to false so compiler shuts up //bool op2 ; //bool accumulator = false; //bool hadOR = false; bool result = false; // leaf node if (m_operand){ result = m_operand->isTruth(bits, mask); // handle masked terms better.. don't apply NOT operator if (!(m_operand->m_termBits & mask)) return true; } else if (m_numChildren == 1){ result = m_children[0]->isTruth(bits, mask); } else if (m_opcode == OP_OR || m_opcode == OP_UOR) { for ( long i=0 ; iisTruth(bits, mask); if (result) goto done; } } else if (m_opcode == OP_AND || m_opcode == OP_PIPE){ result = true; for (long i = 0 ; i < m_numChildren ; i++ ) { result = result && m_children[i]->isTruth(bits, mask); if (!result) goto done; } } done : if (m_hasNOT) return !result; else return result; } // . "bits" are 1-1 with the query terms in Query::m_qterms[] array // . hasNOT is true if there's a NOT just to the left of this WHOLE expressions // ourside the parens qvec_t Expression::getNOTBits ( bool hasNOT ) { qvec_t notBits = 0; // for ( long i = 0 ; i < m_numOperands ; i++ ) { // // get value of the ith operand, be it plain or an expression // if ( m_operands[i] ) { // if ( m_hasNOT[i] || hasNOT ) // notBits |= m_operands[i]->m_termBits; // } // else // notBits |= m_expressions[i]->getNOTBits (m_hasNOT[i]); // } // success, all operand pairs were true return notBits; } // print boolean expression for debug purposes void Expression::print(SafeBuf *sbuf) { if (m_hasNOT) sbuf->safePrintf("NOT "); if (m_operand){ m_operand->print(sbuf); return; } sbuf->safePrintf("("); for (long i=0; i < m_numChildren ; i++) { m_children[i]->print(sbuf); if (i >= m_numChildren-1) break; switch (m_opcode) { case OP_OR: sbuf->safePrintf(" OR " ); break; case OP_AND: sbuf->safePrintf(" AND " ); break; case OP_UOR: sbuf->safePrintf(" UOR " ); break; case OP_PIPE: sbuf->safePrintf(" PIPE "); break; } } sbuf->safePrintf(")"); } void Operand::print(SafeBuf *sbuf) { // long shift = 0; // while (m_termBits >> shift) shift++; // sbuf->safePrintf("%i", 1<<(shift-1)); if (m_hasNOT) sbuf->safePrintf("NOT 0x%lx", (long)m_termBits); else sbuf->safePrintf("0x%lx", (long)m_termBits); } // if any one query term is split, msg3a has to split the query bool Query::isSplit() { for(long i = 0; i < m_numTerms; i++) if(m_qterms[i].isSplit()) return true; return false; } bool QueryTerm::isSplit() { if(!m_fieldCode) return true; if(m_fieldCode == FIELD_QUOTA) return false; if(m_fieldCode == FIELD_GBTAGVECTOR) return false; if(m_fieldCode == FIELD_GBGIGABITVECTOR) return false; if(m_fieldCode == FIELD_GBSAMPLEVECTOR) return false; if(m_fieldCode == FIELD_GBSECTIONHASH) return false; if(m_fieldCode == FIELD_GBCONTENTHASH) return false; return true; } // hash of all the query terms long long Query::getQueryHash() { long long qh = 0LL; for ( long i = 0 ; i < m_numTerms ; i++ ) qh = hash64 ( m_termIds[i] , qh ); return qh; }