#include "gb-include.h" #include "Matches.h" #include "Titledb.h" // for getting total # of docs in db #include "StopWords.h" #include "Phrases.h" #include "Title.h" #include "CountryCode.h" #include "Domains.h" #include "Sections.h" #include "XmlDoc.h" //#define DEBUG_MATCHES 1 // TODO: have Matches set itself from all the meta tags, titles, link text, // neighborhoods and body. then proximity algo can utilize that info // as well as the summary generator, Summary.cpp. right now prox algo // was setting all those different classes itself. // TODO: toss m_tscores. make Summary::getBestWindow() just use its the // scores array itself. just access it with Match::m_queryWordNum. Matches::Matches ( ) { m_detectSubPhrases = false; m_numMatchGroups = 0; reset(); } Matches::~Matches( ) { reset(); } void Matches::reset ( ) { m_numMatches = 0; //m_maxNQT = -1; m_numAlnums = 0; // free all the classes' buffers for ( long i = 0 ; i < m_numMatchGroups ; i++ ) { m_wordsArray [i].reset(); //m_sectionsArray[i].reset(); m_posArray [i].reset(); m_bitsArray [i].reset(); } m_numMatchGroups = 0; //m_explicitsMatched = 0; //m_matchableRequiredBits = 0; //m_hasAllQueryTerms = false; //m_matchesQuery = false; } bool Matches::isMatchableTerm ( QueryTerm *qt ) { // , long i ) { // . skip if negative sign // . no, we need to match negative words/phrases now so we can // big hack them out... //if ( qw->m_wordSign == '-' ) return false; QueryWord *qw = qt->m_qword; // not derived from a query word? how? if ( ! qw ) return false; if ( qw->m_ignoreWord == IGNORE_DEFAULT ) return false; if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) return false; if ( qw->m_ignoreWord == IGNORE_BOOLOP ) return false; // stop words in 'all the king's men' query need to be highlighted //if ( qw->m_isQueryStopWord && ! qw->m_inQuotes ) return false; //if ( qw->m_isStopWord && ! qw->m_inQuotes ) return false; // take this out for now so we highlight for title: terms if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE ) return false; // what word # are we? long qwn = qw - m_q->m_qwords; // do not include if in a quote and does not start it!! //if ( qw->m_inQuotes && i-1 != qw->m_quoteStart ) return false; if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != qwn ) return false; // if query is too long, a query word can be truncated! // this happens for some words if they are ignored, too! if ( ! qw->m_queryWordTerm && ! qw->m_queryPhraseTerm ) return false; // after a NOT operator? if ( qw->m_underNOT ) return false; // in a field? //if ( qw->m_fieldCode != fieldCode ) continue; // skip if a query stop word w/o a sign and ignored //if ( q->m_isStopWord[i] && // q->m_termSigns[i] == '\0' && // q->m_ignore[i] ) continue; return true; } // a QueryMatch is a quote in the query or a single word. class QueryMatch { public: // range in Query::m_qwords [m_a,m_b] long m_a; long m_b; long m_score; // lowest of the term freqs }; void Matches::setQuery ( Query *q ) { //long qtableScores [ MAX_QUERY_TERMS * 2 ]; reset(); // save it m_q = q; //m_tscores = tscores; // scores, 1-1 with query terms //m_numNegTerms = 0; //m_explicitsMatched = 0; // clear this vector //memset ( m_foundTermVector , 0 , m_q->getNumTerms() ); //memset ( m_foundNegTermVector, 0, m_q->getNumTerms() ); // # of WORDS in the query long nqt = m_q->m_numTerms; // how many query words do we have that can be matched? long numToMatch = 0; for ( long i = 0 ; i < nqt ; i++ ) { // rest this m_qwordFlags[i] = 0; // get query word #i //QueryWord *qw = &m_q->m_qwords[i]; QueryTerm *qt = &m_q->m_qterms[i]; // skip if ignored *in certain ways only* if ( ! isMatchableTerm ( qt ) ) { //if( (qw->m_wordSign == '-') && !qw->m_fieldCode ) // m_numNegTerms++; continue; } // count it numToMatch++; // don't breach. MDW: i made this >= from > (2/11/09) if ( numToMatch < MAX_QUERY_WORDS_TO_MATCH ) continue; // note it log("matches: hit %li max query words to match limit", (long)MAX_QUERY_WORDS_TO_MATCH); break; } // fix a core the hack way for now! if ( numToMatch < 256 ) numToMatch = 256; // keep number of slots in hash table a power of two for fast hashing m_numSlots = getHighestLitBitValue ( (unsigned long)(numToMatch * 3)); // make the hash mask unsigned long mask = m_numSlots - 1; long n; // sanity check if ( m_numSlots > MAX_QUERY_WORDS_TO_MATCH * 3 ) { char *xx = NULL; *xx = 0; } // clear hash table memset ( m_qtableIds , 0 , m_numSlots * 8 ); memset ( m_qtableFlags , 0 , m_numSlots ); //memset ( m_qtableNegIds, 0 , m_numNegTerms ); // alternate colors for highlighting long colorNum = 0; //long negIds = 0; // . hash all the query terms into the hash table // . the term's score should be 100 for a very rare term, // and 1 for a stop word. //m_maxNQT = nqt; for ( long i = 0 ; i < nqt ; i++ ) { // get query word #i //QueryWord *qw = &m_q->m_qwords[i]; QueryTerm *qt = &m_q->m_qterms[i]; // skip if ignored *in certain ways only* if ( ! isMatchableTerm ( qt ) ) { //if( (qw->m_wordSign == '-') && !qw->m_fieldCode ) // m_qtableNegIds[negIds++] = qw->m_rawWordId; continue; } // get the word it is from QueryWord *qw = qt->m_qword; // get word # long qwn = qw - q->m_qwords; // assign color # for term highlighting with different colors qw->m_colorNum = colorNum++; // do not overfill table if ( colorNum > MAX_QUERY_WORDS_TO_MATCH ) { //m_maxNQT = nqt; break; } // this should be equivalent to the word id long long qid = qt->m_rawTermId;//qw->m_rawWordId; // but NOT for 'cheatcodes.com' if ( qt->m_isPhrase ) qid = qw->m_rawWordId; // if its a multi-word synonym, like "new jersey" we must // index the individual words... or compute the phrase ids // for all the words in the doc. right now the qid is // the phrase hash for this guy i think... if ( qt->m_synonymOf && qt->m_numAlnumWordsInSynonym == 2 ) qid = qt->m_synWids0; // put in hash table n = ((unsigned long)qid) & mask; // chain to an empty slot while ( m_qtableIds[n] && m_qtableIds[n] != qid ) if ( ++n >= m_numSlots ) n = 0; // . if already occupied, do not overwrite this, keep this // first word, the other is often ignored as IGNORE_REPEAT // . what word # in the query are we. save this. if ( ! m_qtableIds[n] ) m_qtableWordNums[n] = qwn; // store it m_qtableIds[n] = qid; // in quotes? this term may appear multiple times in the // query, in some cases in quotes, and in some cases not. // we need to know either way for logic below. if ( qw->m_inQuotes ) m_qtableFlags[n] |= 0x02; else m_qtableFlags[n] |= 0x01; // this is basically a quoted synonym if ( qt->m_numAlnumWordsInSynonym == 2 ) m_qtableFlags[n] |= 0x08; //QueryTerm *qt = qw->m_queryWordTerm; if ( qt && qt->m_termSign == '+' ) m_qtableFlags[n] |= 0x04; // // if query has e-mail, then index phrase id "email" so // it matches "email" in the doc. // we need this for the 'cheat codes' query as well so it // highlights 'cheatcodes' // long long pid = qw->m_rawPhraseId; if ( pid == 0 ) continue; // put in hash table n = ((unsigned long)pid) & mask; // chain to an empty slot while ( m_qtableIds[n] && m_qtableIds[n] != pid ) if ( ++n >= m_numSlots ) n = 0; // this too? if ( ! m_qtableIds[n] ) m_qtableWordNums[n] = qwn; // store it m_qtableIds[n] = pid; } /* // set what bits we need to match for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) { // get it QueryTerm *qt = &m_q->m_qterms[i]; // get its explicit bit qvec_t ebit = qt->m_explicitBit; // must be a required term if ( (m_q->m_matchRequiredBits & ebit) == 0 ) continue; // we only check for certain fields in this logic right now bool skip = true; // if no field, must match it if ( qt->m_fieldCode == 0 ) skip = false; if ( qt->m_fieldCode == FIELD_GBLANG ) skip = false; if ( qt->m_fieldCode == FIELD_GBCOUNTRY ) skip = false; if ( qt->m_fieldCode == FIELD_SITE ) skip = false; if ( qt->m_fieldCode == FIELD_IP ) skip = false; if ( qt->m_fieldCode == FIELD_URL ) skip = false; if ( skip ) continue; // we need this ebit m_matchableRequiredBits |= ebit; } */ } // . this was in Summary.cpp, but is more useful here // . we can also use this to replace the proximity algo setup where it // fills in the matrix for title, link text, etc. // . returns false and sets g_errno on error bool Matches::set ( XmlDoc *xd , Words *bodyWords , //Synonyms *bodySynonyms, Phrases *bodyPhrases , Sections *bodySections , Bits *bodyBits , Pos *bodyPos , Xml *bodyXml , Title *tt , long niceness ) { // don't reset query info! reset(); // sanity check if ( ! xd->m_docIdValid ) { char *xx=NULL;*xx=0; } // . first add all the matches in the body of the doc // . add it first since it will kick out early if too many matches // and we get all the explicit bits matched if ( ! addMatches ( bodyWords , //bodySynonyms , bodyPhrases , bodySections , //addToMatches , bodyBits , bodyPos , 0 , // fieldCode of words, 0 for no field true , // allowPunctInPhrase, false , // exclQTOnlyinAnchTxt, 0 , // qvec_t reqMask , 0 , // qvec_t negMask , 1 , // long diversityWeight, xd->m_docId, MF_BODY ) ) return false; // add the title in if ( ! addMatches ( tt->getTitle() , tt->getTitleSize() , MF_TITLEGEN , xd->m_docId , niceness )) return false; // add in the url terms Url *turl = xd->getFirstUrl(); if ( ! addMatches ( turl->m_url , turl->m_ulen , MF_URL , xd->m_docId , niceness ) ) return false; // also use the title from the title tag, because sometimes // it does not equal "tt->getTitle()" long a = tt->m_titleTagStart; long b = tt->m_titleTagEnd; char *start = NULL; char *end = NULL; if ( a >= 0 && b >= 0 ) { start = bodyWords->getWord(a); end = bodyWords->getWord(b-1) + bodyWords->getWordLen(b-1); if ( ! addMatches ( start , end - start , MF_TITLETAG , xd->m_docId , niceness )) return false; } // add in dmoz stuff char *dt = xd->ptr_dmozTitles; char *ds = xd->ptr_dmozSumms; long nd = xd->size_catIds / 4; for ( long i = 0 ; i < nd ; i++ ) { // sanity check if ( ! dt[0] ) break; // add each dmoz title if ( ! addMatches ( dt , gbstrlen(dt) , MF_DMOZTITLE , xd->m_docId , niceness )) return false; // skip dt += gbstrlen(dt) + 1; // sanity check if ( ! ds[0] ) break; // and the summary if ( ! addMatches ( ds , gbstrlen(ds) , MF_DMOZSUMM , xd->m_docId , niceness )) return false; // skip ds += gbstrlen(ds) + 1; } // now add in the meta tags long n = bodyXml->getNumNodes(); XmlNode *nodes = bodyXml->getNodes(); // find the first meta summary node for ( long i = 0 ; i < n ; i++ ) { // continue if not a meta tag if ( nodes[i].m_nodeId != 68 ) continue; // only get content for not long tagLen; char *tag = bodyXml->getString ( i , "name" , &tagLen ); // is it an accepted meta tag? long flag = 0; if (tagLen== 7&&strncasecmp(tag,"keyword" , 7)== 0) flag = MF_METAKEYW; if (tagLen== 7&&strncasecmp(tag,"summary" , 7)== 0) flag = MF_METASUMM; if (tagLen== 8&&strncasecmp(tag,"keywords" , 8)== 0) flag = MF_METAKEYW; if (tagLen==11&&strncasecmp(tag,"description",11)== 0) flag = MF_METADESC; if ( ! flag ) continue; // get the content long len; char *s = bodyXml->getString ( i , "content" , &len ); if ( ! s || len <= 0 ) continue; // wordify if ( ! addMatches ( s , len , flag , xd->m_docId , niceness ) ) return false; } // . now the link text // . loop through each link text and it its matches LinkInfo *info = xd->getLinkInfo1(); // this is not the second pass, it is the first pass bool secondPass = false; loop: // loop through the Inlinks Inlink *k = NULL; for ( ; (k = info->getNextInlink(k)) ; ) { // does it have link text? skip if not. if ( k->size_linkText <= 1 ) continue; // set the flag, the type of match mf_t flags = MF_LINK; //if ( k->m_isAnomaly ) flags = MF_ALINK; // add it in if ( ! addMatches ( k->ptr_linkText , k->size_linkText - 1 , flags , xd->m_docId , niceness )) return false; // skip if no neighborhood text //if ( k->size_surroundingText <= 1 ) continue; // set flag for that flags = MF_HOOD; //if ( k->m_isAnomaly ) flags = MF_AHOOD; // add it in if ( ! addMatches ( k->ptr_surroundingText , k->size_surroundingText - 1 , flags , xd->m_docId , niceness )) return false; // parse the rss up into xml Xml rxml; if ( ! k->setXmlFromRSS ( &rxml , niceness ) ) return false; // add rss description bool isHtmlEncoded; long rdlen; char *rd = rxml.getRSSDescription ( &rdlen , &isHtmlEncoded ); if ( ! addMatches ( rd , rdlen , MF_RSSDESC , xd->m_docId , niceness )) return false; // add rss title long rtlen; char *rt = rxml.getRSSTitle ( &rtlen , &isHtmlEncoded ); if ( ! addMatches ( rt , rtlen , MF_RSSTITLE , xd->m_docId , niceness )) return false; } // now repeat for imported link text! if ( ! secondPass ) { // only do this once secondPass = true; // set it info = *xd->getLinkInfo2(); if ( info ) goto loop; } /* // convenience Query *q = m_q; // any error we have will be this g_errno = EMISSINGQUERYTERMS; // . add in match bits from query! // . used for the BIG HACK for( long i = 0; i < q->m_numTerms ; i++ ) { // get it QueryTerm *qt = &q->m_qterms[i]; bool isNeg = qt->m_termSign == '-'; qvec_t ebit = qt->m_explicitBit; // save it long fc = qt->m_fieldCode; // . length stops at space for fielded terms // . get word QueryWord *w = qt->m_qword; // get word index long wi = w - q->m_qwords; // point to word char *qw = q->m_qwords[wi].m_word; // total length long qwLen = 0; // keep including more words until not in field anymore for ( ; wi < q->m_numWords ; wi++ ) { if ( q->m_qwords[wi].m_fieldCode != fc ) break; // include its length qwLen += q->m_qwords[wi].m_wordLen; } if( !qw || !qwLen ) return log( "query: Error, no query word found!" ); char tmp[512]; //long tmpLen; //tmpLen = utf16ToUtf8( tmp, 512, qw, qwLen ); long tmpLen = qwLen; if ( tmpLen > 500 ) tmpLen = 500; memcpy ( tmp , qw , tmpLen ); tmp[tmpLen] = '\0'; log(LOG_DEBUG,"query: term#=%li fieldLen=%li:%s",i,tmpLen,tmp); if ( fc == FIELD_GBLANG ) { char lang = atoi( tmp ); log( LOG_DEBUG, "query: TitleRec " "Lang=%i", *xd->getLangId() ); if( q->m_isBoolean ) { if (*xd->getLangId() == lang) m_explicitsMatched |= ebit; continue; } if ( isNeg && (*xd->getLangId() == lang)){ if( q->m_hasUOR ) continue; return log("query: Result contains " "-gblang: term, filtering. " " q=%s", q->m_orig); } else if( !isNeg && (*xd->getLangId() != lang)){ if( q->m_hasUOR ) continue; return log("query: Result is missing " "gblang: term, filtering. " "q=%s", q->m_orig); } else m_explicitsMatched |= ebit; } else if ( fc == FIELD_GBCOUNTRY ) { unsigned char country ; country = g_countryCode.getIndexOfAbbr(tmp); log( LOG_DEBUG, "query: TitleRec " "Country=%i", *xd->getCountryId() ); if ( q->m_isBoolean ) { if ( *xd->getCountryId() == country) m_explicitsMatched |= ebit; continue; } if ( isNeg && (*xd->getCountryId() == country)){ if( q->m_hasUOR ) continue; return log("query: Result contains " "-gbcountry: term, filtering. " " q=%s", q->m_orig); } else if ( !isNeg && (*xd->getCountryId() != country)){ if( q->m_hasUOR ) continue; return log("query: Result is missing " "gbcountry: term, filtering. " "q=%s", q->m_orig); } else m_explicitsMatched |= ebit; } else if( fc == FIELD_SITE ) { // . Site Colon Field Terms: // 1.) match tld first (if only tld) // 2.) match domain (contains tld) // 3.) match host (sub-domain) // 4.) match path // * 1 is the minimal specificity for // a site: query. 2,3, and 4 are // only required if specified in // query bool fail = false; Url *turl = xd->getFirstUrl(); char *ttld = turl->getTLD(); long ttlen = turl->getTLDLen(); char *tdom = turl->getDomain(); long tdlen = turl->getDomainLen(); char *thost = turl->getHost(); long thlen = turl->getHostLen(); char *tpath = turl->getPath(); long tplen = turl->getPathLen(); //bool hasWWW = turl->isHostWWW(); log( LOG_DEBUG, "query: TitleRec " "Site=%s", tdom ); // . Check to see if site: is querying // only a TLD, then we can't put it // into Url. if(isTLD(tmp, tmpLen)) { if(ttlen != tmpLen || strncmp(ttld, tmp, tmpLen)) fail = true; } else { Url qurl; // false --> add www? qurl.set( tmp, tmpLen, false);//hasWWW ); char *qdom = qurl.getDomain(); long qdlen = qurl. getDomainLen(); char *qhost = qurl.getHost(); long qhlen = qurl.getHostLen(); char *qpath = qurl.getPath(); long qplen = qurl.getPathLen(); if(tdlen != qdlen || strncmp(tdom, qdom, qdlen)) fail = true; if(!fail && qhlen != qdlen && (thlen != qhlen || strncmp(thost, qhost, qhlen))) fail = true; if(!fail && qplen > 1 && (tplen < qplen || strncmp(tpath, qpath, qplen))) fail = true; } if( q->m_isBoolean){ if ( ! fail ) m_explicitsMatched |= ebit; continue; } if( fail && !isNeg ){ if( q->m_hasUOR ) continue; return log("query: Result is missing " "site: term, filtering. " "q=%s", q->m_orig); } else if( !fail && isNeg ){ if( q->m_hasUOR ) continue; return log("query: Result contains " "-site: term, filtering. " "q=%s", q->m_orig ); } else m_explicitsMatched |= ebit; } else if ( fc == FIELD_IP ) { long ip = *xd->getIp(); char *oip = iptoa( ip ); log(LOG_DEBUG, "query: TitleRec Ip=%s", oip ); long olen = gbstrlen(oip); bool matched = false; if (olen>=tmpLen && strncmp(oip,tmp,tmpLen)==0 ) matched = true; if( q->m_isBoolean){ if (matched) m_explicitsMatched |= ebit; continue; } if ( ! matched && ! isNeg ) { if( q->m_hasUOR ) continue; return log("query: Result is missing ip: term," " filtering. q=%s", q->m_orig ); } else if ( matched && isNeg ) { if( q->m_hasUOR ) continue; return log("query: Result contains -ip: term, " "filtering. q=%s", q->m_orig ); } else m_explicitsMatched |= ebit; } else if ( fc == FIELD_URL ) { char *url = xd->getFirstUrl()->getUrl(); long slen = xd->getFirstUrl()->getUrlLen(); Url u; // do not force add the "www." cuz titleRec does not u.set( tmp, tmpLen, false );//true ); char * qs = u.getUrl(); long qsl = u.getUrlLen(); log( LOG_DEBUG, "query: TitleRec Url=%s", url ); if( qsl > slen ) qsl = slen; long result = strncmp( url, qs, qsl ); if( q->m_isBoolean){ if (result) m_explicitsMatched |= ebit; continue; } if( result && !isNeg ){ if( q->m_hasUOR ) continue; return log("query: Result is missing " "url: term, filtering. q=%s", q->m_orig ); } else if( !result && isNeg ){ if( q->m_hasUOR ) continue; return log("query: Result contains " "-url: term, filtering. " "q=%s", q->m_orig ); } else m_explicitsMatched |= ebit; } } // clear just in case g_errno = 0; // what bits are not matchable qvec_t unmatchable = m_q->m_matchRequiredBits -m_matchableRequiredBits; // modify what we got qvec_t matched = m_explicitsMatched | unmatchable; // need to set Query::m_bmap before calling getBitScore() if ( ! m_q->m_bmapIsSet ) m_q->setBitMap(); // if boolean, do the truth table long bitScore = m_q->getBitScore ( matched ); // assume we are missing some. if false, may still be in the results // if we have rat=0 (Require All Terms = false) m_hasAllQueryTerms = false; // assume not a match. if this is false big hack excludes from results m_matchesQuery = false; // see Query.h for these bits defined. do not include 0x80 because // we may not have any forced bits... if ( bitScore & (0x20|0x40) ) m_matchesQuery = true; // it may not have all the query terms because of rat=0 if ( (matched & m_q->m_matchRequiredBits)== m_q->m_matchRequiredBits ){ m_hasAllQueryTerms = true; m_matchesQuery = true; } */ // that should be it return true; } bool Matches::addMatches ( char *s , long slen , mf_t flags , long long docId , long niceness ) { // . do not breach // . happens a lot with a lot of link info text if ( m_numMatchGroups >= MAX_MATCHGROUPS ) { // . log it // . often we have a ton of inlink text!! //log("matches: could not add matches1 for docid=%lli because " // "already have %li matchgroups",docId, // (long)MAX_MATCHGROUPS); return true; } // get some new ptrs for this match group Words *wp = &m_wordsArray [ m_numMatchGroups ]; //Sections *sp = &m_sectionsArray [ m_numMatchGroups ]; Sections *sp = NULL; Bits *bp = &m_bitsArray [ m_numMatchGroups ]; Pos *pb = &m_posArray [ m_numMatchGroups ]; // set the words class for this match group if ( ! wp->set ( s , slen , // in bytes TITLEREC_CURRENT_VERSION , true , // computeIds? niceness )) return false; // scores vector //if ( ! sp->set ( wp , TITLEREC_CURRENT_VERSION , false ) ) // return false; // bits vector if ( ! bp->setForSummary ( wp ) ) return false; // position vector if ( ! pb->set ( wp , sp ) ) return false; // record the start long startNumMatches = m_numMatches; // sometimes it returns true w/o incrementing this long n = m_numMatchGroups; // . add all the Match classes from this match group // . this increments m_numMatchGroups on success bool status = addMatches ( wp , //NULL , // synonyms NULL , // phrases sp , //true , // addToMatches bp , // bits pb , // pos 0 , // fieldCode true , // allowPunctInPhrase? false , // excludeQTOnlyInAnchTxt? 0 , // reqMask 0 , // negMask 1 , // diversityWeight docId , flags );// docId // if this matchgroup had some, matches, then keep it if ( m_numMatches > startNumMatches ) return status; // otherwise, reset it, useless wp->reset(); if ( sp ) sp->reset(); bp->reset(); pb->reset(); // do not decrement the counter if we never incremented it if ( n == m_numMatchGroups ) return status; // ok, remove it m_numMatchGroups--; return status; } bool Matches::getMatchGroup ( mf_t matchFlag , Words **wp , Pos **pp , Sections **sp ) { for ( long i = 0 ; i < m_numMatchGroups ; i++ ) { // must be the type we want if ( m_flags[i] != matchFlag ) continue; // get it *wp = &m_wordsArray [i]; *pp = &m_posArray [i]; //*sp = &m_sectionsArray [i]; *sp = NULL; return true; } // not found return false; } // . TODO: support stemming later. each word should then have multiple ids. // . add to our m_matches[] array iff addToMatches is true, otherwise we just // set the m_foundTermVector for doing the BIG HACK described in Summary.cpp bool Matches::addMatches ( Words *words , //Synonyms *syn , Phrases *phrases , Sections *sections , Bits *bits , Pos *pos , long fieldCode , // of words,0=none bool allowPunctInPhrase , bool exclQTOnlyinAnchTxt , qvec_t reqMask , qvec_t negMask , long diversityWeight , long long docId , mf_t flags ) { // if no query term, bail. if ( m_numSlots <= 0 ) return true; // . do not breach // . happens a lot with a lot of link info text if ( m_numMatchGroups >= MAX_MATCHGROUPS ) { // . log it // . often we have a ton of inlink text!! //log("matches: could not add matches2 for docid=%lli because " // "already have %li matchgroups",docId, // (long)MAX_MATCHGROUPS); return true; } // shortcut Section *sp = NULL; if ( sections ) sp = sections->m_sections; // we've added a lot of matches, if we don't need anymore // to confirm the big hack then break out //if ( m_numMatches >= MAX_MATCHES && // ( m_explicitsMatched & m_matchableRequiredBits ) ) // return true; mf_t eflag = 0; // set the ptrs m_wordsPtr [ m_numMatchGroups ] = words; m_sectionsPtr [ m_numMatchGroups ] = sections; m_bitsPtr [ m_numMatchGroups ] = bits; m_posPtr [ m_numMatchGroups ] = pos; m_flags [ m_numMatchGroups ] = flags; m_numMatchGroups++; long long *pids = NULL; if ( phrases ) pids = phrases->getPhraseIds2(); // set convenience vars unsigned long mask = m_numSlots - 1; long long *wids = words->getWordIds(); long *wlens = words->getWordLens(); char **wptrs = words->getWords(); // swids = word ids where accent marks, etc. are stripped //long long *swids = words->getStripWordIds(); nodeid_t *tids = words->getTagIds(); long nw = words->m_numWords; //long *wscores = NULL; //if ( scores ) wscores = scores->m_scores; long n;//,n2 ; long matchStack = 0; long long nextMatchWordIdMustBeThis = 0; long nextMatchWordPos = 0; long lasti = -3; //bool inAnchTag = false; long dist = 0; // . every tag increments "dist" by a value // . rather than use a switch/case statement, which does a binary // lookup thing which is really slow, let's use a 256 bucket table // for constant lookup, rather than log(N). static char s_tableInit = false; static int8_t s_tab[512]; if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; } for ( long i = 0 ; ! s_tableInit && i < 128 ; i++ ) { char step = 0; if ( i == TAG_TR ) step = 2; if ( i == TAG_P ) step = 10; if ( i == TAG_HR ) step = 10; if ( i == TAG_H1 ) step = 10; if ( i == TAG_H2 ) step = 10; if ( i == TAG_H3 ) step = 10; if ( i == TAG_H4 ) step = 10; if ( i == TAG_H5 ) step = 10; if ( i == TAG_H6 ) step = 10; if ( i == TAG_TABLE ) step = 30; if ( i == TAG_BLOCKQUOTE ) step = 10; // default if ( step == 0 ) { if ( g_nodes[i].m_isBreaking ) step = 10; else step = 1; } // account for both the back and the front tags s_tab[i ] = step; //s_tab[i|0x80] = step; } s_tableInit = true; // google seems to index SEC_MARQUEE so i took that out of here long badFlags =SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE; //long anum; //long long *aids; //long j; long qwn; long numQWords; long numWords; // // . set m_matches[] array // . loop over all words in the document // for ( long i = 0 ; i < nw ; i++ ) { //if (tids && (tids[i] ) == TAG_A) // inAnchTag = true; //else if (tids && (tids[i]&BACKBITCOMP) == TAG_A) // inAnchTag = false; // for each word increment distance dist++; //if ( addToMatches && tids && tids[i] ){ if ( tids && tids[i] ){ long tid = tids[i] & BACKBITCOMP; // accumulate distance dist += s_tab[tid]; // monitor boundaries so that the proximity algo // knows when two matches are separated by such tags // MDW: isn't the "dist" good enough for this????? // let's try just using "dist" then. // "crossedSection" is hereby replaced by "dist". //if ( s_tab[tid] // tagIds don't have wids and are skipped continue; } // skip if wid is 0, it is not an alnum word then if ( ! wids[i] ) { // and extra unit if it starts with \n i guess if ( words->m_words[i][0] == '\n' ) dist++; // dist += words->m_wordLens[i] / 3; continue; } // count the number of alnum words m_numAlnums++; // clear this eflag = 0; // . zero score words cannot match query terms either // . BUT if score is -1 that means it is in a