#include "gb-include.h"
#include "Matches.h"
#include "Titledb.h" // for getting total # of docs in db
#include "StopWords.h"
#include "Phrases.h"
#include "Title.h"
#include "CountryCode.h"
#include "Domains.h"
#include "Sections.h"
#include "XmlDoc.h"
//#define DEBUG_MATCHES 1
// TODO: have Matches set itself from all the meta tags, titles, link text,
// neighborhoods and body. then proximity algo can utilize that info
// as well as the summary generator, Summary.cpp. right now prox algo
// was setting all those different classes itself.
// TODO: toss m_tscores. make Summary::getBestWindow() just use its the
// scores array itself. just access it with Match::m_queryWordNum.
Matches::Matches ( ) {
m_detectSubPhrases = false;
m_numMatchGroups = 0;
reset();
}
Matches::~Matches( ) { reset(); }
void Matches::reset ( ) {
m_numMatches = 0;
//m_maxNQT = -1;
m_numAlnums = 0;
// free all the classes' buffers
for ( long i = 0 ; i < m_numMatchGroups ; i++ ) {
m_wordsArray [i].reset();
//m_sectionsArray[i].reset();
m_posArray [i].reset();
m_bitsArray [i].reset();
}
m_numMatchGroups = 0;
//m_explicitsMatched = 0;
//m_matchableRequiredBits = 0;
//m_hasAllQueryTerms = false;
//m_matchesQuery = false;
}
bool Matches::isMatchableTerm ( QueryTerm *qt ) { // , long i ) {
// . skip if negative sign
// . no, we need to match negative words/phrases now so we can
// big hack them out...
//if ( qw->m_wordSign == '-' ) return false;
QueryWord *qw = qt->m_qword;
// not derived from a query word? how?
if ( ! qw ) return false;
if ( qw->m_ignoreWord == IGNORE_DEFAULT ) return false;
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) return false;
if ( qw->m_ignoreWord == IGNORE_BOOLOP ) return false;
// stop words in 'all the king's men' query need to be highlighted
//if ( qw->m_isQueryStopWord && ! qw->m_inQuotes ) return false;
//if ( qw->m_isStopWord && ! qw->m_inQuotes ) return false;
// take this out for now so we highlight for title: terms
if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE ) return false;
// what word # are we?
long qwn = qw - m_q->m_qwords;
// do not include if in a quote and does not start it!!
//if ( qw->m_inQuotes && i-1 != qw->m_quoteStart ) return false;
if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != qwn ) return false;
// if query is too long, a query word can be truncated!
// this happens for some words if they are ignored, too!
if ( ! qw->m_queryWordTerm && ! qw->m_queryPhraseTerm ) return false;
// after a NOT operator?
if ( qw->m_underNOT )
return false;
// in a field?
//if ( qw->m_fieldCode != fieldCode ) continue;
// skip if a query stop word w/o a sign and ignored
//if ( q->m_isStopWord[i] &&
// q->m_termSigns[i] == '\0' &&
// q->m_ignore[i] ) continue;
return true;
}
// a QueryMatch is a quote in the query or a single word.
class QueryMatch {
public:
// range in Query::m_qwords [m_a,m_b]
long m_a;
long m_b;
long m_score; // lowest of the term freqs
};
void Matches::setQuery ( Query *q ) {
//long qtableScores [ MAX_QUERY_TERMS * 2 ];
reset();
// save it
m_q = q;
//m_tscores = tscores; // scores, 1-1 with query terms
//m_numNegTerms = 0;
//m_explicitsMatched = 0;
// clear this vector
//memset ( m_foundTermVector , 0 , m_q->getNumTerms() );
//memset ( m_foundNegTermVector, 0, m_q->getNumTerms() );
// # of WORDS in the query
long nqt = m_q->m_numTerms;
// how many query words do we have that can be matched?
long numToMatch = 0;
for ( long i = 0 ; i < nqt ; i++ ) {
// rest this
m_qwordFlags[i] = 0;
// get query word #i
//QueryWord *qw = &m_q->m_qwords[i];
QueryTerm *qt = &m_q->m_qterms[i];
// skip if ignored *in certain ways only*
if ( ! isMatchableTerm ( qt ) ) {
//if( (qw->m_wordSign == '-') && !qw->m_fieldCode )
// m_numNegTerms++;
continue;
}
// count it
numToMatch++;
// don't breach. MDW: i made this >= from > (2/11/09)
if ( numToMatch < MAX_QUERY_WORDS_TO_MATCH ) continue;
// note it
log("matches: hit %li max query words to match limit",
(long)MAX_QUERY_WORDS_TO_MATCH);
break;
}
// fix a core the hack way for now!
if ( numToMatch < 256 ) numToMatch = 256;
// keep number of slots in hash table a power of two for fast hashing
m_numSlots = getHighestLitBitValue ( (unsigned long)(numToMatch * 3));
// make the hash mask
unsigned long mask = m_numSlots - 1;
long n;
// sanity check
if ( m_numSlots > MAX_QUERY_WORDS_TO_MATCH * 3 ) {
char *xx = NULL; *xx = 0; }
// clear hash table
memset ( m_qtableIds , 0 , m_numSlots * 8 );
memset ( m_qtableFlags , 0 , m_numSlots );
//memset ( m_qtableNegIds, 0 , m_numNegTerms );
// alternate colors for highlighting
long colorNum = 0;
//long negIds = 0;
// . hash all the query terms into the hash table
// . the term's score should be 100 for a very rare term,
// and 1 for a stop word.
//m_maxNQT = nqt;
for ( long i = 0 ; i < nqt ; i++ ) {
// get query word #i
//QueryWord *qw = &m_q->m_qwords[i];
QueryTerm *qt = &m_q->m_qterms[i];
// skip if ignored *in certain ways only*
if ( ! isMatchableTerm ( qt ) ) {
//if( (qw->m_wordSign == '-') && !qw->m_fieldCode )
// m_qtableNegIds[negIds++] = qw->m_rawWordId;
continue;
}
// get the word it is from
QueryWord *qw = qt->m_qword;
// get word #
long qwn = qw - q->m_qwords;
// assign color # for term highlighting with different colors
qw->m_colorNum = colorNum++;
// do not overfill table
if ( colorNum > MAX_QUERY_WORDS_TO_MATCH ) {
//m_maxNQT = nqt;
break;
}
// this should be equivalent to the word id
long long qid = qt->m_rawTermId;//qw->m_rawWordId;
// but NOT for 'cheatcodes.com'
if ( qt->m_isPhrase ) qid = qw->m_rawWordId;
// if its a multi-word synonym, like "new jersey" we must
// index the individual words... or compute the phrase ids
// for all the words in the doc. right now the qid is
// the phrase hash for this guy i think...
if ( qt->m_synonymOf && qt->m_numAlnumWordsInSynonym == 2 )
qid = qt->m_synWids0;
// put in hash table
n = ((unsigned long)qid) & mask;
// chain to an empty slot
while ( m_qtableIds[n] && m_qtableIds[n] != qid )
if ( ++n >= m_numSlots ) n = 0;
// . if already occupied, do not overwrite this, keep this
// first word, the other is often ignored as IGNORE_REPEAT
// . what word # in the query are we. save this.
if ( ! m_qtableIds[n] ) m_qtableWordNums[n] = qwn;
// store it
m_qtableIds[n] = qid;
// in quotes? this term may appear multiple times in the
// query, in some cases in quotes, and in some cases not.
// we need to know either way for logic below.
if ( qw->m_inQuotes ) m_qtableFlags[n] |= 0x02;
else m_qtableFlags[n] |= 0x01;
// this is basically a quoted synonym
if ( qt->m_numAlnumWordsInSynonym == 2 )
m_qtableFlags[n] |= 0x08;
//QueryTerm *qt = qw->m_queryWordTerm;
if ( qt && qt->m_termSign == '+' ) m_qtableFlags[n] |= 0x04;
//
// if query has e-mail, then index phrase id "email" so
// it matches "email" in the doc.
// we need this for the 'cheat codes' query as well so it
// highlights 'cheatcodes'
//
long long pid = qw->m_rawPhraseId;
if ( pid == 0 ) continue;
// put in hash table
n = ((unsigned long)pid) & mask;
// chain to an empty slot
while ( m_qtableIds[n] && m_qtableIds[n] != pid )
if ( ++n >= m_numSlots ) n = 0;
// this too?
if ( ! m_qtableIds[n] ) m_qtableWordNums[n] = qwn;
// store it
m_qtableIds[n] = pid;
}
/*
// set what bits we need to match
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
// get it
QueryTerm *qt = &m_q->m_qterms[i];
// get its explicit bit
qvec_t ebit = qt->m_explicitBit;
// must be a required term
if ( (m_q->m_matchRequiredBits & ebit) == 0 ) continue;
// we only check for certain fields in this logic right now
bool skip = true;
// if no field, must match it
if ( qt->m_fieldCode == 0 ) skip = false;
if ( qt->m_fieldCode == FIELD_GBLANG ) skip = false;
if ( qt->m_fieldCode == FIELD_GBCOUNTRY ) skip = false;
if ( qt->m_fieldCode == FIELD_SITE ) skip = false;
if ( qt->m_fieldCode == FIELD_IP ) skip = false;
if ( qt->m_fieldCode == FIELD_URL ) skip = false;
if ( skip ) continue;
// we need this ebit
m_matchableRequiredBits |= ebit;
}
*/
}
// . this was in Summary.cpp, but is more useful here
// . we can also use this to replace the proximity algo setup where it
// fills in the matrix for title, link text, etc.
// . returns false and sets g_errno on error
bool Matches::set ( XmlDoc *xd ,
Words *bodyWords ,
//Synonyms *bodySynonyms,
Phrases *bodyPhrases ,
Sections *bodySections ,
Bits *bodyBits ,
Pos *bodyPos ,
Xml *bodyXml ,
Title *tt ,
long niceness ) {
// don't reset query info!
reset();
// sanity check
if ( ! xd->m_docIdValid ) { char *xx=NULL;*xx=0; }
// . first add all the matches in the body of the doc
// . add it first since it will kick out early if too many matches
// and we get all the explicit bits matched
if ( ! addMatches ( bodyWords ,
//bodySynonyms ,
bodyPhrases ,
bodySections ,
//addToMatches ,
bodyBits ,
bodyPos ,
0 , // fieldCode of words, 0 for no field
true , // allowPunctInPhrase,
false , // exclQTOnlyinAnchTxt,
0 , // qvec_t reqMask ,
0 , // qvec_t negMask ,
1 , // long diversityWeight,
xd->m_docId,
MF_BODY ) )
return false;
// add the title in
if ( ! addMatches ( tt->getTitle() ,
tt->getTitleSize() ,
MF_TITLEGEN ,
xd->m_docId ,
niceness ))
return false;
// add in the url terms
Url *turl = xd->getFirstUrl();
if ( ! addMatches ( turl->m_url ,
turl->m_ulen ,
MF_URL ,
xd->m_docId ,
niceness ) )
return false;
// also use the title from the title tag, because sometimes
// it does not equal "tt->getTitle()"
long a = tt->m_titleTagStart;
long b = tt->m_titleTagEnd;
char *start = NULL;
char *end = NULL;
if ( a >= 0 && b >= 0 ) {
start = bodyWords->getWord(a);
end = bodyWords->getWord(b-1) + bodyWords->getWordLen(b-1);
if ( ! addMatches ( start ,
end - start ,
MF_TITLETAG ,
xd->m_docId ,
niceness ))
return false;
}
// add in dmoz stuff
char *dt = xd->ptr_dmozTitles;
char *ds = xd->ptr_dmozSumms;
long nd = xd->size_catIds / 4;
for ( long i = 0 ; i < nd ; i++ ) {
// sanity check
if ( ! dt[0] ) break;
// add each dmoz title
if ( ! addMatches ( dt ,
gbstrlen(dt) ,
MF_DMOZTITLE ,
xd->m_docId ,
niceness ))
return false;
// skip
dt += gbstrlen(dt) + 1;
// sanity check
if ( ! ds[0] ) break;
// and the summary
if ( ! addMatches ( ds ,
gbstrlen(ds) ,
MF_DMOZSUMM ,
xd->m_docId ,
niceness ))
return false;
// skip
ds += gbstrlen(ds) + 1;
}
// now add in the meta tags
long n = bodyXml->getNumNodes();
XmlNode *nodes = bodyXml->getNodes();
// find the first meta summary node
for ( long i = 0 ; i < n ; i++ ) {
// continue if not a meta tag
if ( nodes[i].m_nodeId != 68 ) continue;
// only get content for not
long tagLen;
char *tag = bodyXml->getString ( i , "name" , &tagLen );
// is it an accepted meta tag?
long flag = 0;
if (tagLen== 7&&strncasecmp(tag,"keyword" , 7)== 0)
flag = MF_METAKEYW;
if (tagLen== 7&&strncasecmp(tag,"summary" , 7)== 0)
flag = MF_METASUMM;
if (tagLen== 8&&strncasecmp(tag,"keywords" , 8)== 0)
flag = MF_METAKEYW;
if (tagLen==11&&strncasecmp(tag,"description",11)== 0)
flag = MF_METADESC;
if ( ! flag ) continue;
// get the content
long len;
char *s = bodyXml->getString ( i , "content" , &len );
if ( ! s || len <= 0 ) continue;
// wordify
if ( ! addMatches ( s ,
len ,
flag ,
xd->m_docId ,
niceness ) )
return false;
}
// . now the link text
// . loop through each link text and it its matches
LinkInfo *info = xd->getLinkInfo1();
// this is not the second pass, it is the first pass
bool secondPass = false;
loop:
// loop through the Inlinks
Inlink *k = NULL;
for ( ; (k = info->getNextInlink(k)) ; ) {
// does it have link text? skip if not.
if ( k->size_linkText <= 1 ) continue;
// set the flag, the type of match
mf_t flags = MF_LINK;
//if ( k->m_isAnomaly ) flags = MF_ALINK;
// add it in
if ( ! addMatches ( k->ptr_linkText ,
k->size_linkText - 1 ,
flags ,
xd->m_docId ,
niceness ))
return false;
// skip if no neighborhood text
//if ( k->size_surroundingText <= 1 ) continue;
// set flag for that
flags = MF_HOOD;
//if ( k->m_isAnomaly ) flags = MF_AHOOD;
// add it in
if ( ! addMatches ( k->ptr_surroundingText ,
k->size_surroundingText - 1 ,
flags ,
xd->m_docId ,
niceness ))
return false;
// parse the rss up into xml
Xml rxml;
if ( ! k->setXmlFromRSS ( &rxml , niceness ) ) return false;
// add rss description
bool isHtmlEncoded;
long rdlen;
char *rd = rxml.getRSSDescription ( &rdlen , &isHtmlEncoded );
if ( ! addMatches ( rd ,
rdlen ,
MF_RSSDESC ,
xd->m_docId ,
niceness ))
return false;
// add rss title
long rtlen;
char *rt = rxml.getRSSTitle ( &rtlen , &isHtmlEncoded );
if ( ! addMatches ( rt ,
rtlen ,
MF_RSSTITLE ,
xd->m_docId ,
niceness ))
return false;
}
// now repeat for imported link text!
if ( ! secondPass ) {
// only do this once
secondPass = true;
// set it
info = *xd->getLinkInfo2();
if ( info ) goto loop;
}
/*
// convenience
Query *q = m_q;
// any error we have will be this
g_errno = EMISSINGQUERYTERMS;
// . add in match bits from query!
// . used for the BIG HACK
for( long i = 0; i < q->m_numTerms ; i++ ) {
// get it
QueryTerm *qt = &q->m_qterms[i];
bool isNeg = qt->m_termSign == '-';
qvec_t ebit = qt->m_explicitBit;
// save it
long fc = qt->m_fieldCode;
// . length stops at space for fielded terms
// . get word
QueryWord *w = qt->m_qword;
// get word index
long wi = w - q->m_qwords;
// point to word
char *qw = q->m_qwords[wi].m_word;
// total length
long qwLen = 0;
// keep including more words until not in field anymore
for ( ; wi < q->m_numWords ; wi++ ) {
if ( q->m_qwords[wi].m_fieldCode != fc ) break;
// include its length
qwLen += q->m_qwords[wi].m_wordLen;
}
if( !qw || !qwLen )
return log( "query: Error, no query word found!" );
char tmp[512];
//long tmpLen;
//tmpLen = utf16ToUtf8( tmp, 512, qw, qwLen );
long tmpLen = qwLen;
if ( tmpLen > 500 ) tmpLen = 500;
memcpy ( tmp , qw , tmpLen );
tmp[tmpLen] = '\0';
log(LOG_DEBUG,"query: term#=%li fieldLen=%li:%s",i,tmpLen,tmp);
if ( fc == FIELD_GBLANG ) {
char lang = atoi( tmp );
log( LOG_DEBUG, "query: TitleRec "
"Lang=%i", *xd->getLangId() );
if( q->m_isBoolean ) {
if (*xd->getLangId() == lang)
m_explicitsMatched |= ebit;
continue;
}
if ( isNeg && (*xd->getLangId() == lang)){
if( q->m_hasUOR ) continue;
return log("query: Result contains "
"-gblang: term, filtering. "
" q=%s", q->m_orig);
}
else if( !isNeg
&& (*xd->getLangId() != lang)){
if( q->m_hasUOR ) continue;
return log("query: Result is missing "
"gblang: term, filtering. "
"q=%s", q->m_orig);
}
else
m_explicitsMatched |= ebit;
}
else if ( fc == FIELD_GBCOUNTRY ) {
unsigned char country ;
country = g_countryCode.getIndexOfAbbr(tmp);
log( LOG_DEBUG, "query: TitleRec "
"Country=%i", *xd->getCountryId() );
if ( q->m_isBoolean ) {
if ( *xd->getCountryId() == country)
m_explicitsMatched |= ebit;
continue;
}
if ( isNeg && (*xd->getCountryId() == country)){
if( q->m_hasUOR ) continue;
return log("query: Result contains "
"-gbcountry: term, filtering. "
" q=%s", q->m_orig);
}
else if ( !isNeg && (*xd->getCountryId() != country)){
if( q->m_hasUOR ) continue;
return log("query: Result is missing "
"gbcountry: term, filtering. "
"q=%s", q->m_orig);
}
else
m_explicitsMatched |= ebit;
}
else if( fc == FIELD_SITE ) {
// . Site Colon Field Terms:
// 1.) match tld first (if only tld)
// 2.) match domain (contains tld)
// 3.) match host (sub-domain)
// 4.) match path
// * 1 is the minimal specificity for
// a site: query. 2,3, and 4 are
// only required if specified in
// query
bool fail = false;
Url *turl = xd->getFirstUrl();
char *ttld = turl->getTLD();
long ttlen = turl->getTLDLen();
char *tdom = turl->getDomain();
long tdlen = turl->getDomainLen();
char *thost = turl->getHost();
long thlen = turl->getHostLen();
char *tpath = turl->getPath();
long tplen = turl->getPathLen();
//bool hasWWW = turl->isHostWWW();
log( LOG_DEBUG, "query: TitleRec "
"Site=%s", tdom );
// . Check to see if site: is querying
// only a TLD, then we can't put it
// into Url.
if(isTLD(tmp, tmpLen)) {
if(ttlen != tmpLen ||
strncmp(ttld, tmp, tmpLen))
fail = true;
}
else {
Url qurl;
// false --> add www?
qurl.set( tmp, tmpLen, false);//hasWWW );
char *qdom = qurl.getDomain();
long qdlen = qurl.
getDomainLen();
char *qhost = qurl.getHost();
long qhlen = qurl.getHostLen();
char *qpath = qurl.getPath();
long qplen = qurl.getPathLen();
if(tdlen != qdlen ||
strncmp(tdom, qdom, qdlen))
fail = true;
if(!fail &&
qhlen != qdlen &&
(thlen != qhlen ||
strncmp(thost,
qhost, qhlen)))
fail = true;
if(!fail && qplen > 1 &&
(tplen < qplen ||
strncmp(tpath,
qpath, qplen)))
fail = true;
}
if( q->m_isBoolean){
if ( ! fail )
m_explicitsMatched |= ebit;
continue;
}
if( fail && !isNeg ){
if( q->m_hasUOR ) continue;
return log("query: Result is missing "
"site: term, filtering. "
"q=%s", q->m_orig);
}
else if( !fail && isNeg ){
if( q->m_hasUOR ) continue;
return log("query: Result contains "
"-site: term, filtering. "
"q=%s", q->m_orig );
}
else
m_explicitsMatched |= ebit;
}
else if ( fc == FIELD_IP ) {
long ip = *xd->getIp();
char *oip = iptoa( ip );
log(LOG_DEBUG, "query: TitleRec Ip=%s", oip );
long olen = gbstrlen(oip);
bool matched = false;
if (olen>=tmpLen && strncmp(oip,tmp,tmpLen)==0 )
matched = true;
if( q->m_isBoolean){
if (matched) m_explicitsMatched |= ebit;
continue;
}
if ( ! matched && ! isNeg ) {
if( q->m_hasUOR ) continue;
return log("query: Result is missing ip: term,"
" filtering. q=%s", q->m_orig );
}
else if ( matched && isNeg ) {
if( q->m_hasUOR ) continue;
return log("query: Result contains -ip: term, "
"filtering. q=%s", q->m_orig );
}
else
m_explicitsMatched |= ebit;
}
else if ( fc == FIELD_URL ) {
char *url = xd->getFirstUrl()->getUrl();
long slen = xd->getFirstUrl()->getUrlLen();
Url u;
// do not force add the "www." cuz titleRec does not
u.set( tmp, tmpLen, false );//true );
char * qs = u.getUrl();
long qsl = u.getUrlLen();
log( LOG_DEBUG, "query: TitleRec Url=%s", url );
if( qsl > slen ) qsl = slen;
long result = strncmp( url, qs, qsl );
if( q->m_isBoolean){
if (result)
m_explicitsMatched |= ebit;
continue;
}
if( result && !isNeg ){
if( q->m_hasUOR ) continue;
return log("query: Result is missing "
"url: term, filtering. q=%s",
q->m_orig );
}
else if( !result && isNeg ){
if( q->m_hasUOR ) continue;
return log("query: Result contains "
"-url: term, filtering. "
"q=%s", q->m_orig );
}
else
m_explicitsMatched |= ebit;
}
}
// clear just in case
g_errno = 0;
// what bits are not matchable
qvec_t unmatchable = m_q->m_matchRequiredBits -m_matchableRequiredBits;
// modify what we got
qvec_t matched = m_explicitsMatched | unmatchable;
// need to set Query::m_bmap before calling getBitScore()
if ( ! m_q->m_bmapIsSet ) m_q->setBitMap();
// if boolean, do the truth table
long bitScore = m_q->getBitScore ( matched );
// assume we are missing some. if false, may still be in the results
// if we have rat=0 (Require All Terms = false)
m_hasAllQueryTerms = false;
// assume not a match. if this is false big hack excludes from results
m_matchesQuery = false;
// see Query.h for these bits defined. do not include 0x80 because
// we may not have any forced bits...
if ( bitScore & (0x20|0x40) ) m_matchesQuery = true;
// it may not have all the query terms because of rat=0
if ( (matched & m_q->m_matchRequiredBits)== m_q->m_matchRequiredBits ){
m_hasAllQueryTerms = true;
m_matchesQuery = true;
}
*/
// that should be it
return true;
}
bool Matches::addMatches ( char *s ,
long slen ,
mf_t flags ,
long long docId ,
long niceness ) {
// . do not breach
// . happens a lot with a lot of link info text
if ( m_numMatchGroups >= MAX_MATCHGROUPS ) {
// . log it
// . often we have a ton of inlink text!!
//log("matches: could not add matches1 for docid=%lli because "
// "already have %li matchgroups",docId,
// (long)MAX_MATCHGROUPS);
return true;
}
// get some new ptrs for this match group
Words *wp = &m_wordsArray [ m_numMatchGroups ];
//Sections *sp = &m_sectionsArray [ m_numMatchGroups ];
Sections *sp = NULL;
Bits *bp = &m_bitsArray [ m_numMatchGroups ];
Pos *pb = &m_posArray [ m_numMatchGroups ];
// set the words class for this match group
if ( ! wp->set ( s ,
slen , // in bytes
TITLEREC_CURRENT_VERSION ,
true , // computeIds?
niceness ))
return false;
// scores vector
//if ( ! sp->set ( wp , TITLEREC_CURRENT_VERSION , false ) )
// return false;
// bits vector
if ( ! bp->setForSummary ( wp ) )
return false;
// position vector
if ( ! pb->set ( wp , sp ) )
return false;
// record the start
long startNumMatches = m_numMatches;
// sometimes it returns true w/o incrementing this
long n = m_numMatchGroups;
// . add all the Match classes from this match group
// . this increments m_numMatchGroups on success
bool status = addMatches ( wp ,
//NULL , // synonyms
NULL , // phrases
sp ,
//true , // addToMatches
bp , // bits
pb , // pos
0 , // fieldCode
true , // allowPunctInPhrase?
false , // excludeQTOnlyInAnchTxt?
0 , // reqMask
0 , // negMask
1 , // diversityWeight
docId ,
flags );// docId
// if this matchgroup had some, matches, then keep it
if ( m_numMatches > startNumMatches ) return status;
// otherwise, reset it, useless
wp->reset();
if ( sp ) sp->reset();
bp->reset();
pb->reset();
// do not decrement the counter if we never incremented it
if ( n == m_numMatchGroups ) return status;
// ok, remove it
m_numMatchGroups--;
return status;
}
bool Matches::getMatchGroup ( mf_t matchFlag ,
Words **wp ,
Pos **pp ,
Sections **sp ) {
for ( long i = 0 ; i < m_numMatchGroups ; i++ ) {
// must be the type we want
if ( m_flags[i] != matchFlag ) continue;
// get it
*wp = &m_wordsArray [i];
*pp = &m_posArray [i];
//*sp = &m_sectionsArray [i];
*sp = NULL;
return true;
}
// not found
return false;
}
// . TODO: support stemming later. each word should then have multiple ids.
// . add to our m_matches[] array iff addToMatches is true, otherwise we just
// set the m_foundTermVector for doing the BIG HACK described in Summary.cpp
bool Matches::addMatches ( Words *words ,
//Synonyms *syn ,
Phrases *phrases ,
Sections *sections ,
Bits *bits ,
Pos *pos ,
long fieldCode , // of words,0=none
bool allowPunctInPhrase ,
bool exclQTOnlyinAnchTxt ,
qvec_t reqMask ,
qvec_t negMask ,
long diversityWeight ,
long long docId ,
mf_t flags ) {
// if no query term, bail.
if ( m_numSlots <= 0 ) return true;
// . do not breach
// . happens a lot with a lot of link info text
if ( m_numMatchGroups >= MAX_MATCHGROUPS ) {
// . log it
// . often we have a ton of inlink text!!
//log("matches: could not add matches2 for docid=%lli because "
// "already have %li matchgroups",docId,
// (long)MAX_MATCHGROUPS);
return true;
}
// shortcut
Section *sp = NULL;
if ( sections ) sp = sections->m_sections;
// we've added a lot of matches, if we don't need anymore
// to confirm the big hack then break out
//if ( m_numMatches >= MAX_MATCHES &&
// ( m_explicitsMatched & m_matchableRequiredBits ) )
// return true;
mf_t eflag = 0;
// set the ptrs
m_wordsPtr [ m_numMatchGroups ] = words;
m_sectionsPtr [ m_numMatchGroups ] = sections;
m_bitsPtr [ m_numMatchGroups ] = bits;
m_posPtr [ m_numMatchGroups ] = pos;
m_flags [ m_numMatchGroups ] = flags;
m_numMatchGroups++;
long long *pids = NULL;
if ( phrases ) pids = phrases->getPhraseIds2();
// set convenience vars
unsigned long mask = m_numSlots - 1;
long long *wids = words->getWordIds();
long *wlens = words->getWordLens();
char **wptrs = words->getWords();
// swids = word ids where accent marks, etc. are stripped
//long long *swids = words->getStripWordIds();
nodeid_t *tids = words->getTagIds();
long nw = words->m_numWords;
//long *wscores = NULL;
//if ( scores ) wscores = scores->m_scores;
long n;//,n2 ;
long matchStack = 0;
long long nextMatchWordIdMustBeThis = 0;
long nextMatchWordPos = 0;
long lasti = -3;
//bool inAnchTag = false;
long dist = 0;
// . every tag increments "dist" by a value
// . rather than use a switch/case statement, which does a binary
// lookup thing which is really slow, let's use a 256 bucket table
// for constant lookup, rather than log(N).
static char s_tableInit = false;
static int8_t s_tab[512];
if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }
for ( long i = 0 ; ! s_tableInit && i < 128 ; i++ ) {
char step = 0;
if ( i == TAG_TR ) step = 2;
if ( i == TAG_P ) step = 10;
if ( i == TAG_HR ) step = 10;
if ( i == TAG_H1 ) step = 10;
if ( i == TAG_H2 ) step = 10;
if ( i == TAG_H3 ) step = 10;
if ( i == TAG_H4 ) step = 10;
if ( i == TAG_H5 ) step = 10;
if ( i == TAG_H6 ) step = 10;
if ( i == TAG_TABLE ) step = 30;
if ( i == TAG_BLOCKQUOTE ) step = 10;
// default
if ( step == 0 ) {
if ( g_nodes[i].m_isBreaking ) step = 10;
else step = 1;
}
// account for both the back and the front tags
s_tab[i ] = step;
//s_tab[i|0x80] = step;
}
s_tableInit = true;
// google seems to index SEC_MARQUEE so i took that out of here
long badFlags =SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
//long anum;
//long long *aids;
//long j;
long qwn;
long numQWords;
long numWords;
//
// . set m_matches[] array
// . loop over all words in the document
//
for ( long i = 0 ; i < nw ; i++ ) {
//if (tids && (tids[i] ) == TAG_A)
// inAnchTag = true;
//else if (tids && (tids[i]&BACKBITCOMP) == TAG_A)
// inAnchTag = false;
// for each word increment distance
dist++;
//if ( addToMatches && tids && tids[i] ){
if ( tids && tids[i] ){
long tid = tids[i] & BACKBITCOMP;
// accumulate distance
dist += s_tab[tid];
// monitor boundaries so that the proximity algo
// knows when two matches are separated by such tags
// MDW: isn't the "dist" good enough for this?????
// let's try just using "dist" then.
// "crossedSection" is hereby replaced by "dist".
//if ( s_tab[tid]
// tagIds don't have wids and are skipped
continue;
}
// skip if wid is 0, it is not an alnum word then
if ( ! wids[i] ) {
// and extra unit if it starts with \n i guess
if ( words->m_words[i][0] == '\n' ) dist++;
// dist += words->m_wordLens[i] / 3;
continue;
}
// count the number of alnum words
m_numAlnums++;
// clear this
eflag = 0;
// . zero score words cannot match query terms either
// . BUT if score is -1 that means it is in a