open-source-search-engine/Matches.cpp
mwells 87285ba3cd use gbmemcpy not memcpy so we can get profiler working again
since memcpy can't be interrupted and backtrace() called.
2015-01-13 12:25:42 -07:00

1832 lines
55 KiB
C++

#include "gb-include.h"
#include "Matches.h"
#include "Titledb.h" // for getting total # of docs in db
#include "StopWords.h"
#include "Phrases.h"
#include "Title.h"
#include "CountryCode.h"
#include "Domains.h"
#include "Sections.h"
#include "XmlDoc.h"
//#define DEBUG_MATCHES 1
// TODO: have Matches set itself from all the meta tags, titles, link text,
// neighborhoods and body. then proximity algo can utilize that info
// as well as the summary generator, Summary.cpp. right now prox algo
// was setting all those different classes itself.
// TODO: toss m_tscores. make Summary::getBestWindow() just use its the
// scores array itself. just access it with Match::m_queryWordNum.
Matches::Matches ( ) {
m_detectSubPhrases = false;
m_numMatchGroups = 0;
reset();
}
Matches::~Matches( ) { reset(); }
void Matches::reset ( ) {
m_numMatches = 0;
//m_maxNQT = -1;
m_numAlnums = 0;
// free all the classes' buffers
for ( int32_t i = 0 ; i < m_numMatchGroups ; i++ ) {
m_wordsArray [i].reset();
//m_sectionsArray[i].reset();
m_posArray [i].reset();
m_bitsArray [i].reset();
}
m_numMatchGroups = 0;
//m_explicitsMatched = 0;
//m_matchableRequiredBits = 0;
//m_hasAllQueryTerms = false;
//m_matchesQuery = false;
}
bool Matches::isMatchableTerm ( QueryTerm *qt ) { // , int32_t i ) {
// . skip if negative sign
// . no, we need to match negative words/phrases now so we can
// big hack them out...
//if ( qw->m_wordSign == '-' ) return false;
QueryWord *qw = qt->m_qword;
// not derived from a query word? how?
if ( ! qw ) return false;
if ( qw->m_ignoreWord == IGNORE_DEFAULT ) return false;
if ( qw->m_ignoreWord == IGNORE_FIELDNAME ) return false;
if ( qw->m_ignoreWord == IGNORE_BOOLOP ) return false;
// stop words in 'all the king's men' query need to be highlighted
//if ( qw->m_isQueryStopWord && ! qw->m_inQuotes ) return false;
//if ( qw->m_isStopWord && ! qw->m_inQuotes ) return false;
// take this out for now so we highlight for title: terms
if ( qw->m_fieldCode && qw->m_fieldCode != FIELD_TITLE ) return false;
// what word # are we?
int32_t qwn = qw - m_q->m_qwords;
// do not include if in a quote and does not start it!!
//if ( qw->m_inQuotes && i-1 != qw->m_quoteStart ) return false;
if ( qw->m_quoteStart >= 0 && qw->m_quoteStart != qwn ) return false;
// if query is too long, a query word can be truncated!
// this happens for some words if they are ignored, too!
if ( ! qw->m_queryWordTerm && ! qw->m_queryPhraseTerm ) return false;
// after a NOT operator?
if ( qw->m_underNOT )
return false;
// in a field?
//if ( qw->m_fieldCode != fieldCode ) continue;
// skip if a query stop word w/o a sign and ignored
//if ( q->m_isStopWord[i] &&
// q->m_termSigns[i] == '\0' &&
// q->m_ignore[i] ) continue;
return true;
}
// a QueryMatch is a quote in the query or a single word.
class QueryMatch {
public:
// range in Query::m_qwords [m_a,m_b]
int32_t m_a;
int32_t m_b;
int32_t m_score; // lowest of the term freqs
};
void Matches::setQuery ( Query *q ) {
//int32_t qtableScores [ MAX_QUERY_TERMS * 2 ];
reset();
// save it
m_q = q;
//m_tscores = tscores; // scores, 1-1 with query terms
//m_numNegTerms = 0;
//m_explicitsMatched = 0;
// clear this vector
//memset ( m_foundTermVector , 0 , m_q->getNumTerms() );
//memset ( m_foundNegTermVector, 0, m_q->getNumTerms() );
// this is word based. these are each 1 byte
memset ( m_qwordFlags , 0 , m_q->m_numWords * sizeof(mf_t));
// # of WORDS in the query
int32_t nqt = m_q->m_numTerms;
// how many query words do we have that can be matched?
int32_t numToMatch = 0;
for ( int32_t i = 0 ; i < nqt ; i++ ) {
// rest this
//m_qwordFlags[i] = 0;
// get query word #i
//QueryWord *qw = &m_q->m_qwords[i];
QueryTerm *qt = &m_q->m_qterms[i];
// skip if ignored *in certain ways only*
if ( ! isMatchableTerm ( qt ) ) {
//if( (qw->m_wordSign == '-') && !qw->m_fieldCode )
// m_numNegTerms++;
continue;
}
// count it
numToMatch++;
// don't breach. MDW: i made this >= from > (2/11/09)
if ( numToMatch < MAX_QUERY_WORDS_TO_MATCH ) continue;
// note it
log("matches: hit %"INT32" max query words to match limit",
(int32_t)MAX_QUERY_WORDS_TO_MATCH);
break;
}
// fix a core the hack way for now!
if ( numToMatch < 256 ) numToMatch = 256;
// keep number of slots in hash table a power of two for fast hashing
m_numSlots = getHighestLitBitValue ( (uint32_t)(numToMatch * 3));
// make the hash mask
uint32_t mask = m_numSlots - 1;
int32_t n;
// sanity check
if ( m_numSlots > MAX_QUERY_WORDS_TO_MATCH * 3 ) {
char *xx = NULL; *xx = 0; }
// clear hash table
memset ( m_qtableIds , 0 , m_numSlots * 8 );
memset ( m_qtableFlags , 0 , m_numSlots );
//memset ( m_qtableNegIds, 0 , m_numNegTerms );
// alternate colors for highlighting
int32_t colorNum = 0;
//int32_t negIds = 0;
// . hash all the query terms into the hash table
// . the term's score should be 100 for a very rare term,
// and 1 for a stop word.
//m_maxNQT = nqt;
for ( int32_t i = 0 ; i < nqt ; i++ ) {
// get query word #i
//QueryWord *qw = &m_q->m_qwords[i];
QueryTerm *qt = &m_q->m_qterms[i];
// skip if ignored *in certain ways only*
if ( ! isMatchableTerm ( qt ) ) {
//if( (qw->m_wordSign == '-') && !qw->m_fieldCode )
// m_qtableNegIds[negIds++] = qw->m_rawWordId;
continue;
}
// get the word it is from
QueryWord *qw = qt->m_qword;
// get word #
int32_t qwn = qw - q->m_qwords;
// assign color # for term highlighting with different colors
qw->m_colorNum = colorNum++;
// do not overfill table
if ( colorNum > MAX_QUERY_WORDS_TO_MATCH ) {
//m_maxNQT = nqt;
break;
}
// this should be equivalent to the word id
int64_t qid = qt->m_rawTermId;//qw->m_rawWordId;
// but NOT for 'cheatcodes.com'
if ( qt->m_isPhrase ) qid = qw->m_rawWordId;
// if its a multi-word synonym, like "new jersey" we must
// index the individual words... or compute the phrase ids
// for all the words in the doc. right now the qid is
// the phrase hash for this guy i think...
if ( qt->m_synonymOf && qt->m_numAlnumWordsInSynonym == 2 )
qid = qt->m_synWids0;
// put in hash table
n = ((uint32_t)qid) & mask;
// chain to an empty slot
while ( m_qtableIds[n] && m_qtableIds[n] != qid )
if ( ++n >= m_numSlots ) n = 0;
// . if already occupied, do not overwrite this, keep this
// first word, the other is often ignored as IGNORE_REPEAT
// . what word # in the query are we. save this.
if ( ! m_qtableIds[n] ) m_qtableWordNums[n] = qwn;
// store it
m_qtableIds[n] = qid;
// in quotes? this term may appear multiple times in the
// query, in some cases in quotes, and in some cases not.
// we need to know either way for logic below.
if ( qw->m_inQuotes ) m_qtableFlags[n] |= 0x02;
else m_qtableFlags[n] |= 0x01;
// this is basically a quoted synonym
if ( qt->m_numAlnumWordsInSynonym == 2 )
m_qtableFlags[n] |= 0x08;
//QueryTerm *qt = qw->m_queryWordTerm;
if ( qt && qt->m_termSign == '+' ) m_qtableFlags[n] |= 0x04;
//
// if query has e-mail, then index phrase id "email" so
// it matches "email" in the doc.
// we need this for the 'cheat codes' query as well so it
// highlights 'cheatcodes'
//
int64_t pid = qw->m_rawPhraseId;
if ( pid == 0 ) continue;
// put in hash table
n = ((uint32_t)pid) & mask;
// chain to an empty slot
while ( m_qtableIds[n] && m_qtableIds[n] != pid )
if ( ++n >= m_numSlots ) n = 0;
// this too?
if ( ! m_qtableIds[n] ) m_qtableWordNums[n] = qwn;
// store it
m_qtableIds[n] = pid;
}
/*
// set what bits we need to match
for ( int32_t i = 0 ; i < m_q->m_numTerms ; i++ ) {
// get it
QueryTerm *qt = &m_q->m_qterms[i];
// get its explicit bit
qvec_t ebit = qt->m_explicitBit;
// must be a required term
if ( (m_q->m_matchRequiredBits & ebit) == 0 ) continue;
// we only check for certain fields in this logic right now
bool skip = true;
// if no field, must match it
if ( qt->m_fieldCode == 0 ) skip = false;
if ( qt->m_fieldCode == FIELD_GBLANG ) skip = false;
if ( qt->m_fieldCode == FIELD_GBCOUNTRY ) skip = false;
if ( qt->m_fieldCode == FIELD_SITE ) skip = false;
if ( qt->m_fieldCode == FIELD_IP ) skip = false;
if ( qt->m_fieldCode == FIELD_URL ) skip = false;
if ( skip ) continue;
// we need this ebit
m_matchableRequiredBits |= ebit;
}
*/
}
// . this was in Summary.cpp, but is more useful here
// . we can also use this to replace the proximity algo setup where it
// fills in the matrix for title, link text, etc.
// . returns false and sets g_errno on error
bool Matches::set ( XmlDoc *xd ,
Words *bodyWords ,
//Synonyms *bodySynonyms,
Phrases *bodyPhrases ,
Sections *bodySections ,
Bits *bodyBits ,
Pos *bodyPos ,
Xml *bodyXml ,
Title *tt ,
int32_t niceness ) {
// don't reset query info!
reset();
// sanity check
if ( ! xd->m_docIdValid ) { char *xx=NULL;*xx=0; }
// . first add all the matches in the body of the doc
// . add it first since it will kick out early if too many matches
// and we get all the explicit bits matched
if ( ! addMatches ( bodyWords ,
//bodySynonyms ,
bodyPhrases ,
bodySections ,
//addToMatches ,
bodyBits ,
bodyPos ,
0 , // fieldCode of words, 0 for no field
true , // allowPunctInPhrase,
false , // exclQTOnlyinAnchTxt,
0 , // qvec_t reqMask ,
0 , // qvec_t negMask ,
1 , // int32_t diversityWeight,
xd->m_docId,
MF_BODY ) )
return false;
// add the title in
if ( ! addMatches ( tt->getTitle() ,
tt->getTitleLen() ,
MF_TITLEGEN ,
xd->m_docId ,
niceness ))
return false;
// add in the url terms
Url *turl = xd->getFirstUrl();
if ( ! addMatches ( turl->m_url ,
turl->m_ulen ,
MF_URL ,
xd->m_docId ,
niceness ) )
return false;
// also use the title from the title tag, because sometimes
// it does not equal "tt->getTitle()"
int32_t a = tt->m_titleTagStart;
int32_t b = tt->m_titleTagEnd;
char *start = NULL;
char *end = NULL;
if ( a >= 0 && b >= 0 && b>a ) {
start = bodyWords->getWord(a);
end = bodyWords->getWord(b-1) + bodyWords->getWordLen(b-1);
if ( ! addMatches ( start ,
end - start ,
MF_TITLETAG ,
xd->m_docId ,
niceness ))
return false;
}
// add in dmoz stuff
char *dt = xd->ptr_dmozTitles;
char *ds = xd->ptr_dmozSumms;
int32_t nd = xd->size_catIds / 4;
for ( int32_t i = 0 ; i < nd ; i++ ) {
// sanity check
if ( ! dt[0] ) break;
// add each dmoz title
if ( ! addMatches ( dt ,
gbstrlen(dt) ,
MF_DMOZTITLE ,
xd->m_docId ,
niceness ))
return false;
// skip
dt += gbstrlen(dt) + 1;
// sanity check
if ( ! ds[0] ) break;
// and the summary
if ( ! addMatches ( ds ,
gbstrlen(ds) ,
MF_DMOZSUMM ,
xd->m_docId ,
niceness ))
return false;
// skip
ds += gbstrlen(ds) + 1;
}
// now add in the meta tags
int32_t n = bodyXml->getNumNodes();
XmlNode *nodes = bodyXml->getNodes();
// find the first meta summary node
for ( int32_t i = 0 ; i < n ; i++ ) {
// continue if not a meta tag
if ( nodes[i].m_nodeId != 68 ) continue;
// only get content for <meta name=..> not <meta http-equiv=..>
int32_t tagLen;
char *tag = bodyXml->getString ( i , "name" , &tagLen );
// is it an accepted meta tag?
int32_t flag = 0;
if (tagLen== 7&&strncasecmp(tag,"keyword" , 7)== 0)
flag = MF_METAKEYW;
if (tagLen== 7&&strncasecmp(tag,"summary" , 7)== 0)
flag = MF_METASUMM;
if (tagLen== 8&&strncasecmp(tag,"keywords" , 8)== 0)
flag = MF_METAKEYW;
if (tagLen==11&&strncasecmp(tag,"description",11)== 0)
flag = MF_METADESC;
if ( ! flag ) continue;
// get the content
int32_t len;
char *s = bodyXml->getString ( i , "content" , &len );
if ( ! s || len <= 0 ) continue;
// wordify
if ( ! addMatches ( s ,
len ,
flag ,
xd->m_docId ,
niceness ) )
return false;
}
// . now the link text
// . loop through each link text and it its matches
LinkInfo *info = xd->getLinkInfo1();
// this is not the second pass, it is the first pass
bool secondPass = false;
loop:
// loop through the Inlinks
Inlink *k = NULL;
for ( ; (k = info->getNextInlink(k)) ; ) {
// does it have link text? skip if not.
if ( k->size_linkText <= 1 ) continue;
// set the flag, the type of match
mf_t flags = MF_LINK;
//if ( k->m_isAnomaly ) flags = MF_ALINK;
// add it in
if ( ! addMatches ( k->getLinkText() ,
k->size_linkText - 1 ,
flags ,
xd->m_docId ,
niceness ))
return false;
// skip if no neighborhood text
//if ( k->size_surroundingText <= 1 ) continue;
// set flag for that
flags = MF_HOOD;
//if ( k->m_isAnomaly ) flags = MF_AHOOD;
// add it in
if ( ! addMatches ( k->getSurroundingText() ,
k->size_surroundingText - 1 ,
flags ,
xd->m_docId ,
niceness ))
return false;
// parse the rss up into xml
Xml rxml;
if ( ! k->setXmlFromRSS ( &rxml , niceness ) ) return false;
// add rss description
bool isHtmlEncoded;
int32_t rdlen;
char *rd = rxml.getRSSDescription ( &rdlen , &isHtmlEncoded );
if ( ! addMatches ( rd ,
rdlen ,
MF_RSSDESC ,
xd->m_docId ,
niceness ))
return false;
// add rss title
int32_t rtlen;
char *rt = rxml.getRSSTitle ( &rtlen , &isHtmlEncoded );
if ( ! addMatches ( rt ,
rtlen ,
MF_RSSTITLE ,
xd->m_docId ,
niceness ))
return false;
}
// now repeat for imported link text!
if ( ! secondPass ) {
// only do this once
secondPass = true;
// set it
info = *xd->getLinkInfo2();
if ( info ) goto loop;
}
/*
// convenience
Query *q = m_q;
// any error we have will be this
g_errno = EMISSINGQUERYTERMS;
// . add in match bits from query!
// . used for the BIG HACK
for( int32_t i = 0; i < q->m_numTerms ; i++ ) {
// get it
QueryTerm *qt = &q->m_qterms[i];
bool isNeg = qt->m_termSign == '-';
qvec_t ebit = qt->m_explicitBit;
// save it
int32_t fc = qt->m_fieldCode;
// . length stops at space for fielded terms
// . get word
QueryWord *w = qt->m_qword;
// get word index
int32_t wi = w - q->m_qwords;
// point to word
char *qw = q->m_qwords[wi].m_word;
// total length
int32_t qwLen = 0;
// keep including more words until not in field anymore
for ( ; wi < q->m_numWords ; wi++ ) {
if ( q->m_qwords[wi].m_fieldCode != fc ) break;
// include its length
qwLen += q->m_qwords[wi].m_wordLen;
}
if( !qw || !qwLen )
return log( "query: Error, no query word found!" );
char tmp[512];
//int32_t tmpLen;
//tmpLen = utf16ToUtf8( tmp, 512, qw, qwLen );
int32_t tmpLen = qwLen;
if ( tmpLen > 500 ) tmpLen = 500;
gbmemcpy ( tmp , qw , tmpLen );
tmp[tmpLen] = '\0';
log(LOG_DEBUG,"query: term#=%"INT32" fieldLen=%"INT32":%s",i,tmpLen,tmp);
if ( fc == FIELD_GBLANG ) {
char lang = atoi( tmp );
log( LOG_DEBUG, "query: TitleRec "
"Lang=%i", *xd->getLangId() );
if( q->m_isBoolean ) {
if (*xd->getLangId() == lang)
m_explicitsMatched |= ebit;
continue;
}
if ( isNeg && (*xd->getLangId() == lang)){
if( q->m_hasUOR ) continue;
return log("query: Result contains "
"-gblang: term, filtering. "
" q=%s", q->m_orig);
}
else if( !isNeg
&& (*xd->getLangId() != lang)){
if( q->m_hasUOR ) continue;
return log("query: Result is missing "
"gblang: term, filtering. "
"q=%s", q->m_orig);
}
else
m_explicitsMatched |= ebit;
}
else if ( fc == FIELD_GBCOUNTRY ) {
unsigned char country ;
country = g_countryCode.getIndexOfAbbr(tmp);
log( LOG_DEBUG, "query: TitleRec "
"Country=%i", *xd->getCountryId() );
if ( q->m_isBoolean ) {
if ( *xd->getCountryId() == country)
m_explicitsMatched |= ebit;
continue;
}
if ( isNeg && (*xd->getCountryId() == country)){
if( q->m_hasUOR ) continue;
return log("query: Result contains "
"-gbcountry: term, filtering. "
" q=%s", q->m_orig);
}
else if ( !isNeg && (*xd->getCountryId() != country)){
if( q->m_hasUOR ) continue;
return log("query: Result is missing "
"gbcountry: term, filtering. "
"q=%s", q->m_orig);
}
else
m_explicitsMatched |= ebit;
}
else if( fc == FIELD_SITE ) {
// . Site Colon Field Terms:
// 1.) match tld first (if only tld)
// 2.) match domain (contains tld)
// 3.) match host (sub-domain)
// 4.) match path
// * 1 is the minimal specificity for
// a site: query. 2,3, and 4 are
// only required if specified in
// query
bool fail = false;
Url *turl = xd->getFirstUrl();
char *ttld = turl->getTLD();
int32_t ttlen = turl->getTLDLen();
char *tdom = turl->getDomain();
int32_t tdlen = turl->getDomainLen();
char *thost = turl->getHost();
int32_t thlen = turl->getHostLen();
char *tpath = turl->getPath();
int32_t tplen = turl->getPathLen();
//bool hasWWW = turl->isHostWWW();
log( LOG_DEBUG, "query: TitleRec "
"Site=%s", tdom );
// . Check to see if site: is querying
// only a TLD, then we can't put it
// into Url.
if(isTLD(tmp, tmpLen)) {
if(ttlen != tmpLen ||
strncmp(ttld, tmp, tmpLen))
fail = true;
}
else {
Url qurl;
// false --> add www?
qurl.set( tmp, tmpLen, false);//hasWWW );
char *qdom = qurl.getDomain();
int32_t qdlen = qurl.
getDomainLen();
char *qhost = qurl.getHost();
int32_t qhlen = qurl.getHostLen();
char *qpath = qurl.getPath();
int32_t qplen = qurl.getPathLen();
if(tdlen != qdlen ||
strncmp(tdom, qdom, qdlen))
fail = true;
if(!fail &&
qhlen != qdlen &&
(thlen != qhlen ||
strncmp(thost,
qhost, qhlen)))
fail = true;
if(!fail && qplen > 1 &&
(tplen < qplen ||
strncmp(tpath,
qpath, qplen)))
fail = true;
}
if( q->m_isBoolean){
if ( ! fail )
m_explicitsMatched |= ebit;
continue;
}
if( fail && !isNeg ){
if( q->m_hasUOR ) continue;
return log("query: Result is missing "
"site: term, filtering. "
"q=%s", q->m_orig);
}
else if( !fail && isNeg ){
if( q->m_hasUOR ) continue;
return log("query: Result contains "
"-site: term, filtering. "
"q=%s", q->m_orig );
}
else
m_explicitsMatched |= ebit;
}
else if ( fc == FIELD_IP ) {
int32_t ip = *xd->getIp();
char *oip = iptoa( ip );
log(LOG_DEBUG, "query: TitleRec Ip=%s", oip );
int32_t olen = gbstrlen(oip);
bool matched = false;
if (olen>=tmpLen && strncmp(oip,tmp,tmpLen)==0 )
matched = true;
if( q->m_isBoolean){
if (matched) m_explicitsMatched |= ebit;
continue;
}
if ( ! matched && ! isNeg ) {
if( q->m_hasUOR ) continue;
return log("query: Result is missing ip: term,"
" filtering. q=%s", q->m_orig );
}
else if ( matched && isNeg ) {
if( q->m_hasUOR ) continue;
return log("query: Result contains -ip: term, "
"filtering. q=%s", q->m_orig );
}
else
m_explicitsMatched |= ebit;
}
else if ( fc == FIELD_URL ) {
char *url = xd->getFirstUrl()->getUrl();
int32_t slen = xd->getFirstUrl()->getUrlLen();
Url u;
// do not force add the "www." cuz titleRec does not
u.set( tmp, tmpLen, false );//true );
char * qs = u.getUrl();
int32_t qsl = u.getUrlLen();
log( LOG_DEBUG, "query: TitleRec Url=%s", url );
if( qsl > slen ) qsl = slen;
int32_t result = strncmp( url, qs, qsl );
if( q->m_isBoolean){
if (result)
m_explicitsMatched |= ebit;
continue;
}
if( result && !isNeg ){
if( q->m_hasUOR ) continue;
return log("query: Result is missing "
"url: term, filtering. q=%s",
q->m_orig );
}
else if( !result && isNeg ){
if( q->m_hasUOR ) continue;
return log("query: Result contains "
"-url: term, filtering. "
"q=%s", q->m_orig );
}
else
m_explicitsMatched |= ebit;
}
}
// clear just in case
g_errno = 0;
// what bits are not matchable
qvec_t unmatchable = m_q->m_matchRequiredBits -m_matchableRequiredBits;
// modify what we got
qvec_t matched = m_explicitsMatched | unmatchable;
// need to set Query::m_bmap before calling getBitScore()
if ( ! m_q->m_bmapIsSet ) m_q->setBitMap();
// if boolean, do the truth table
int32_t bitScore = m_q->getBitScore ( matched );
// assume we are missing some. if false, may still be in the results
// if we have rat=0 (Require All Terms = false)
m_hasAllQueryTerms = false;
// assume not a match. if this is false big hack excludes from results
m_matchesQuery = false;
// see Query.h for these bits defined. do not include 0x80 because
// we may not have any forced bits...
if ( bitScore & (0x20|0x40) ) m_matchesQuery = true;
// it may not have all the query terms because of rat=0
if ( (matched & m_q->m_matchRequiredBits)== m_q->m_matchRequiredBits ){
m_hasAllQueryTerms = true;
m_matchesQuery = true;
}
*/
// that should be it
return true;
}
bool Matches::addMatches ( char *s ,
int32_t slen ,
mf_t flags ,
int64_t docId ,
int32_t niceness ) {
// . do not breach
// . happens a lot with a lot of link info text
if ( m_numMatchGroups >= MAX_MATCHGROUPS ) {
// . log it
// . often we have a ton of inlink text!!
//log("matches: could not add matches1 for docid=%"INT64" because "
// "already have %"INT32" matchgroups",docId,
// (int32_t)MAX_MATCHGROUPS);
return true;
}
// get some new ptrs for this match group
Words *wp = &m_wordsArray [ m_numMatchGroups ];
//Sections *sp = &m_sectionsArray [ m_numMatchGroups ];
Sections *sp = NULL;
Bits *bp = &m_bitsArray [ m_numMatchGroups ];
Pos *pb = &m_posArray [ m_numMatchGroups ];
// set the words class for this match group
if ( ! wp->set ( s ,
slen , // in bytes
TITLEREC_CURRENT_VERSION ,
true , // computeIds?
niceness ))
return false;
// scores vector
//if ( ! sp->set ( wp , TITLEREC_CURRENT_VERSION , false ) )
// return false;
// bits vector
if ( ! bp->setForSummary ( wp ) )
return false;
// position vector
if ( ! pb->set ( wp , sp ) )
return false;
// record the start
int32_t startNumMatches = m_numMatches;
// sometimes it returns true w/o incrementing this
int32_t n = m_numMatchGroups;
// . add all the Match classes from this match group
// . this increments m_numMatchGroups on success
bool status = addMatches ( wp ,
//NULL , // synonyms
NULL , // phrases
sp ,
//true , // addToMatches
bp , // bits
pb , // pos
0 , // fieldCode
true , // allowPunctInPhrase?
false , // excludeQTOnlyInAnchTxt?
0 , // reqMask
0 , // negMask
1 , // diversityWeight
docId ,
flags );// docId
// if this matchgroup had some, matches, then keep it
if ( m_numMatches > startNumMatches ) return status;
// otherwise, reset it, useless
wp->reset();
if ( sp ) sp->reset();
bp->reset();
pb->reset();
// do not decrement the counter if we never incremented it
if ( n == m_numMatchGroups ) return status;
// ok, remove it
m_numMatchGroups--;
return status;
}
bool Matches::getMatchGroup ( mf_t matchFlag ,
Words **wp ,
Pos **pp ,
Sections **sp ) {
for ( int32_t i = 0 ; i < m_numMatchGroups ; i++ ) {
// must be the type we want
if ( m_flags[i] != matchFlag ) continue;
// get it
*wp = &m_wordsArray [i];
*pp = &m_posArray [i];
//*sp = &m_sectionsArray [i];
*sp = NULL;
return true;
}
// not found
return false;
}
// . TODO: support stemming later. each word should then have multiple ids.
// . add to our m_matches[] array iff addToMatches is true, otherwise we just
// set the m_foundTermVector for doing the BIG HACK described in Summary.cpp
bool Matches::addMatches ( Words *words ,
//Synonyms *syn ,
Phrases *phrases ,
Sections *sections ,
Bits *bits ,
Pos *pos ,
int32_t fieldCode , // of words,0=none
bool allowPunctInPhrase ,
bool exclQTOnlyinAnchTxt ,
qvec_t reqMask ,
qvec_t negMask ,
int32_t diversityWeight ,
int64_t docId ,
mf_t flags ) {
// if no query term, bail.
if ( m_numSlots <= 0 ) return true;
// . do not breach
// . happens a lot with a lot of link info text
if ( m_numMatchGroups >= MAX_MATCHGROUPS ) {
// . log it
// . often we have a ton of inlink text!!
//log("matches: could not add matches2 for docid=%"INT64" because "
// "already have %"INT32" matchgroups",docId,
// (int32_t)MAX_MATCHGROUPS);
return true;
}
// int16_tcut
Section *sp = NULL;
if ( sections ) sp = sections->m_sections;
// we've added a lot of matches, if we don't need anymore
// to confirm the big hack then break out
//if ( m_numMatches >= MAX_MATCHES &&
// ( m_explicitsMatched & m_matchableRequiredBits ) )
// return true;
mf_t eflag = 0;
// set the ptrs
m_wordsPtr [ m_numMatchGroups ] = words;
m_sectionsPtr [ m_numMatchGroups ] = sections;
m_bitsPtr [ m_numMatchGroups ] = bits;
m_posPtr [ m_numMatchGroups ] = pos;
m_flags [ m_numMatchGroups ] = flags;
m_numMatchGroups++;
int64_t *pids = NULL;
if ( phrases ) pids = phrases->getPhraseIds2();
// set convenience vars
uint32_t mask = m_numSlots - 1;
int64_t *wids = words->getWordIds();
int32_t *wlens = words->getWordLens();
char **wptrs = words->getWords();
// swids = word ids where accent marks, etc. are stripped
//int64_t *swids = words->getStripWordIds();
nodeid_t *tids = words->getTagIds();
int32_t nw = words->m_numWords;
//int32_t *wscores = NULL;
//if ( scores ) wscores = scores->m_scores;
int32_t n;//,n2 ;
int32_t matchStack = 0;
int64_t nextMatchWordIdMustBeThis = 0;
int32_t nextMatchWordPos = 0;
int32_t lasti = -3;
//bool inAnchTag = false;
int32_t dist = 0;
// . every tag increments "dist" by a value
// . rather than use a switch/case statement, which does a binary
// lookup thing which is really slow, let's use a 256 bucket table
// for constant lookup, rather than log(N).
static char s_tableInit = false;
static int8_t s_tab[512];
if ( getNumXmlNodes() > 512 ) { char *xx=NULL;*xx=0; }
for ( int32_t i = 0 ; ! s_tableInit && i < 128 ; i++ ) {
char step = 0;
if ( i == TAG_TR ) step = 2;
if ( i == TAG_P ) step = 10;
if ( i == TAG_HR ) step = 10;
if ( i == TAG_H1 ) step = 10;
if ( i == TAG_H2 ) step = 10;
if ( i == TAG_H3 ) step = 10;
if ( i == TAG_H4 ) step = 10;
if ( i == TAG_H5 ) step = 10;
if ( i == TAG_H6 ) step = 10;
if ( i == TAG_TABLE ) step = 30;
if ( i == TAG_BLOCKQUOTE ) step = 10;
// default
if ( step == 0 ) {
if ( g_nodes[i].m_isBreaking ) step = 10;
else step = 1;
}
// account for both the back and the front tags
s_tab[i ] = step;
//s_tab[i|0x80] = step;
}
s_tableInit = true;
// google seems to index SEC_MARQUEE so i took that out of here
int32_t badFlags =SEC_SCRIPT|SEC_STYLE|SEC_SELECT|SEC_IN_TITLE;
//int32_t anum;
//int64_t *aids;
//int32_t j;
int32_t qwn;
int32_t numQWords;
int32_t numWords;
//
// . set m_matches[] array
// . loop over all words in the document
//
for ( int32_t i = 0 ; i < nw ; i++ ) {
//if (tids && (tids[i] ) == TAG_A)
// inAnchTag = true;
//else if (tids && (tids[i]&BACKBITCOMP) == TAG_A)
// inAnchTag = false;
// for each word increment distance
dist++;
//if ( addToMatches && tids && tids[i] ){
if ( tids && tids[i] ){
int32_t tid = tids[i] & BACKBITCOMP;
// accumulate distance
dist += s_tab[tid];
// monitor boundaries so that the proximity algo
// knows when two matches are separated by such tags
// MDW: isn't the "dist" good enough for this?????
// let's try just using "dist" then.
// "crossedSection" is hereby replaced by "dist".
//if ( s_tab[tid]
// tagIds don't have wids and are skipped
continue;
}
// skip if wid is 0, it is not an alnum word then
if ( ! wids[i] ) {
// and extra unit if it starts with \n i guess
if ( words->m_words[i][0] == '\n' ) dist++;
// dist += words->m_wordLens[i] / 3;
continue;
}
// count the number of alnum words
m_numAlnums++;
// clear this
eflag = 0;
// . zero score words cannot match query terms either
// . BUT if score is -1 that means it is in a <select> or
// a <marquee> tag (see Scores.cpp)
// . FIX: neg word terms cannot be in quotes!!
//for( int32_t j = 0; j < m_numNegTerms; j++ ) {
// if( wids[i] == m_qtableNegIds[j]
// || swids[i] == m_qtableNegIds[j] )
// m_foundNegTermVector[j] = 1;
//}
// NO NO, a score of -1 means in a select tag, and
// we do index that!! so only skip if wscores is 0 now.
// -1 means in script, style, select or marquee. it is
// indexed but with very little weight... this is really
// a hack in Scores.cpp and should be fixed.
// in Scores.cpp we set even the select tag stuff to -1...
//if ( wscores && wscores[i] == -1 ) continue;
if ( sp && (sp->m_flags & badFlags) ) continue;
// . does it match a query term?
// . hash to the slot in the hash table
n = ((uint32_t)wids[i]) & mask;
//n2 = swids[i]?((uint32_t)swids[i]) & mask:n;
chain1:
// skip if slot is empty (doesn't match query term)
//if ( ! m_qtableIds[n] && ! m_qtableIds[n2]) continue;
if ( ! m_qtableIds[n] ) goto tryPhrase;
// otherwise chain
if ( (m_qtableIds[n] != wids[i]) ) {
if ( m_qtableIds[n] && ++n >= m_numSlots ) n = 0;
goto chain1;
}
// we got one!
goto gotMatch;
//
// fix so we hihglight "woman's" when query term is "woman"
// for 'spiritual books for women' query
//
tryPhrase:
// try without 's if it had it
if ( wlens[i] >= 3 &&
wptrs[i][wlens[i]-2] == '\'' &&
to_lower_a(wptrs[i][wlens[i]-1]) == 's' ) {
// move 's from word hash... very tricky
int64_t nwid = wids[i];
// undo hash64Lower_utf8 in hash.h
nwid ^= g_hashtab[wlens[i]-1][(uint8_t)'s'];
nwid ^= g_hashtab[wlens[i]-2][(uint8_t)'\''];
n = ((uint32_t)nwid) & mask;
chain2:
if ( ! m_qtableIds[n] ) goto tryPhrase2;
if ( (m_qtableIds[n] != nwid) ) {
if ( m_qtableIds[n] && ++n >= m_numSlots ) n=0;
goto chain2;
}
qwn = m_qtableWordNums[n];
numWords = 1;
numQWords = 1;
// we got one!
goto gotMatch2;
}
tryPhrase2:
// try phrase first
if ( pids && pids[i] ) {
n = ((uint32_t)pids[i]) & mask;
chain3:
if ( ! m_qtableIds[n] ) continue;
if ( (m_qtableIds[n] != pids[i]) ) {
if ( m_qtableIds[n] && ++n >= m_numSlots)n = 0;
goto chain3;
}
// what query word # do we match?
qwn = m_qtableWordNums[n];
// get that query word #
QueryWord *qw = &m_q->m_qwords[qwn];
// . do we match it as a single word?
// . did they search for "bluetribe" ...?
if ( qw->m_rawWordId == pids[i] ) {
// set our # of words basically to 3
numWords = 3;
// matching a single query word
numQWords = 1;
// got a match
goto gotMatch2;
}
if ( qw->m_phraseId == pids[i] ) {
// might match more if we had more query
// terms in the quote
numWords = getNumWordsInMatch( words,
i,
n,
&numQWords,
&qwn,
allowPunctInPhrase );
// this is 0 if we were an unmatched quote
if ( numWords <= 0 ) continue;
// we matched a bigram in the document
//numWords = 3;
// i guess we matched the query phrase bigram
//numQWords = 3;
// got a match
goto gotMatch2;
}
// otherwise we are matching a query phrase id
log("matches: wtf? query word not matched for "
"highlighting... strange.");
// assume one word for now
numWords = 1;
numQWords = 1;
goto gotMatch2;
//char *xx=NULL;*xx=0;
}
//
// shucks, no match
//
continue;
gotMatch:
// what query word # do we match?
qwn = m_qtableWordNums[n];
// . how many words are in this match?
// . it may match a single word or a phrase or both
// . this will be 1 for just matching a single word, and
// multiple words for quotes/phrases. The number of words
// in both cases will included unmatched punctuation words
// and tags in between matching words.
numQWords = 0;
numWords = getNumWordsInMatch( words, i, n, &numQWords,
&qwn, allowPunctInPhrase );
// this is 0 if we were an unmatched quote
if ( numWords <= 0 ) continue;
gotMatch2:
// get query word
QueryWord *qw = &m_q->m_qwords[qwn];
// point to next word in the query
QueryWord *nq = NULL;
if ( qwn+2 < m_q->m_numWords ) nq = &m_q->m_qwords[qwn+2];
// . if only one word matches and its a stop word, make sure
// it's next to the correct words in the query
// . if phraseId is 0, that means we do not start a phrase,
// because stop words can start phrases if they are the
// first word, are capitalized, or have breaking punct before
// them.
if ( numWords == 1 &&
! qw->m_inQuotes &&
m_q->m_numWords > 2 &&
qw->m_wordSign == '\0' &&
(nq && nq->m_wordId) && // no field names can follow
//(qw->m_isQueryStopWord || qw->m_isStopWord ) ) {
// we no longer consider single alnum chars to be
// query stop words as stated in StopWords.cpp to fix
// the query 'j. w. eagan'
qw->m_isQueryStopWord ) {
// if stop word does not start a phrase in the query
// then he must have a matched word before him in the
// document. if he doesn't then do not count as a match
if ( qw->m_phraseId == 0LL && i-2 != lasti ) {
// peel off anybody before us
m_numMatches -= matchStack;
if ( m_numMatches < 0 ) m_numMatches = 0;
// don't forget to reset the match stack
matchStack = 0;
/*
//
// count him at least for big hack though
//
// incorporate the explicit bit of this term
QueryTerm *qt = qw->m_queryWordTerm;
// are we in quotes?
if ( ! qt ) qt = qw->m_queryPhraseTerm;
// record it as matched. this is used for the
// BIG HACK
if ( qt ) m_explicitsMatched |=
qt->m_explicitBit | qt->m_implicitBits;
//
// done BIG HACK fix
//
*/
continue;
}
// if we already have a match stack, we must
// be in nextMatchWordPos
if ( matchStack && nextMatchWordPos != i ) {
// peel off anybody before us
m_numMatches -= matchStack;
if ( m_numMatches < 0 ) m_numMatches = 0;
// don't forget to reset the match stack
matchStack = 0;
//continue;
}
// if the phraseId is 0 and the previous word
// is a match, then we're ok, but put us on a stack
// so if we lose a match, we'll be erased
QueryWord *nq = &m_q->m_qwords[qwn+2];
// next match is only required if next word in query
// is indeed valid.
if ( nq->m_wordId && nq->m_fieldCode == 0 ) {
nextMatchWordIdMustBeThis = nq->m_rawWordId;
nextMatchWordPos = i + 2;
matchStack++;
}
}
else if ( matchStack ) {
// if the last word matched was a stop word, we have to
// match otherwise we have to remove the whole stack.
if ( qw->m_rawWordId != nextMatchWordIdMustBeThis ||
i > nextMatchWordPos ) {
m_numMatches -= matchStack;
// ensure we never go negative like for
// www.experian.com query
if ( m_numMatches < 0 ) m_numMatches = 0;
}
// always reset this here if we're not a stop word
matchStack = 0;
}
// record word # of last match
lasti = i;
// . we MUST map the QueryWords to their respective QueryTerms
// . that is done already pretty much in Query.cpp
// . this allows us to set our m_foundTermVector[] as well as
// compute the termFreq for our matching quote
// . MDW: WHAT IS THIS?????
/*
//int64_t max = -1;
for ( int32_t j = qwn ; j < qwn + numQWords &&
// if the word is repeated twice in two different
// phrases, qwn sometimes ends up in the later,
// phrase which may have less words in it than
// the other, so check for breech here
j < m_q->m_numWords ; j++ ) {
// get the ith query word
QueryWord *qw = &m_q->m_qwords[j];
// does it have a query word or phrase term?
QueryTerm *qt1 = qw->m_queryWordTerm ;
QueryTerm *qt2 = qw->m_queryPhraseTerm;
int32_t qtn1 = -1;
int32_t qtn2 = -1;
if ( qt1 ) qtn1 = qt1 - m_q->m_qterms;
if ( qt2 ) qtn2 = qt2 - m_q->m_qterms;
// we must match X words to match the phrase!
if ( numWords <= 1 ) qt2 = NULL;
// MDW: why do this here instead of below where we
// actually add the Match?
if ( qt1 &&
!(exclQTOnlyinAnchTxt && inAnchTag) )
m_explicitsMatched |= qt1->m_matchesExplicitBits;
if ( qt2 &&
!(exclQTOnlyinAnchTxt && inAnchTag) ) {
m_explicitsMatched |= qt2->m_matchesExplicitBits;
m_explicitsMatched |= qt2->m_implicitBits;
}
// . set the score
// . MDW: these scores are set in Summary.cpp based on
// tf, etc. i think it should handle this , not us
if ( ! m_tscores ) continue;
if ( max == -1 && qt1 ) max=m_tscores[qtn1];
if ( max == -1 && qt2 ) max=m_tscores[qtn2];
if ( qt1 && m_tscores[qtn1]>max ) max=m_tscores[qtn1];
if ( qt2 && m_tscores[qtn2]>max ) max=m_tscores[qtn2];
}
*/
if(m_detectSubPhrases)
detectSubPhrase(words, i, numWords, qwn,
diversityWeight);
// . if not adding to m_matches, keep going
// . MDW: why wouldn't we add to the matches array?
//if ( ! addToMatches ) continue;
// don't store it in our m_matches array if the max is negative
// i.e. we matched a '-' unwanted word
/*
if ( max < -1 ) {
log("query: found neg word in doc! should be taken "
"care of in summary and doc should not be "
"displayed! query=%s docId=%"INT64"",
m_q->m_orig, docId);
return false;
}
// sanity check
if ( m_tscores && max == -1 ) {
g_errno = EBADENGINEER;
log("query: bad matches error. fix me! query=%s "
"docId=%"INT64"", m_q->m_orig, docId);
return false;
char *xx = NULL; *xx = 0;
}
*/
// otherwise, store it in our m_matches[] array
Match *m = &m_matches[m_numMatches];
// use the max score of all query terms we contain as our score
//if ( max >= 0 ) m->m_score = max;
// the word # in the doc, and how many of 'em are in the match
m->m_wordNum = i;
m->m_numWords = numWords;
// the word # in the query, and how many of 'em we match
m->m_qwordNum = qwn;
m->m_numQWords = numQWords;
// get the first query word # of this match
//QueryWord *qw = &m_q->m_qwords[qwn];
qw = &m_q->m_qwords[qwn];
// get its color. for highlighting under different colors.
m->m_colorNum = qw->m_colorNum;
// sanity check
if ( m->m_colorNum < 0 ) { char *xx = NULL; *xx = 0; }
// convenience, used by Summary.cpp
m->m_words = words;
m->m_sections = sections;
m->m_bits = bits;
m->m_pos = pos;
m->m_dist = dist;
m->m_flags = flags | eflag ;
// this is used by the proximity algorithm in Summary.cpp
//m->m_crossedSection = false;
// add to our vector. we want to know where each QueryWord
// is. i.e. in the title, link text, meta tag, etc. so
// the proximity algo in Summary.cpp can use that info.
m_qwordFlags[qwn] |= flags;
// loop over the query words in the match and add in all
// their explicit bits. fixes www.gmail.com query which
// matches query words, and we assume it is in quotes...
/*
for ( int32_t qi = qwn ; qi < qwn + numQWords ; qi++ ) {
// get it
QueryWord *ww = &m_q->m_qwords[qi];
// incorporate the explicit bit of this term
QueryTerm *qt = ww->m_queryWordTerm;
// are we in quotes?
if ( ! qt ) qt = ww->m_queryPhraseTerm;
// record it as matched. this is used for the BIG HACK
if ( qt ) m_explicitsMatched |=
qt->m_explicitBit | qt->m_implicitBits;
}
*/
// advance
m_numMatches++;
// i think we use "dist" for the proximity algo now, but what
// was it used for before?
//dist = 0;
// reset stack
// no! we need to be able to pop off this match if it
// requires the next query term to follow it, like in the
// case of a query stop word...
//matchStack = 0;
// we get atleast MAX_MATCHES
if ( m_numMatches < MAX_MATCHES ) continue;
// we've added a lot of matches, if we don't need anymore
// to confirm the big hack then break out
//if ( m_explicitsMatched & m_matchableRequiredBits ) {
// log(LOG_DEBUG,
// "query: found all query terms for big hack after "
// "%"INT32" matches. docId=%"INT64"", m_numMatches, docId);
// break;
//}
//bool hadPhrases ;
//bool hadWords ;
//int32_t matchedBits = getTermsFound2 (&hadPhrases,&hadWords);
//if ( (matchedBits & reqMask) == reqMask &&
// !(matchedBits & negMask) ) {
// log("query: found all query terms for big hack after "
// "%"INT32" matches. docId=%"INT64"", m_numMatches, docId);
// break;
//}
// don't breech MAX_MATCHES_FOR_BIG_HACK
if ( m_numMatches < MAX_MATCHES_FOR_BIG_HACK ) continue;
log("query: Exceed match buffer of %"INT32" matches. docId=%"INT64"",
(int32_t)MAX_MATCHES_FOR_BIG_HACK, docId);
break;
}
// peel off anybody before us
m_numMatches -= matchStack;
if ( m_numMatches < 0 ) m_numMatches = 0;
return true;
}
// . word #i in the doc matches slot #n in the hash table
int32_t Matches::getNumWordsInMatch ( Words *words ,
int32_t wn ,
int32_t n ,
int32_t *numQWords ,
int32_t *qwn ,
bool allowPunctInPhrase ) {
// is it a two-word synonym?
if ( m_qtableFlags[n] & 0x08 ) {
// get the word following this
int64_t wid2 = 0LL;
if ( wn+2 < words->m_numWords ) wid2 = words->m_wordIds[wn+2];
// scan the synonyms...
int64_t *wids = words->m_wordIds;
for ( int32_t k = 0 ; k < m_q->m_numTerms ; k++ ) {
QueryTerm *qt = &m_q->m_qterms[k];
if ( ! qt->m_synonymOf ) continue;
if ( qt->m_synWids0 != wids[wn] ) continue;
if ( qt->m_synWids1 != wid2 ) continue;
*numQWords = 3;
return 3;
}
}
// save the first word in the doc that we match first
int32_t wn0 = wn;
// CAUTION: the query "business development center" (in quotes)
// would match a doc with "business development" and
// "development center" as two separate phrases.
// if query word never appears in quotes, it's a single word match
if ( ! (m_qtableFlags[n] & 0x02) ) { *numQWords = 1; return 1; }
// get word ids array for the doc
int64_t *wids = words->getWordIds();
//int64_t *swids = words->getStripWordIds();
char **ws = words->getWords();
int32_t *wl = words->getWordLens();
//the word we match in the query appears in quotes in the query
int32_t k = -1;
int32_t count = 0;
int32_t nw = words->m_numWords;
// loop through all the quotes in the query and find
// which one we match, if any. we will have to advance the
// query word and doc word simultaneously and make sure they
// match as we advance.
int32_t nqw = m_q->m_numWords;
// do not look through more words than were hashed, wastes time
//if ( nqw >= m_maxNQW && m_maxNQW > 0 ) nqw = m_maxNQW;
int32_t j;
for ( j = 0 ; j < nqw ; j++ ) {
// get ith query word
QueryWord *qw = &m_q->m_qwords[j];
if ( !qw->m_rawWordId ) continue;
// query word must match wid of first word in quote
if ( (qw->m_rawWordId != wids[wn]) ) continue;
// (qw->m_rawWordId != swids[wn])) continue;
// skip if in field
// . we were doing an intitle:"fight club" query and
// needed to match that in the title...
//if ( qw->m_fieldCode ) continue;
// query word must be in quotes
if ( ! qw->m_inQuotes ) continue;
// skip it if it does NOT start the quote. quoteStart
// is actually the query word # that contains the quote
//if ( qw->m_quoteStart != j-1 ) continue;
// not any more it isn't...
if ( qw->m_quoteStart != j ) continue;
// save the first word # in the query of the quote
k = j; // -1;
// count number of words we match in the quote, we've
// already matched the first one
count = 0;
subloop:
// query word must match wid of first word in phrase
if ( (qw->m_rawWordId != wids[wn]) ) {
// (qw->m_rawWordId != swids[wn])) {
// reset and try another quote in the query
count = 0;
wn = wn0;
continue;
}
// up the count of query words matched in the quote
count++;
// ADVANCE QUERY WORD
j++;
// if no more, we got a match
if ( j >= nqw ) break;
// skip punct words
if ( m_q->m_qwords[j].m_isPunct ) j++;
// if no more, we got a match
if ( j >= nqw ) break;
// now we should point to the next query word in quote
qw = &m_q->m_qwords[j];
// if not in quotes, we're done, we got a match
if ( ! qw->m_inQuotes ) break;
// or if in a different set of quotes, we got a match
if ( qw->m_quoteStart != k ) break;
// . ADVANCE DOCUMENT WORD
// . tags and punctuation words have 0 for their wid
for ( wn++ ; wn < nw ; wn++ ) {
// . if NO PUNCT, IN QUOTES, AND word id is zero
// then check for punctuation
if(!allowPunctInPhrase && qw->m_inQuotes && !wids[wn]) {
// . check if its a space [0x20, 0x00]
if( (wl[wn] == 2) && (ws[wn][0] == ' ') )
continue;
// . if the length is greater than a space
else if( wl[wn] > 2 ) {
// . increment until we find no space
// . increment by 2 since its utf16
for( int32_t i = 0; i < wl[wn]; i+=2 )
// . if its not a space, its punc
if( ws[wn][i] != ' ' ) {
count=0; break;
}
// . if count is 0, punc found break
if( count == 0 ) break;
}
// . otherwise its solo punc, set count and break
else { count=0; break; }
}
// . we incremented to a new word break and check
if ( wids[wn] ) break;
}
// there was a following query word in the quote
// so there must be a following word, if not, continue
// to try to find another quote in the query we match
if ( wn >= nw ) {
// reset and try another quote in the query
count = 0;
wn = wn0;
continue;
}
// see if the next word and query term match
goto subloop;
}
// if we did not match any quote in the query
// check if we did match a single word. e.g.
// Hello World "HelloWorld" "Hello World Example"
if ( count <= 0 ) {
if ( m_qtableFlags[n] & 0x01 ) {
*numQWords = 1;
// we did match a single word. m_qtableWordNums[n] may
// not be pointing to the right qword. Set it to a
// qword that is the single word
for ( j = 0 ; j < nqw ; j++ ) {
// get ith query word
QueryWord *qw = &m_q->m_qwords[j];
if ( !qw->m_rawWordId ) continue;
// query word must match wid of word
if ( (qw->m_rawWordId != wids[wn]) ) continue;
// (qw->m_rawWordId != swids[wn])) continue;
// skip if in field
// . fix intitle:"fight club"
//if ( qw->m_fieldCode ) continue;
// query word must NOT be in quotes
if ( qw->m_inQuotes ) continue;
*qwn = j;
}
return 1;
}
else
return 0;
}
// sanity check
if ( k < 0 ) { char *xx = NULL; *xx = 0; }
// skip punct words
if ( j-1>=0 && m_q->m_qwords[j-1].m_isPunct ) j--;
// . ok, we got a quote match
// . it had this man query words in it
//*numQWords = j - (k+1);
*numQWords = j - k;
// fix the start word
*qwn = k ;
if (m_q->m_qwords[k].m_isPunct) *qwn = k+1;
return wn - wn0 + 1;
}
/*
int32_t Matches::getTermsFound ( bool *hadPhrases ,
bool *hadWords ) {
*hadPhrases = true;
*hadWords = true;
int32_t n = m_q->getNumTerms();
int32_t count = 0;
for ( int32_t i = 0 ; i < n ; i++ ) {
// do not count query stop words if not in quotes
//if ( m_q->m_qterms[i].m_isQueryStopWord &&
// ! m_q->m_qterms[i].m_inQuotes )
// continue;
if ( m_foundTermVector[i] ) { count++; continue; }
// if we missed a phrase, flag it
if ( m_q->m_qterms[i].m_inQuotes ) *hadPhrases = false;
else *hadWords = false;
}
return count;
}
*/
// new version for explicit bit mask
/*
uint32_t Matches::getTermsFound2(bool *hadPhrases, bool *hadWords) {
*hadPhrases = true;
*hadWords = true;
int32_t n = m_q->getNumTerms();
//int32_t count = 0;
for ( int32_t i = 0 ; i < n ; i++ ) {
QueryTerm *qt = &m_q->m_qterms[i];
if (qt->m_fieldCode) continue;
if (qt->m_isPhrase && qt->m_termSign == 0) continue;
if ( m_explicitsMatched & qt->m_explicitBit ) continue;
// if we missed a phrase, flag it
if ( qt->m_inQuotes ) *hadPhrases = false;
else *hadWords = false;
}
return m_explicitsMatched;
}
*/
/*
void Matches::setSubPhraseDetection() {
char* pbuf = m_subPhraseBuf;
m_pre.set(128, pbuf , m_htSize);
pbuf += m_htSize;
m_post.set(128, pbuf, m_htSize);
m_detectSubPhrases = true;
m_leftDiversity = 0;
m_rightDiversity = 0;
int64_t h = hash64LowerE("www",3);
m_pre.addKey(h, LONG_MIN);
}
*/
void Matches::detectSubPhrase(Words* words,
int32_t matchWordNum,
int32_t numMatchedWords,
int32_t queryWordNum ,
int32_t diversityWeight ) {
int32_t nw = words->getNumWords();
int64_t *wids = words->getWordIds();
// . Hash the preceding word
int32_t prevWord = matchWordNum - 2;
//skip entities
while(prevWord > 0 && wids[prevWord] == 0) prevWord--;
int64_t wid;
int32_t slot;
if(prevWord < 0 || wids[prevWord] == 0) {
//word begins this section
m_leftDiversity += diversityWeight;
}
else if(queryWordNum == 0 ||
m_q->m_qwords[queryWordNum-1].m_rawWordId != wids[prevWord]) {
//prev word is valid and is not prev query word
wid = wids[prevWord];
slot = m_pre.getSlot(wid);
int32_t val;
if(slot == -1) {
m_pre.addKey(wid, 1);
m_leftDiversity += diversityWeight;
}
else {
val = m_pre.getValueFromSlot(slot);
//our exempt words are negative
if(val < 0) m_leftDiversity += diversityWeight;
}
}
// . Hash the trailing word
//n words + n-1 punctuation separators.
int32_t nextWord = matchWordNum + 2 * numMatchedWords ;
int32_t nextQueryWord = queryWordNum + numMatchedWords;
//skip entities
while(nextWord < nw && wids[nextWord] == 0) nextWord++;
if(nextWord >= nw || wids[nextWord] == 0) {
//word ends this section
m_rightDiversity += diversityWeight;
}
else if(nextQueryWord >= m_q->m_numWords ||
m_q->m_qwords[nextQueryWord].m_rawWordId != wids[nextWord]) {
//next word is valid and is not the next query word
wid = wids[nextWord];
slot = m_post.getSlot(wid);
int32_t val;
if(slot == -1) {
m_post.addKey(wid, 1);
m_rightDiversity += diversityWeight;
}
else {
val = m_post.getValueFromSlot(slot);
if(val < 0) m_rightDiversity += diversityWeight;
}
}
}
float Matches::getDiversity() {
float retval = m_leftDiversity;
if(m_rightDiversity < retval) retval = m_rightDiversity;
//0 means we did not get a match, doc will be big hacked out
//1 means not diverse at all
if(retval <= 1) return 0;
return logf(retval);
}
/*
bool Matches::negTermsFound ( ) {
for( int32_t i = 0; i < m_numNegTerms; i++ ) {
if( m_foundNegTermVector[i] ) return true;
}
return false;
}
*/
MatchOffsets::MatchOffsets() {
reset();
}
MatchOffsets::~MatchOffsets() {
}
void MatchOffsets::reset() {
m_numMatches = 0;
m_numBreaks = 0;
m_numAlnums = 0;
}
bool MatchOffsets::set(Xml * xml, Words *words, Matches *matches,
unsigned char offsetType) {
//m_numMatches = matches->m_numMatches;
m_numMatches = 0;
m_numAlnums = matches->m_numAlnums;
if (offsetType == OFFSET_WORDS){
for (int32_t i = 0; i < matches->m_numMatches ; i++ ) {
m_queryWords[i] = matches->m_matches[i].m_qwordNum;
m_matchOffsets[i] = matches->m_matches[i].m_wordNum;
m_numMatches++;
// look for breaking tags
if (i == matches->m_numMatches-1) continue;
for (int32_t j= matches->m_matches[i].m_wordNum+1;
j < matches->m_matches[i+1].m_wordNum;
j++){
nodeid_t tag =words->m_tagIds[j] & BACKBITCOMP;
if (!g_nodes[tag].m_isBreaking)
continue;
m_breakOffsets[m_numBreaks++] = j;
// only store the first one
break;
}
}
}
else if ( offsetType == OFFSET_BYTES ){
// Latin-1 offset
for (int32_t i = 0; i < matches->m_numMatches ; i++ ) {
int32_t wordOffset = matches->m_matches[i].m_wordNum;
m_queryWords[i] = matches->m_matches[i].m_qwordNum;
m_matchOffsets[i] =
words->m_words[wordOffset] -
words->m_words[0];
m_numMatches++;
// look for breaking tags
if (i == matches->m_numMatches-1)
continue;
for (int32_t j= matches->m_matches[i].m_wordNum+1;
j < matches->m_matches[i+1].m_wordNum;
j++){
nodeid_t tag =words->m_tagIds[j] & BACKBITCOMP;
if (g_nodes[tag].m_isBreaking) {
m_breakOffsets[m_numBreaks++] =
words->m_words[j] -
words->m_words[0];
// only store the first one
break;
}
}
}
}
return true;
}
int32_t MatchOffsets::getStoredSize() {
return m_numMatches * 5
+ 4 //numMatches
+ m_numBreaks * 4
+ 4 //numBreaks
+ 4 //totalsize
+ 4;//numAlnums
}
int32_t MatchOffsets::serialize(char *buf, int32_t bufsize){
//if (m_numMatches == 0 ) return 0;
int32_t need = getStoredSize();
if ( need > bufsize ) {
g_errno = EBUFTOOSMALL;
log(LOG_LOGIC,"query: matchoffsets: serialize Buf too small.");
return -1;
}
char *p = buf;
*(int32_t*) p = need; p += 4;
*(int32_t*) p = m_numMatches; p += 4;
*(int32_t*) p = m_numAlnums; p += 4;
gbmemcpy(p, m_queryWords, m_numMatches); p += m_numMatches;
gbmemcpy(p, m_matchOffsets, m_numMatches*4); p += m_numMatches*4;
*(int32_t*) p = m_numBreaks; p += 4;
gbmemcpy(p, m_breakOffsets, m_numBreaks*4); p += m_numBreaks*4;
return p - buf;
}
int32_t MatchOffsets::deserialize(char *buf, int32_t bufsize){
//if (bufsize == 0 ) return 0;
char *p = buf;
int32_t need = *(int32_t*) p ; p += 4;
if (bufsize < need) {
g_errno = EBUFTOOSMALL;
log(LOG_LOGIC,"query: matchoffsets: deserialize "
"buf too small.");
return -1;
}
m_numMatches = *(int32_t*) p ; p += 4;
m_numAlnums = *(int32_t*) p ; p += 4;
gbmemcpy(m_queryWords, p, m_numMatches); p += m_numMatches;
gbmemcpy(m_matchOffsets, p, m_numMatches*4); p += m_numMatches*4;
m_numBreaks = *(int32_t*) p ; p += 4;
gbmemcpy(m_breakOffsets, p, m_numBreaks*4); p += m_numBreaks*4;
return p - buf;
}