Merge remote-tracking branch 'origin/diffbot' into diffbot-dan

This commit is contained in:
Daniel Steinberg 2014-04-01 19:48:24 -07:00
commit 0988a134d0
27 changed files with 1068 additions and 442 deletions

View File

@ -433,7 +433,7 @@ char *JsonItem::getValueAsString ( long *valueLen ) {
// numbers...
static char s_numBuf[64];
if ( m_valueLong == (long)m_valueDouble ) {
if ( (float)m_valueLong == m_valueDouble ) {
*valueLen = sprintf ( s_numBuf,"%li", m_valueLong );
return s_numBuf;
}

View File

@ -721,6 +721,25 @@ void downloadTheDocForReals ( Msg13Request *r ) {
"(compatible; MSIE 6.0; Windows 98; "
"Win 9x 4.90)" ;
// for bulk jobs avoid actual downloads of the page for efficiency
if ( r->m_isCustomCrawl == 2 ) {
char *s =
"HTTP/1.0 200 (OK)\r\n"
"Content-Length: 0\r\n"
"Connection: Close\r\n"
"Content-Type: text/html\r\n\r\n";
long slen = gbstrlen(s);
long fakeBufSize = slen + 1;
char *fakeBuf = mdup ( s , fakeBufSize , "fkblk");
gotHttpReply2 ( r ,
fakeBuf,
fakeBufSize, // include \0
fakeBufSize, // allocsize
NULL ); // tcpsock
return;
}
// download it
if ( ! g_httpServer.getDoc ( r->m_url ,
r->m_urlIp ,

View File

@ -32,6 +32,8 @@ public:
// if doing spider compression, compute contentHash32 of document
// downloaded, and if it matches this then send back EDOCUNCHANGED
long m_contentHash32;
// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
char m_isCustomCrawl;
// send back error ENOGOODDATE if it does not have one. but if
// harvestLinks is true, just send back a filtered list of links
long m_requireGoodDate:1;

View File

@ -543,7 +543,7 @@ bool Msg39::getLists () {
"component=%li "
"otermLen=%li "
"isSynonym=%li "
"querylangid=%li ",
"querylangid=%li " ,
(long)this ,
i ,
qt->m_term,//bb ,
@ -569,7 +569,7 @@ bool Msg39::getLists () {
(long)m_tmpq.m_componentCodes[i],
(long)m_tmpq.getTermLen(i) ,
isSynonym,
(long)m_tmpq.m_langId); // ,tt
(long)m_tmpq.m_langId ); // ,tt
// put it back
*tpc = tmp;
if ( st ) {
@ -661,6 +661,7 @@ void gotListsWrapper ( void *state ) {
Msg39 *THIS = (Msg39 *) state;
// . hash the lists into our index table
// . this will send back a reply or recycle and read more list data
if ( ! THIS->gotLists ( true ) ) return;
// . if he did not block and there was an errno we send reply
@ -671,6 +672,12 @@ void gotListsWrapper ( void *state ) {
log("msg39: sending back error reply = %s",mstrerror(g_errno));
sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
}
// no, block? call the docid split loop
//if ( numDocIdSplits <= 1 ) return;
// if we get the lists and processed them without blocking, repeat!
THIS->doDocIdSplitLoop();
}
// . now come here when we got the necessary index lists
@ -753,10 +760,25 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
// . now we must call this separately here, not in allocTopTree()
// . we have to re-set the QueryTermInfos with each docid range split
// since it will set the list ptrs from the msg2 lists
if ( m_r->m_useNewAlgo && ! m_posdbTable.setQueryTermInfo () ) {
return true;
if ( ! m_posdbTable.setQueryTermInfo () ) return true;
// print query term bit numbers here
for ( long i = 0 ;
m_debug && i < m_tmpq.getNumTerms() ; i++ ) {
QueryTerm *qt = &m_tmpq.m_qterms[i];
//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
char *tpc = qt->m_term + qt->m_termLen;
char tmp = *tpc;
*tpc = '\0';
SafeBuf sb;
sb.safePrintf("query: msg39: BITNUM query term #%li \"%s\" "
"bitnum=%li ", i , qt->m_term, qt->m_bitNum );
// put it back
*tpc = tmp;
logf(LOG_DEBUG,"%s",sb.getBufStart());
}
// timestamp log
if ( m_debug ) {
log(LOG_DEBUG,"query: msg39: [%lu] Preparing to intersect "
@ -817,6 +839,7 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
// time it
diff = gettimeofdayInMilliseconds() - start;
if ( diff > 10 ) log("query: Took %lli ms for intersection",diff);
// returns false if blocked, true otherwise
return addedLists ();
}

View File

@ -859,9 +859,9 @@ bool Msg5::needsRecall ( ) {
if ( m_round == 0 ) logIt = false;
if ( logIt )
logf(LOG_DEBUG,"db: Reading %li again from %s (need %li total "
"got %li) this=0x%lx round=%li.",
"got %li) cn=%li this=0x%lx round=%li.",
m_newMinRecSizes , base->m_dbname , m_minRecSizes,
m_list->m_listSize, (long)this , m_round );
m_list->m_listSize, (long)m_collnum,(long)this, m_round );
m_round++;
// record how many screw ups we had so we know if it hurts performance
base->m_rdb->didReSeek ( );

View File

@ -85,7 +85,9 @@ bool sendPageAddUrl2 ( TcpSocket *s , HttpRequest *r ) {
if ( url ) {
// normalize and add www. if it needs it
Url uu;
uu.set ( url , gbstrlen(url) , true );
// do not convert xyz.com to www.xyz.com because sometimes
// people want xyz.com exactly
uu.set ( url , gbstrlen(url) , false ); // true );
// remove >'s i guess and store in st1->m_url[] buffer
st1->m_urlLen=cleanInput ( st1->m_url,
MAX_URL_LEN,

View File

@ -623,6 +623,7 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
//"</td>"
//"</tr>"
/*
// local subdir match
"<tr>"
"<td>file://C/mydir/mysubdir/"
@ -637,6 +638,7 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
"Do not spider files in this subdirectory."
"</td>"
"</tr>"
*/
// connect to a device and index it as a stream
//"<tr>"

View File

@ -582,6 +582,7 @@ skipReplaceHost:
sb.safePrintf ( "</table><br>\n" );
/*
// print spare hosts table
sb.safePrintf (
"<table %s>"
@ -646,7 +647,9 @@ skipReplaceHost:
h->m_note );
}
sb.safePrintf ( "</table><br>" );
*/
/*
// print proxy hosts table
sb.safePrintf (
"<table %s>"
@ -754,6 +757,7 @@ skipReplaceHost:
h->m_note );
}
sb.safePrintf ( "</table><br><br>" );
*/
sb.safePrintf(
"<style>"
@ -812,7 +816,6 @@ skipReplaceHost:
"<td>The UDP port used to send and receive dns traffic with."
"</td>"
"</tr>\n"
*/
"<tr class=poo>"
"<td>http port</td>"
@ -820,7 +823,6 @@ skipReplaceHost:
"</td>"
"</tr>\n"
/*
"<tr class=poo>"
"<td>best switch id</td>"
"<td>The host prefers to be on this switch because it "
@ -886,6 +888,43 @@ skipReplaceHost:
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>avg split time</td>"
"<td>Average time this host took to compute the docids "
"for a query. Useful for guaging the slowness of a host "
"compare to other hosts."
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>splits done</td>"
"<td>Number of queries this host completed. Used in "
"computation of the <i>avg split time</i>."
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>status</td>"
"<td>Status flags for the host. See key below."
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>slow reads</td>"
"<td>Number of slow disk reads the host has had. "
"When this is big compared to other hosts it is a good "
"indicator its drives are relatively slow."
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>docs indexed</td>"
"<td>Number of documents this host has indexed over all "
"collections. All hosts should have close to the same "
"number in a well-sharded situation."
"</td>"
"</tr>\n"
//"<tr class=poo>"
//"<td>loadavg</td>"
//"<td>1-minute sliding-window load average from "
@ -895,13 +934,26 @@ skipReplaceHost:
"<tr class=poo>"
"<td>mem used</td>"
"<td>percentage of memory currently used."
"<td>Percentage of memory currently used."
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>cpu usage</td>"
"<td>percentage of cpu resources in use by the gb process."
"<td>Percentage of cpu resources in use by the gb process."
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>disk usage</td>"
"<td>Percentage of disk in use. When this gets close to "
"100%% you need to do something."
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>max ping1</td>"
"<td>The worst ping latency from host to host."
"</td>"
"</tr>\n"
@ -918,6 +970,7 @@ skipReplaceHost:
"</td>"
"</tr>\n"
/*
"<tr class=poo>"
"<td>ping2</td>"
"<td>Ping time to this host on the seconday/shotgun "
@ -925,6 +978,7 @@ skipReplaceHost:
"network is not enabled in the master controls."
"</td>"
"</tr>\n"
*/
"<tr class=poo>"
"<td>M (status flag)</td>"
@ -950,6 +1004,27 @@ skipReplaceHost:
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>R (status flag)</td>"
"<td>Indicates host is performing a rebalance operation."
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>F (status flag)</td>"
"<td>Indicates host has foreign records and requires "
"a rebalance operation."
"</td>"
"</tr>\n"
"<tr class=poo>"
"<td>x (status flag)</td>"
"<td>Indicates host has abruptly exited due to a fatal "
"error (cored) and "
"restarted itself."
"</td>"
"</tr>\n"
,
TABLE_STYLE

View File

@ -233,13 +233,13 @@ bool sendReply ( void *state ) {
"By default, injected urls "
"take precedence over the \"insitelist\" directive in the "
"<a href=/admin/scheduler>spider scheduler</a> "
"<a href=/admin/filters>url filters</a> "
"so injected urls need not match the "
"<a href=/admin/sites>spider sites</a> patterns. You can "
"change that behavior in the <a href=/scheduler>spider "
"scheduler</a> if you want. "
"change that behavior in the <a href=/admin/filters>url "
"filters</a> if you want. "
"Injected urls will have a "
"<a href=/admin/scheduler#hopcount>hopcount</a> of 0. "
"<a href=/admin/filters#hopcount>hopcount</a> of 0. "
"The injection api is described on the "
"<a href=/admin/api>api</a> page."

View File

@ -1,8 +1,8 @@
#include "gb-include.h"
#include "PageParser.h"
#include "IndexTable.h"
#include "IndexTable2.h"
//#include "IndexTable.h"
//#include "IndexTable2.h"
//#include "XmlDoc.h" // addCheckboxSpan()
bool g_inPageParser = false;
@ -101,7 +101,7 @@ bool sendPageParser2 ( TcpSocket *s ,
st->m_termFreqs = termFreqs;
st->m_termFreqWeights = termFreqWeights;
st->m_affWeights = affWeights;
st->m_total = (score_t)-1;
//st->m_total = (score_t)-1;
st->m_indexCode = 0;
st->m_blocked = false;
st->m_didRootDom = false;
@ -654,7 +654,7 @@ bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
//st->m_termFreqs = termFreqs;
//st->m_termFreqWeights = termFreqWeights;
//st->m_affWeights = affWeights;
st->m_total = (score_t)-1;
//st->m_total = (score_t)-1;
st->m_indexCode = 0;
st->m_blocked = false;
st->m_didRootDom = false;

View File

@ -80,7 +80,7 @@ public:
long long *m_termFreqs;
float *m_termFreqWeights;
float *m_affWeights;
score_t m_total;
//score_t m_total;
bool m_freeIt;
bool m_blocked;

View File

@ -1324,7 +1324,7 @@ bool printSearchResultsHeader ( State0 *st ) {
if ( isAdmin ) {
sb->safePrintf(" &nbsp; "
"<font color=red><b>"
"<a href=\"/admin/basic?c=%s\">"
"<a href=\"/admin/settings?c=%s\">"
"[admin]"
"</a></b></font>",coll);
// print reindex link
@ -2141,7 +2141,9 @@ bool printResult ( State0 *st, long ix ) {
*end == '}' ) {
// replace trailing } with spidertime}
sb->incrementLength(-1);
sb->safePrintf(",\"docId\":%lli\n", mr->m_docId);
sb->safePrintf(",\"docId\":%lli", mr->m_docId);
// for deduping
//sb->safePrintf(",\"crc\":%lu",mr->m_contentHash32);
// crap, we lose resolution storing as a float
// so fix that shit here...
//float f = mr->m_lastSpidered;

View File

@ -78,7 +78,7 @@ bool printNav ( SafeBuf &sb , HttpRequest *r ) {
//" &nbsp; &nbsp; <a href=/logout>Logout</a>"
);
if ( r->isLocal() )
//if ( r->isLocal() )
sb.safePrintf("&nbsp; &nbsp; [<a href=\"/admin/settings\">"
"<font color=red>Admin</font></a>]");
sb.safePrintf("</p></b></center></body></html>");

View File

@ -233,9 +233,9 @@ static WebPage s_pages[] = {
"what sites can be spidered",
sendPageGeneric , 0 } , // sendPageBasicSettings
{ PAGE_FILTERS , "admin/scheduler", 0 , "spider scheduler" , 1 , 1,
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
//USER_ADMIN | USER_MASTER ,
"schedule urls to be spidered",
"prioritize urls for spidering",
sendPageGeneric , 0 } ,
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
@ -1353,7 +1353,7 @@ bool Pages::printAdminBottom ( SafeBuf *sb, HttpRequest *r ) {
bool Pages::printSubmit ( SafeBuf *sb ) {
// update button
return sb->safePrintf (
"<br>"
//"<br>"
"<center>"
"<input type=submit name=action value=submit>"
"</center>"
@ -1764,7 +1764,9 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
//if ( g_users.hasPermission(username,PAGE_ADMIN ) )
// sprintf(buf,"&master=0");
//sb->safePrintf("<div style=max-width:1000px;>");
// unfortunately width:100% is percent of the virtual window, not the
// visible window... so just try 1000px max
sb->safePrintf("<div style=max-width:800px;>");
//long matt1 = atoip ( MATTIP1 , gbstrlen(MATTIP1) );
//long matt2 = atoip ( MATTIP2 , gbstrlen(MATTIP2) );
@ -1904,7 +1906,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
"href=/developer.html>"
"<b>dev guide</b></a>" );
//sb->safePrintf("</div>");
sb->safePrintf("</div>");
//sb->safePrintf("</center>" );
//sb->safePrintf("<br/>" );

View File

@ -929,7 +929,7 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
long fromIp = s->m_ip;
char fmt = r->getReplyFormat();
/*
if ( fmt == FORMAT_HTML )
sb->safePrintf (
"<script type=\"text/javascript\">"
@ -959,7 +959,7 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
" }\n"
"}\n"
"</script>");
*/
// print the start of the table
char *tt = "None";
if ( page == PAGE_LOG ) tt = "Log Controls";
@ -969,7 +969,7 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
if ( page == PAGE_SPIDER ) tt = "Spider Controls";
if ( page == PAGE_SEARCH ) tt = "Search Controls";
if ( page == PAGE_ACCESS ) tt = "Access Controls";
if ( page == PAGE_FILTERS ) tt = "Spider Scheduler";
if ( page == PAGE_FILTERS ) tt = "Url Filters";
if ( page == PAGE_BASIC_SETTINGS ) tt = "Settings";
if ( page == PAGE_BASIC_SECURITY ) tt = "Security";
if ( page == PAGE_SITES ) tt = "Site List";
@ -1049,11 +1049,12 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
//p= g_parms.printParms (p, pend, page, user, THIS, coll, pwd, nc, pd);
g_parms.printParms ( sb , s , r );
if ( fmt == FORMAT_HTML ) sb->safePrintf ( "<br><br>\n" );
// end the table
if ( fmt == FORMAT_HTML ) sb->safePrintf ( "</table>\n" );
// this must be outside of table, submit button follows
if ( fmt == FORMAT_HTML ) sb->safePrintf ( "<br>\n" );
// url filter page has a test table
if ( page == PAGE_FILTERS && fmt == FORMAT_HTML ) {
// wrap up the form, print a submit button
@ -1698,10 +1699,12 @@ bool Parms::printParm ( SafeBuf* sb,
// and default value if it exists
if ( m->m_def && m->m_def[0] && t != TYPE_CMD ) {
char *d = m->m_def;
if ( t == TYPE_BOOL ) {
if ( t == TYPE_BOOL || t == TYPE_CHECKBOX ) {
if ( d[0]=='0' ) d = "NO";
else d = "YES";
sb->safePrintf ( " Default: %s.",d);
sb->safePrintf ( " <nobr>"
"Default: %s."
"</nobr>",d);
}
else {
sb->safePrintf (" Default: ");
@ -1782,7 +1785,8 @@ bool Parms::printParm ( SafeBuf* sb,
}
}
else {
sb->safePrintf("<center><nobr>");
//sb->safePrintf("<center><nobr>");
sb->safePrintf("<nobr>");
// this is part of the "HACK" fix below. you have to
// specify the cgi parm in the POST request, and
// unchecked checkboxes are not included in the POST
@ -1829,7 +1833,9 @@ bool Parms::printParm ( SafeBuf* sb,
// sb->safePrintf("value=0 name=%s%s>",
// cgi,ddd2);
//}
sb->safePrintf("</nobr></center>");
sb->safePrintf("</nobr>"
//"</center>"
);
}
}
else if ( t == TYPE_CHAR )
@ -5778,11 +5784,19 @@ void Parms::init ( ) {
m++;
m->m_title = "email server 1";
m->m_desc = "Connects to this server directly when sending email 1 ";
m->m_desc = "Connects to this IP or hostname "
"directly when sending email 1. "
"Use <i>apt-get install sendmail</i> to install sendmail "
"on that IP or hostname. Add <i>From:10.5 RELAY</i> to "
"/etc/mail/access to allow sendmail to forward email it "
"receives from gigablast if gigablast hosts are on the "
"10.5.*.* IPs. Then run <i>/etc/init.d/sendmail restart</i> "
"as root to pick up those changes so sendmail will forward "
"Gigablast's mail to the address you give below.";
m->m_cgi = "esrvone";
m->m_off = (char *)&g_conf.m_email1MX - g;
m->m_type = TYPE_STRING;
m->m_def = "10.5.54.47";
m->m_def = "127.0.0.1";
m->m_size = MAX_MX_LEN;
m->m_priv = 2;
m->m_group = 0;
@ -7487,7 +7501,7 @@ void Parms::init ( ) {
"If your url does not index as you expect you "
"can check it's history. " // (spiderdb lookup)
"Added urls will have a "
"<a href=/admin/scheduler#hopcount>hopcount</a> of 0. "
"<a href=/admin/filters#hopcount>hopcount</a> of 0. "
"The add url api is described on the "
"<a href=/admin/api>api</a> page.";
m->m_cgi = "urls";
@ -7509,7 +7523,7 @@ void Parms::init ( ) {
m++;
m->m_title = "strip sessionids";
m->m_desc = "strip added urls of their session ids.";
m->m_desc = "Strip added urls of their session ids.";
m->m_cgi = "strip";
m->m_page = PAGE_ADDURL2;
m->m_obj = OBJ_NONE;
@ -7518,7 +7532,7 @@ void Parms::init ( ) {
m++;
m->m_title = "harvest links";
m->m_desc = "harvest links of added urls so we can spider them?.";
m->m_desc = "Harvest links of added urls so we can spider them?.";
m->m_cgi = "spiderLinks";
m->m_page = PAGE_ADDURL2;
m->m_obj = OBJ_NONE;
@ -7557,17 +7571,17 @@ void Parms::init ( ) {
m->m_xml = "siteList";
m->m_desc = "List of sites to spider, one per line. "
"Gigablast uses the "
"<a href=/admin/scheduler#insitelist>insitelist</a> "
"<a href=/admin/filters#insitelist>insitelist</a> "
"directive on "
"the <a href=/admin/scheduler>spider scheduler</a> "
"the <a href=/admin/filters>url filters</a> "
"page to make sure that the spider only indexes urls "
"that match the site patterns you specify here, other than "
"urls you add individually via the add urls or inject url "
"tools. "
"See <a href=#examples>example site list</a> below. "
"Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
"to add then consider using the <a href=/admin/addurl>addurl"
"</a> interface.";
"to add then consider using the <a href=/admin/addurl>add "
"urls</a> interface.";
m->m_cgi = "sitelist";
m->m_off = (char *)&cr.m_siteListBuf - x;
m->m_page = PAGE_BASIC_SETTINGS;
@ -7625,9 +7639,9 @@ void Parms::init ( ) {
m->m_xml = "siteList";
m->m_desc = "List of sites to spider, one per line. "
"Gigablast uses the "
"<a href=/admin/scheduler#insitelist>insitelist</a> "
"<a href=/admin/filters#insitelist>insitelist</a> "
"directive on "
"the <a href=/admin/scheduler>spider scheduler</a> "
"the <a href=/admin/filters>url filters</a> "
"page to make sure that the spider only indexes urls "
"that match the site patterns you specify here, other than "
"urls you add individually via the add urls or inject url "
@ -10691,7 +10705,7 @@ void Parms::init ( ) {
m->m_off = (char *)&cr.m_siteClusterByDefault - x;
m->m_soff = (char *)&si.m_doSiteClustering - y;
m->m_type = TYPE_BOOL;
m->m_def = "1";
m->m_def = "0";
m->m_sparm = 1;
m->m_scgi = "sc";
m++;
@ -18523,9 +18537,9 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
"the list of sites on the <a href=/admin/sites>"
"site list</a> page. That site list is useful for "
"adding a large number of sites that can not be "
"accomodated by the spider scheduler table. Plus "
"accomodated by the url fitlers table. Plus "
"it is higher performance and easier to use, but "
"lacks the spider scheduler's "
"lacks the url filter table's "
"fine level of control."
"</td></tr>"

362
Posdb.cpp
View File

@ -1263,7 +1263,6 @@ char *getHashGroupString ( unsigned char hg ) {
//
////////////////
#define MAX_SUBLISTS 50
/*
// . these lists[] are 1-1 with q->m_qterms
void PosdbTable::intersectLists9_r ( ) {
@ -4075,38 +4074,6 @@ float PosdbTable::getTermPairScoreForAny ( long i, long j,
//
// . each QueryTerm has this attached additional info now:
// . these should be 1-1 with query terms, Query::m_qterms[]
class QueryTermInfo {
public:
// the required lists for this query term, synonym lists, etc.
RdbList *m_subLists [MAX_SUBLISTS];
// flags to indicate if bigram list should be scored higher
char m_bigramFlags [MAX_SUBLISTS];
// shrinkSubLists() set this:
long m_newSubListSize [MAX_SUBLISTS];
char *m_newSubListStart [MAX_SUBLISTS];
char *m_newSubListEnd [MAX_SUBLISTS];
char *m_cursor [MAX_SUBLISTS];
char *m_savedCursor [MAX_SUBLISTS];
long m_numNewSubLists;
// how many are valid?
long m_numSubLists;
// size of all m_subLists in bytes
long long m_totalSubListsSize;
// the term freq weight for this term
float m_termFreqWeight;
// what query term # do we correspond to in Query.h
long m_qtermNum;
// the word position of this query term in the Words.h class
long m_qpos;
// the wikipedia phrase id if we start one
long m_wikiPhraseId;
// phrase id term or bigram is in
long m_quotedStartId;
};
// returns false and sets g_errno on error
bool PosdbTable::setQueryTermInfo ( ) {
@ -4215,6 +4182,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_bigramFlags[nn] = BF_HALFSTOPWIKIBIGRAM;
// before a pipe operator?
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
// add list of member terms as well
//qti->m_qtermList[nn] = &m_q->m_qterms[left];
m_q->m_qterms[left].m_bitNum = nrg;
// only really add if useful
if ( list && list->m_listSize ) nn++;
@ -4231,6 +4201,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_bigramFlags[nn] |= BF_SYNONYM;
if (qt->m_piped)
qti->m_bigramFlags[nn]|=BF_PIPED;
// add list of member terms as well
//qti->m_qtermList[nn] = bt;
bt->m_bitNum = nrg;
if ( list && list->m_listSize ) nn++;
}
@ -4252,6 +4225,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_bigramFlags[nn] = BF_HALFSTOPWIKIBIGRAM;
// before a pipe operator?
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
// add list of member terms as well
//qti->m_qtermList[nn] = &m_q->m_qterms[right];
m_q->m_qterms[right].m_bitNum = nrg;
// only really add if useful
if ( list && list->m_listSize ) nn++;
@ -4268,6 +4244,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_bigramFlags[nn] |= BF_SYNONYM;
if (qt->m_piped)
qti->m_bigramFlags[nn]|=BF_PIPED;
// add list of member terms as well
//qti->m_qtermList[nn] = bt;
bt->m_bitNum = nrg;
if ( list && list->m_listSize ) nn++;
}
@ -4312,6 +4291,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
if (qt->m_fieldCode == FIELD_GBNUMBERMAXINT )
qti->m_bigramFlags[nn]|=BF_NUMBER;
// add list of member terms
//qti->m_qtermList[nn] = qt;
qt->m_bitNum = nrg;
// only really add if useful
// no, because when inserting NEW (related) terms that are
// not currently in the document, this list may initially
@ -4334,6 +4317,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
// call it a synonym i guess
qti->m_bigramFlags[nn] |= BF_BIGRAM;
// add list of member terms
//qti->m_qtermList[nn] = &m_q->m_qterms[left];
m_q->m_qterms[left].m_bitNum = nrg;
// only really add if useful
if ( list && list->m_listSize ) nn++;
@ -4349,6 +4335,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_bigramFlags[nn] = BF_SYNONYM;
if (qt->m_piped)
qti->m_bigramFlags[nn]|=BF_PIPED;
// add list of member terms
//qti->m_qtermList[nn] = bt;
bt->m_bitNum = nrg;
if ( list && list->m_listSize ) nn++;
}
@ -4370,6 +4359,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_bigramFlags[nn] |= BF_BIGRAM;
// before a pipe operator?
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
// add list of query terms too that are in this group
//qti->m_qtermList[nn] = &m_q->m_qterms[right];
m_q->m_qterms[right].m_bitNum = nrg;
// only really add if useful
if ( list && list->m_listSize ) nn++;
@ -4385,6 +4377,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_bigramFlags[nn] = BF_SYNONYM;
if (qt->m_piped)
qti->m_bigramFlags[nn]|=BF_PIPED;
// add list of member terms
//qti->m_qtermList[nn] = bt;
bt->m_bitNum = nrg;
if ( list && list->m_listSize ) nn++;
}
@ -4408,6 +4403,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_bigramFlags[nn] = BF_SYNONYM;
// before a pipe operator?
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
// add list of member terms as well
//qti->m_qtermList[nn] = qt2;
// set bitnum here i guess
qt2->m_bitNum = nrg;
// only really add if useful
if ( list && list->m_listSize ) nn++;
}
@ -4448,11 +4447,34 @@ bool PosdbTable::setQueryTermInfo ( ) {
nrg++;
}
//
// now set QueryTerm::m_bitNum for use by Expression::isTruth()
// in Query.cpp for boolean queries, so we can get the bit vector
// of a docid that is 1-1 with the queryterminfos and see which
// query words in the boolean expression it contains.
// used by matchesBoolQuery() which we call below.
//
/*
for ( long i = 0 ; i < nrg ; i++ ) {
// get one
QueryTermInfo *qti = &qip[i];
// how many query terms are in this group?
for ( long j = 0 ; j < qti->m_numSubLists ; j++ ) {
// get the query term
QueryTerm *qt = qti->m_qtermList[j];
// set the bit num member
qt->m_bitNum = i;
}
}
*/
//
// get the query term with the least data in posdb including syns
//
m_minListSize = 0;
m_minListi = -1;
long long grand = 0LL;
// hopefully no more than 100 sublists per term
//char *listEnds [ MAX_QUERY_TERMS ][ MAX_SUBLISTS ];
// set ptrs now i guess
@ -4465,6 +4487,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
// add to it
total = qti->m_totalSubListsSize;
// add up this now
grand += total;
// get min
if ( total < m_minListSize || m_minListi == -1 ) {
m_minListSize = total;
@ -4485,9 +4509,40 @@ bool PosdbTable::setQueryTermInfo ( ) {
long maxDocIds = m_minListSize / 12;
// store all interesected docids in here for new algo plus 1 byte vote
long need = maxDocIds * 6;
// they could all be OR'd together!
if ( m_q->m_isBoolean ) need = grand;
// so we can always cast a long long from a ptr in there
// for setting m_docId when m_booleanQuery is true below
need += 8;
// get max # of docids we got in an intersection from all the lists
if ( ! m_docIdVoteBuf.reserve ( need,"divbuf" ) ) return false;
// i'm feeling if a boolean query put this in there too, the
// hashtable that maps each docid to its boolean bit vector
// where each bit stands for an operand so we can quickly evaluate
// the bit vector in a truth table
long maxSlots = maxDocIds * 2;
// get total operands we used
//long numOperands = m_q->m_numWords;//Operands;
// a quoted phrase counts as a single operand
// . QueryTerm::m_bitNum <== m_numQueryTermInfos
// . each queryTermInfo class corresponds to one bit in our bit vec
// . essentially each queryTermInfo is a query term, but it has
// all the synonym and word forms for that query, etc.
m_vecSize = m_numQueryTermInfos;//numOperands / 8 ;
// allow an extra byte for remainders
if ( m_numQueryTermInfos % 8 ) m_vecSize++;
// now preallocate the hashtable. 0 niceness.
if ( m_q->m_isBoolean &&
! m_bt.set (8,m_vecSize,maxSlots,NULL,0,false,0,"booltbl"))
return false;
if ( m_q->m_isBoolean &&
! m_ct.set (8,1,maxSlots,NULL,0,false,0,
"booltbl"))
return false;
return true;
}
@ -5110,7 +5165,7 @@ void PosdbTable::intersectLists10_r ( ) {
// . if smallest required list is empty, 0 results
// . also set in setQueryTermInfo
if ( m_minListSize == 0 ) return;
if ( m_minListSize == 0 && ! m_q->m_isBoolean ) return;
/*
for ( long k = 0 ; seoHack && k < m_q->m_numTerms ; k++ ) {
@ -5165,6 +5220,20 @@ void PosdbTable::intersectLists10_r ( ) {
//if ( ! m_msg2 ) goto seoHackSkip;
// for boolean queries we scan every docid in all termlists,
// then we see what query terms it has, and make a bit vector for it.
// then use a hashtable to map that bit vector to a true or false
// as to whether we should include it in the results or not.
// we use Query::getBitScore(qvec_t ebits) to evaluate a docid's
// query term explicit term bit vector.
if ( m_q->m_isBoolean ) {
// keeping the docids sorted is the challenge here...
makeDocIdVoteBufForBoolQuery_r();
goto skip3;
}
// . create "m_docIdVoteBuf" filled with just the docids from the
// smallest group of sublists
// . m_minListi is the queryterminfo that had the smallest total
@ -5238,6 +5307,8 @@ void PosdbTable::intersectLists10_r ( ) {
}
*/
skip3:
if ( m_debug ) {
now = gettimeofdayInMilliseconds();
took = now - lastTime;
@ -5662,6 +5733,16 @@ void PosdbTable::intersectLists10_r ( ) {
}
}
if ( m_q->m_isBoolean ) {
minScore = 1.0;
// since we are jumping, we need to set m_docId here
m_docId = *(unsigned long *)(docIdPtr+1);
m_docId <<= 8;
m_docId |= (unsigned char)docIdPtr[0];
m_docId >>= 2;
goto boolJump;
}
// TODO: consider skipping this pre-filter if it sucks, as it does
// for 'time enough for love'. it might save time!
@ -6512,6 +6593,8 @@ void PosdbTable::intersectLists10_r ( ) {
goto advance;
boolJump:
// try dividing it by 3! (or multiply by .33333 faster)
score = minScore * (((float)siteRank)*SITERANKMULTIPLIER+1.0);
@ -6670,6 +6753,8 @@ void PosdbTable::intersectLists10_r ( ) {
// set the score and docid ptr
t->m_score = score;
t->m_docId = m_docId;
// sanity
if ( m_docId == 0 ) { char *xx=NULL;*xx=0; }
// use an integer score like lastSpidered timestamp?
if ( m_sortByTermNumInt >= 0 ) {
t->m_intScore = intScore;
@ -6961,4 +7046,213 @@ void printTermList ( long i, char *list, long listSize ) {
}
}
// sort in descending order
int dcmp6 ( const void *h1 , const void *h2 ) {
if ( *(unsigned long *)((char *)h1+2) <
*(unsigned long *)((char *)h2+2) )
return -1;
if ( *(unsigned long *)((char *)h1+2) >
*(unsigned long *)((char *)h2+2) )
return 1;
if ( *(unsigned short *)((char *)h1) <
*(unsigned short *)((char *)h2) )
return -1;
// they shouldn't be any dups in there...
return 1;
}
// TODO: do this in docid range phases to save memory and be much faster
// since we could contain to the L1 cache for hashing
bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
// . make a hashtable of all the docids from all the termlists
// . the value slot will be the operand bit vector i guess
// . the size of the vector needs one bit per query operand
// . if the vector is only 1-2 bytes we can just evaluate each
// combination we encounter and store it into an array, otherwise,
// we can use a another hashtable in order to avoid re-evaluation
// on if it passes the boolean query.
char bitVec[MAX_OVEC_SIZE];
if ( m_vecSize > MAX_OVEC_SIZE ) m_vecSize = MAX_OVEC_SIZE;
QueryTermInfo *qip = (QueryTermInfo *)m_qiBuf.getBufStart();
// . scan each list of docids to a get a new docid, keep a dedup
// table to avoid re-processing the same docid.
// . each posdb list we read corresponds to a query term,
// or a synonym of a query term, or bigram of a query term, etc.
// but we really want to know what operand, so we associate an
// operand bit with each query term, and each list can map to
// the base query term so we can get the operand # from that.
for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// get it
QueryTermInfo *qti = &qip[i];
QueryTerm *qt = &m_q->m_qterms[qti->m_qtermNum];
// get the query word
//QueryWord *qw = qt->m_qword;
// just use the word # now
//long opNum = qw->m_wordNum;//opNum;
// . make it consistent with Query::isTruth()
// . m_bitNum is set above to the QueryTermInfo #
long bitNum = qt->m_bitNum;
// do not consider for adding if negative ('my house -home')
//if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
// set all to zeroes
memset ( bitVec , 0 , m_vecSize );
// set bitvec for him
long byte = bitNum / 8;
unsigned char mask = 1<<(bitNum % 8);
bitVec[byte] |= mask;
// each query term can have synonym lists etc. scan those
for ( long j = 0 ; j < qti->m_numSubLists ; j++ ) {
// scan all docids in this list
char *p = qti->m_subLists[j]->getList();
char *pend = qti->m_subLists[j]->getListEnd();
//long long lastDocId = 0LL;
for ( ; p < pend ; ) {
// place holder
long long docId = g_posdb.getDocId(p);
// sanity
//if ( d < lastDocId ) { char *xx=NULL;*xx=0; }
//lastDocId = d;
// point to it
//char *dp = p + 8;
// this was the first key for this docid for
// this termid and possible the first key for
// this termid, so skip it, either 12 or 18
// bytes
if ( (((char *)p)[0])&0x02 ) p += 12;
// the first key for this termid?
else p += 18;
// then only 6 byte keys would follow from the
// same docid, so skip those as well
subloop:
if((((char *)p)[0])&0x04){p += 6;goto subloop;}
// convert docid into hash key
//long long docId = *(long long *)dp;
// shift down 2 bits
//docId >>= 2;
// and mask
//docId &= DOCID_MASK;
// test it
//long long docId = g_posdb.getDocId(dp-8);
//if ( d2 != docId ) { char *xx=NULL;*xx=0; }
// store this docid though. treat as long long
// but we mask with keymask
long slot = m_bt.getSlot ( &docId );
if ( slot < 0 ) {
// we can't alloc in a thread, careful
if ( ! m_bt.addKey(&docId,bitVec) ) {
char *xx=NULL;*xx=0; }
continue;
}
// or the bit in otherwise
char *bv = (char *)m_bt.getValueFromSlot(slot);
bv[byte] |= mask;
}
}
}
char *dst = m_docIdVoteBuf.getBufStart();
// . now our hash table is filled with all the docids
// . evaluate each bit vector
for ( long i = 0 ; i < m_bt.m_numSlots ; i++ ) {
// skip if empty
if ( ! m_bt.m_flags[i] ) continue;
// get the bit vector
unsigned char *vec = (unsigned char *)m_bt.getValueFromSlot(i);
// hash the vector
long long h64 = 0LL;
for ( long k = 0 ; k < m_vecSize ; k++ )
h64^=g_hashtab[(unsigned char)vec[k]][(unsigned char)k];
// check in hash table
char *val = (char *)m_ct.getValue ( &h64 );
// it passes, add the ocid
if ( m_debug ) {
long long docId =*(long long *)m_bt.getKeyFromSlot(i);
log("query: eval d=%llu vec[0]=%lx h64=%lli",
docId,(long)vec[0],h64);
//if ( docId == 47801316261LL )
// log("hy");
}
// add him to the good table
if ( val && *val ) {
// it passes, add the ocid
long long docId =*(long long *)m_bt.getKeyFromSlot(i);
// fix it up
if ( m_debug ) {
log("query: adding d=%llu vec[0]=%lx",
docId,(long)vec[0]);
}
// shift up
docId <<= 2;
// a 6 byte key means you pass
memcpy ( dst , &docId , 6 );
dst += 6;
continue;
}
// evaluate the vector
char include = m_q->matchesBoolQuery ( (unsigned char *)vec ,
m_vecSize );
if ( include ) {
// it passes, add the ocid
long long docId =*(long long *)m_bt.getKeyFromSlot(i);
// fix it up
if ( m_debug ) {
log("query: adding d=%llu vec[0]=0x%lx",
docId,(long)vec[0]);
}
// shift up
docId <<= 2;
// a 6 byte key means you pass
memcpy ( dst , &docId , 6 );
// test it
long long d2;
d2 = *(unsigned long *)(dst+1);
d2 <<= 8;
d2 |= (unsigned char)dst[0];
d2 >>= 2;
docId >>= 2;
if ( d2 != docId ) { char *xx=NULL;*xx=0; }
// end test
dst += 6;
}
// store in hash table
m_ct.addKey ( &h64 , &include );
}
// update SafeBuf::m_length
m_docIdVoteBuf.setLength ( dst - m_docIdVoteBuf.getBufStart() );
// now sort the docids. TODO: break makeDocIdVoteBufForBoolQuery_r()
// up into docid ranges so we have like 1/100th the # of docids to
// sort. that should make this part a lot faster.
// i.e. 1000*log(1000) > 1000*(10*log(10))) --> 3000 > 1000
// i.e. it's faster to break it down into 1000 pieces
// i.e. for log base 2 maybe it's like 10x faster...
qsort ( m_docIdVoteBuf.getBufStart() ,
m_docIdVoteBuf.length() / 6 ,
6 ,
dcmp6 );
return true;
}

45
Posdb.h
View File

@ -395,6 +395,42 @@ class Posdb {
DiskPageCache m_pc;
};
#define MAX_SUBLISTS 50
// . each QueryTerm has this attached additional info now:
// . these should be 1-1 with query terms, Query::m_qterms[]
class QueryTermInfo {
public:
// the required lists for this query term, synonym lists, etc.
RdbList *m_subLists [MAX_SUBLISTS];
// flags to indicate if bigram list should be scored higher
char m_bigramFlags [MAX_SUBLISTS];
// shrinkSubLists() set this:
long m_newSubListSize [MAX_SUBLISTS];
char *m_newSubListStart [MAX_SUBLISTS];
char *m_newSubListEnd [MAX_SUBLISTS];
char *m_cursor [MAX_SUBLISTS];
char *m_savedCursor [MAX_SUBLISTS];
// the corresponding QueryTerm for this sublist
//class QueryTerm *m_qtermList [MAX_SUBLISTS];
long m_numNewSubLists;
// how many are valid?
long m_numSubLists;
// size of all m_subLists in bytes
long long m_totalSubListsSize;
// the term freq weight for this term
float m_termFreqWeight;
// what query term # do we correspond to in Query.h
long m_qtermNum;
// the word position of this query term in the Words.h class
long m_qpos;
// the wikipedia phrase id if we start one
long m_wikiPhraseId;
// phrase id term or bigram is in
long m_quotedStartId;
};
/*
#include "RdbList.h"
@ -523,6 +559,8 @@ class PosdbTable {
char *endi, char *endj,
class DocIdScore *pdcs );
bool makeDocIdVoteBufForBoolQuery_r ( ) ;
// some generic stuff
PosdbTable();
~PosdbTable();
@ -670,6 +708,13 @@ class PosdbTable {
long m_minListi;
// intersect docids from each QueryTermInfo into here
SafeBuf m_docIdVoteBuf;
// boolean truth table for boolean queries
HashTableX m_bt;
HashTableX m_ct;
// size of the data slot in m_bt
long m_vecSize;
// are all positive query terms in same wikipedia phrase like
// 'time enough for love'?
bool m_allInSameWikiPhrase;

535
Query.cpp
View File

@ -24,11 +24,11 @@ void Query::constructor ( ) {
//m_bmap = NULL;
m_bitScores = NULL;
m_qwords = NULL;
m_expressions = NULL;
//m_expressions = NULL;
m_qwordsAllocSize = 0;
m_expressionsAllocSize = 0;
//m_expressionsAllocSize = 0;
m_qwords = NULL;
m_expressions = NULL;
//m_expressions = NULL;
reset ( );
}
@ -46,7 +46,7 @@ void Query::reset ( ) {
m_bufLen = 0;
m_origLen = 0;
m_numWords = 0;
m_numOperands = 0;
//m_numOperands = 0;
m_numTerms = 0;
m_synTerm = 0;
//m_numIgnored = 0;
@ -60,14 +60,14 @@ void Query::reset ( ) {
m_bitScores = NULL;
//m_bmapSize = 0;
m_bitScoresSize = 0;
if ( m_expressionsAllocSize )
mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
//if ( m_expressionsAllocSize )
// mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
if ( m_qwordsAllocSize )
mfree ( m_qwords , m_qwordsAllocSize , "Query4" );
m_expressionsAllocSize = 0;
//m_expressionsAllocSize = 0;
m_qwordsAllocSize = 0;
m_qwords = NULL;
m_expressions = NULL;
//m_expressions = NULL;
m_numExpressions = 0;
m_gnext = m_gbuf;
m_hasUOR = false;
@ -149,7 +149,7 @@ bool Query::set2 ( char *query ,
char *q = query;
// see if it should be boolean...
for ( long i = 0 ; boolFlag && i < queryLen ; i++ ) {
for ( long i = 0 ; i < queryLen ; i++ ) {
if ( q[i]=='A' && q[i+1]=='N' && q[i+2]=='D' &&
(q[i+3]==' ' || q[i+3]=='(') )
boolFlag = 1;
@ -343,8 +343,8 @@ bool Query::set2 ( char *query ,
// set m_expressions[] and m_operands[] arrays and m_numOperands
// for boolean queries
if ( m_isBoolean )
if ( ! setBooleanOperands() ) return false;
//if ( m_isBoolean )
// if ( ! setBooleanOperands() ) return false;
// disable stuff for site:, ip: and url: queries
for ( long i = 0 ; i < m_numWords ; i++ ) {
@ -386,6 +386,17 @@ bool Query::set2 ( char *query ,
break;
}
// . keep it simple for now
// . we limit to MAX_EXRESSIONS to like 10 now i guess
if ( m_isBoolean ) {
m_numExpressions = 1;
m_expressions[0].add ( 0 ,
m_numWords ,
this , // Query
0 ); // level
}
// . if it is not truncated, no need to use hard counts
// . comment this line and the next one out for testing hard counts
if ( ! m_truncated ) return true;
@ -450,16 +461,16 @@ bool Query::set2 ( char *query ,
// "(nt=%li)",
// m_numExplicitBits,m_numTerms-m_numExplicitBits,m_numTerms);
if ( ! m_isBoolean ) return true;
//if ( ! m_isBoolean ) return true;
// free cuz it was already set
if ( m_expressionsAllocSize )
mfree(m_expressions,m_expressionsAllocSize , "Query" );
m_expressionsAllocSize = 0;
m_expressions = NULL;
//if ( m_expressionsAllocSize )
// mfree(m_expressions,m_expressionsAllocSize , "Query" );
//m_expressionsAllocSize = 0;
//m_expressions = NULL;
// also set the boolean stuff again too!
if ( ! setBooleanOperands() ) return false;
//if ( ! setBooleanOperands() ) return false;
return true;
}
@ -498,7 +509,6 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
long max = (long)MAX_EXPLICIT_BITS;
if ( max > m_maxQueryTerms ) max = m_maxQueryTerms;
//char u8Buf[256];
for ( long i = 0 ; i < m_numWords && n < MAX_QUERY_TERMS ; i++ ) {
// break out if no more explicit bits!
/*
@ -617,7 +627,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// doh! gotta reset to 0
qt->m_implicitBits = 0;
// assume not under a NOT bool op
qt->m_underNOT = false;
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeightPhrase ;
qt->m_userType = qw->m_userTypePhrase ;
@ -819,7 +829,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// break;
// }
// assume not under a NOT bool op
qt->m_underNOT = false;
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeight ;
qt->m_userType = qw->m_userType ;
@ -1162,7 +1172,8 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
if ( qw->m_wordSign == '+' ) continue;
// no url: stuff, maybe only title
if ( qw->m_fieldCode &&
qw->m_fieldCode != FIELD_TITLE )
qw->m_fieldCode != FIELD_TITLE &&
qw->m_fieldCode != FIELD_GENERIC )
continue;
// skip if ignored like a stopword (stop to->too)
//if ( qw->m_ignoreWord ) continue;
@ -1232,8 +1243,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// stop word? no, we're a phrase term
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
// change in both places
qt->m_termId = syn.m_aids[j] & TERMID_MASK;
m_termIds[n] = syn.m_aids[j] & TERMID_MASK;
long long wid = syn.m_aids[j];
// might be in a title: field or something
if ( qw->m_prefixHash ) {
long long ph = qw->m_prefixHash;
wid= hash64h(wid,ph);
}
qt->m_termId = wid & TERMID_MASK;
m_termIds[n] = wid & TERMID_MASK;
qt->m_rawTermId = syn.m_aids[j];
// assume explicit bit is 0
qt->m_explicitBit = 0;
@ -1265,7 +1282,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
// reset our implicit bits to 0
qt->m_implicitBits = 0;
// assume not under a NOT bool op
qt->m_underNOT = false;
//qt->m_underNOT = false;
// assign score weight, we're a phrase here
qt->m_userWeight = qw->m_userWeight ;
qt->m_userType = qw->m_userType ;
@ -1902,7 +1919,7 @@ bool Query::setQWords ( char boolFlag ,
// assume QueryWord is ignored by default
qw->m_ignoreWord = IGNORE_DEFAULT;
qw->m_ignorePhrase = IGNORE_DEFAULT;
qw->m_wordNum = i;
// get word as a string
//char *w = words.getWord(i);
//long wlen = words.getWordLen(i);
@ -3308,24 +3325,24 @@ void Query::printQueryTerms(){
////////// ONLY BOOLEAN STUFF BELOW HERE /////////////
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
bool Query::testBoolean(qvec_t bits, qvec_t bitmask){
bool Query::testBoolean( unsigned char *bits ,long vecSize){//qvec_t bitmask){
if (!m_isBoolean) return false;
Expression *e = &m_expressions [ 0 ];
// find top-level expression
while (e->m_parent && e != e->m_parent) e = e->m_parent;
return e->isTruth(bits, bitmask);
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
return e->isTruth(bits,vecSize);//, bitmask);
}
void Query::printBooleanTree(){
if (!m_isBoolean) return;
Expression *e = &m_expressions [ 0 ];
//Expression *e = &m_expressions [ 0 ];
// find top-level expression
while (e->m_parent && e != e->m_parent) e = e->m_parent;
SafeBuf sbuf(1024);
e->print(&sbuf);
logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
//SafeBuf sbuf(1024,"botree");
//e->print(&sbuf);
//logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
}
/*
// . also sets the m_underNOT member of each QueryTerm, too!!
// . returns false and sets g_errno on error, true otherwise
bool Query::setBooleanOperands ( ) {
@ -3338,6 +3355,20 @@ bool Query::setBooleanOperands ( ) {
"exceeded (%ld).",m_numTerms);
}
// set the QueryWord::m_opBit member of each query word.
// so if you have a query like 'A B OR C' then you need
// to have both A and B if you don't have C. so every word
// unless its an operator needs its own bit. quoted phrases
// may present a problem down the road we'll have to deal with.
long opNum = 0;
for ( long i = 0 ; i < m_numWords ; i++ ) {
// skip if field, opcode, punct. etc.
if ( m_qwords[i].m_ignoreWord ) continue;
// assign it a # i guess
m_qwords[i].m_opNum = opNum++;
}
// alloc the mem if we need to (mdw left off here)
//long need = (m_numWords/3) * sizeof(Expression);
// illegitmate bool expressions breech the buffer
@ -3367,14 +3398,11 @@ bool Query::setBooleanOperands ( ) {
// . set the expression recursively
// . just setting this will not set the m_hasNOT members of each
// QueryTerm
long status = e->set ( 0 , // first word #
m_numWords , // last word #
0 , // parser position
this , // array of QueryWords
0 ,// level
NULL, NULL, // parent, leftchild
false , // has NOT?
false ); // under NOT?
long status = e->add ( 0 , // first word #
m_numWords , // last word #
this , // array of QueryWords
0 ,// level
false ); // has NOT?
if ( status < 0 ) {
g_errno = ETOOMANYOPERANDS;
return log("query: Maximum number of bool operands "
@ -3399,6 +3427,8 @@ bool Query::setBooleanOperands ( ) {
// . get all the terms that are UNDER a NOT operator in some fashion
// . these bits are 1-1 with m_qterms[]
*/
/*
qvec_t notBits = e->getNOTBits( false );
for ( long i = 0 ; i < m_numTerms ; i++ ) {
if ( m_qterms[i].m_explicitBit & notBits )
@ -3406,15 +3436,20 @@ bool Query::setBooleanOperands ( ) {
else
m_qterms[i].m_underNOT = false;
}
*/
/*
return true;
}
*/
/*
// . returns -1 on bad query error
// . returns word AFTER the last word in our operand
long Operand::set ( long a , long b , QueryWord *qwords , long level ,
bool underNOT ) {
// clear these
m_termBits = 0;
//m_termBits = 0;
memset(m_opBits,0,MAX_OVEC_SIZE);
m_hasNOT = false;
//m_hardRequiredBits = 0;
@ -3429,7 +3464,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
// set the parenthetical level of the word
qw->m_level = level;
// set this
qw->m_underNOT = underNOT;
//qw->m_underNOT = underNOT;
// skip punct
if ( ! qw->isAlphaWord() ) {
// if it is a parens, bail!
@ -3459,9 +3494,12 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
// query is too long
if ( qw->m_phraseId && qw->m_queryPhraseTerm &&
qw->m_phraseSign ) {
qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
//qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
m_termBits |= e;
//m_termBits |= e;
long byte = qw->m_opNum / 8;
long mask = 1<<(qw->m_opNum % 8);
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
}
// why would it be ignored? oh... if like cd-rom or in quotes
if ( qw->m_ignoreWord ) continue;
@ -3469,13 +3507,17 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
// . might be a word that's not a QueryTerm because
// query is too long
if ( qw->m_queryWordTerm ) {
qvec_t e = qw->m_queryWordTerm->m_explicitBit;
//qvec_t e = qw->m_queryWordTerm->m_explicitBit;
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
m_termBits |= e;
//m_termBits |= e;
long byte = qw->m_opNum / 8;
long mask = 1<<(qw->m_opNum % 8);
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
}
}
return b;
}
*/
// . returns -1 on bad query error
// . returns next word to parse (after expression) on success
@ -3485,6 +3527,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
// . new: organize query into sum of products normal form, ie:
// . (a) OR (b AND c AND d) OR (e AND f)
/*
unsigned char precedence[] = {
0, // term
4, // OR
@ -3495,238 +3538,214 @@ unsigned char precedence[] = {
3, // UOR
5, // PIPE
};
*/
long Expression::set (long start,
long end,
long pos, // current parsing position
class Query *q,
long level,
class Expression *parent,
class Expression *leftChild,
bool hasNOT ,
bool underNOT ) {
m_start = start;
m_end = end;
m_opcode = 0;
m_operand = NULL;
m_numChildren = 0;
m_hasNOT = hasNOT;
m_parent = parent;
uint8_t curOp = 0;
//#define TYPE_OPERAND 1
//#define TYPE_OPCODE 2
//#define TYPE_EXPRESSION 3
QueryWord *qwords = q->m_qwords;
Expression *o_expressions = q->m_expressions;
Operand *o_operands = q->m_operands;
long *o_numOperands = &q->m_numOperands;
long *o_numExpressions = &q->m_numExpressions;
long maxExpressions = q->m_numWords;
// Lets really try to catch this
if (m_parent == this) {
//log(LOG_WARN, "query: Warning, setting expression "
// "parent to self");
char *xx = NULL; *xx = 0;
}
// return -1 and set g_errno on error
// returns how many words expression was
bool Expression::add (long start,
long end,
class Query *q,
long level
) {
if ( level >= MAX_EXPRESSIONS ) { g_errno = EBADENGINEER; return -1;}
// the # of the first alnumpunct word in the expression
m_expressionStartWord = start;
// and the last one
//m_end = end;
//m_hasNOT = hasNOT;
m_q = q;
//m_cc = 0;
long i = m_expressionStartWord;
// "start" is the current alnumpunct word we are parsing out
for ( ; i<end ; i++ ) {
QueryWord *qwords = q->m_qwords;
//set initial args
if (leftChild) {
leftChild->m_parent = this;
m_children[0] = leftChild;
m_numChildren = 1;
}
hasNOT = false;
for ( long i=pos ; i<end ; i++ ){
QueryWord * qw = &qwords[i];
// set this
qw->m_underNOT = underNOT;
// set leaf node
if (!qw->m_opcode && qw->isAlphaWord()){
if (i > m_start) goto setChildExpr;
// if we maxxed out, error out
if ( *o_numOperands >= MAX_OPERANDS ) return -1;
Operand *op = &o_operands [ *o_numOperands ];
*o_numOperands = *o_numOperands + 1;
// . return ptr to next word for us to parse
// . subtract once since for loop will inc it
i = op->set ( i , end , qwords , level , underNOT );
if ( i < 0 ) return -1;
m_operand = op;
goto endExpr;
//qw->m_underNOT = underNOT;
// set leaf node if not an opcode like "AND" and not punct.
if ( ! qw->m_opcode && qw->isAlphaWord()){
//m_opSlots[m_cc] = i;
//m_opTypes[m_cc] = TYPE_OPERAND;
//qw->m_opBitNum = m_cc;
continue;//goto endExpr; mdw
}
if (qw->m_opcode == OP_NOT){
hasNOT = !hasNOT;
underNOT = hasNOT;
//hasNOT = !hasNOT;
//underNOT = hasNOT;
continue;
}
else if (qw->m_opcode == OP_LEFTPAREN){
if (i == m_start) i++;
goto setChildExpr;
// this is expression
// . it should advance "i" to end of expression
// point to next...
q->m_numExpressions++;
// make a new one:
Expression *e=&q->m_expressions[q->m_numExpressions-1];
// now set it
e->add ( i+1, // skip over (
end ,
q ,
level + 1);
// skip over it. pt to ')'
i += e->m_numWordsInExpression;
qw->m_expressionPtr = e;
//m_opSlots[m_cc] = (long)e;
//m_opTypes[m_cc] = TYPE_EXPRESSION;
//qw->m_opBitNum = m_cc;
}
else if (qw->m_opcode == OP_RIGHTPAREN){
goto endExpr;
}
else if (qw->m_opcode) {
int delta = 0;
curOp = qw->m_opcode;
if (m_numChildren == 1)
m_opcode = curOp;
if (m_numChildren > 1 && curOp != m_opcode) {
delta = (int)precedence[curOp] -
(int)precedence[m_opcode];
}
if (delta > 0){
goto endExpr;
}
if (delta < 0){
// set a subexpression conataining the
// last operand we found as the first
goto setChildExpr2;
}
}
continue;
endExpr:
//log(LOG_DEBUG, "query: set Expr [%ld, %ld), opcode: %d",
// a, i, curOp);
// if we've matched parens, go to next word
// but if we have an extra right paren, don't crash
if (qw->m_opcode == OP_RIGHTPAREN &&
(qwords[m_start].m_opcode == OP_LEFTPAREN ||
m_start == 0))
i++;
m_end = i;
// We have an extra open paren
if (qwords[m_start].m_opcode == OP_LEFTPAREN &&
qw->m_opcode != OP_RIGHTPAREN)
goto setParentExpr;
// we are top-level expr, but there is more to parse
if (!m_parent && i < end-1)
goto setParentExpr;
// just return
return i;
// add a parent expression with this one as the left child
setParentExpr:
{
if ( *o_numExpressions >= maxExpressions ) return -1;
//if (qw->m_opcode == OP_RIGHTPAREN) i++;
Expression *e = &o_expressions[*o_numExpressions];
*o_numExpressions = *o_numExpressions + 1;
i = e->set ( m_start , end ,i, q ,
level+1,
m_parent,
this,
false ,
underNOT ) ;
// return size i guess, include )
m_numWordsInExpression = i - m_expressionStartWord+1;
return i;
}
else if (qw->m_opcode) {
// add that mdw
//m_opSlots[m_cc] = qw->m_opcode;
//m_opTypes[m_cc] = TYPE_OPCODE;
//qw->m_opBitNum = m_cc;
//m_cc++;
continue;
}
// white space?
continue;
}
// add a child expression
setChildExpr:
{
if ( *o_numExpressions >= maxExpressions ) return -1;
Expression *e = &o_expressions[*o_numExpressions];
*o_numExpressions = *o_numExpressions + 1;
i = e->set ( i , end , i, q ,
level+1,
this, NULL, hasNOT ,
underNOT ) -1;
if ( i < 0 ) return -1;
// trim needless parens
while (e->m_numChildren == 1) {
hasNOT = e->m_hasNOT;
e = e->m_children[0];
if (hasNOT) e->m_hasNOT = ! e->m_hasNOT;
m_numWordsInExpression = i - m_expressionStartWord;
return true;
}
// each bit is 1-1 with the explicit terms in the boolean query
bool Query::matchesBoolQuery ( unsigned char *bitVec , long vecSize ) {
return m_expressions[0].isTruth ( bitVec , vecSize );
}
bool isBitNumSet ( long opBitNum, unsigned char *bitVec, long vecSize ) {
long byte = opBitNum / 8;
long mask = 1<<(opBitNum % 8);
if ( byte >= vecSize ) { char *xx=NULL;*xx=0; }
return bitVec[byte] & mask;
}
// . "bits" are 1-1 with the query words in Query::m_qwords[] array
// including ignored words and spaces i guess since Expression::add()
// seems to do that.
bool Expression::isTruth ( unsigned char *bitVec ,long vecSize ) {
//
// operand1 operand2 operator1 operand3 operator2 ....
//
// result: -1 means unknown at this point
long result = -1;
char prevOpCode = 0;
long prevResult ;
// result of current operand
long opResult = -1;
long i = m_expressionStartWord;
long iend = i + m_numWordsInExpression;
bool hasNot = false;
for ( ; i < iend ; i++ ) {
QueryWord *qw = &m_q->m_qwords[i];
if ( qw->m_opcode == OP_NOT ) {
hasNot = true;
continue;
}
// so operands are expressions as well
Expression *e = (Expression *)qw->m_expressionPtr;
if ( e ) {
// save prev one. -1 means no prev.
prevResult = opResult;
// set new onw
opResult = e->isTruth ( bitVec , vecSize );
// skip over that expression. point to ')'
i += e->m_numWordsInExpression;
// flip?
if ( hasNot ) {
if ( opResult == 1 ) opResult = 0;
else opResult = 1;
hasNot = false;
}
hasNOT = false;
//cull empty expressions
if (e->m_numChildren < 1 &&
e->m_operand == NULL) continue;
}
if (m_numChildren >= MAX_OPERANDS) return -1;
// add good expressions
m_children [ m_numChildren] = e;
m_numChildren++;
if (m_numChildren > 1 && m_opcode == 0)
m_opcode = OP_AND; // default AND
if ( qw->m_opcode && ! e ) {
prevOpCode = qw->m_opcode;//m_opSlots[i];
continue;
}
// we need to make the last operand we passed
// be the first operand of a subexpression
setChildExpr2:
{
// remove the last expression from our list
Expression *ce = m_children[m_numChildren-1];
// simple operand
if ( ! qw->m_opcode && ! e ) {
// for regular word operands
// ignore it like a space?
if ( qw->m_ignoreWord ) continue;
// save old one
prevResult = opResult;
// convert word to term #
QueryTerm *qt = qw->m_queryWordTerm;
if ( ! qt ) continue;
// . m_bitNum is set in Posdb.cpp when it sets its
// QueryTermInfo array
// . it is basically the query term #
// . see iff that bit is set in this docid's vec
opResult = isBitNumSet ( qt->m_bitNum,bitVec,vecSize );
// flip?
if ( hasNot ) {
if ( opResult == 1 ) opResult = 0;
else opResult = 1;
hasNot = false;
}
}
m_numChildren--;
// need two to tango. i.e. (true OR false)
if ( prevResult == -1 ) continue;
if ( *o_numExpressions >= maxExpressions ) return -1;
Expression *e = &o_expressions[*o_numExpressions];
*o_numExpressions = *o_numExpressions + 1;
i = e->set ( ce->m_start , end , i, q ,
level+1,
this, ce,
false ,
underNOT ) -1;
ce->m_parent = e;
if ( i < 0 ) return -1;
if (m_numChildren >= MAX_OPERANDS) return -1;
m_children [ m_numChildren ] = e;
hasNOT = false;
m_numChildren++;
continue;
// if this is not the first time... we got two
if ( prevOpCode == OP_AND ) {
// if first operation we encount is A AND B then
// default result to on. only allow an AND operation
// to turn if off.
if ( result == -1 ) result = true;
if ( ! prevResult ) result = false;
if ( ! opResult ) result = false;
}
else if ( prevOpCode == OP_OR ) {
// if first operation we encount is A OR B then
// default result to off
if ( result == -1 ) result = false;
if ( prevResult ) result = true;
if ( opResult ) result = true;
}
}
return end;
}
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
bool Expression::isTruth ( qvec_t bits, qvec_t mask ) {
//bool op1 = false ; // set to false so compiler shuts up
//bool op2 ;
//bool accumulator = false;
//bool hadOR = false;
bool result = false;
// leaf node
if (m_operand){
result = m_operand->isTruth(bits, mask);
// handle masked terms better.. don't apply NOT operator
if (!(m_operand->m_termBits & mask)) return true;
}
else if (m_numChildren == 1){
result = m_children[0]->isTruth(bits, mask);
}
else if (m_opcode == OP_OR || m_opcode == OP_UOR) {
for ( long i=0 ; i<m_numChildren ; i++ ) {
result = result || m_children[i]->isTruth(bits, mask);
if (result) goto done;
}
}
else if (m_opcode == OP_AND || m_opcode == OP_PIPE){
result = true;
for (long i = 0 ; i < m_numChildren ; i++ ) {
result = result && m_children[i]->isTruth(bits, mask);
if (!result) goto done;
}
}
done :
if (m_hasNOT) return !result;
else return result;
if ( result == -1 ) return true;
if ( result == 0 ) return false;
return true;
}
/*
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
// . hasNOT is true if there's a NOT just to the left of this WHOLE expressions
// ourside the parens
@ -3744,9 +3763,11 @@ qvec_t Expression::getNOTBits ( bool hasNOT ) {
// success, all operand pairs were true
return notBits;
}
*/
// print boolean expression for debug purposes
void Expression::print(SafeBuf *sbuf) {
/*
if (m_hasNOT) sbuf->safePrintf("NOT ");
if (m_operand){
m_operand->print(sbuf);
@ -3765,16 +3786,18 @@ void Expression::print(SafeBuf *sbuf) {
}
}
sbuf->safePrintf(")");
*/
}
/*
void Operand::print(SafeBuf *sbuf) {
// long shift = 0;
// while (m_termBits >> shift) shift++;
// sbuf->safePrintf("%i", 1<<(shift-1));
if (m_hasNOT) sbuf->safePrintf("NOT 0x%lx", (long)m_termBits);
else sbuf->safePrintf("0x%lx", (long)m_termBits);
if (m_hasNOT) sbuf->safePrintf("NOT 0x%llx",*(long long *)m_opBits);
else sbuf->safePrintf("0x%llx", *(long long *)m_opBits);
}
*/
// if any one query term is split, msg3a has to split the query
bool Query::isSplit() {

154
Query.h
View File

@ -49,6 +49,8 @@ typedef unsigned long long qvec_t;
#define MAX_EXPLICIT_BITS (sizeof(qvec_t)*8)
#define MAX_OVEC_SIZE 256
// only can use 16-bit since have to make a 64k truth table!
#define MAX_EXPLICIT_BITS_BOOLEAN (16*8)
@ -166,6 +168,7 @@ extern struct QueryField g_fields[];
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
/*
// . creating a QueryBoolean class was unnecessary since it was only functional
// and had nothing new it would store that the Query class doesn't store
// . the entry point is the Query::setBitScoresBoolean() function below
@ -181,76 +184,46 @@ public:
long set ( long a , long b , class QueryWord *qwords , long level ,
bool underNOT ) ;
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
// . Operand::m_termBits is the required bits for operand to be true
// . Operand::m_opBits is the required bits for operand to be true
// . does not include signless phrases
bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
//bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
bool isTruth ( unsigned char *bitVec , long vecSize ) {
// must always satisfy hard required terms (+ sign)
//if ( (bits & m_forcedBits) != m_forcedBits )
// return false;
if (m_hasNOT) return (bits & m_termBits & mask) == 0;
return ( (bits & m_termBits & mask) == (m_termBits & mask));
//if (m_hasNOT) return (bits & m_opBits & mask) == 0;
//return ( (bits & m_opBits & mask) == (m_opBits & mask));
if ( m_hasNOT ) {
for ( long i = 0 ; i < vecSize ; i++ )
if ( m_opBits[i] & bitVec[i] ) return false;
return true;
}
for ( long i = 0 ; i < vecSize ; i++ )
if ( m_opBits[i] & bitVec[i] ) return true;
return false;
// . we are now back to good ol' default OR
// . m_termBits should have been masked with
// . m_opBits should have been masked with
// m_requiredBits so as not to include signless phrases
//return ( (bits & m_termBits) != 0 );
//return ( (bits & m_opBits) != 0 );
};
void print (SafeBuf *sbuf);
// we are a sequence of QueryWords
//long m_startWordNum;
//long m_lastWordNum;
// . we treat the required term bits of those words as one unit (ANDed)
// . unsigned phrases are not included in these term bits
// . doc just needs one of these bits for this op to be considered true
qvec_t m_termBits;
// . terms under the same QueryTermInfo class should have the same
// termbit here
unsigned char m_opBits[MAX_OVEC_SIZE];
//long m_vecSize;
// does the word NOT preceed the operand?
bool m_hasNOT;
class Expression *m_parent;
//class Expression *m_parent;
// we MUST have these for this OPERAND to be true
//unsigned short m_forcedBits;
};
*/
// operand1 AND operand2 OR ...
// operand1 OR operand2 AND ...
class Expression {
public:
long set (long start,
long end,
long pos, // current parsing position
class Query *q,
long level,
class Expression *parent,
class Expression *leftChild,
bool hasNOT ,
bool underNOT );
bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) ;
// . what QueryTerms are UNDER the influence of the NOT opcode?
// . we read in the WHOLE termlist of those that are (like '-' sign)
// . returned bit vector is 1-1 with m_qterms in Query class
qvec_t getNOTBits ( bool hasNOT );
void print (SafeBuf *sbuf);
// . a list of operands separated by op codes (a AND b OR c ...)
// . sometimes and operand is another expression: a AND (b OR c)
// . use NULL in m_operands slot if we got an expression and vice versa
// . m_opcodes[i] is the opcode after operand #i
class Expression *m_parent;
//class Operand *m_operands [ MAX_OPERANDS ];
class Expression *m_children [ MAX_OPERANDS ];
//char m_opcodes [ MAX_OPERANDS ];
//long m_numOperands;
// now expressions can have either child expressions or 1 operand
long m_numChildren;
// do we have a NOT operator before operand #i?
//bool m_hasNOT [ MAX_OPERANDS ];
// only one opcode, operand, hasNOT per expression now
uint8_t m_opcode;
class Operand *m_operand;
bool m_hasNOT;
// needed for nesting
long m_start;
long m_end;
};
////////////////////////////////////////////////////////
////////////////////////////////////////////////////////
@ -292,7 +265,7 @@ class QueryWord {
long long m_phraseId;
// hash of field name then collection, used to hash termId
long long m_prefixHash;
long m_wordNum;
// are we in a phrase in a wikipedia title?
long m_wikiPhraseId;
long m_wikiPhraseStart;
@ -373,6 +346,10 @@ class QueryWord {
float m_float;
// for gbminint:99 etc. uses integers instead of floats for better res
long m_int;
// what operand bit # is it for doing boolen queries?
//long m_opBitNum;
// when an operand is an expression...
class Expression *m_expressionPtr;
};
// . we filter the QueryWords and turn them into QueryTerms
@ -415,6 +392,13 @@ class QueryTerm {
// expressions) and just use a hardCount to see how many hard required
// terms are contained by a document. see IndexTable.cpp "hardCount"
char m_hardCount;
// the "number" of the query term used for evaluation boolean
// expressions in Expression::isTruth(). Basically just the
// QueryTermInfo for which this query term belongs. each QueryTermInfo
// is like a single query term and all its synonyms, etc.
long m_bitNum;
// point to term, either m_word or m_phrase
char *m_term;
long m_termLen;
@ -485,6 +469,14 @@ class QueryTerm {
// we can be in? uses -1 to indicate none.
long m_leftPhraseTermNum;
long m_rightPhraseTermNum;
// . what operand # are we a part of in a boolean query?
// . like for (x AND y) x would have an opNum of 0 and y an
// opNum of 1 for instance.
// . for things like (x1 OR x2 OR x3 ... ) we try to give all
// those query terms the same m_opNum for efficiency since
// they all have the same effecct
//long m_opNum;
// same as above basically
class QueryTerm *m_leftPhraseTerm;
class QueryTerm *m_rightPhraseTerm;
@ -501,6 +493,41 @@ class QueryTerm {
};
//#define MAX_OPSLOTS 256
#define MAX_EXPRESSIONS 10
// operand1 AND operand2 OR ...
// operand1 OR operand2 AND ...
class Expression {
public:
bool add (long start,
long end,
class Query *q,
long level );
bool isTruth ( unsigned char *bitVec , long vecSize );
// . what QueryTerms are UNDER the influence of the NOT opcode?
// . we read in the WHOLE termlist of those that are (like '-' sign)
// . returned bit vector is 1-1 with m_qterms in Query class
void print (SafeBuf *sbuf);
// . a list of operands separated by op codes (a AND b OR c ...)
// . sometimes and operand is another expression: a AND (b OR c)
// . use NULL in m_operands slot if we got an expression and vice versa
// . m_opcodes[i] is the opcode after operand #i
//class Expression *m_parent;
//bool m_hasNOT;
//long m_start;
//long m_end;
long m_expressionStartWord;
long m_numWordsInExpression;
Query *m_q;
// . opSlots can be operands operators or expressions
// . m_opTypes tells which of the 3 they are
//long m_opSlots[MAX_OPSLOTS];
//char m_opTypes[MAX_OPSLOTS];
//long m_cc;
};
// . this is the main class for representing a query
// . it contains array of QueryWords (m_qwords[]) and QueryTerms (m_qterms[])
class Query {
@ -589,11 +616,17 @@ class Query {
// sets m_bmap[][] so getImplicits() works
void setBitMap ( );
bool testBoolean(qvec_t bits, qvec_t bitmask=(qvec_t)-1);
bool testBoolean(unsigned char *bits,long vecSize);
// print to log
void printBooleanTree();
void printQueryTerms();
// the new way as of 3/12/2014. just determine if matches the bool
// query or not. let's try to offload the scoring logic to other places
// if possible.
// bitVec is all the QueryWord::m_opBits some docid contains, so
// does it match our boolean query or not?
bool matchesBoolQuery ( unsigned char *bitVec , long vecSize ) ;
// . call this before calling getBitScore() to set m_bitScores[] table
@ -613,6 +646,7 @@ class Query {
// through the phrase
// . the greater the number of IMplicit SINGLE words a doc has the
// bigger its bit score
/*
uint8_t getBitScore ( qvec_t ebits ) {
// get implicit bits from explicit bits
qvec_t ibits = getImplicits ( ebits );
@ -661,6 +695,7 @@ class Query {
if (ibits == m_requiredBits ) bscore|=0x20;
return bscore;
};
*/
// return an implicit vector from an explicit which contains the explic
qvec_t getImplicits ( qvec_t ebits ) {
@ -716,7 +751,7 @@ class Query {
bool isConnection ( char *s , long len ) ;
// set the QueryTerm::m_hasNOT members
void setHasNOTs();
//void setHasNOTs();
// . used by IndexTable.cpp to make a ptr map of the query terms
// to make intersecting the termlists one at a time efficient
@ -874,11 +909,12 @@ class Query {
// . we now contain the parsing components for boolean queries
// . m_expressions points into m_gbuf or is allocated
class Expression *m_expressions; // [ MAX_OPERANDS ];
long m_expressionsAllocSize;
//class Expression *m_expressions; // [ MAX_OPERANDS ];
//long m_expressionsAllocSize;
Expression m_expressions[MAX_EXPRESSIONS];
long m_numExpressions;
class Operand m_operands [ MAX_OPERANDS ];
long m_numOperands ;
//class Operand m_operands [ MAX_OPERANDS ];
//long m_numOperands ;
// does query contain the pipe operator
bool m_piped;

View File

@ -9920,7 +9920,8 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
char *row;
bool checkedRow = false;
SpiderColl *sc = cr->m_spiderColl;
//SpiderColl *sc = cr->m_spiderColl;
SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum);
//if ( strstr(url,"http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2" ))
// log("hey");

View File

@ -71,6 +71,7 @@ void Title::reset() {
mfree ( m_title , m_titleAllocSize , "Title" );
m_title = NULL;
m_titleBytes = 0;
m_titleAllocSize = 0;
m_query = NULL;
m_titleTagStart = -1;
m_titleTagEnd = -1;
@ -113,7 +114,7 @@ bool Title::setTitle ( XmlDoc *xd ,
char *val = NULL;
// look for the "title:" field in json then use that
SafeBuf jsonTitle;
long vlen;
long vlen = 0;
if ( xd->m_contentType == CT_JSON ) {
char *jt;
jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
@ -124,7 +125,6 @@ bool Title::setTitle ( XmlDoc *xd ,
val = jsonTitle.getBufStart();
vlen = jsonTitle.length();
}
}
// if we had a title: field in the json...
if ( val && vlen > 0 ) {
@ -135,6 +135,7 @@ bool Title::setTitle ( XmlDoc *xd ,
else {
dst = (char *)mmalloc ( m_titleBytes+1,"titdst" );
if ( ! dst ) return false;
m_titleAllocSize = m_titleBytes+1;
}
m_title = dst;
memcpy ( dst , val , m_titleBytes );
@ -142,6 +143,13 @@ bool Title::setTitle ( XmlDoc *xd ,
return true;
}
// json content, if has no explicit title field, has no title then
if ( xd->m_contentType == CT_JSON ) {
m_localBuf[0] = '\0';
m_title = m_localBuf;
m_titleBytes = 0;
return true;
}
bool status = setTitle4 ( xd ,
xml ,

View File

@ -9,7 +9,7 @@
#define _TOPTREE_H_
#include "Clusterdb.h" // SAMPLE_VECTOR_SIZE, 48 bytes for now
#include "IndexTable2.h" // score_t definition
//#include "IndexTable2.h" // score_t definition
#include "RdbTree.h"
class TopNode {

View File

@ -14474,6 +14474,7 @@ char **XmlDoc::getHttpReply2 ( ) {
// turn off
r->m_useCompressionProxy = false;
r->m_compressReply = false;
r->m_isCustomCrawl = cr->m_isCustomCrawl;
// set it for this too
if ( g_conf.m_useCompressionProxy &&
@ -17199,12 +17200,16 @@ long *XmlDoc::getContentHashJson32 ( ) {
JsonItem *ji = jp->getFirstItem();
long totalHash32 = 0;
//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);
for ( ; ji ; ji = ji->m_next ) {
QUICKPOLL(m_niceness);
// skip if not number or string
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
continue;
char *topName = NULL;
// what name level are we?
long numNames = 1;
JsonItem *pi = ji->m_parent;
@ -17212,6 +17217,7 @@ long *XmlDoc::getContentHashJson32 ( ) {
// empty name?
if ( ! pi->m_name ) continue;
if ( ! pi->m_name[0] ) continue;
topName = pi->m_name;
numNames++;
}
@ -17230,6 +17236,22 @@ long *XmlDoc::getContentHashJson32 ( ) {
strcmp(ji->m_name,"resolved_url") == 0 )
continue;
if ( topName && strcmp(topName,"stats") == 0 )
continue;
if ( topName && strcmp(topName,"queryString") == 0 )
continue;
if ( topName && strcmp(topName,"nextPages") == 0 )
continue;
if ( topName && strcmp(topName,"textAnalysis") == 0 )
continue;
if ( topName && strcmp(topName,"links") == 0 )
continue;
// hash the fully compound name
long nameHash32 = 0;
JsonItem *p = ji;
@ -17275,6 +17297,11 @@ long *XmlDoc::getContentHashJson32 ( ) {
long combined32 = hash32h ( nameHash32 , vh32 );
// accumulate field/val pairs order independently
totalHash32 ^= combined32;
// debug note
//logf(LOG_DEBUG,"ch32: field=%s nh32=%lu vallen=%li",
// ji->m_name,
// nameHash32,
// vlen);
}
m_contentHash32 = totalHash32;
@ -29753,7 +29780,10 @@ bool XmlDoc::hashWords3 ( //long wordStart ,
long plen = 0;
if ( hi->m_prefix ) plen = gbstrlen ( hi->m_prefix );
if ( hi->m_prefix && plen ) {
prefixHash = hash64 ( hi->m_prefix , plen );
// we gotta make this case insensitive, and skip spaces
// because if it is 'focal length' we can't search
// 'focal length:10' because that comes across as TWO terms.
prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen );
// . sanity test, make sure it is in supported list
// . hashing diffbot json output of course fails this so
// skip in that case if diffbot
@ -30287,6 +30317,9 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
// . this now allows for commas in numbers like "1,500.62"
float f = atof2 ( p , bufEnd - p );
// debug
//log("build: hashing %s %f",hi->m_prefix,f);
if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
return false;
@ -30324,7 +30357,7 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {
long nameLen = 0;
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
if ( hi->m_prefix && nameLen )
nameHash = hash64Lower_utf8 ( hi->m_prefix , nameLen );
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
// need a prefix for hashing numbers... for now
else { char *xx=NULL; *xx=0; }
@ -30429,7 +30462,7 @@ bool XmlDoc::hashNumber3 ( long n , HashInfo *hi , char *sortByStr ) {
long nameLen = 0;
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
if ( hi->m_prefix && nameLen )
nameHash = hash64Lower_utf8 ( hi->m_prefix , nameLen );
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
// need a prefix for hashing numbers... for now
else { char *xx=NULL; *xx=0; }

View File

@ -1,8 +1,8 @@
# List of sites to spider, one per line. Gigablast uses the <a
# href=/admin/scheduler#insitelist>insitelist</a> directive on the <a
# href=/admin/scheduler>spider scheduler</a> page to make sure that the spider
# only indexes urls that match the site patterns you specify here, other than
# urls you add individually via the add urls or inject url tools. See <a
# href=/admin/filters#insitelist>insitelist</a> directive on the <a
# href=/admin/filters>url filters</a> page to make sure that the spider only
# indexes urls that match the site patterns you specify here, other than urls
# you add individually via the add urls or inject url tools. See <a
# href=#examples>example site list</a> below. Limit list to 300MB. If you have
# a lot of INDIVIDUAL URLS to add then consider using the <a
# href=/admin/addurl>addurl</a> interface.
@ -12,7 +12,7 @@
# must be represented as &lt;, &gt;, &#34; and &#035; respectively.
# Controls just the spiders for this collection.
<spideringEnabled>0</>
<spideringEnabled>1</>
# What is the maximum number of web pages the spider is allowed to download
# simultaneously PER HOST for THIS collection?

10
gb.conf
View File

@ -51,7 +51,7 @@
<readOnlyMode>0</>
# Controls all spidering for all collections
<spideringEnabled>0</>
<spideringEnabled>1</>
# What is the maximum number of web pages the spider is allowed to download
# simultaneously for ALL collections PER HOST?
@ -144,7 +144,13 @@
# Sends to email address 1 through email server 1 if any parm is changed.
<sendParmChangeEmailAlertsToEmail1>0</>
# Connects to this server directly when sending email 1
# Connects to this IP or hostname directly when sending email 1. Use
# <i>apt-get install sendmail</i> to install sendmail on that IP or hostname.
# Add <i>From:10.5 RELAY</i> to /etc/mail/access to allow sendmail to forward
# email it receives from gigablast if gigablast hosts are on the 10.5.*.* IPs.
# Then run <i>/etc/init.d/sendmail restart</i> as root to pick up those
# changes so sendmail will forward Gigablast's mail to the address you give
# below.
<emailServer1><![CDATA[10.5.54.47]]></>
# Sends to this address when sending email 1

View File

@ -138,6 +138,7 @@ rather your current working directory, where the 'gb' binary resides.
<li> Indexes JSON and XML natively. Provides ability to search individual structured fields.
<li> Sorting. Sort the search results by meta tags or JSON fields that contain numbers, simply by adding something like gbsortby:price or gbrevsortby:price as a query term, assuming you have meta price tags.
<li>Easy Scaling. Add new servers to the hosts.conf file then click 'rebalance shards' to automatically rebalance the sharded data.
<li>Using &stream=1 can stream back millions of search results for a query without running out of memory.
</ul>
<br>

View File

@ -411,8 +411,8 @@ int main ( int argc , char *argv[] ) {
"-h\tprint this help.\n\n"
"-v\tprint version and exit.\n\n"
"-o\tprint the overview documentation in HTML. "
"Contains the format of hosts.conf.\n\n"
//"-o\tprint the overview documentation in HTML. "
//"Contains the format of hosts.conf.\n\n"
"-r\tindicates recovery mode, "
"sends email to addresses "
"specified in Conf.h upon startup.\n\n"
@ -440,6 +440,7 @@ int main ( int argc , char *argv[] ) {
"\ttwo hostids with a hyphen in between indicates a "
"range.\n\n"
/*
"tmpstart [hostId]\n"
"\tstart the gb process on all hosts or just on "
"[hostId] if specified, but "
@ -456,6 +457,7 @@ int main ( int argc , char *argv[] ) {
"\tsaves and exits for all gb hosts or "
"just on [hostId] if specified, for the "
"tmpstart command.\n\n"
*/
"spidersoff [hostId]\n"
"\tdisables spidering for all gb hosts or "
@ -465,6 +467,7 @@ int main ( int argc , char *argv[] ) {
"\tensables spidering for all gb hosts or "
"just on [hostId] if specified.\n\n"
/*
"cacheoff [hostId]\n"
"\tdisables all disk PAGE caches on all hosts or "
"just on [hostId] if specified.\n\n"
@ -472,11 +475,17 @@ int main ( int argc , char *argv[] ) {
"freecache [maxShmid]\n"
"\tfinds and frees all shared memory up to shmid "
"maxShmid, default is 3000000.\n\n"
*/
/*
"ddump [hostId]\n"
"\tdisk dump in memory trees to binary files "
"just on [hostId] if specified.\n\n"
"\tdump all b-trees in memory to sorted files on "
"disk. "
"Will likely trigger merges on files on disk. "
"Restrict to just host [hostId] if given.\n\n"
*/
/*
"pmerge [hostId|hostId1-hostId2]\n"
"\tforce merge of posdb files "
"just on [hostId] if specified.\n\n"
@ -492,16 +501,19 @@ int main ( int argc , char *argv[] ) {
"merge [hostId|hostId1-hostId2]\n"
"\tforce merge of all rdb files "
"just on [hostId] if specified.\n\n"
*/
"dsh <CMD>\n"
"\trun this command on the primary IPs of "
"all active hosts in hosts.conf. Example: "
"gb dsh 'ps auxw; uptime'\n\n"
/*
"dsh2 <CMD>\n"
"\trun this command on the secondary IPs of "
"all active hosts in hosts.conf. Example: "
"gb dsh2 'ps auxw; uptime'\n\n"
*/
"install [hostId]\n"
"\tinstall all required files for gb from "
@ -509,13 +521,16 @@ int main ( int argc , char *argv[] ) {
"to [hostId]. If no [hostId] is specified install "
"to ALL hosts.\n\n"
/*
"install2 [hostId]\n"
"\tlike above, but use the secondary IPs in the "
"hosts.conf.\n\n"
*/
"installgb [hostId]\n"
"\tlike above, but install just the gb executable.\n\n"
/*
"installgb2 [hostId]\n"
"\tlike above, but use the secondary IPs in the "
"hosts.conf.\n\n"
@ -592,7 +607,9 @@ int main ( int argc , char *argv[] ) {
"search for them on server2. If you do not want to"
" use the proxy server "
"on gk10, use -p\n\n"
*/
/*
"blaster [-l|-u|-i] <file> <maxNumThreads> <wait>\n"
"\tget documents from the urls given in file. The "
"-l argument is to "
@ -606,7 +623,9 @@ int main ( int argc , char *argv[] ) {
"\tmaxNumThreads is the"
" number of concurrent threads at one time and wait "
" is the time to wait between threads.\n\n"
*/
/*
"scale <newHosts.conf>\n"
"\tGenerate a script to be called to migrate the "
"data to the new places. Remaining hosts will "
@ -647,7 +666,9 @@ int main ( int argc , char *argv[] ) {
"ping <hostId> [clientport]\n"
"\tperforms pings to <hostId>. [clientport] defaults "
"to 2050.\n\n"
*/
/*
"spellcheck <file>\n"
"\tspellchecks the the queries in <file>.\n\n"
@ -701,7 +722,9 @@ int main ( int argc , char *argv[] ) {
"parsetest <docIdToTest> [coll] [query]\n\t"
"parser speed tests\n\n"
*/
/*
"thrutest [dir] [fileSize]\n\tdisk write/read speed "
"test\n\n"
@ -711,6 +734,9 @@ int main ( int argc , char *argv[] ) {
"memtest\n"
"\t Test how much memory we can use\n\n"
*/
/*
// Quality Tests
"countdomains <coll> <X>\n"
"\tCounts the domains and IPs in collection coll and "
@ -738,33 +764,38 @@ int main ( int argc , char *argv[] ) {
"dump es <coll> <UTCtimestamp>\n\tdump stats for "
"all events as if the time is UTCtimestamp.\n\n"
*/
/*
#ifdef _CLIENT_
//there was <hostId> in this command but it
// wasn't used in the program, so deleting it from
// here
"dump <V> [C [X [Y [Z]]]]\n\tdump a db in "
#else
*/
/*
"dump <V> [C [X [Y [Z [T]]]]]\n\tdump a db in "
#endif
//#endif
"working directory.\n"
#ifndef _CLIENT_
#ifndef _METALINCS_
//#ifndef _CLIENT_
//#ifndef _METALINCS_
//"\tV is u to dump tfndb.\n"
"\tV is d to dump datedb.\n"
#endif
#endif
//#endif
//#endif
"\tV is s to dump spiderdb. set [T] to 1 to print "
"new stats. 2 to print old stats. T is ip of firstip."
"\n"
"\tV is t to dump titledb.\n"
"\tV is ts to dump sentences from events.\n"
"\tV is tw to dump words from events.\n"
//"\tV is ts to dump sentences from events.\n"
//"\tV is tw to dump words from events.\n"
"\tV is D to dump duplicate docids in titledb.\n"
"\tV is c to dump checksumdb.\n"
"\tV is S to dump tagdb.\n"
"\tV is W to dump tagdb for wget.\n"
"\tV is V to dump revdb.\n"
//"\tV is V to dump revdb.\n"
"\tV is x to dump doledb.\n"
"\tV is w to dump waiting tree.\n"
"\tV is B to dump sectiondb.\n"
@ -779,13 +810,13 @@ int main ( int argc , char *argv[] ) {
"\tX is start file num. (default 0)\n"
"\tY is num files. (default -1)\n"
"\tZ is 1 to include tree. (default 1)\n"
#ifndef _CLIENT_
#ifndef _METALINCS_
#ifndef _GLOBALSPEC_
//#ifndef _CLIENT_
//#ifndef _METALINCS_
//#ifndef _GLOBALSPEC_
"\tT is the termid to dump. Applies only to indexdb.\n"
#endif
#endif
#endif
//#endif
//#endif
//#endif
"\tT is the first docId to dump. Applies only to "
"titledb. "
//"(default none)\n\n"
@ -806,22 +837,27 @@ int main ( int argc , char *argv[] ) {
//"\tB is -1 to dump all priorities\n"
"\tC is 1 to just show the stats. (default 0)\n"
"\n"
*/
//"dump i X Y Z t\n\tdump indexdb termId t in working "
//"directory.\n"
//"\tX is start file num. (default 0)\n"
//"\tY is num files. (default -1)\n"
//"\tZ is 1 to include tree. (default 1)\n"
//"\tt is the termid to dump. (default none)\n\n"
#ifndef _CLIENT_
#ifndef _METALINCS_
//#ifndef _CLIENT_
//#ifndef _METALINCS_
/*
"dump I [X [V]]\n\tdump indexdb in working "
"directory at "
"an offset.\n"
#endif
#endif
//#endif
//#endif
"\tX is the file NAME. (default NULL)\n"
"\tV is the start offset. (default 0)\n"
*/
/*
"\n"
"dumpmissing <coll> [hostId]\n\t"
"dump the docIds in indexdb but not "
@ -867,6 +903,7 @@ int main ( int argc , char *argv[] ) {
"in the current gb. Use synchost2 to use secondary "
"IPs.\n"
"\n"
*/
//#endif
);
SafeBuf sb2;
@ -894,6 +931,7 @@ int main ( int argc , char *argv[] ) {
if ( strcmp ( cmd , "-h" ) == 0 ) goto printHelp;
// version
if ( strcmp ( cmd , "-v" ) == 0 ) {
fprintf(stdout,"Gigablast March-2014\n");
// fprintf(stderr,"Gigablast %s\nMD5KEY: %s\n"
// "TAG: %s\nPATH: %s\n",
// GBVersion, GBCommitID, GBTag, GBBuildPath);
@ -901,10 +939,10 @@ int main ( int argc , char *argv[] ) {
}
// print overview
if ( strcmp ( cmd , "-o" ) == 0 ) {
//printOverview ( );
return 0;
}
//if ( strcmp ( cmd , "-o" ) == 0 ) {
// //printOverview ( );
// return 0;
//}
bool hadHostId = false;