mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
Merge remote-tracking branch 'origin/diffbot' into diffbot-dan
This commit is contained in:
commit
0988a134d0
2
Json.cpp
2
Json.cpp
@ -433,7 +433,7 @@ char *JsonItem::getValueAsString ( long *valueLen ) {
|
||||
|
||||
// numbers...
|
||||
static char s_numBuf[64];
|
||||
if ( m_valueLong == (long)m_valueDouble ) {
|
||||
if ( (float)m_valueLong == m_valueDouble ) {
|
||||
*valueLen = sprintf ( s_numBuf,"%li", m_valueLong );
|
||||
return s_numBuf;
|
||||
}
|
||||
|
19
Msg13.cpp
19
Msg13.cpp
@ -721,6 +721,25 @@ void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
"(compatible; MSIE 6.0; Windows 98; "
|
||||
"Win 9x 4.90)" ;
|
||||
|
||||
// for bulk jobs avoid actual downloads of the page for efficiency
|
||||
if ( r->m_isCustomCrawl == 2 ) {
|
||||
char *s =
|
||||
"HTTP/1.0 200 (OK)\r\n"
|
||||
"Content-Length: 0\r\n"
|
||||
"Connection: Close\r\n"
|
||||
"Content-Type: text/html\r\n\r\n";
|
||||
long slen = gbstrlen(s);
|
||||
long fakeBufSize = slen + 1;
|
||||
char *fakeBuf = mdup ( s , fakeBufSize , "fkblk");
|
||||
gotHttpReply2 ( r ,
|
||||
fakeBuf,
|
||||
fakeBufSize, // include \0
|
||||
fakeBufSize, // allocsize
|
||||
NULL ); // tcpsock
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// download it
|
||||
if ( ! g_httpServer.getDoc ( r->m_url ,
|
||||
r->m_urlIp ,
|
||||
|
2
Msg13.h
2
Msg13.h
@ -32,6 +32,8 @@ public:
|
||||
// if doing spider compression, compute contentHash32 of document
|
||||
// downloaded, and if it matches this then send back EDOCUNCHANGED
|
||||
long m_contentHash32;
|
||||
// copy of CollectionRec::m_customCrawl, 0 1 for crawls or 2 for bulks
|
||||
char m_isCustomCrawl;
|
||||
// send back error ENOGOODDATE if it does not have one. but if
|
||||
// harvestLinks is true, just send back a filtered list of links
|
||||
long m_requireGoodDate:1;
|
||||
|
31
Msg39.cpp
31
Msg39.cpp
@ -543,7 +543,7 @@ bool Msg39::getLists () {
|
||||
"component=%li "
|
||||
"otermLen=%li "
|
||||
"isSynonym=%li "
|
||||
"querylangid=%li ",
|
||||
"querylangid=%li " ,
|
||||
(long)this ,
|
||||
i ,
|
||||
qt->m_term,//bb ,
|
||||
@ -569,7 +569,7 @@ bool Msg39::getLists () {
|
||||
(long)m_tmpq.m_componentCodes[i],
|
||||
(long)m_tmpq.getTermLen(i) ,
|
||||
isSynonym,
|
||||
(long)m_tmpq.m_langId); // ,tt
|
||||
(long)m_tmpq.m_langId ); // ,tt
|
||||
// put it back
|
||||
*tpc = tmp;
|
||||
if ( st ) {
|
||||
@ -661,6 +661,7 @@ void gotListsWrapper ( void *state ) {
|
||||
Msg39 *THIS = (Msg39 *) state;
|
||||
// . hash the lists into our index table
|
||||
// . this will send back a reply or recycle and read more list data
|
||||
|
||||
if ( ! THIS->gotLists ( true ) ) return;
|
||||
|
||||
// . if he did not block and there was an errno we send reply
|
||||
@ -671,6 +672,12 @@ void gotListsWrapper ( void *state ) {
|
||||
log("msg39: sending back error reply = %s",mstrerror(g_errno));
|
||||
sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
|
||||
}
|
||||
|
||||
// no, block? call the docid split loop
|
||||
//if ( numDocIdSplits <= 1 ) return;
|
||||
|
||||
// if we get the lists and processed them without blocking, repeat!
|
||||
THIS->doDocIdSplitLoop();
|
||||
}
|
||||
|
||||
// . now come here when we got the necessary index lists
|
||||
@ -753,10 +760,25 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
// . now we must call this separately here, not in allocTopTree()
|
||||
// . we have to re-set the QueryTermInfos with each docid range split
|
||||
// since it will set the list ptrs from the msg2 lists
|
||||
if ( m_r->m_useNewAlgo && ! m_posdbTable.setQueryTermInfo () ) {
|
||||
return true;
|
||||
if ( ! m_posdbTable.setQueryTermInfo () ) return true;
|
||||
|
||||
// print query term bit numbers here
|
||||
for ( long i = 0 ;
|
||||
m_debug && i < m_tmpq.getNumTerms() ; i++ ) {
|
||||
QueryTerm *qt = &m_tmpq.m_qterms[i];
|
||||
//utf16ToUtf8(bb, 256, qt->m_term, qt->m_termLen);
|
||||
char *tpc = qt->m_term + qt->m_termLen;
|
||||
char tmp = *tpc;
|
||||
*tpc = '\0';
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("query: msg39: BITNUM query term #%li \"%s\" "
|
||||
"bitnum=%li ", i , qt->m_term, qt->m_bitNum );
|
||||
// put it back
|
||||
*tpc = tmp;
|
||||
logf(LOG_DEBUG,"%s",sb.getBufStart());
|
||||
}
|
||||
|
||||
|
||||
// timestamp log
|
||||
if ( m_debug ) {
|
||||
log(LOG_DEBUG,"query: msg39: [%lu] Preparing to intersect "
|
||||
@ -817,6 +839,7 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
// time it
|
||||
diff = gettimeofdayInMilliseconds() - start;
|
||||
if ( diff > 10 ) log("query: Took %lli ms for intersection",diff);
|
||||
|
||||
// returns false if blocked, true otherwise
|
||||
return addedLists ();
|
||||
}
|
||||
|
4
Msg5.cpp
4
Msg5.cpp
@ -859,9 +859,9 @@ bool Msg5::needsRecall ( ) {
|
||||
if ( m_round == 0 ) logIt = false;
|
||||
if ( logIt )
|
||||
logf(LOG_DEBUG,"db: Reading %li again from %s (need %li total "
|
||||
"got %li) this=0x%lx round=%li.",
|
||||
"got %li) cn=%li this=0x%lx round=%li.",
|
||||
m_newMinRecSizes , base->m_dbname , m_minRecSizes,
|
||||
m_list->m_listSize, (long)this , m_round );
|
||||
m_list->m_listSize, (long)m_collnum,(long)this, m_round );
|
||||
m_round++;
|
||||
// record how many screw ups we had so we know if it hurts performance
|
||||
base->m_rdb->didReSeek ( );
|
||||
|
@ -85,7 +85,9 @@ bool sendPageAddUrl2 ( TcpSocket *s , HttpRequest *r ) {
|
||||
if ( url ) {
|
||||
// normalize and add www. if it needs it
|
||||
Url uu;
|
||||
uu.set ( url , gbstrlen(url) , true );
|
||||
// do not convert xyz.com to www.xyz.com because sometimes
|
||||
// people want xyz.com exactly
|
||||
uu.set ( url , gbstrlen(url) , false ); // true );
|
||||
// remove >'s i guess and store in st1->m_url[] buffer
|
||||
st1->m_urlLen=cleanInput ( st1->m_url,
|
||||
MAX_URL_LEN,
|
||||
|
@ -623,6 +623,7 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
//"</td>"
|
||||
//"</tr>"
|
||||
|
||||
/*
|
||||
// local subdir match
|
||||
"<tr>"
|
||||
"<td>file://C/mydir/mysubdir/"
|
||||
@ -637,6 +638,7 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
"Do not spider files in this subdirectory."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
*/
|
||||
|
||||
// connect to a device and index it as a stream
|
||||
//"<tr>"
|
||||
|
@ -582,6 +582,7 @@ skipReplaceHost:
|
||||
sb.safePrintf ( "</table><br>\n" );
|
||||
|
||||
|
||||
/*
|
||||
// print spare hosts table
|
||||
sb.safePrintf (
|
||||
"<table %s>"
|
||||
@ -646,7 +647,9 @@ skipReplaceHost:
|
||||
h->m_note );
|
||||
}
|
||||
sb.safePrintf ( "</table><br>" );
|
||||
*/
|
||||
|
||||
/*
|
||||
// print proxy hosts table
|
||||
sb.safePrintf (
|
||||
"<table %s>"
|
||||
@ -754,6 +757,7 @@ skipReplaceHost:
|
||||
h->m_note );
|
||||
}
|
||||
sb.safePrintf ( "</table><br><br>" );
|
||||
*/
|
||||
|
||||
sb.safePrintf(
|
||||
"<style>"
|
||||
@ -812,7 +816,6 @@ skipReplaceHost:
|
||||
"<td>The UDP port used to send and receive dns traffic with."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
*/
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>http port</td>"
|
||||
@ -820,7 +823,6 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
/*
|
||||
"<tr class=poo>"
|
||||
"<td>best switch id</td>"
|
||||
"<td>The host prefers to be on this switch because it "
|
||||
@ -886,6 +888,43 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>avg split time</td>"
|
||||
"<td>Average time this host took to compute the docids "
|
||||
"for a query. Useful for guaging the slowness of a host "
|
||||
"compare to other hosts."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>splits done</td>"
|
||||
"<td>Number of queries this host completed. Used in "
|
||||
"computation of the <i>avg split time</i>."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>status</td>"
|
||||
"<td>Status flags for the host. See key below."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>slow reads</td>"
|
||||
"<td>Number of slow disk reads the host has had. "
|
||||
"When this is big compared to other hosts it is a good "
|
||||
"indicator its drives are relatively slow."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>docs indexed</td>"
|
||||
"<td>Number of documents this host has indexed over all "
|
||||
"collections. All hosts should have close to the same "
|
||||
"number in a well-sharded situation."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
//"<tr class=poo>"
|
||||
//"<td>loadavg</td>"
|
||||
//"<td>1-minute sliding-window load average from "
|
||||
@ -895,13 +934,26 @@ skipReplaceHost:
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>mem used</td>"
|
||||
"<td>percentage of memory currently used."
|
||||
"<td>Percentage of memory currently used."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>cpu usage</td>"
|
||||
"<td>percentage of cpu resources in use by the gb process."
|
||||
"<td>Percentage of cpu resources in use by the gb process."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>disk usage</td>"
|
||||
"<td>Percentage of disk in use. When this gets close to "
|
||||
"100%% you need to do something."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>max ping1</td>"
|
||||
"<td>The worst ping latency from host to host."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
@ -918,6 +970,7 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
/*
|
||||
"<tr class=poo>"
|
||||
"<td>ping2</td>"
|
||||
"<td>Ping time to this host on the seconday/shotgun "
|
||||
@ -925,6 +978,7 @@ skipReplaceHost:
|
||||
"network is not enabled in the master controls."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
*/
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>M (status flag)</td>"
|
||||
@ -950,6 +1004,27 @@ skipReplaceHost:
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>R (status flag)</td>"
|
||||
"<td>Indicates host is performing a rebalance operation."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>F (status flag)</td>"
|
||||
"<td>Indicates host has foreign records and requires "
|
||||
"a rebalance operation."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
"<tr class=poo>"
|
||||
"<td>x (status flag)</td>"
|
||||
"<td>Indicates host has abruptly exited due to a fatal "
|
||||
"error (cored) and "
|
||||
"restarted itself."
|
||||
"</td>"
|
||||
"</tr>\n"
|
||||
|
||||
|
||||
,
|
||||
TABLE_STYLE
|
||||
|
@ -233,13 +233,13 @@ bool sendReply ( void *state ) {
|
||||
|
||||
"By default, injected urls "
|
||||
"take precedence over the \"insitelist\" directive in the "
|
||||
"<a href=/admin/scheduler>spider scheduler</a> "
|
||||
"<a href=/admin/filters>url filters</a> "
|
||||
"so injected urls need not match the "
|
||||
"<a href=/admin/sites>spider sites</a> patterns. You can "
|
||||
"change that behavior in the <a href=/scheduler>spider "
|
||||
"scheduler</a> if you want. "
|
||||
"change that behavior in the <a href=/admin/filters>url "
|
||||
"filters</a> if you want. "
|
||||
"Injected urls will have a "
|
||||
"<a href=/admin/scheduler#hopcount>hopcount</a> of 0. "
|
||||
"<a href=/admin/filters#hopcount>hopcount</a> of 0. "
|
||||
"The injection api is described on the "
|
||||
"<a href=/admin/api>api</a> page."
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
#include "gb-include.h"
|
||||
|
||||
#include "PageParser.h"
|
||||
#include "IndexTable.h"
|
||||
#include "IndexTable2.h"
|
||||
//#include "IndexTable.h"
|
||||
//#include "IndexTable2.h"
|
||||
//#include "XmlDoc.h" // addCheckboxSpan()
|
||||
|
||||
bool g_inPageParser = false;
|
||||
@ -101,7 +101,7 @@ bool sendPageParser2 ( TcpSocket *s ,
|
||||
st->m_termFreqs = termFreqs;
|
||||
st->m_termFreqWeights = termFreqWeights;
|
||||
st->m_affWeights = affWeights;
|
||||
st->m_total = (score_t)-1;
|
||||
//st->m_total = (score_t)-1;
|
||||
st->m_indexCode = 0;
|
||||
st->m_blocked = false;
|
||||
st->m_didRootDom = false;
|
||||
@ -654,7 +654,7 @@ bool sendPageAnalyze ( TcpSocket *s , HttpRequest *r ) {
|
||||
//st->m_termFreqs = termFreqs;
|
||||
//st->m_termFreqWeights = termFreqWeights;
|
||||
//st->m_affWeights = affWeights;
|
||||
st->m_total = (score_t)-1;
|
||||
//st->m_total = (score_t)-1;
|
||||
st->m_indexCode = 0;
|
||||
st->m_blocked = false;
|
||||
st->m_didRootDom = false;
|
||||
|
@ -80,7 +80,7 @@ public:
|
||||
long long *m_termFreqs;
|
||||
float *m_termFreqWeights;
|
||||
float *m_affWeights;
|
||||
score_t m_total;
|
||||
//score_t m_total;
|
||||
bool m_freeIt;
|
||||
bool m_blocked;
|
||||
|
||||
|
@ -1324,7 +1324,7 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
if ( isAdmin ) {
|
||||
sb->safePrintf(" "
|
||||
"<font color=red><b>"
|
||||
"<a href=\"/admin/basic?c=%s\">"
|
||||
"<a href=\"/admin/settings?c=%s\">"
|
||||
"[admin]"
|
||||
"</a></b></font>",coll);
|
||||
// print reindex link
|
||||
@ -2141,7 +2141,9 @@ bool printResult ( State0 *st, long ix ) {
|
||||
*end == '}' ) {
|
||||
// replace trailing } with spidertime}
|
||||
sb->incrementLength(-1);
|
||||
sb->safePrintf(",\"docId\":%lli\n", mr->m_docId);
|
||||
sb->safePrintf(",\"docId\":%lli", mr->m_docId);
|
||||
// for deduping
|
||||
//sb->safePrintf(",\"crc\":%lu",mr->m_contentHash32);
|
||||
// crap, we lose resolution storing as a float
|
||||
// so fix that shit here...
|
||||
//float f = mr->m_lastSpidered;
|
||||
|
@ -78,7 +78,7 @@ bool printNav ( SafeBuf &sb , HttpRequest *r ) {
|
||||
//" <a href=/logout>Logout</a>"
|
||||
);
|
||||
|
||||
if ( r->isLocal() )
|
||||
//if ( r->isLocal() )
|
||||
sb.safePrintf(" [<a href=\"/admin/settings\">"
|
||||
"<font color=red>Admin</font></a>]");
|
||||
sb.safePrintf("</p></b></center></body></html>");
|
||||
|
12
Pages.cpp
12
Pages.cpp
@ -233,9 +233,9 @@ static WebPage s_pages[] = {
|
||||
"what sites can be spidered",
|
||||
sendPageGeneric , 0 } , // sendPageBasicSettings
|
||||
|
||||
{ PAGE_FILTERS , "admin/scheduler", 0 , "spider scheduler" , 1 , 1,
|
||||
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"schedule urls to be spidered",
|
||||
"prioritize urls for spidering",
|
||||
sendPageGeneric , 0 } ,
|
||||
|
||||
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0 , 1 ,
|
||||
@ -1353,7 +1353,7 @@ bool Pages::printAdminBottom ( SafeBuf *sb, HttpRequest *r ) {
|
||||
bool Pages::printSubmit ( SafeBuf *sb ) {
|
||||
// update button
|
||||
return sb->safePrintf (
|
||||
"<br>"
|
||||
//"<br>"
|
||||
"<center>"
|
||||
"<input type=submit name=action value=submit>"
|
||||
"</center>"
|
||||
@ -1764,7 +1764,9 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
|
||||
//if ( g_users.hasPermission(username,PAGE_ADMIN ) )
|
||||
// sprintf(buf,"&master=0");
|
||||
|
||||
//sb->safePrintf("<div style=max-width:1000px;>");
|
||||
// unfortunately width:100% is percent of the virtual window, not the
|
||||
// visible window... so just try 1000px max
|
||||
sb->safePrintf("<div style=max-width:800px;>");
|
||||
|
||||
//long matt1 = atoip ( MATTIP1 , gbstrlen(MATTIP1) );
|
||||
//long matt2 = atoip ( MATTIP2 , gbstrlen(MATTIP2) );
|
||||
@ -1904,7 +1906,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
|
||||
"href=/developer.html>"
|
||||
"<b>dev guide</b></a>" );
|
||||
|
||||
//sb->safePrintf("</div>");
|
||||
sb->safePrintf("</div>");
|
||||
|
||||
//sb->safePrintf("</center>" );
|
||||
//sb->safePrintf("<br/>" );
|
||||
|
60
Parms.cpp
60
Parms.cpp
@ -929,7 +929,7 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
|
||||
long fromIp = s->m_ip;
|
||||
|
||||
char fmt = r->getReplyFormat();
|
||||
|
||||
/*
|
||||
if ( fmt == FORMAT_HTML )
|
||||
sb->safePrintf (
|
||||
"<script type=\"text/javascript\">"
|
||||
@ -959,7 +959,7 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
|
||||
" }\n"
|
||||
"}\n"
|
||||
"</script>");
|
||||
|
||||
*/
|
||||
// print the start of the table
|
||||
char *tt = "None";
|
||||
if ( page == PAGE_LOG ) tt = "Log Controls";
|
||||
@ -969,7 +969,7 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
|
||||
if ( page == PAGE_SPIDER ) tt = "Spider Controls";
|
||||
if ( page == PAGE_SEARCH ) tt = "Search Controls";
|
||||
if ( page == PAGE_ACCESS ) tt = "Access Controls";
|
||||
if ( page == PAGE_FILTERS ) tt = "Spider Scheduler";
|
||||
if ( page == PAGE_FILTERS ) tt = "Url Filters";
|
||||
if ( page == PAGE_BASIC_SETTINGS ) tt = "Settings";
|
||||
if ( page == PAGE_BASIC_SECURITY ) tt = "Security";
|
||||
if ( page == PAGE_SITES ) tt = "Site List";
|
||||
@ -1049,11 +1049,12 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
|
||||
//p= g_parms.printParms (p, pend, page, user, THIS, coll, pwd, nc, pd);
|
||||
g_parms.printParms ( sb , s , r );
|
||||
|
||||
if ( fmt == FORMAT_HTML ) sb->safePrintf ( "<br><br>\n" );
|
||||
|
||||
// end the table
|
||||
if ( fmt == FORMAT_HTML ) sb->safePrintf ( "</table>\n" );
|
||||
|
||||
// this must be outside of table, submit button follows
|
||||
if ( fmt == FORMAT_HTML ) sb->safePrintf ( "<br>\n" );
|
||||
|
||||
// url filter page has a test table
|
||||
if ( page == PAGE_FILTERS && fmt == FORMAT_HTML ) {
|
||||
// wrap up the form, print a submit button
|
||||
@ -1698,10 +1699,12 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
// and default value if it exists
|
||||
if ( m->m_def && m->m_def[0] && t != TYPE_CMD ) {
|
||||
char *d = m->m_def;
|
||||
if ( t == TYPE_BOOL ) {
|
||||
if ( t == TYPE_BOOL || t == TYPE_CHECKBOX ) {
|
||||
if ( d[0]=='0' ) d = "NO";
|
||||
else d = "YES";
|
||||
sb->safePrintf ( " Default: %s.",d);
|
||||
sb->safePrintf ( " <nobr>"
|
||||
"Default: %s."
|
||||
"</nobr>",d);
|
||||
}
|
||||
else {
|
||||
sb->safePrintf (" Default: ");
|
||||
@ -1782,7 +1785,8 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
}
|
||||
}
|
||||
else {
|
||||
sb->safePrintf("<center><nobr>");
|
||||
//sb->safePrintf("<center><nobr>");
|
||||
sb->safePrintf("<nobr>");
|
||||
// this is part of the "HACK" fix below. you have to
|
||||
// specify the cgi parm in the POST request, and
|
||||
// unchecked checkboxes are not included in the POST
|
||||
@ -1829,7 +1833,9 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
// sb->safePrintf("value=0 name=%s%s>",
|
||||
// cgi,ddd2);
|
||||
//}
|
||||
sb->safePrintf("</nobr></center>");
|
||||
sb->safePrintf("</nobr>"
|
||||
//"</center>"
|
||||
);
|
||||
}
|
||||
}
|
||||
else if ( t == TYPE_CHAR )
|
||||
@ -5778,11 +5784,19 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
m->m_title = "email server 1";
|
||||
m->m_desc = "Connects to this server directly when sending email 1 ";
|
||||
m->m_desc = "Connects to this IP or hostname "
|
||||
"directly when sending email 1. "
|
||||
"Use <i>apt-get install sendmail</i> to install sendmail "
|
||||
"on that IP or hostname. Add <i>From:10.5 RELAY</i> to "
|
||||
"/etc/mail/access to allow sendmail to forward email it "
|
||||
"receives from gigablast if gigablast hosts are on the "
|
||||
"10.5.*.* IPs. Then run <i>/etc/init.d/sendmail restart</i> "
|
||||
"as root to pick up those changes so sendmail will forward "
|
||||
"Gigablast's mail to the address you give below.";
|
||||
m->m_cgi = "esrvone";
|
||||
m->m_off = (char *)&g_conf.m_email1MX - g;
|
||||
m->m_type = TYPE_STRING;
|
||||
m->m_def = "10.5.54.47";
|
||||
m->m_def = "127.0.0.1";
|
||||
m->m_size = MAX_MX_LEN;
|
||||
m->m_priv = 2;
|
||||
m->m_group = 0;
|
||||
@ -7487,7 +7501,7 @@ void Parms::init ( ) {
|
||||
"If your url does not index as you expect you "
|
||||
"can check it's history. " // (spiderdb lookup)
|
||||
"Added urls will have a "
|
||||
"<a href=/admin/scheduler#hopcount>hopcount</a> of 0. "
|
||||
"<a href=/admin/filters#hopcount>hopcount</a> of 0. "
|
||||
"The add url api is described on the "
|
||||
"<a href=/admin/api>api</a> page.";
|
||||
m->m_cgi = "urls";
|
||||
@ -7509,7 +7523,7 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
m->m_title = "strip sessionids";
|
||||
m->m_desc = "strip added urls of their session ids.";
|
||||
m->m_desc = "Strip added urls of their session ids.";
|
||||
m->m_cgi = "strip";
|
||||
m->m_page = PAGE_ADDURL2;
|
||||
m->m_obj = OBJ_NONE;
|
||||
@ -7518,7 +7532,7 @@ void Parms::init ( ) {
|
||||
m++;
|
||||
|
||||
m->m_title = "harvest links";
|
||||
m->m_desc = "harvest links of added urls so we can spider them?.";
|
||||
m->m_desc = "Harvest links of added urls so we can spider them?.";
|
||||
m->m_cgi = "spiderLinks";
|
||||
m->m_page = PAGE_ADDURL2;
|
||||
m->m_obj = OBJ_NONE;
|
||||
@ -7557,17 +7571,17 @@ void Parms::init ( ) {
|
||||
m->m_xml = "siteList";
|
||||
m->m_desc = "List of sites to spider, one per line. "
|
||||
"Gigablast uses the "
|
||||
"<a href=/admin/scheduler#insitelist>insitelist</a> "
|
||||
"<a href=/admin/filters#insitelist>insitelist</a> "
|
||||
"directive on "
|
||||
"the <a href=/admin/scheduler>spider scheduler</a> "
|
||||
"the <a href=/admin/filters>url filters</a> "
|
||||
"page to make sure that the spider only indexes urls "
|
||||
"that match the site patterns you specify here, other than "
|
||||
"urls you add individually via the add urls or inject url "
|
||||
"tools. "
|
||||
"See <a href=#examples>example site list</a> below. "
|
||||
"Limit list to 300MB. If you have a lot of INDIVIDUAL URLS "
|
||||
"to add then consider using the <a href=/admin/addurl>addurl"
|
||||
"</a> interface.";
|
||||
"to add then consider using the <a href=/admin/addurl>add "
|
||||
"urls</a> interface.";
|
||||
m->m_cgi = "sitelist";
|
||||
m->m_off = (char *)&cr.m_siteListBuf - x;
|
||||
m->m_page = PAGE_BASIC_SETTINGS;
|
||||
@ -7625,9 +7639,9 @@ void Parms::init ( ) {
|
||||
m->m_xml = "siteList";
|
||||
m->m_desc = "List of sites to spider, one per line. "
|
||||
"Gigablast uses the "
|
||||
"<a href=/admin/scheduler#insitelist>insitelist</a> "
|
||||
"<a href=/admin/filters#insitelist>insitelist</a> "
|
||||
"directive on "
|
||||
"the <a href=/admin/scheduler>spider scheduler</a> "
|
||||
"the <a href=/admin/filters>url filters</a> "
|
||||
"page to make sure that the spider only indexes urls "
|
||||
"that match the site patterns you specify here, other than "
|
||||
"urls you add individually via the add urls or inject url "
|
||||
@ -10691,7 +10705,7 @@ void Parms::init ( ) {
|
||||
m->m_off = (char *)&cr.m_siteClusterByDefault - x;
|
||||
m->m_soff = (char *)&si.m_doSiteClustering - y;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "1";
|
||||
m->m_def = "0";
|
||||
m->m_sparm = 1;
|
||||
m->m_scgi = "sc";
|
||||
m++;
|
||||
@ -18523,9 +18537,9 @@ bool printUrlExpressionExamples ( SafeBuf *sb ) {
|
||||
"the list of sites on the <a href=/admin/sites>"
|
||||
"site list</a> page. That site list is useful for "
|
||||
"adding a large number of sites that can not be "
|
||||
"accomodated by the spider scheduler table. Plus "
|
||||
"accomodated by the url fitlers table. Plus "
|
||||
"it is higher performance and easier to use, but "
|
||||
"lacks the spider scheduler's "
|
||||
"lacks the url filter table's "
|
||||
"fine level of control."
|
||||
"</td></tr>"
|
||||
|
||||
|
362
Posdb.cpp
362
Posdb.cpp
@ -1263,7 +1263,6 @@ char *getHashGroupString ( unsigned char hg ) {
|
||||
//
|
||||
////////////////
|
||||
|
||||
#define MAX_SUBLISTS 50
|
||||
/*
|
||||
// . these lists[] are 1-1 with q->m_qterms
|
||||
void PosdbTable::intersectLists9_r ( ) {
|
||||
@ -4075,38 +4074,6 @@ float PosdbTable::getTermPairScoreForAny ( long i, long j,
|
||||
//
|
||||
|
||||
|
||||
// . each QueryTerm has this attached additional info now:
|
||||
// . these should be 1-1 with query terms, Query::m_qterms[]
|
||||
class QueryTermInfo {
|
||||
public:
|
||||
// the required lists for this query term, synonym lists, etc.
|
||||
RdbList *m_subLists [MAX_SUBLISTS];
|
||||
// flags to indicate if bigram list should be scored higher
|
||||
char m_bigramFlags [MAX_SUBLISTS];
|
||||
// shrinkSubLists() set this:
|
||||
long m_newSubListSize [MAX_SUBLISTS];
|
||||
char *m_newSubListStart [MAX_SUBLISTS];
|
||||
char *m_newSubListEnd [MAX_SUBLISTS];
|
||||
char *m_cursor [MAX_SUBLISTS];
|
||||
char *m_savedCursor [MAX_SUBLISTS];
|
||||
long m_numNewSubLists;
|
||||
// how many are valid?
|
||||
long m_numSubLists;
|
||||
// size of all m_subLists in bytes
|
||||
long long m_totalSubListsSize;
|
||||
// the term freq weight for this term
|
||||
float m_termFreqWeight;
|
||||
// what query term # do we correspond to in Query.h
|
||||
long m_qtermNum;
|
||||
// the word position of this query term in the Words.h class
|
||||
long m_qpos;
|
||||
// the wikipedia phrase id if we start one
|
||||
long m_wikiPhraseId;
|
||||
// phrase id term or bigram is in
|
||||
long m_quotedStartId;
|
||||
};
|
||||
|
||||
|
||||
// returns false and sets g_errno on error
|
||||
bool PosdbTable::setQueryTermInfo ( ) {
|
||||
|
||||
@ -4215,6 +4182,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
qti->m_bigramFlags[nn] = BF_HALFSTOPWIKIBIGRAM;
|
||||
// before a pipe operator?
|
||||
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
|
||||
// add list of member terms as well
|
||||
//qti->m_qtermList[nn] = &m_q->m_qterms[left];
|
||||
m_q->m_qterms[left].m_bitNum = nrg;
|
||||
// only really add if useful
|
||||
if ( list && list->m_listSize ) nn++;
|
||||
|
||||
@ -4231,6 +4201,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
qti->m_bigramFlags[nn] |= BF_SYNONYM;
|
||||
if (qt->m_piped)
|
||||
qti->m_bigramFlags[nn]|=BF_PIPED;
|
||||
// add list of member terms as well
|
||||
//qti->m_qtermList[nn] = bt;
|
||||
bt->m_bitNum = nrg;
|
||||
if ( list && list->m_listSize ) nn++;
|
||||
}
|
||||
|
||||
@ -4252,6 +4225,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
qti->m_bigramFlags[nn] = BF_HALFSTOPWIKIBIGRAM;
|
||||
// before a pipe operator?
|
||||
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
|
||||
// add list of member terms as well
|
||||
//qti->m_qtermList[nn] = &m_q->m_qterms[right];
|
||||
m_q->m_qterms[right].m_bitNum = nrg;
|
||||
// only really add if useful
|
||||
if ( list && list->m_listSize ) nn++;
|
||||
|
||||
@ -4268,6 +4244,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
qti->m_bigramFlags[nn] |= BF_SYNONYM;
|
||||
if (qt->m_piped)
|
||||
qti->m_bigramFlags[nn]|=BF_PIPED;
|
||||
// add list of member terms as well
|
||||
//qti->m_qtermList[nn] = bt;
|
||||
bt->m_bitNum = nrg;
|
||||
if ( list && list->m_listSize ) nn++;
|
||||
}
|
||||
|
||||
@ -4312,6 +4291,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
if (qt->m_fieldCode == FIELD_GBNUMBERMAXINT )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
|
||||
// add list of member terms
|
||||
//qti->m_qtermList[nn] = qt;
|
||||
qt->m_bitNum = nrg;
|
||||
|
||||
// only really add if useful
|
||||
// no, because when inserting NEW (related) terms that are
|
||||
// not currently in the document, this list may initially
|
||||
@ -4334,6 +4317,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
|
||||
// call it a synonym i guess
|
||||
qti->m_bigramFlags[nn] |= BF_BIGRAM;
|
||||
// add list of member terms
|
||||
//qti->m_qtermList[nn] = &m_q->m_qterms[left];
|
||||
m_q->m_qterms[left].m_bitNum = nrg;
|
||||
// only really add if useful
|
||||
if ( list && list->m_listSize ) nn++;
|
||||
|
||||
@ -4349,6 +4335,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
qti->m_bigramFlags[nn] = BF_SYNONYM;
|
||||
if (qt->m_piped)
|
||||
qti->m_bigramFlags[nn]|=BF_PIPED;
|
||||
// add list of member terms
|
||||
//qti->m_qtermList[nn] = bt;
|
||||
bt->m_bitNum = nrg;
|
||||
if ( list && list->m_listSize ) nn++;
|
||||
}
|
||||
|
||||
@ -4370,6 +4359,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
qti->m_bigramFlags[nn] |= BF_BIGRAM;
|
||||
// before a pipe operator?
|
||||
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
|
||||
// add list of query terms too that are in this group
|
||||
//qti->m_qtermList[nn] = &m_q->m_qterms[right];
|
||||
m_q->m_qterms[right].m_bitNum = nrg;
|
||||
// only really add if useful
|
||||
if ( list && list->m_listSize ) nn++;
|
||||
|
||||
@ -4385,6 +4377,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
qti->m_bigramFlags[nn] = BF_SYNONYM;
|
||||
if (qt->m_piped)
|
||||
qti->m_bigramFlags[nn]|=BF_PIPED;
|
||||
// add list of member terms
|
||||
//qti->m_qtermList[nn] = bt;
|
||||
bt->m_bitNum = nrg;
|
||||
if ( list && list->m_listSize ) nn++;
|
||||
}
|
||||
|
||||
@ -4408,6 +4403,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
qti->m_bigramFlags[nn] = BF_SYNONYM;
|
||||
// before a pipe operator?
|
||||
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
|
||||
// add list of member terms as well
|
||||
//qti->m_qtermList[nn] = qt2;
|
||||
// set bitnum here i guess
|
||||
qt2->m_bitNum = nrg;
|
||||
// only really add if useful
|
||||
if ( list && list->m_listSize ) nn++;
|
||||
}
|
||||
@ -4448,11 +4447,34 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
nrg++;
|
||||
}
|
||||
|
||||
//
|
||||
// now set QueryTerm::m_bitNum for use by Expression::isTruth()
|
||||
// in Query.cpp for boolean queries, so we can get the bit vector
|
||||
// of a docid that is 1-1 with the queryterminfos and see which
|
||||
// query words in the boolean expression it contains.
|
||||
// used by matchesBoolQuery() which we call below.
|
||||
//
|
||||
/*
|
||||
for ( long i = 0 ; i < nrg ; i++ ) {
|
||||
// get one
|
||||
QueryTermInfo *qti = &qip[i];
|
||||
// how many query terms are in this group?
|
||||
for ( long j = 0 ; j < qti->m_numSubLists ; j++ ) {
|
||||
// get the query term
|
||||
QueryTerm *qt = qti->m_qtermList[j];
|
||||
// set the bit num member
|
||||
qt->m_bitNum = i;
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
//
|
||||
// get the query term with the least data in posdb including syns
|
||||
//
|
||||
m_minListSize = 0;
|
||||
m_minListi = -1;
|
||||
long long grand = 0LL;
|
||||
// hopefully no more than 100 sublists per term
|
||||
//char *listEnds [ MAX_QUERY_TERMS ][ MAX_SUBLISTS ];
|
||||
// set ptrs now i guess
|
||||
@ -4465,6 +4487,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
|
||||
// add to it
|
||||
total = qti->m_totalSubListsSize;
|
||||
// add up this now
|
||||
grand += total;
|
||||
// get min
|
||||
if ( total < m_minListSize || m_minListi == -1 ) {
|
||||
m_minListSize = total;
|
||||
@ -4485,9 +4509,40 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
long maxDocIds = m_minListSize / 12;
|
||||
// store all interesected docids in here for new algo plus 1 byte vote
|
||||
long need = maxDocIds * 6;
|
||||
|
||||
// they could all be OR'd together!
|
||||
if ( m_q->m_isBoolean ) need = grand;
|
||||
|
||||
// so we can always cast a long long from a ptr in there
|
||||
// for setting m_docId when m_booleanQuery is true below
|
||||
need += 8;
|
||||
|
||||
// get max # of docids we got in an intersection from all the lists
|
||||
if ( ! m_docIdVoteBuf.reserve ( need,"divbuf" ) ) return false;
|
||||
|
||||
// i'm feeling if a boolean query put this in there too, the
|
||||
// hashtable that maps each docid to its boolean bit vector
|
||||
// where each bit stands for an operand so we can quickly evaluate
|
||||
// the bit vector in a truth table
|
||||
long maxSlots = maxDocIds * 2;
|
||||
// get total operands we used
|
||||
//long numOperands = m_q->m_numWords;//Operands;
|
||||
// a quoted phrase counts as a single operand
|
||||
// . QueryTerm::m_bitNum <== m_numQueryTermInfos
|
||||
// . each queryTermInfo class corresponds to one bit in our bit vec
|
||||
// . essentially each queryTermInfo is a query term, but it has
|
||||
// all the synonym and word forms for that query, etc.
|
||||
m_vecSize = m_numQueryTermInfos;//numOperands / 8 ;
|
||||
// allow an extra byte for remainders
|
||||
if ( m_numQueryTermInfos % 8 ) m_vecSize++;
|
||||
// now preallocate the hashtable. 0 niceness.
|
||||
if ( m_q->m_isBoolean &&
|
||||
! m_bt.set (8,m_vecSize,maxSlots,NULL,0,false,0,"booltbl"))
|
||||
return false;
|
||||
if ( m_q->m_isBoolean &&
|
||||
! m_ct.set (8,1,maxSlots,NULL,0,false,0,
|
||||
"booltbl"))
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -5110,7 +5165,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
// . if smallest required list is empty, 0 results
|
||||
// . also set in setQueryTermInfo
|
||||
if ( m_minListSize == 0 ) return;
|
||||
if ( m_minListSize == 0 && ! m_q->m_isBoolean ) return;
|
||||
|
||||
/*
|
||||
for ( long k = 0 ; seoHack && k < m_q->m_numTerms ; k++ ) {
|
||||
@ -5165,6 +5220,20 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
//if ( ! m_msg2 ) goto seoHackSkip;
|
||||
|
||||
|
||||
// for boolean queries we scan every docid in all termlists,
|
||||
// then we see what query terms it has, and make a bit vector for it.
|
||||
// then use a hashtable to map that bit vector to a true or false
|
||||
// as to whether we should include it in the results or not.
|
||||
// we use Query::getBitScore(qvec_t ebits) to evaluate a docid's
|
||||
// query term explicit term bit vector.
|
||||
if ( m_q->m_isBoolean ) {
|
||||
// keeping the docids sorted is the challenge here...
|
||||
makeDocIdVoteBufForBoolQuery_r();
|
||||
goto skip3;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// . create "m_docIdVoteBuf" filled with just the docids from the
|
||||
// smallest group of sublists
|
||||
// . m_minListi is the queryterminfo that had the smallest total
|
||||
@ -5238,6 +5307,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
}
|
||||
*/
|
||||
|
||||
skip3:
|
||||
|
||||
if ( m_debug ) {
|
||||
now = gettimeofdayInMilliseconds();
|
||||
took = now - lastTime;
|
||||
@ -5662,6 +5733,16 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
if ( m_q->m_isBoolean ) {
|
||||
minScore = 1.0;
|
||||
// since we are jumping, we need to set m_docId here
|
||||
m_docId = *(unsigned long *)(docIdPtr+1);
|
||||
m_docId <<= 8;
|
||||
m_docId |= (unsigned char)docIdPtr[0];
|
||||
m_docId >>= 2;
|
||||
goto boolJump;
|
||||
}
|
||||
|
||||
// TODO: consider skipping this pre-filter if it sucks, as it does
|
||||
// for 'time enough for love'. it might save time!
|
||||
|
||||
@ -6512,6 +6593,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
goto advance;
|
||||
|
||||
|
||||
boolJump:
|
||||
|
||||
// try dividing it by 3! (or multiply by .33333 faster)
|
||||
score = minScore * (((float)siteRank)*SITERANKMULTIPLIER+1.0);
|
||||
|
||||
@ -6670,6 +6753,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// set the score and docid ptr
|
||||
t->m_score = score;
|
||||
t->m_docId = m_docId;
|
||||
// sanity
|
||||
if ( m_docId == 0 ) { char *xx=NULL;*xx=0; }
|
||||
// use an integer score like lastSpidered timestamp?
|
||||
if ( m_sortByTermNumInt >= 0 ) {
|
||||
t->m_intScore = intScore;
|
||||
@ -6961,4 +7046,213 @@ void printTermList ( long i, char *list, long listSize ) {
|
||||
}
|
||||
}
|
||||
|
||||
// sort in descending order
|
||||
int dcmp6 ( const void *h1 , const void *h2 ) {
|
||||
if ( *(unsigned long *)((char *)h1+2) <
|
||||
*(unsigned long *)((char *)h2+2) )
|
||||
return -1;
|
||||
if ( *(unsigned long *)((char *)h1+2) >
|
||||
*(unsigned long *)((char *)h2+2) )
|
||||
return 1;
|
||||
if ( *(unsigned short *)((char *)h1) <
|
||||
*(unsigned short *)((char *)h2) )
|
||||
return -1;
|
||||
// they shouldn't be any dups in there...
|
||||
return 1;
|
||||
}
|
||||
|
||||
// TODO: do this in docid range phases to save memory and be much faster
|
||||
// since we could contain to the L1 cache for hashing
|
||||
bool PosdbTable::makeDocIdVoteBufForBoolQuery_r ( ) {
|
||||
|
||||
// . make a hashtable of all the docids from all the termlists
|
||||
// . the value slot will be the operand bit vector i guess
|
||||
// . the size of the vector needs one bit per query operand
|
||||
// . if the vector is only 1-2 bytes we can just evaluate each
|
||||
// combination we encounter and store it into an array, otherwise,
|
||||
// we can use a another hashtable in order to avoid re-evaluation
|
||||
// on if it passes the boolean query.
|
||||
char bitVec[MAX_OVEC_SIZE];
|
||||
if ( m_vecSize > MAX_OVEC_SIZE ) m_vecSize = MAX_OVEC_SIZE;
|
||||
|
||||
QueryTermInfo *qip = (QueryTermInfo *)m_qiBuf.getBufStart();
|
||||
|
||||
// . scan each list of docids to a get a new docid, keep a dedup
|
||||
// table to avoid re-processing the same docid.
|
||||
// . each posdb list we read corresponds to a query term,
|
||||
// or a synonym of a query term, or bigram of a query term, etc.
|
||||
// but we really want to know what operand, so we associate an
|
||||
// operand bit with each query term, and each list can map to
|
||||
// the base query term so we can get the operand # from that.
|
||||
for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
|
||||
|
||||
// get it
|
||||
QueryTermInfo *qti = &qip[i];
|
||||
|
||||
QueryTerm *qt = &m_q->m_qterms[qti->m_qtermNum];
|
||||
// get the query word
|
||||
//QueryWord *qw = qt->m_qword;
|
||||
|
||||
// just use the word # now
|
||||
//long opNum = qw->m_wordNum;//opNum;
|
||||
|
||||
// . make it consistent with Query::isTruth()
|
||||
// . m_bitNum is set above to the QueryTermInfo #
|
||||
long bitNum = qt->m_bitNum;
|
||||
|
||||
// do not consider for adding if negative ('my house -home')
|
||||
//if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
|
||||
|
||||
// set all to zeroes
|
||||
memset ( bitVec , 0 , m_vecSize );
|
||||
// set bitvec for him
|
||||
long byte = bitNum / 8;
|
||||
unsigned char mask = 1<<(bitNum % 8);
|
||||
bitVec[byte] |= mask;
|
||||
|
||||
// each query term can have synonym lists etc. scan those
|
||||
for ( long j = 0 ; j < qti->m_numSubLists ; j++ ) {
|
||||
|
||||
// scan all docids in this list
|
||||
char *p = qti->m_subLists[j]->getList();
|
||||
char *pend = qti->m_subLists[j]->getListEnd();
|
||||
|
||||
//long long lastDocId = 0LL;
|
||||
|
||||
for ( ; p < pend ; ) {
|
||||
// place holder
|
||||
long long docId = g_posdb.getDocId(p);
|
||||
|
||||
// sanity
|
||||
//if ( d < lastDocId ) { char *xx=NULL;*xx=0; }
|
||||
//lastDocId = d;
|
||||
|
||||
// point to it
|
||||
//char *dp = p + 8;
|
||||
|
||||
// this was the first key for this docid for
|
||||
// this termid and possible the first key for
|
||||
// this termid, so skip it, either 12 or 18
|
||||
// bytes
|
||||
if ( (((char *)p)[0])&0x02 ) p += 12;
|
||||
// the first key for this termid?
|
||||
else p += 18;
|
||||
|
||||
// then only 6 byte keys would follow from the
|
||||
// same docid, so skip those as well
|
||||
subloop:
|
||||
if((((char *)p)[0])&0x04){p += 6;goto subloop;}
|
||||
|
||||
// convert docid into hash key
|
||||
//long long docId = *(long long *)dp;
|
||||
// shift down 2 bits
|
||||
//docId >>= 2;
|
||||
// and mask
|
||||
//docId &= DOCID_MASK;
|
||||
// test it
|
||||
//long long docId = g_posdb.getDocId(dp-8);
|
||||
//if ( d2 != docId ) { char *xx=NULL;*xx=0; }
|
||||
// store this docid though. treat as long long
|
||||
// but we mask with keymask
|
||||
long slot = m_bt.getSlot ( &docId );
|
||||
if ( slot < 0 ) {
|
||||
// we can't alloc in a thread, careful
|
||||
if ( ! m_bt.addKey(&docId,bitVec) ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
continue;
|
||||
}
|
||||
// or the bit in otherwise
|
||||
char *bv = (char *)m_bt.getValueFromSlot(slot);
|
||||
bv[byte] |= mask;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
char *dst = m_docIdVoteBuf.getBufStart();
|
||||
|
||||
// . now our hash table is filled with all the docids
|
||||
// . evaluate each bit vector
|
||||
for ( long i = 0 ; i < m_bt.m_numSlots ; i++ ) {
|
||||
// skip if empty
|
||||
if ( ! m_bt.m_flags[i] ) continue;
|
||||
// get the bit vector
|
||||
unsigned char *vec = (unsigned char *)m_bt.getValueFromSlot(i);
|
||||
// hash the vector
|
||||
long long h64 = 0LL;
|
||||
for ( long k = 0 ; k < m_vecSize ; k++ )
|
||||
h64^=g_hashtab[(unsigned char)vec[k]][(unsigned char)k];
|
||||
// check in hash table
|
||||
char *val = (char *)m_ct.getValue ( &h64 );
|
||||
|
||||
// it passes, add the ocid
|
||||
if ( m_debug ) {
|
||||
long long docId =*(long long *)m_bt.getKeyFromSlot(i);
|
||||
log("query: eval d=%llu vec[0]=%lx h64=%lli",
|
||||
docId,(long)vec[0],h64);
|
||||
//if ( docId == 47801316261LL )
|
||||
// log("hy");
|
||||
}
|
||||
|
||||
// add him to the good table
|
||||
if ( val && *val ) {
|
||||
// it passes, add the ocid
|
||||
long long docId =*(long long *)m_bt.getKeyFromSlot(i);
|
||||
// fix it up
|
||||
if ( m_debug ) {
|
||||
log("query: adding d=%llu vec[0]=%lx",
|
||||
docId,(long)vec[0]);
|
||||
}
|
||||
// shift up
|
||||
docId <<= 2;
|
||||
// a 6 byte key means you pass
|
||||
memcpy ( dst , &docId , 6 );
|
||||
dst += 6;
|
||||
continue;
|
||||
}
|
||||
// evaluate the vector
|
||||
char include = m_q->matchesBoolQuery ( (unsigned char *)vec ,
|
||||
m_vecSize );
|
||||
if ( include ) {
|
||||
// it passes, add the ocid
|
||||
long long docId =*(long long *)m_bt.getKeyFromSlot(i);
|
||||
// fix it up
|
||||
if ( m_debug ) {
|
||||
log("query: adding d=%llu vec[0]=0x%lx",
|
||||
docId,(long)vec[0]);
|
||||
}
|
||||
// shift up
|
||||
docId <<= 2;
|
||||
// a 6 byte key means you pass
|
||||
memcpy ( dst , &docId , 6 );
|
||||
// test it
|
||||
long long d2;
|
||||
d2 = *(unsigned long *)(dst+1);
|
||||
d2 <<= 8;
|
||||
d2 |= (unsigned char)dst[0];
|
||||
d2 >>= 2;
|
||||
docId >>= 2;
|
||||
if ( d2 != docId ) { char *xx=NULL;*xx=0; }
|
||||
// end test
|
||||
dst += 6;
|
||||
}
|
||||
// store in hash table
|
||||
m_ct.addKey ( &h64 , &include );
|
||||
}
|
||||
|
||||
// update SafeBuf::m_length
|
||||
m_docIdVoteBuf.setLength ( dst - m_docIdVoteBuf.getBufStart() );
|
||||
|
||||
// now sort the docids. TODO: break makeDocIdVoteBufForBoolQuery_r()
|
||||
// up into docid ranges so we have like 1/100th the # of docids to
|
||||
// sort. that should make this part a lot faster.
|
||||
// i.e. 1000*log(1000) > 1000*(10*log(10))) --> 3000 > 1000
|
||||
// i.e. it's faster to break it down into 1000 pieces
|
||||
// i.e. for log base 2 maybe it's like 10x faster...
|
||||
qsort ( m_docIdVoteBuf.getBufStart() ,
|
||||
m_docIdVoteBuf.length() / 6 ,
|
||||
6 ,
|
||||
dcmp6 );
|
||||
|
||||
return true;
|
||||
}
|
||||
|
45
Posdb.h
45
Posdb.h
@ -395,6 +395,42 @@ class Posdb {
|
||||
DiskPageCache m_pc;
|
||||
};
|
||||
|
||||
#define MAX_SUBLISTS 50
|
||||
|
||||
// . each QueryTerm has this attached additional info now:
|
||||
// . these should be 1-1 with query terms, Query::m_qterms[]
|
||||
class QueryTermInfo {
|
||||
public:
|
||||
// the required lists for this query term, synonym lists, etc.
|
||||
RdbList *m_subLists [MAX_SUBLISTS];
|
||||
// flags to indicate if bigram list should be scored higher
|
||||
char m_bigramFlags [MAX_SUBLISTS];
|
||||
// shrinkSubLists() set this:
|
||||
long m_newSubListSize [MAX_SUBLISTS];
|
||||
char *m_newSubListStart [MAX_SUBLISTS];
|
||||
char *m_newSubListEnd [MAX_SUBLISTS];
|
||||
char *m_cursor [MAX_SUBLISTS];
|
||||
char *m_savedCursor [MAX_SUBLISTS];
|
||||
// the corresponding QueryTerm for this sublist
|
||||
//class QueryTerm *m_qtermList [MAX_SUBLISTS];
|
||||
long m_numNewSubLists;
|
||||
// how many are valid?
|
||||
long m_numSubLists;
|
||||
// size of all m_subLists in bytes
|
||||
long long m_totalSubListsSize;
|
||||
// the term freq weight for this term
|
||||
float m_termFreqWeight;
|
||||
// what query term # do we correspond to in Query.h
|
||||
long m_qtermNum;
|
||||
// the word position of this query term in the Words.h class
|
||||
long m_qpos;
|
||||
// the wikipedia phrase id if we start one
|
||||
long m_wikiPhraseId;
|
||||
// phrase id term or bigram is in
|
||||
long m_quotedStartId;
|
||||
};
|
||||
|
||||
|
||||
/*
|
||||
#include "RdbList.h"
|
||||
|
||||
@ -523,6 +559,8 @@ class PosdbTable {
|
||||
char *endi, char *endj,
|
||||
class DocIdScore *pdcs );
|
||||
|
||||
bool makeDocIdVoteBufForBoolQuery_r ( ) ;
|
||||
|
||||
// some generic stuff
|
||||
PosdbTable();
|
||||
~PosdbTable();
|
||||
@ -670,6 +708,13 @@ class PosdbTable {
|
||||
long m_minListi;
|
||||
// intersect docids from each QueryTermInfo into here
|
||||
SafeBuf m_docIdVoteBuf;
|
||||
|
||||
// boolean truth table for boolean queries
|
||||
HashTableX m_bt;
|
||||
HashTableX m_ct;
|
||||
// size of the data slot in m_bt
|
||||
long m_vecSize;
|
||||
|
||||
// are all positive query terms in same wikipedia phrase like
|
||||
// 'time enough for love'?
|
||||
bool m_allInSameWikiPhrase;
|
||||
|
535
Query.cpp
535
Query.cpp
@ -24,11 +24,11 @@ void Query::constructor ( ) {
|
||||
//m_bmap = NULL;
|
||||
m_bitScores = NULL;
|
||||
m_qwords = NULL;
|
||||
m_expressions = NULL;
|
||||
//m_expressions = NULL;
|
||||
m_qwordsAllocSize = 0;
|
||||
m_expressionsAllocSize = 0;
|
||||
//m_expressionsAllocSize = 0;
|
||||
m_qwords = NULL;
|
||||
m_expressions = NULL;
|
||||
//m_expressions = NULL;
|
||||
reset ( );
|
||||
}
|
||||
|
||||
@ -46,7 +46,7 @@ void Query::reset ( ) {
|
||||
m_bufLen = 0;
|
||||
m_origLen = 0;
|
||||
m_numWords = 0;
|
||||
m_numOperands = 0;
|
||||
//m_numOperands = 0;
|
||||
m_numTerms = 0;
|
||||
m_synTerm = 0;
|
||||
//m_numIgnored = 0;
|
||||
@ -60,14 +60,14 @@ void Query::reset ( ) {
|
||||
m_bitScores = NULL;
|
||||
//m_bmapSize = 0;
|
||||
m_bitScoresSize = 0;
|
||||
if ( m_expressionsAllocSize )
|
||||
mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
|
||||
//if ( m_expressionsAllocSize )
|
||||
// mfree ( m_expressions , m_expressionsAllocSize , "Query3" );
|
||||
if ( m_qwordsAllocSize )
|
||||
mfree ( m_qwords , m_qwordsAllocSize , "Query4" );
|
||||
m_expressionsAllocSize = 0;
|
||||
//m_expressionsAllocSize = 0;
|
||||
m_qwordsAllocSize = 0;
|
||||
m_qwords = NULL;
|
||||
m_expressions = NULL;
|
||||
//m_expressions = NULL;
|
||||
m_numExpressions = 0;
|
||||
m_gnext = m_gbuf;
|
||||
m_hasUOR = false;
|
||||
@ -149,7 +149,7 @@ bool Query::set2 ( char *query ,
|
||||
|
||||
char *q = query;
|
||||
// see if it should be boolean...
|
||||
for ( long i = 0 ; boolFlag && i < queryLen ; i++ ) {
|
||||
for ( long i = 0 ; i < queryLen ; i++ ) {
|
||||
if ( q[i]=='A' && q[i+1]=='N' && q[i+2]=='D' &&
|
||||
(q[i+3]==' ' || q[i+3]=='(') )
|
||||
boolFlag = 1;
|
||||
@ -343,8 +343,8 @@ bool Query::set2 ( char *query ,
|
||||
|
||||
// set m_expressions[] and m_operands[] arrays and m_numOperands
|
||||
// for boolean queries
|
||||
if ( m_isBoolean )
|
||||
if ( ! setBooleanOperands() ) return false;
|
||||
//if ( m_isBoolean )
|
||||
// if ( ! setBooleanOperands() ) return false;
|
||||
|
||||
// disable stuff for site:, ip: and url: queries
|
||||
for ( long i = 0 ; i < m_numWords ; i++ ) {
|
||||
@ -386,6 +386,17 @@ bool Query::set2 ( char *query ,
|
||||
break;
|
||||
}
|
||||
|
||||
// . keep it simple for now
|
||||
// . we limit to MAX_EXRESSIONS to like 10 now i guess
|
||||
if ( m_isBoolean ) {
|
||||
m_numExpressions = 1;
|
||||
m_expressions[0].add ( 0 ,
|
||||
m_numWords ,
|
||||
this , // Query
|
||||
0 ); // level
|
||||
}
|
||||
|
||||
|
||||
// . if it is not truncated, no need to use hard counts
|
||||
// . comment this line and the next one out for testing hard counts
|
||||
if ( ! m_truncated ) return true;
|
||||
@ -450,16 +461,16 @@ bool Query::set2 ( char *query ,
|
||||
// "(nt=%li)",
|
||||
// m_numExplicitBits,m_numTerms-m_numExplicitBits,m_numTerms);
|
||||
|
||||
if ( ! m_isBoolean ) return true;
|
||||
//if ( ! m_isBoolean ) return true;
|
||||
|
||||
// free cuz it was already set
|
||||
if ( m_expressionsAllocSize )
|
||||
mfree(m_expressions,m_expressionsAllocSize , "Query" );
|
||||
m_expressionsAllocSize = 0;
|
||||
m_expressions = NULL;
|
||||
//if ( m_expressionsAllocSize )
|
||||
// mfree(m_expressions,m_expressionsAllocSize , "Query" );
|
||||
//m_expressionsAllocSize = 0;
|
||||
//m_expressions = NULL;
|
||||
|
||||
// also set the boolean stuff again too!
|
||||
if ( ! setBooleanOperands() ) return false;
|
||||
//if ( ! setBooleanOperands() ) return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -498,7 +509,6 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
long max = (long)MAX_EXPLICIT_BITS;
|
||||
if ( max > m_maxQueryTerms ) max = m_maxQueryTerms;
|
||||
//char u8Buf[256];
|
||||
|
||||
for ( long i = 0 ; i < m_numWords && n < MAX_QUERY_TERMS ; i++ ) {
|
||||
// break out if no more explicit bits!
|
||||
/*
|
||||
@ -617,7 +627,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// doh! gotta reset to 0
|
||||
qt->m_implicitBits = 0;
|
||||
// assume not under a NOT bool op
|
||||
qt->m_underNOT = false;
|
||||
//qt->m_underNOT = false;
|
||||
// assign score weight, we're a phrase here
|
||||
qt->m_userWeight = qw->m_userWeightPhrase ;
|
||||
qt->m_userType = qw->m_userTypePhrase ;
|
||||
@ -819,7 +829,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// break;
|
||||
// }
|
||||
// assume not under a NOT bool op
|
||||
qt->m_underNOT = false;
|
||||
//qt->m_underNOT = false;
|
||||
// assign score weight, we're a phrase here
|
||||
qt->m_userWeight = qw->m_userWeight ;
|
||||
qt->m_userType = qw->m_userType ;
|
||||
@ -1162,7 +1172,8 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
if ( qw->m_wordSign == '+' ) continue;
|
||||
// no url: stuff, maybe only title
|
||||
if ( qw->m_fieldCode &&
|
||||
qw->m_fieldCode != FIELD_TITLE )
|
||||
qw->m_fieldCode != FIELD_TITLE &&
|
||||
qw->m_fieldCode != FIELD_GENERIC )
|
||||
continue;
|
||||
// skip if ignored like a stopword (stop to->too)
|
||||
//if ( qw->m_ignoreWord ) continue;
|
||||
@ -1232,8 +1243,14 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// stop word? no, we're a phrase term
|
||||
qt->m_isQueryStopWord = qw->m_isQueryStopWord;
|
||||
// change in both places
|
||||
qt->m_termId = syn.m_aids[j] & TERMID_MASK;
|
||||
m_termIds[n] = syn.m_aids[j] & TERMID_MASK;
|
||||
long long wid = syn.m_aids[j];
|
||||
// might be in a title: field or something
|
||||
if ( qw->m_prefixHash ) {
|
||||
long long ph = qw->m_prefixHash;
|
||||
wid= hash64h(wid,ph);
|
||||
}
|
||||
qt->m_termId = wid & TERMID_MASK;
|
||||
m_termIds[n] = wid & TERMID_MASK;
|
||||
qt->m_rawTermId = syn.m_aids[j];
|
||||
// assume explicit bit is 0
|
||||
qt->m_explicitBit = 0;
|
||||
@ -1265,7 +1282,7 @@ bool Query::setQTerms ( Words &words , Phrases &phrases ) {
|
||||
// reset our implicit bits to 0
|
||||
qt->m_implicitBits = 0;
|
||||
// assume not under a NOT bool op
|
||||
qt->m_underNOT = false;
|
||||
//qt->m_underNOT = false;
|
||||
// assign score weight, we're a phrase here
|
||||
qt->m_userWeight = qw->m_userWeight ;
|
||||
qt->m_userType = qw->m_userType ;
|
||||
@ -1902,7 +1919,7 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// assume QueryWord is ignored by default
|
||||
qw->m_ignoreWord = IGNORE_DEFAULT;
|
||||
qw->m_ignorePhrase = IGNORE_DEFAULT;
|
||||
|
||||
qw->m_wordNum = i;
|
||||
// get word as a string
|
||||
//char *w = words.getWord(i);
|
||||
//long wlen = words.getWordLen(i);
|
||||
@ -3308,24 +3325,24 @@ void Query::printQueryTerms(){
|
||||
////////// ONLY BOOLEAN STUFF BELOW HERE /////////////
|
||||
////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////
|
||||
bool Query::testBoolean(qvec_t bits, qvec_t bitmask){
|
||||
bool Query::testBoolean( unsigned char *bits ,long vecSize){//qvec_t bitmask){
|
||||
if (!m_isBoolean) return false;
|
||||
Expression *e = &m_expressions [ 0 ];
|
||||
// find top-level expression
|
||||
while (e->m_parent && e != e->m_parent) e = e->m_parent;
|
||||
return e->isTruth(bits, bitmask);
|
||||
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
|
||||
return e->isTruth(bits,vecSize);//, bitmask);
|
||||
|
||||
}
|
||||
void Query::printBooleanTree(){
|
||||
if (!m_isBoolean) return;
|
||||
Expression *e = &m_expressions [ 0 ];
|
||||
//Expression *e = &m_expressions [ 0 ];
|
||||
// find top-level expression
|
||||
while (e->m_parent && e != e->m_parent) e = e->m_parent;
|
||||
SafeBuf sbuf(1024);
|
||||
e->print(&sbuf);
|
||||
logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
|
||||
//while (e->m_parent && e != e->m_parent) e = e->m_parent;
|
||||
//SafeBuf sbuf(1024,"botree");
|
||||
//e->print(&sbuf);
|
||||
//logf(LOG_DEBUG, "query: Boolean Query: %s", sbuf.getBufStart());
|
||||
}
|
||||
|
||||
/*
|
||||
// . also sets the m_underNOT member of each QueryTerm, too!!
|
||||
// . returns false and sets g_errno on error, true otherwise
|
||||
bool Query::setBooleanOperands ( ) {
|
||||
@ -3338,6 +3355,20 @@ bool Query::setBooleanOperands ( ) {
|
||||
"exceeded (%ld).",m_numTerms);
|
||||
}
|
||||
|
||||
// set the QueryWord::m_opBit member of each query word.
|
||||
// so if you have a query like 'A B OR C' then you need
|
||||
// to have both A and B if you don't have C. so every word
|
||||
// unless its an operator needs its own bit. quoted phrases
|
||||
// may present a problem down the road we'll have to deal with.
|
||||
long opNum = 0;
|
||||
for ( long i = 0 ; i < m_numWords ; i++ ) {
|
||||
// skip if field, opcode, punct. etc.
|
||||
if ( m_qwords[i].m_ignoreWord ) continue;
|
||||
// assign it a # i guess
|
||||
m_qwords[i].m_opNum = opNum++;
|
||||
}
|
||||
|
||||
|
||||
// alloc the mem if we need to (mdw left off here)
|
||||
//long need = (m_numWords/3) * sizeof(Expression);
|
||||
// illegitmate bool expressions breech the buffer
|
||||
@ -3367,14 +3398,11 @@ bool Query::setBooleanOperands ( ) {
|
||||
// . set the expression recursively
|
||||
// . just setting this will not set the m_hasNOT members of each
|
||||
// QueryTerm
|
||||
long status = e->set ( 0 , // first word #
|
||||
m_numWords , // last word #
|
||||
0 , // parser position
|
||||
this , // array of QueryWords
|
||||
0 ,// level
|
||||
NULL, NULL, // parent, leftchild
|
||||
false , // has NOT?
|
||||
false ); // under NOT?
|
||||
long status = e->add ( 0 , // first word #
|
||||
m_numWords , // last word #
|
||||
this , // array of QueryWords
|
||||
0 ,// level
|
||||
false ); // has NOT?
|
||||
if ( status < 0 ) {
|
||||
g_errno = ETOOMANYOPERANDS;
|
||||
return log("query: Maximum number of bool operands "
|
||||
@ -3399,6 +3427,8 @@ bool Query::setBooleanOperands ( ) {
|
||||
|
||||
// . get all the terms that are UNDER a NOT operator in some fashion
|
||||
// . these bits are 1-1 with m_qterms[]
|
||||
*/
|
||||
/*
|
||||
qvec_t notBits = e->getNOTBits( false );
|
||||
for ( long i = 0 ; i < m_numTerms ; i++ ) {
|
||||
if ( m_qterms[i].m_explicitBit & notBits )
|
||||
@ -3406,15 +3436,20 @@ bool Query::setBooleanOperands ( ) {
|
||||
else
|
||||
m_qterms[i].m_underNOT = false;
|
||||
}
|
||||
*/
|
||||
/*
|
||||
return true;
|
||||
}
|
||||
|
||||
*/
|
||||
/*
|
||||
// . returns -1 on bad query error
|
||||
// . returns word AFTER the last word in our operand
|
||||
long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
bool underNOT ) {
|
||||
// clear these
|
||||
m_termBits = 0;
|
||||
//m_termBits = 0;
|
||||
memset(m_opBits,0,MAX_OVEC_SIZE);
|
||||
|
||||
m_hasNOT = false;
|
||||
|
||||
//m_hardRequiredBits = 0;
|
||||
@ -3429,7 +3464,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
// set the parenthetical level of the word
|
||||
qw->m_level = level;
|
||||
// set this
|
||||
qw->m_underNOT = underNOT;
|
||||
//qw->m_underNOT = underNOT;
|
||||
// skip punct
|
||||
if ( ! qw->isAlphaWord() ) {
|
||||
// if it is a parens, bail!
|
||||
@ -3459,9 +3494,12 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
// query is too long
|
||||
if ( qw->m_phraseId && qw->m_queryPhraseTerm &&
|
||||
qw->m_phraseSign ) {
|
||||
qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
|
||||
//qvec_t e =qw->m_queryPhraseTerm->m_explicitBit;
|
||||
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
|
||||
m_termBits |= e;
|
||||
//m_termBits |= e;
|
||||
long byte = qw->m_opNum / 8;
|
||||
long mask = 1<<(qw->m_opNum % 8);
|
||||
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
|
||||
}
|
||||
// why would it be ignored? oh... if like cd-rom or in quotes
|
||||
if ( qw->m_ignoreWord ) continue;
|
||||
@ -3469,13 +3507,17 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
// . might be a word that's not a QueryTerm because
|
||||
// query is too long
|
||||
if ( qw->m_queryWordTerm ) {
|
||||
qvec_t e = qw->m_queryWordTerm->m_explicitBit;
|
||||
//qvec_t e = qw->m_queryWordTerm->m_explicitBit;
|
||||
//if (qw->m_phraseSign == '+') m_hardRequiredBits |= e;
|
||||
m_termBits |= e;
|
||||
//m_termBits |= e;
|
||||
long byte = qw->m_opNum / 8;
|
||||
long mask = 1<<(qw->m_opNum % 8);
|
||||
if ( byte < MAX_OVEC_SIZE ) m_opBits[byte] |= mask;
|
||||
}
|
||||
}
|
||||
return b;
|
||||
}
|
||||
*/
|
||||
|
||||
// . returns -1 on bad query error
|
||||
// . returns next word to parse (after expression) on success
|
||||
@ -3485,6 +3527,7 @@ long Operand::set ( long a , long b , QueryWord *qwords , long level ,
|
||||
// . new: organize query into sum of products normal form, ie:
|
||||
// . (a) OR (b AND c AND d) OR (e AND f)
|
||||
|
||||
/*
|
||||
unsigned char precedence[] = {
|
||||
0, // term
|
||||
4, // OR
|
||||
@ -3495,238 +3538,214 @@ unsigned char precedence[] = {
|
||||
3, // UOR
|
||||
5, // PIPE
|
||||
};
|
||||
*/
|
||||
|
||||
long Expression::set (long start,
|
||||
long end,
|
||||
long pos, // current parsing position
|
||||
class Query *q,
|
||||
long level,
|
||||
class Expression *parent,
|
||||
class Expression *leftChild,
|
||||
bool hasNOT ,
|
||||
bool underNOT ) {
|
||||
m_start = start;
|
||||
m_end = end;
|
||||
m_opcode = 0;
|
||||
m_operand = NULL;
|
||||
m_numChildren = 0;
|
||||
m_hasNOT = hasNOT;
|
||||
m_parent = parent;
|
||||
uint8_t curOp = 0;
|
||||
//#define TYPE_OPERAND 1
|
||||
//#define TYPE_OPCODE 2
|
||||
//#define TYPE_EXPRESSION 3
|
||||
|
||||
QueryWord *qwords = q->m_qwords;
|
||||
Expression *o_expressions = q->m_expressions;
|
||||
Operand *o_operands = q->m_operands;
|
||||
long *o_numOperands = &q->m_numOperands;
|
||||
long *o_numExpressions = &q->m_numExpressions;
|
||||
long maxExpressions = q->m_numWords;
|
||||
|
||||
|
||||
// Lets really try to catch this
|
||||
if (m_parent == this) {
|
||||
//log(LOG_WARN, "query: Warning, setting expression "
|
||||
// "parent to self");
|
||||
char *xx = NULL; *xx = 0;
|
||||
}
|
||||
// return -1 and set g_errno on error
|
||||
// returns how many words expression was
|
||||
bool Expression::add (long start,
|
||||
long end,
|
||||
class Query *q,
|
||||
long level
|
||||
) {
|
||||
|
||||
if ( level >= MAX_EXPRESSIONS ) { g_errno = EBADENGINEER; return -1;}
|
||||
|
||||
// the # of the first alnumpunct word in the expression
|
||||
m_expressionStartWord = start;
|
||||
// and the last one
|
||||
//m_end = end;
|
||||
//m_hasNOT = hasNOT;
|
||||
m_q = q;
|
||||
|
||||
//m_cc = 0;
|
||||
|
||||
long i = m_expressionStartWord;
|
||||
|
||||
// "start" is the current alnumpunct word we are parsing out
|
||||
for ( ; i<end ; i++ ) {
|
||||
|
||||
QueryWord *qwords = q->m_qwords;
|
||||
|
||||
//set initial args
|
||||
if (leftChild) {
|
||||
leftChild->m_parent = this;
|
||||
m_children[0] = leftChild;
|
||||
m_numChildren = 1;
|
||||
}
|
||||
hasNOT = false;
|
||||
for ( long i=pos ; i<end ; i++ ){
|
||||
QueryWord * qw = &qwords[i];
|
||||
// set this
|
||||
qw->m_underNOT = underNOT;
|
||||
// set leaf node
|
||||
if (!qw->m_opcode && qw->isAlphaWord()){
|
||||
if (i > m_start) goto setChildExpr;
|
||||
// if we maxxed out, error out
|
||||
if ( *o_numOperands >= MAX_OPERANDS ) return -1;
|
||||
Operand *op = &o_operands [ *o_numOperands ];
|
||||
*o_numOperands = *o_numOperands + 1;
|
||||
// . return ptr to next word for us to parse
|
||||
// . subtract once since for loop will inc it
|
||||
i = op->set ( i , end , qwords , level , underNOT );
|
||||
if ( i < 0 ) return -1;
|
||||
m_operand = op;
|
||||
goto endExpr;
|
||||
//qw->m_underNOT = underNOT;
|
||||
|
||||
// set leaf node if not an opcode like "AND" and not punct.
|
||||
if ( ! qw->m_opcode && qw->isAlphaWord()){
|
||||
//m_opSlots[m_cc] = i;
|
||||
//m_opTypes[m_cc] = TYPE_OPERAND;
|
||||
//qw->m_opBitNum = m_cc;
|
||||
continue;//goto endExpr; mdw
|
||||
}
|
||||
if (qw->m_opcode == OP_NOT){
|
||||
hasNOT = !hasNOT;
|
||||
underNOT = hasNOT;
|
||||
//hasNOT = !hasNOT;
|
||||
//underNOT = hasNOT;
|
||||
continue;
|
||||
}
|
||||
else if (qw->m_opcode == OP_LEFTPAREN){
|
||||
if (i == m_start) i++;
|
||||
goto setChildExpr;
|
||||
// this is expression
|
||||
// . it should advance "i" to end of expression
|
||||
// point to next...
|
||||
q->m_numExpressions++;
|
||||
// make a new one:
|
||||
Expression *e=&q->m_expressions[q->m_numExpressions-1];
|
||||
// now set it
|
||||
e->add ( i+1, // skip over (
|
||||
end ,
|
||||
q ,
|
||||
level + 1);
|
||||
// skip over it. pt to ')'
|
||||
i += e->m_numWordsInExpression;
|
||||
qw->m_expressionPtr = e;
|
||||
//m_opSlots[m_cc] = (long)e;
|
||||
//m_opTypes[m_cc] = TYPE_EXPRESSION;
|
||||
//qw->m_opBitNum = m_cc;
|
||||
}
|
||||
else if (qw->m_opcode == OP_RIGHTPAREN){
|
||||
goto endExpr;
|
||||
}
|
||||
else if (qw->m_opcode) {
|
||||
int delta = 0;
|
||||
curOp = qw->m_opcode;
|
||||
if (m_numChildren == 1)
|
||||
m_opcode = curOp;
|
||||
|
||||
if (m_numChildren > 1 && curOp != m_opcode) {
|
||||
|
||||
delta = (int)precedence[curOp] -
|
||||
(int)precedence[m_opcode];
|
||||
}
|
||||
|
||||
if (delta > 0){
|
||||
goto endExpr;
|
||||
}
|
||||
if (delta < 0){
|
||||
// set a subexpression conataining the
|
||||
// last operand we found as the first
|
||||
goto setChildExpr2;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
endExpr:
|
||||
//log(LOG_DEBUG, "query: set Expr [%ld, %ld), opcode: %d",
|
||||
// a, i, curOp);
|
||||
// if we've matched parens, go to next word
|
||||
// but if we have an extra right paren, don't crash
|
||||
if (qw->m_opcode == OP_RIGHTPAREN &&
|
||||
(qwords[m_start].m_opcode == OP_LEFTPAREN ||
|
||||
m_start == 0))
|
||||
i++;
|
||||
|
||||
m_end = i;
|
||||
// We have an extra open paren
|
||||
if (qwords[m_start].m_opcode == OP_LEFTPAREN &&
|
||||
qw->m_opcode != OP_RIGHTPAREN)
|
||||
goto setParentExpr;
|
||||
// we are top-level expr, but there is more to parse
|
||||
if (!m_parent && i < end-1)
|
||||
goto setParentExpr;
|
||||
// just return
|
||||
return i;
|
||||
// add a parent expression with this one as the left child
|
||||
setParentExpr:
|
||||
{
|
||||
if ( *o_numExpressions >= maxExpressions ) return -1;
|
||||
//if (qw->m_opcode == OP_RIGHTPAREN) i++;
|
||||
Expression *e = &o_expressions[*o_numExpressions];
|
||||
*o_numExpressions = *o_numExpressions + 1;
|
||||
i = e->set ( m_start , end ,i, q ,
|
||||
level+1,
|
||||
m_parent,
|
||||
this,
|
||||
false ,
|
||||
underNOT ) ;
|
||||
// return size i guess, include )
|
||||
m_numWordsInExpression = i - m_expressionStartWord+1;
|
||||
return i;
|
||||
}
|
||||
else if (qw->m_opcode) {
|
||||
// add that mdw
|
||||
//m_opSlots[m_cc] = qw->m_opcode;
|
||||
//m_opTypes[m_cc] = TYPE_OPCODE;
|
||||
//qw->m_opBitNum = m_cc;
|
||||
//m_cc++;
|
||||
continue;
|
||||
}
|
||||
// white space?
|
||||
continue;
|
||||
}
|
||||
|
||||
// add a child expression
|
||||
setChildExpr:
|
||||
{
|
||||
if ( *o_numExpressions >= maxExpressions ) return -1;
|
||||
|
||||
Expression *e = &o_expressions[*o_numExpressions];
|
||||
*o_numExpressions = *o_numExpressions + 1;
|
||||
i = e->set ( i , end , i, q ,
|
||||
level+1,
|
||||
this, NULL, hasNOT ,
|
||||
underNOT ) -1;
|
||||
if ( i < 0 ) return -1;
|
||||
|
||||
// trim needless parens
|
||||
while (e->m_numChildren == 1) {
|
||||
hasNOT = e->m_hasNOT;
|
||||
e = e->m_children[0];
|
||||
if (hasNOT) e->m_hasNOT = ! e->m_hasNOT;
|
||||
m_numWordsInExpression = i - m_expressionStartWord;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// each bit is 1-1 with the explicit terms in the boolean query
|
||||
bool Query::matchesBoolQuery ( unsigned char *bitVec , long vecSize ) {
|
||||
return m_expressions[0].isTruth ( bitVec , vecSize );
|
||||
}
|
||||
|
||||
|
||||
bool isBitNumSet ( long opBitNum, unsigned char *bitVec, long vecSize ) {
|
||||
long byte = opBitNum / 8;
|
||||
long mask = 1<<(opBitNum % 8);
|
||||
if ( byte >= vecSize ) { char *xx=NULL;*xx=0; }
|
||||
return bitVec[byte] & mask;
|
||||
}
|
||||
|
||||
// . "bits" are 1-1 with the query words in Query::m_qwords[] array
|
||||
// including ignored words and spaces i guess since Expression::add()
|
||||
// seems to do that.
|
||||
bool Expression::isTruth ( unsigned char *bitVec ,long vecSize ) {
|
||||
|
||||
//
|
||||
// operand1 operand2 operator1 operand3 operator2 ....
|
||||
//
|
||||
|
||||
// result: -1 means unknown at this point
|
||||
long result = -1;
|
||||
|
||||
char prevOpCode = 0;
|
||||
long prevResult ;
|
||||
// result of current operand
|
||||
long opResult = -1;
|
||||
|
||||
long i = m_expressionStartWord;
|
||||
long iend = i + m_numWordsInExpression;
|
||||
|
||||
bool hasNot = false;
|
||||
|
||||
for ( ; i < iend ; i++ ) {
|
||||
|
||||
QueryWord *qw = &m_q->m_qwords[i];
|
||||
|
||||
if ( qw->m_opcode == OP_NOT ) {
|
||||
hasNot = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
|
||||
// so operands are expressions as well
|
||||
Expression *e = (Expression *)qw->m_expressionPtr;
|
||||
if ( e ) {
|
||||
// save prev one. -1 means no prev.
|
||||
prevResult = opResult;
|
||||
// set new onw
|
||||
opResult = e->isTruth ( bitVec , vecSize );
|
||||
// skip over that expression. point to ')'
|
||||
i += e->m_numWordsInExpression;
|
||||
// flip?
|
||||
if ( hasNot ) {
|
||||
if ( opResult == 1 ) opResult = 0;
|
||||
else opResult = 1;
|
||||
hasNot = false;
|
||||
}
|
||||
hasNOT = false;
|
||||
//cull empty expressions
|
||||
if (e->m_numChildren < 1 &&
|
||||
e->m_operand == NULL) continue;
|
||||
}
|
||||
|
||||
if (m_numChildren >= MAX_OPERANDS) return -1;
|
||||
// add good expressions
|
||||
m_children [ m_numChildren] = e;
|
||||
m_numChildren++;
|
||||
if (m_numChildren > 1 && m_opcode == 0)
|
||||
m_opcode = OP_AND; // default AND
|
||||
if ( qw->m_opcode && ! e ) {
|
||||
prevOpCode = qw->m_opcode;//m_opSlots[i];
|
||||
continue;
|
||||
}
|
||||
|
||||
// we need to make the last operand we passed
|
||||
// be the first operand of a subexpression
|
||||
setChildExpr2:
|
||||
{
|
||||
// remove the last expression from our list
|
||||
Expression *ce = m_children[m_numChildren-1];
|
||||
// simple operand
|
||||
if ( ! qw->m_opcode && ! e ) {
|
||||
// for regular word operands
|
||||
// ignore it like a space?
|
||||
if ( qw->m_ignoreWord ) continue;
|
||||
// save old one
|
||||
prevResult = opResult;
|
||||
// convert word to term #
|
||||
QueryTerm *qt = qw->m_queryWordTerm;
|
||||
if ( ! qt ) continue;
|
||||
// . m_bitNum is set in Posdb.cpp when it sets its
|
||||
// QueryTermInfo array
|
||||
// . it is basically the query term #
|
||||
// . see iff that bit is set in this docid's vec
|
||||
opResult = isBitNumSet ( qt->m_bitNum,bitVec,vecSize );
|
||||
// flip?
|
||||
if ( hasNot ) {
|
||||
if ( opResult == 1 ) opResult = 0;
|
||||
else opResult = 1;
|
||||
hasNot = false;
|
||||
}
|
||||
}
|
||||
|
||||
m_numChildren--;
|
||||
// need two to tango. i.e. (true OR false)
|
||||
if ( prevResult == -1 ) continue;
|
||||
|
||||
|
||||
if ( *o_numExpressions >= maxExpressions ) return -1;
|
||||
|
||||
Expression *e = &o_expressions[*o_numExpressions];
|
||||
*o_numExpressions = *o_numExpressions + 1;
|
||||
i = e->set ( ce->m_start , end , i, q ,
|
||||
level+1,
|
||||
this, ce,
|
||||
false ,
|
||||
underNOT ) -1;
|
||||
ce->m_parent = e;
|
||||
if ( i < 0 ) return -1;
|
||||
|
||||
if (m_numChildren >= MAX_OPERANDS) return -1;
|
||||
m_children [ m_numChildren ] = e;
|
||||
|
||||
hasNOT = false;
|
||||
m_numChildren++;
|
||||
continue;
|
||||
// if this is not the first time... we got two
|
||||
if ( prevOpCode == OP_AND ) {
|
||||
// if first operation we encount is A AND B then
|
||||
// default result to on. only allow an AND operation
|
||||
// to turn if off.
|
||||
if ( result == -1 ) result = true;
|
||||
if ( ! prevResult ) result = false;
|
||||
if ( ! opResult ) result = false;
|
||||
}
|
||||
else if ( prevOpCode == OP_OR ) {
|
||||
// if first operation we encount is A OR B then
|
||||
// default result to off
|
||||
if ( result == -1 ) result = false;
|
||||
if ( prevResult ) result = true;
|
||||
if ( opResult ) result = true;
|
||||
}
|
||||
}
|
||||
return end;
|
||||
}
|
||||
|
||||
|
||||
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
|
||||
bool Expression::isTruth ( qvec_t bits, qvec_t mask ) {
|
||||
//bool op1 = false ; // set to false so compiler shuts up
|
||||
//bool op2 ;
|
||||
//bool accumulator = false;
|
||||
//bool hadOR = false;
|
||||
bool result = false;
|
||||
|
||||
// leaf node
|
||||
if (m_operand){
|
||||
result = m_operand->isTruth(bits, mask);
|
||||
// handle masked terms better.. don't apply NOT operator
|
||||
if (!(m_operand->m_termBits & mask)) return true;
|
||||
}
|
||||
else if (m_numChildren == 1){
|
||||
result = m_children[0]->isTruth(bits, mask);
|
||||
}
|
||||
else if (m_opcode == OP_OR || m_opcode == OP_UOR) {
|
||||
for ( long i=0 ; i<m_numChildren ; i++ ) {
|
||||
result = result || m_children[i]->isTruth(bits, mask);
|
||||
if (result) goto done;
|
||||
}
|
||||
}
|
||||
else if (m_opcode == OP_AND || m_opcode == OP_PIPE){
|
||||
result = true;
|
||||
for (long i = 0 ; i < m_numChildren ; i++ ) {
|
||||
result = result && m_children[i]->isTruth(bits, mask);
|
||||
if (!result) goto done;
|
||||
}
|
||||
}
|
||||
|
||||
done :
|
||||
if (m_hasNOT) return !result;
|
||||
else return result;
|
||||
|
||||
if ( result == -1 ) return true;
|
||||
if ( result == 0 ) return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
|
||||
// . hasNOT is true if there's a NOT just to the left of this WHOLE expressions
|
||||
// ourside the parens
|
||||
@ -3744,9 +3763,11 @@ qvec_t Expression::getNOTBits ( bool hasNOT ) {
|
||||
// success, all operand pairs were true
|
||||
return notBits;
|
||||
}
|
||||
*/
|
||||
|
||||
// print boolean expression for debug purposes
|
||||
void Expression::print(SafeBuf *sbuf) {
|
||||
/*
|
||||
if (m_hasNOT) sbuf->safePrintf("NOT ");
|
||||
if (m_operand){
|
||||
m_operand->print(sbuf);
|
||||
@ -3765,16 +3786,18 @@ void Expression::print(SafeBuf *sbuf) {
|
||||
}
|
||||
}
|
||||
sbuf->safePrintf(")");
|
||||
|
||||
*/
|
||||
}
|
||||
|
||||
/*
|
||||
void Operand::print(SafeBuf *sbuf) {
|
||||
// long shift = 0;
|
||||
// while (m_termBits >> shift) shift++;
|
||||
// sbuf->safePrintf("%i", 1<<(shift-1));
|
||||
if (m_hasNOT) sbuf->safePrintf("NOT 0x%lx", (long)m_termBits);
|
||||
else sbuf->safePrintf("0x%lx", (long)m_termBits);
|
||||
if (m_hasNOT) sbuf->safePrintf("NOT 0x%llx",*(long long *)m_opBits);
|
||||
else sbuf->safePrintf("0x%llx", *(long long *)m_opBits);
|
||||
}
|
||||
*/
|
||||
|
||||
// if any one query term is split, msg3a has to split the query
|
||||
bool Query::isSplit() {
|
||||
|
154
Query.h
154
Query.h
@ -49,6 +49,8 @@ typedef unsigned long long qvec_t;
|
||||
|
||||
#define MAX_EXPLICIT_BITS (sizeof(qvec_t)*8)
|
||||
|
||||
#define MAX_OVEC_SIZE 256
|
||||
|
||||
// only can use 16-bit since have to make a 64k truth table!
|
||||
#define MAX_EXPLICIT_BITS_BOOLEAN (16*8)
|
||||
|
||||
@ -166,6 +168,7 @@ extern struct QueryField g_fields[];
|
||||
////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////
|
||||
|
||||
/*
|
||||
// . creating a QueryBoolean class was unnecessary since it was only functional
|
||||
// and had nothing new it would store that the Query class doesn't store
|
||||
// . the entry point is the Query::setBitScoresBoolean() function below
|
||||
@ -181,76 +184,46 @@ public:
|
||||
long set ( long a , long b , class QueryWord *qwords , long level ,
|
||||
bool underNOT ) ;
|
||||
// . "bits" are 1-1 with the query terms in Query::m_qterms[] array
|
||||
// . Operand::m_termBits is the required bits for operand to be true
|
||||
// . Operand::m_opBits is the required bits for operand to be true
|
||||
// . does not include signless phrases
|
||||
bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
|
||||
//bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) {
|
||||
bool isTruth ( unsigned char *bitVec , long vecSize ) {
|
||||
// must always satisfy hard required terms (+ sign)
|
||||
//if ( (bits & m_forcedBits) != m_forcedBits )
|
||||
// return false;
|
||||
if (m_hasNOT) return (bits & m_termBits & mask) == 0;
|
||||
return ( (bits & m_termBits & mask) == (m_termBits & mask));
|
||||
//if (m_hasNOT) return (bits & m_opBits & mask) == 0;
|
||||
//return ( (bits & m_opBits & mask) == (m_opBits & mask));
|
||||
if ( m_hasNOT ) {
|
||||
for ( long i = 0 ; i < vecSize ; i++ )
|
||||
if ( m_opBits[i] & bitVec[i] ) return false;
|
||||
return true;
|
||||
}
|
||||
for ( long i = 0 ; i < vecSize ; i++ )
|
||||
if ( m_opBits[i] & bitVec[i] ) return true;
|
||||
return false;
|
||||
// . we are now back to good ol' default OR
|
||||
// . m_termBits should have been masked with
|
||||
// . m_opBits should have been masked with
|
||||
// m_requiredBits so as not to include signless phrases
|
||||
//return ( (bits & m_termBits) != 0 );
|
||||
//return ( (bits & m_opBits) != 0 );
|
||||
};
|
||||
void print (SafeBuf *sbuf);
|
||||
// we are a sequence of QueryWords
|
||||
//long m_startWordNum;
|
||||
//long m_lastWordNum;
|
||||
// . we treat the required term bits of those words as one unit (ANDed)
|
||||
// . unsigned phrases are not included in these term bits
|
||||
// . doc just needs one of these bits for this op to be considered true
|
||||
qvec_t m_termBits;
|
||||
// . terms under the same QueryTermInfo class should have the same
|
||||
// termbit here
|
||||
unsigned char m_opBits[MAX_OVEC_SIZE];
|
||||
//long m_vecSize;
|
||||
// does the word NOT preceed the operand?
|
||||
bool m_hasNOT;
|
||||
class Expression *m_parent;
|
||||
//class Expression *m_parent;
|
||||
|
||||
// we MUST have these for this OPERAND to be true
|
||||
//unsigned short m_forcedBits;
|
||||
};
|
||||
*/
|
||||
|
||||
// operand1 AND operand2 OR ...
|
||||
// operand1 OR operand2 AND ...
|
||||
class Expression {
|
||||
public:
|
||||
long set (long start,
|
||||
long end,
|
||||
long pos, // current parsing position
|
||||
class Query *q,
|
||||
long level,
|
||||
class Expression *parent,
|
||||
class Expression *leftChild,
|
||||
bool hasNOT ,
|
||||
bool underNOT );
|
||||
|
||||
bool isTruth ( qvec_t bits, qvec_t mask=(qvec_t)-1 ) ;
|
||||
// . what QueryTerms are UNDER the influence of the NOT opcode?
|
||||
// . we read in the WHOLE termlist of those that are (like '-' sign)
|
||||
// . returned bit vector is 1-1 with m_qterms in Query class
|
||||
qvec_t getNOTBits ( bool hasNOT );
|
||||
void print (SafeBuf *sbuf);
|
||||
// . a list of operands separated by op codes (a AND b OR c ...)
|
||||
// . sometimes and operand is another expression: a AND (b OR c)
|
||||
// . use NULL in m_operands slot if we got an expression and vice versa
|
||||
// . m_opcodes[i] is the opcode after operand #i
|
||||
class Expression *m_parent;
|
||||
//class Operand *m_operands [ MAX_OPERANDS ];
|
||||
class Expression *m_children [ MAX_OPERANDS ];
|
||||
//char m_opcodes [ MAX_OPERANDS ];
|
||||
//long m_numOperands;
|
||||
// now expressions can have either child expressions or 1 operand
|
||||
long m_numChildren;
|
||||
// do we have a NOT operator before operand #i?
|
||||
//bool m_hasNOT [ MAX_OPERANDS ];
|
||||
// only one opcode, operand, hasNOT per expression now
|
||||
uint8_t m_opcode;
|
||||
class Operand *m_operand;
|
||||
bool m_hasNOT;
|
||||
// needed for nesting
|
||||
long m_start;
|
||||
long m_end;
|
||||
|
||||
};
|
||||
|
||||
////////////////////////////////////////////////////////
|
||||
////////////////////////////////////////////////////////
|
||||
@ -292,7 +265,7 @@ class QueryWord {
|
||||
long long m_phraseId;
|
||||
// hash of field name then collection, used to hash termId
|
||||
long long m_prefixHash;
|
||||
|
||||
long m_wordNum;
|
||||
// are we in a phrase in a wikipedia title?
|
||||
long m_wikiPhraseId;
|
||||
long m_wikiPhraseStart;
|
||||
@ -373,6 +346,10 @@ class QueryWord {
|
||||
float m_float;
|
||||
// for gbminint:99 etc. uses integers instead of floats for better res
|
||||
long m_int;
|
||||
// what operand bit # is it for doing boolen queries?
|
||||
//long m_opBitNum;
|
||||
// when an operand is an expression...
|
||||
class Expression *m_expressionPtr;
|
||||
};
|
||||
|
||||
// . we filter the QueryWords and turn them into QueryTerms
|
||||
@ -415,6 +392,13 @@ class QueryTerm {
|
||||
// expressions) and just use a hardCount to see how many hard required
|
||||
// terms are contained by a document. see IndexTable.cpp "hardCount"
|
||||
char m_hardCount;
|
||||
|
||||
// the "number" of the query term used for evaluation boolean
|
||||
// expressions in Expression::isTruth(). Basically just the
|
||||
// QueryTermInfo for which this query term belongs. each QueryTermInfo
|
||||
// is like a single query term and all its synonyms, etc.
|
||||
long m_bitNum;
|
||||
|
||||
// point to term, either m_word or m_phrase
|
||||
char *m_term;
|
||||
long m_termLen;
|
||||
@ -485,6 +469,14 @@ class QueryTerm {
|
||||
// we can be in? uses -1 to indicate none.
|
||||
long m_leftPhraseTermNum;
|
||||
long m_rightPhraseTermNum;
|
||||
// . what operand # are we a part of in a boolean query?
|
||||
// . like for (x AND y) x would have an opNum of 0 and y an
|
||||
// opNum of 1 for instance.
|
||||
// . for things like (x1 OR x2 OR x3 ... ) we try to give all
|
||||
// those query terms the same m_opNum for efficiency since
|
||||
// they all have the same effecct
|
||||
//long m_opNum;
|
||||
|
||||
// same as above basically
|
||||
class QueryTerm *m_leftPhraseTerm;
|
||||
class QueryTerm *m_rightPhraseTerm;
|
||||
@ -501,6 +493,41 @@ class QueryTerm {
|
||||
|
||||
};
|
||||
|
||||
//#define MAX_OPSLOTS 256
|
||||
|
||||
#define MAX_EXPRESSIONS 10
|
||||
|
||||
// operand1 AND operand2 OR ...
|
||||
// operand1 OR operand2 AND ...
|
||||
class Expression {
|
||||
public:
|
||||
bool add (long start,
|
||||
long end,
|
||||
class Query *q,
|
||||
long level );
|
||||
bool isTruth ( unsigned char *bitVec , long vecSize );
|
||||
// . what QueryTerms are UNDER the influence of the NOT opcode?
|
||||
// . we read in the WHOLE termlist of those that are (like '-' sign)
|
||||
// . returned bit vector is 1-1 with m_qterms in Query class
|
||||
void print (SafeBuf *sbuf);
|
||||
// . a list of operands separated by op codes (a AND b OR c ...)
|
||||
// . sometimes and operand is another expression: a AND (b OR c)
|
||||
// . use NULL in m_operands slot if we got an expression and vice versa
|
||||
// . m_opcodes[i] is the opcode after operand #i
|
||||
//class Expression *m_parent;
|
||||
//bool m_hasNOT;
|
||||
//long m_start;
|
||||
//long m_end;
|
||||
long m_expressionStartWord;
|
||||
long m_numWordsInExpression;
|
||||
Query *m_q;
|
||||
// . opSlots can be operands operators or expressions
|
||||
// . m_opTypes tells which of the 3 they are
|
||||
//long m_opSlots[MAX_OPSLOTS];
|
||||
//char m_opTypes[MAX_OPSLOTS];
|
||||
//long m_cc;
|
||||
};
|
||||
|
||||
// . this is the main class for representing a query
|
||||
// . it contains array of QueryWords (m_qwords[]) and QueryTerms (m_qterms[])
|
||||
class Query {
|
||||
@ -589,11 +616,17 @@ class Query {
|
||||
|
||||
// sets m_bmap[][] so getImplicits() works
|
||||
void setBitMap ( );
|
||||
bool testBoolean(qvec_t bits, qvec_t bitmask=(qvec_t)-1);
|
||||
bool testBoolean(unsigned char *bits,long vecSize);
|
||||
// print to log
|
||||
void printBooleanTree();
|
||||
void printQueryTerms();
|
||||
|
||||
// the new way as of 3/12/2014. just determine if matches the bool
|
||||
// query or not. let's try to offload the scoring logic to other places
|
||||
// if possible.
|
||||
// bitVec is all the QueryWord::m_opBits some docid contains, so
|
||||
// does it match our boolean query or not?
|
||||
bool matchesBoolQuery ( unsigned char *bitVec , long vecSize ) ;
|
||||
|
||||
|
||||
// . call this before calling getBitScore() to set m_bitScores[] table
|
||||
@ -613,6 +646,7 @@ class Query {
|
||||
// through the phrase
|
||||
// . the greater the number of IMplicit SINGLE words a doc has the
|
||||
// bigger its bit score
|
||||
/*
|
||||
uint8_t getBitScore ( qvec_t ebits ) {
|
||||
// get implicit bits from explicit bits
|
||||
qvec_t ibits = getImplicits ( ebits );
|
||||
@ -661,6 +695,7 @@ class Query {
|
||||
if (ibits == m_requiredBits ) bscore|=0x20;
|
||||
return bscore;
|
||||
};
|
||||
*/
|
||||
|
||||
// return an implicit vector from an explicit which contains the explic
|
||||
qvec_t getImplicits ( qvec_t ebits ) {
|
||||
@ -716,7 +751,7 @@ class Query {
|
||||
bool isConnection ( char *s , long len ) ;
|
||||
|
||||
// set the QueryTerm::m_hasNOT members
|
||||
void setHasNOTs();
|
||||
//void setHasNOTs();
|
||||
|
||||
// . used by IndexTable.cpp to make a ptr map of the query terms
|
||||
// to make intersecting the termlists one at a time efficient
|
||||
@ -874,11 +909,12 @@ class Query {
|
||||
|
||||
// . we now contain the parsing components for boolean queries
|
||||
// . m_expressions points into m_gbuf or is allocated
|
||||
class Expression *m_expressions; // [ MAX_OPERANDS ];
|
||||
long m_expressionsAllocSize;
|
||||
//class Expression *m_expressions; // [ MAX_OPERANDS ];
|
||||
//long m_expressionsAllocSize;
|
||||
Expression m_expressions[MAX_EXPRESSIONS];
|
||||
long m_numExpressions;
|
||||
class Operand m_operands [ MAX_OPERANDS ];
|
||||
long m_numOperands ;
|
||||
//class Operand m_operands [ MAX_OPERANDS ];
|
||||
//long m_numOperands ;
|
||||
|
||||
// does query contain the pipe operator
|
||||
bool m_piped;
|
||||
|
@ -9920,7 +9920,8 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
|
||||
char *row;
|
||||
bool checkedRow = false;
|
||||
SpiderColl *sc = cr->m_spiderColl;
|
||||
//SpiderColl *sc = cr->m_spiderColl;
|
||||
SpiderColl *sc = g_spiderCache.getSpiderColl(cr->m_collnum);
|
||||
|
||||
//if ( strstr(url,"http://www.vault.com/rankings-reviews/company-rankings/law/vault-law-100/.aspx?pg=2" ))
|
||||
// log("hey");
|
||||
|
12
Title.cpp
12
Title.cpp
@ -71,6 +71,7 @@ void Title::reset() {
|
||||
mfree ( m_title , m_titleAllocSize , "Title" );
|
||||
m_title = NULL;
|
||||
m_titleBytes = 0;
|
||||
m_titleAllocSize = 0;
|
||||
m_query = NULL;
|
||||
m_titleTagStart = -1;
|
||||
m_titleTagEnd = -1;
|
||||
@ -113,7 +114,7 @@ bool Title::setTitle ( XmlDoc *xd ,
|
||||
char *val = NULL;
|
||||
// look for the "title:" field in json then use that
|
||||
SafeBuf jsonTitle;
|
||||
long vlen;
|
||||
long vlen = 0;
|
||||
if ( xd->m_contentType == CT_JSON ) {
|
||||
char *jt;
|
||||
jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
|
||||
@ -124,7 +125,6 @@ bool Title::setTitle ( XmlDoc *xd ,
|
||||
val = jsonTitle.getBufStart();
|
||||
vlen = jsonTitle.length();
|
||||
}
|
||||
|
||||
}
|
||||
// if we had a title: field in the json...
|
||||
if ( val && vlen > 0 ) {
|
||||
@ -135,6 +135,7 @@ bool Title::setTitle ( XmlDoc *xd ,
|
||||
else {
|
||||
dst = (char *)mmalloc ( m_titleBytes+1,"titdst" );
|
||||
if ( ! dst ) return false;
|
||||
m_titleAllocSize = m_titleBytes+1;
|
||||
}
|
||||
m_title = dst;
|
||||
memcpy ( dst , val , m_titleBytes );
|
||||
@ -142,6 +143,13 @@ bool Title::setTitle ( XmlDoc *xd ,
|
||||
return true;
|
||||
}
|
||||
|
||||
// json content, if has no explicit title field, has no title then
|
||||
if ( xd->m_contentType == CT_JSON ) {
|
||||
m_localBuf[0] = '\0';
|
||||
m_title = m_localBuf;
|
||||
m_titleBytes = 0;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool status = setTitle4 ( xd ,
|
||||
xml ,
|
||||
|
@ -9,7 +9,7 @@
|
||||
#define _TOPTREE_H_
|
||||
|
||||
#include "Clusterdb.h" // SAMPLE_VECTOR_SIZE, 48 bytes for now
|
||||
#include "IndexTable2.h" // score_t definition
|
||||
//#include "IndexTable2.h" // score_t definition
|
||||
#include "RdbTree.h"
|
||||
|
||||
class TopNode {
|
||||
|
39
XmlDoc.cpp
39
XmlDoc.cpp
@ -14474,6 +14474,7 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
// turn off
|
||||
r->m_useCompressionProxy = false;
|
||||
r->m_compressReply = false;
|
||||
r->m_isCustomCrawl = cr->m_isCustomCrawl;
|
||||
|
||||
// set it for this too
|
||||
if ( g_conf.m_useCompressionProxy &&
|
||||
@ -17199,12 +17200,16 @@ long *XmlDoc::getContentHashJson32 ( ) {
|
||||
JsonItem *ji = jp->getFirstItem();
|
||||
long totalHash32 = 0;
|
||||
|
||||
//logf(LOG_DEBUG,"ch32: url=%s",m_firstUrl.m_url);
|
||||
|
||||
for ( ; ji ; ji = ji->m_next ) {
|
||||
QUICKPOLL(m_niceness);
|
||||
// skip if not number or string
|
||||
if ( ji->m_type != JT_NUMBER && ji->m_type != JT_STRING )
|
||||
continue;
|
||||
|
||||
char *topName = NULL;
|
||||
|
||||
// what name level are we?
|
||||
long numNames = 1;
|
||||
JsonItem *pi = ji->m_parent;
|
||||
@ -17212,6 +17217,7 @@ long *XmlDoc::getContentHashJson32 ( ) {
|
||||
// empty name?
|
||||
if ( ! pi->m_name ) continue;
|
||||
if ( ! pi->m_name[0] ) continue;
|
||||
topName = pi->m_name;
|
||||
numNames++;
|
||||
}
|
||||
|
||||
@ -17230,6 +17236,22 @@ long *XmlDoc::getContentHashJson32 ( ) {
|
||||
strcmp(ji->m_name,"resolved_url") == 0 )
|
||||
continue;
|
||||
|
||||
if ( topName && strcmp(topName,"stats") == 0 )
|
||||
continue;
|
||||
|
||||
if ( topName && strcmp(topName,"queryString") == 0 )
|
||||
continue;
|
||||
|
||||
if ( topName && strcmp(topName,"nextPages") == 0 )
|
||||
continue;
|
||||
|
||||
if ( topName && strcmp(topName,"textAnalysis") == 0 )
|
||||
continue;
|
||||
|
||||
if ( topName && strcmp(topName,"links") == 0 )
|
||||
continue;
|
||||
|
||||
|
||||
// hash the fully compound name
|
||||
long nameHash32 = 0;
|
||||
JsonItem *p = ji;
|
||||
@ -17275,6 +17297,11 @@ long *XmlDoc::getContentHashJson32 ( ) {
|
||||
long combined32 = hash32h ( nameHash32 , vh32 );
|
||||
// accumulate field/val pairs order independently
|
||||
totalHash32 ^= combined32;
|
||||
// debug note
|
||||
//logf(LOG_DEBUG,"ch32: field=%s nh32=%lu vallen=%li",
|
||||
// ji->m_name,
|
||||
// nameHash32,
|
||||
// vlen);
|
||||
}
|
||||
|
||||
m_contentHash32 = totalHash32;
|
||||
@ -29753,7 +29780,10 @@ bool XmlDoc::hashWords3 ( //long wordStart ,
|
||||
long plen = 0;
|
||||
if ( hi->m_prefix ) plen = gbstrlen ( hi->m_prefix );
|
||||
if ( hi->m_prefix && plen ) {
|
||||
prefixHash = hash64 ( hi->m_prefix , plen );
|
||||
// we gotta make this case insensitive, and skip spaces
|
||||
// because if it is 'focal length' we can't search
|
||||
// 'focal length:10' because that comes across as TWO terms.
|
||||
prefixHash = hash64Lower_utf8_nospaces ( hi->m_prefix , plen );
|
||||
// . sanity test, make sure it is in supported list
|
||||
// . hashing diffbot json output of course fails this so
|
||||
// skip in that case if diffbot
|
||||
@ -30287,6 +30317,9 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
|
||||
// . this now allows for commas in numbers like "1,500.62"
|
||||
float f = atof2 ( p , bufEnd - p );
|
||||
|
||||
// debug
|
||||
//log("build: hashing %s %f",hi->m_prefix,f);
|
||||
|
||||
if ( ! hashNumber2 ( f , hi , "gbsortby" ) )
|
||||
return false;
|
||||
|
||||
@ -30324,7 +30357,7 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {
|
||||
long nameLen = 0;
|
||||
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
|
||||
if ( hi->m_prefix && nameLen )
|
||||
nameHash = hash64Lower_utf8 ( hi->m_prefix , nameLen );
|
||||
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
|
||||
// need a prefix for hashing numbers... for now
|
||||
else { char *xx=NULL; *xx=0; }
|
||||
|
||||
@ -30429,7 +30462,7 @@ bool XmlDoc::hashNumber3 ( long n , HashInfo *hi , char *sortByStr ) {
|
||||
long nameLen = 0;
|
||||
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
|
||||
if ( hi->m_prefix && nameLen )
|
||||
nameHash = hash64Lower_utf8 ( hi->m_prefix , nameLen );
|
||||
nameHash = hash64Lower_utf8_nospaces( hi->m_prefix , nameLen );
|
||||
// need a prefix for hashing numbers... for now
|
||||
else { char *xx=NULL; *xx=0; }
|
||||
|
||||
|
@ -1,8 +1,8 @@
|
||||
# List of sites to spider, one per line. Gigablast uses the <a
|
||||
# href=/admin/scheduler#insitelist>insitelist</a> directive on the <a
|
||||
# href=/admin/scheduler>spider scheduler</a> page to make sure that the spider
|
||||
# only indexes urls that match the site patterns you specify here, other than
|
||||
# urls you add individually via the add urls or inject url tools. See <a
|
||||
# href=/admin/filters#insitelist>insitelist</a> directive on the <a
|
||||
# href=/admin/filters>url filters</a> page to make sure that the spider only
|
||||
# indexes urls that match the site patterns you specify here, other than urls
|
||||
# you add individually via the add urls or inject url tools. See <a
|
||||
# href=#examples>example site list</a> below. Limit list to 300MB. If you have
|
||||
# a lot of INDIVIDUAL URLS to add then consider using the <a
|
||||
# href=/admin/addurl>addurl</a> interface.
|
||||
@ -12,7 +12,7 @@
|
||||
# must be represented as <, >, " and # respectively.
|
||||
|
||||
# Controls just the spiders for this collection.
|
||||
<spideringEnabled>0</>
|
||||
<spideringEnabled>1</>
|
||||
|
||||
# What is the maximum number of web pages the spider is allowed to download
|
||||
# simultaneously PER HOST for THIS collection?
|
||||
|
10
gb.conf
10
gb.conf
@ -51,7 +51,7 @@
|
||||
<readOnlyMode>0</>
|
||||
|
||||
# Controls all spidering for all collections
|
||||
<spideringEnabled>0</>
|
||||
<spideringEnabled>1</>
|
||||
|
||||
# What is the maximum number of web pages the spider is allowed to download
|
||||
# simultaneously for ALL collections PER HOST?
|
||||
@ -144,7 +144,13 @@
|
||||
# Sends to email address 1 through email server 1 if any parm is changed.
|
||||
<sendParmChangeEmailAlertsToEmail1>0</>
|
||||
|
||||
# Connects to this server directly when sending email 1
|
||||
# Connects to this IP or hostname directly when sending email 1. Use
|
||||
# <i>apt-get install sendmail</i> to install sendmail on that IP or hostname.
|
||||
# Add <i>From:10.5 RELAY</i> to /etc/mail/access to allow sendmail to forward
|
||||
# email it receives from gigablast if gigablast hosts are on the 10.5.*.* IPs.
|
||||
# Then run <i>/etc/init.d/sendmail restart</i> as root to pick up those
|
||||
# changes so sendmail will forward Gigablast's mail to the address you give
|
||||
# below.
|
||||
<emailServer1><![CDATA[10.5.54.47]]></>
|
||||
|
||||
# Sends to this address when sending email 1
|
||||
|
@ -138,6 +138,7 @@ rather your current working directory, where the 'gb' binary resides.
|
||||
<li> Indexes JSON and XML natively. Provides ability to search individual structured fields.
|
||||
<li> Sorting. Sort the search results by meta tags or JSON fields that contain numbers, simply by adding something like gbsortby:price or gbrevsortby:price as a query term, assuming you have meta price tags.
|
||||
<li>Easy Scaling. Add new servers to the hosts.conf file then click 'rebalance shards' to automatically rebalance the sharded data.
|
||||
<li>Using &stream=1 can stream back millions of search results for a query without running out of memory.
|
||||
</ul>
|
||||
|
||||
<br>
|
||||
|
92
main.cpp
92
main.cpp
@ -411,8 +411,8 @@ int main ( int argc , char *argv[] ) {
|
||||
|
||||
"-h\tprint this help.\n\n"
|
||||
"-v\tprint version and exit.\n\n"
|
||||
"-o\tprint the overview documentation in HTML. "
|
||||
"Contains the format of hosts.conf.\n\n"
|
||||
//"-o\tprint the overview documentation in HTML. "
|
||||
//"Contains the format of hosts.conf.\n\n"
|
||||
"-r\tindicates recovery mode, "
|
||||
"sends email to addresses "
|
||||
"specified in Conf.h upon startup.\n\n"
|
||||
@ -440,6 +440,7 @@ int main ( int argc , char *argv[] ) {
|
||||
"\ttwo hostids with a hyphen in between indicates a "
|
||||
"range.\n\n"
|
||||
|
||||
/*
|
||||
"tmpstart [hostId]\n"
|
||||
"\tstart the gb process on all hosts or just on "
|
||||
"[hostId] if specified, but "
|
||||
@ -456,6 +457,7 @@ int main ( int argc , char *argv[] ) {
|
||||
"\tsaves and exits for all gb hosts or "
|
||||
"just on [hostId] if specified, for the "
|
||||
"tmpstart command.\n\n"
|
||||
*/
|
||||
|
||||
"spidersoff [hostId]\n"
|
||||
"\tdisables spidering for all gb hosts or "
|
||||
@ -465,6 +467,7 @@ int main ( int argc , char *argv[] ) {
|
||||
"\tensables spidering for all gb hosts or "
|
||||
"just on [hostId] if specified.\n\n"
|
||||
|
||||
/*
|
||||
"cacheoff [hostId]\n"
|
||||
"\tdisables all disk PAGE caches on all hosts or "
|
||||
"just on [hostId] if specified.\n\n"
|
||||
@ -472,11 +475,17 @@ int main ( int argc , char *argv[] ) {
|
||||
"freecache [maxShmid]\n"
|
||||
"\tfinds and frees all shared memory up to shmid "
|
||||
"maxShmid, default is 3000000.\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
"ddump [hostId]\n"
|
||||
"\tdisk dump in memory trees to binary files "
|
||||
"just on [hostId] if specified.\n\n"
|
||||
"\tdump all b-trees in memory to sorted files on "
|
||||
"disk. "
|
||||
"Will likely trigger merges on files on disk. "
|
||||
"Restrict to just host [hostId] if given.\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
"pmerge [hostId|hostId1-hostId2]\n"
|
||||
"\tforce merge of posdb files "
|
||||
"just on [hostId] if specified.\n\n"
|
||||
@ -492,16 +501,19 @@ int main ( int argc , char *argv[] ) {
|
||||
"merge [hostId|hostId1-hostId2]\n"
|
||||
"\tforce merge of all rdb files "
|
||||
"just on [hostId] if specified.\n\n"
|
||||
*/
|
||||
|
||||
"dsh <CMD>\n"
|
||||
"\trun this command on the primary IPs of "
|
||||
"all active hosts in hosts.conf. Example: "
|
||||
"gb dsh 'ps auxw; uptime'\n\n"
|
||||
|
||||
/*
|
||||
"dsh2 <CMD>\n"
|
||||
"\trun this command on the secondary IPs of "
|
||||
"all active hosts in hosts.conf. Example: "
|
||||
"gb dsh2 'ps auxw; uptime'\n\n"
|
||||
*/
|
||||
|
||||
"install [hostId]\n"
|
||||
"\tinstall all required files for gb from "
|
||||
@ -509,13 +521,16 @@ int main ( int argc , char *argv[] ) {
|
||||
"to [hostId]. If no [hostId] is specified install "
|
||||
"to ALL hosts.\n\n"
|
||||
|
||||
/*
|
||||
"install2 [hostId]\n"
|
||||
"\tlike above, but use the secondary IPs in the "
|
||||
"hosts.conf.\n\n"
|
||||
*/
|
||||
|
||||
"installgb [hostId]\n"
|
||||
"\tlike above, but install just the gb executable.\n\n"
|
||||
|
||||
/*
|
||||
"installgb2 [hostId]\n"
|
||||
"\tlike above, but use the secondary IPs in the "
|
||||
"hosts.conf.\n\n"
|
||||
@ -592,7 +607,9 @@ int main ( int argc , char *argv[] ) {
|
||||
"search for them on server2. If you do not want to"
|
||||
" use the proxy server "
|
||||
"on gk10, use -p\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
"blaster [-l|-u|-i] <file> <maxNumThreads> <wait>\n"
|
||||
"\tget documents from the urls given in file. The "
|
||||
"-l argument is to "
|
||||
@ -606,7 +623,9 @@ int main ( int argc , char *argv[] ) {
|
||||
"\tmaxNumThreads is the"
|
||||
" number of concurrent threads at one time and wait "
|
||||
" is the time to wait between threads.\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
"scale <newHosts.conf>\n"
|
||||
"\tGenerate a script to be called to migrate the "
|
||||
"data to the new places. Remaining hosts will "
|
||||
@ -647,7 +666,9 @@ int main ( int argc , char *argv[] ) {
|
||||
"ping <hostId> [clientport]\n"
|
||||
"\tperforms pings to <hostId>. [clientport] defaults "
|
||||
"to 2050.\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
"spellcheck <file>\n"
|
||||
"\tspellchecks the the queries in <file>.\n\n"
|
||||
|
||||
@ -701,7 +722,9 @@ int main ( int argc , char *argv[] ) {
|
||||
|
||||
"parsetest <docIdToTest> [coll] [query]\n\t"
|
||||
"parser speed tests\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
"thrutest [dir] [fileSize]\n\tdisk write/read speed "
|
||||
"test\n\n"
|
||||
|
||||
@ -711,6 +734,9 @@ int main ( int argc , char *argv[] ) {
|
||||
|
||||
"memtest\n"
|
||||
"\t Test how much memory we can use\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
// Quality Tests
|
||||
"countdomains <coll> <X>\n"
|
||||
"\tCounts the domains and IPs in collection coll and "
|
||||
@ -738,33 +764,38 @@ int main ( int argc , char *argv[] ) {
|
||||
|
||||
"dump es <coll> <UTCtimestamp>\n\tdump stats for "
|
||||
"all events as if the time is UTCtimestamp.\n\n"
|
||||
*/
|
||||
|
||||
/*
|
||||
#ifdef _CLIENT_
|
||||
//there was <hostId> in this command but it
|
||||
// wasn't used in the program, so deleting it from
|
||||
// here
|
||||
"dump <V> [C [X [Y [Z]]]]\n\tdump a db in "
|
||||
#else
|
||||
*/
|
||||
|
||||
/*
|
||||
"dump <V> [C [X [Y [Z [T]]]]]\n\tdump a db in "
|
||||
#endif
|
||||
//#endif
|
||||
"working directory.\n"
|
||||
#ifndef _CLIENT_
|
||||
#ifndef _METALINCS_
|
||||
//#ifndef _CLIENT_
|
||||
//#ifndef _METALINCS_
|
||||
//"\tV is u to dump tfndb.\n"
|
||||
"\tV is d to dump datedb.\n"
|
||||
#endif
|
||||
#endif
|
||||
//#endif
|
||||
//#endif
|
||||
"\tV is s to dump spiderdb. set [T] to 1 to print "
|
||||
"new stats. 2 to print old stats. T is ip of firstip."
|
||||
"\n"
|
||||
"\tV is t to dump titledb.\n"
|
||||
"\tV is ts to dump sentences from events.\n"
|
||||
"\tV is tw to dump words from events.\n"
|
||||
//"\tV is ts to dump sentences from events.\n"
|
||||
//"\tV is tw to dump words from events.\n"
|
||||
"\tV is D to dump duplicate docids in titledb.\n"
|
||||
"\tV is c to dump checksumdb.\n"
|
||||
"\tV is S to dump tagdb.\n"
|
||||
"\tV is W to dump tagdb for wget.\n"
|
||||
"\tV is V to dump revdb.\n"
|
||||
//"\tV is V to dump revdb.\n"
|
||||
"\tV is x to dump doledb.\n"
|
||||
"\tV is w to dump waiting tree.\n"
|
||||
"\tV is B to dump sectiondb.\n"
|
||||
@ -779,13 +810,13 @@ int main ( int argc , char *argv[] ) {
|
||||
"\tX is start file num. (default 0)\n"
|
||||
"\tY is num files. (default -1)\n"
|
||||
"\tZ is 1 to include tree. (default 1)\n"
|
||||
#ifndef _CLIENT_
|
||||
#ifndef _METALINCS_
|
||||
#ifndef _GLOBALSPEC_
|
||||
//#ifndef _CLIENT_
|
||||
//#ifndef _METALINCS_
|
||||
//#ifndef _GLOBALSPEC_
|
||||
"\tT is the termid to dump. Applies only to indexdb.\n"
|
||||
#endif
|
||||
#endif
|
||||
#endif
|
||||
//#endif
|
||||
//#endif
|
||||
//#endif
|
||||
"\tT is the first docId to dump. Applies only to "
|
||||
"titledb. "
|
||||
//"(default none)\n\n"
|
||||
@ -806,22 +837,27 @@ int main ( int argc , char *argv[] ) {
|
||||
//"\tB is -1 to dump all priorities\n"
|
||||
"\tC is 1 to just show the stats. (default 0)\n"
|
||||
"\n"
|
||||
*/
|
||||
|
||||
|
||||
//"dump i X Y Z t\n\tdump indexdb termId t in working "
|
||||
//"directory.\n"
|
||||
//"\tX is start file num. (default 0)\n"
|
||||
//"\tY is num files. (default -1)\n"
|
||||
//"\tZ is 1 to include tree. (default 1)\n"
|
||||
//"\tt is the termid to dump. (default none)\n\n"
|
||||
#ifndef _CLIENT_
|
||||
#ifndef _METALINCS_
|
||||
//#ifndef _CLIENT_
|
||||
//#ifndef _METALINCS_
|
||||
/*
|
||||
"dump I [X [V]]\n\tdump indexdb in working "
|
||||
"directory at "
|
||||
"an offset.\n"
|
||||
#endif
|
||||
#endif
|
||||
//#endif
|
||||
//#endif
|
||||
"\tX is the file NAME. (default NULL)\n"
|
||||
"\tV is the start offset. (default 0)\n"
|
||||
|
||||
*/
|
||||
/*
|
||||
"\n"
|
||||
"dumpmissing <coll> [hostId]\n\t"
|
||||
"dump the docIds in indexdb but not "
|
||||
@ -867,6 +903,7 @@ int main ( int argc , char *argv[] ) {
|
||||
"in the current gb. Use synchost2 to use secondary "
|
||||
"IPs.\n"
|
||||
"\n"
|
||||
*/
|
||||
//#endif
|
||||
);
|
||||
SafeBuf sb2;
|
||||
@ -894,6 +931,7 @@ int main ( int argc , char *argv[] ) {
|
||||
if ( strcmp ( cmd , "-h" ) == 0 ) goto printHelp;
|
||||
// version
|
||||
if ( strcmp ( cmd , "-v" ) == 0 ) {
|
||||
fprintf(stdout,"Gigablast March-2014\n");
|
||||
// fprintf(stderr,"Gigablast %s\nMD5KEY: %s\n"
|
||||
// "TAG: %s\nPATH: %s\n",
|
||||
// GBVersion, GBCommitID, GBTag, GBBuildPath);
|
||||
@ -901,10 +939,10 @@ int main ( int argc , char *argv[] ) {
|
||||
}
|
||||
|
||||
// print overview
|
||||
if ( strcmp ( cmd , "-o" ) == 0 ) {
|
||||
//printOverview ( );
|
||||
return 0;
|
||||
}
|
||||
//if ( strcmp ( cmd , "-o" ) == 0 ) {
|
||||
// //printOverview ( );
|
||||
// return 0;
|
||||
//}
|
||||
|
||||
bool hadHostId = false;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user