Merge branch 'diffbot-testing' of github.com:gigablast/open-source-search-engine into diffbot-testing

This commit is contained in:
Matt Wells 2014-02-28 08:23:59 -08:00
commit 11efab9862
14 changed files with 227 additions and 152 deletions

View File

@ -452,7 +452,7 @@ bool Collectiondb::addNewColl ( char *coll ,
cr->m_diffbotOnlyProcessIfNewUrl = true;
// default respider to off
cr->m_collectiveRespiderFrequency = 0.0;
cr->m_restrictDomain = true;
//cr->m_restrictDomain = true;
// reset the crawl stats
// . this will core if a host was dead and then when it came
// back up host #0's parms.cpp told it to add a new coll
@ -2091,6 +2091,66 @@ bool CollectionRec::rebuildUrlFilters ( ) {
if ( ! upp ) upp = m_diffbotUrlProcessRegEx.getBufStart();
if ( upp && ! upp[0] ) upp = NULL;
///////
//
// recompile regular expressions
//
///////
if ( m_hasucr ) {
regfree ( &m_ucr );
m_hasucr = false;
}
if ( m_hasupr ) {
regfree ( &m_upr );
m_hasupr = false;
}
// copy into tmpbuf
SafeBuf tmp;
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasucr = true;
}
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
REG_EXTENDED| //REG_ICASE|
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
regfree ( &m_ucr );
m_hasucr = false;
}
rx = m_diffbotUrlProcessRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) m_hasupr = true;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasupr = true;
}
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
REG_EXTENDED| // REG_ICASE|
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
regfree ( &m_upr );
m_hasupr = false;
}
// what diffbot url to use for processing
char *api = m_diffbotApiUrl.getBufStart();
@ -2139,11 +2199,18 @@ bool CollectionRec::rebuildUrlFilters ( ) {
// 2nd default filter
// always turn this on for now. they need to add domains they want
// to crawl as seeds so they do not spider the web.
//if ( m_restrictDomain ) {
m_regExs[i].set("!isonsamedomain && !ismanualadd");
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
i++;
//}
// no because FTB seeds with link pages that link to another
// domain. they just need to be sure to supply a crawl pattern
// to avoid spidering the whole web.
//
// if they did not EXPLICITLY provide a url crawl pattern or
// url crawl regex then restrict to seeds to prevent from spidering
// the entire internet
if ( ! ucp && ! m_hasucr ) { // m_restrictDomain ) {
m_regExs[i].set("!isonsamedomain && !ismanualadd");
m_spiderPriorities [i] = SPIDER_PRIORITY_FILTERED;
i++;
}
m_regExs[i].set("errorcount>=1 && !hastmperror");
m_spiderPriorities [i] = 15;
@ -2268,66 +2335,6 @@ bool CollectionRec::rebuildUrlFilters ( ) {
m_numRegExs8 = i;
//m_numRegExs11 = i;
///////
//
// recompile regular expressions
//
///////
if ( m_hasucr ) {
regfree ( &m_ucr );
m_hasucr = false;
}
if ( m_hasupr ) {
regfree ( &m_upr );
m_hasupr = false;
}
// copy into tmpbuf
SafeBuf tmp;
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasucr = true;
}
if ( rx && regcomp ( &m_ucr , tmp.getBufStart() ,
REG_EXTENDED| //REG_ICASE|
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
regfree ( &m_ucr );
m_hasucr = false;
}
rx = m_diffbotUrlProcessRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) m_hasupr = true;
if ( rx ) {
tmp.reset();
tmp.safeStrcpy ( rx );
expandRegExShortcuts ( &tmp );
m_hasupr = true;
}
if ( rx && regcomp ( &m_upr , tmp.getBufStart() ,
REG_EXTENDED| // REG_ICASE|
REG_NEWLINE ) ) { // |REG_NOSUB) ) {
// error!
log("coll: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
regfree ( &m_upr );
m_hasupr = false;
}
//char *x = "http://staticpages.diffbot.com/testCrawl/article1.html";
//if(m_hasupr && regexec(&m_upr,x,0,NULL,0) ) { char *xx=NULL;*xx=0; }

View File

@ -458,7 +458,7 @@ class CollectionRec {
char m_enforceNewQuotas ;
char m_doIpLookups ; // considered iff using proxy
char m_useRobotsTxt ;
char m_restrictDomain ; // say on same domain as seeds?
//char m_restrictDomain ; // say on same domain as seeds?
char m_doTuringTest ; // for addurl
char m_applyFilterToText ; // speeds us up
char m_allowHttps ; // read HTTPS using SSL

View File

@ -2315,10 +2315,10 @@ uint32_t Hostdb::getShardNum ( char rdbId,void *k ) { // ,bool split ) {
else if ( rdbId == RDB_LINKDB || rdbId == RDB2_LINKDB2 ) {
return m_map [(*(uint16_t *)((char *)k + 26))>>3];
}
else if ( rdbId == RDB_TFNDB || rdbId == RDB2_TFNDB2 ) {
unsigned long long d = g_tfndb.getDocId ( (key_t *)k );
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
}
//else if ( rdbId == RDB_TFNDB || rdbId == RDB2_TFNDB2 ) {
// unsigned long long d = g_tfndb.getDocId ( (key_t *)k );
// return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];
//}
else if ( rdbId == RDB_TITLEDB || rdbId == RDB2_TITLEDB2 ) {
unsigned long long d = g_titledb.getDocId ( (key_t *)k );
return m_map [ ((d>>14)^(d>>7)) & (MAX_KSLOTS-1) ];

View File

@ -633,8 +633,8 @@ static void sendReplyWrapper ( void *state ) {
// steal this buffer
char *reply1 = info->getBufStart();
long replySize = info->length();
// sanity
if ( replySize <= 0 ) { char *xx=NULL;*xx=0; }
// sanity. no if collrec not found its 0!
if ( ! saved && replySize <= 0 ) { char *xx=NULL;*xx=0; }
// get original request
Msg25Request *req = (Msg25Request *)slot2->m_readBuf;
// sanity
@ -645,7 +645,10 @@ static void sendReplyWrapper ( void *state ) {
nextLink:
UdpSlot *udpSlot = req->m_udpSlot;
// update for next udpSlot
req = req->m_next;
// just dup the reply for each one
char *reply2 = (char *)mdup(reply1,replySize,"m25repd");
@ -666,7 +669,6 @@ static void sendReplyWrapper ( void *state ) {
}
// if we had a link
req = req->m_next;
if ( req ) goto nextLink;
// the destructor
@ -684,6 +686,10 @@ void handleRequest25 ( UdpSlot *slot , long netnice ) {
// make sure this always NULL for our linked list logic
req->m_next = NULL;
// udp socket for sending back the final linkInfo in m_linkInfoBuf
// used by sendReply()
req->m_udpSlot = slot;
// set up the hashtable if our first time
if ( ! g_lineTable.isInitialized() )
g_lineTable.set ( 8,4,256,NULL,0,false,MAX_NICENESS,"lht25");
@ -735,10 +741,6 @@ void handleRequest25 ( UdpSlot *slot , long netnice ) {
// point to a real safebuf here for populating with data
m25->m_linkInfoBuf = &m25->m_realBuf;
// udp socket for sending back the final linkInfo in m_linkInfoBuf
// used by sendReply()
req->m_udpSlot = slot;
// set some new stuff. should probably be set in getLinkInfo2()
// but we are trying to leave that as unaltered as possible to
// try to reduce debugging.

View File

@ -2,7 +2,7 @@ SHELL = /bin/bash
CC=g++
OBJS = Tfndb.o UdpSlot.o Rebalance.o \
OBJS = UdpSlot.o Rebalance.o \
Msg13.o Mime.o IndexReadInfo.o \
PageGet.o PageHosts.o PageIndexdb.o PageLogin.o \
PageParser.o PageInject.o PagePerf.o PageReindex.o PageResults.o \

View File

@ -14,6 +14,7 @@ void Msg20::constructor () {
m_r = NULL;
m_inProgress = false;
m_launched = false;
m_i = -1;
reset();
m_mcast.constructor();
}

View File

@ -881,19 +881,20 @@ bool Msg40::reallocMsg20Buf ( ) {
return true;
}
m_buf2 = NULL;
m_bufMaxSize2 = need;
m_numMsg20s = m_msg3a.m_numDocIds;
// when streaming because we can have hundreds of thousands of
// search results we recycle a few msg20s to save mem
if ( m_si->m_streamResults ) {
long max = MAX_OUTSTANDING_MSG20S;
long max = MAX_OUTSTANDING_MSG20S * 2;
if ( m_msg3a.m_numDocIds < max ) max = m_msg3a.m_numDocIds;
need = max * (4+sizeof(Msg20));
m_numMsg20s = max;
}
m_buf2 = NULL;
m_bufMaxSize2 = need;
// do the alloc
if ( need ) m_buf2 = (char *)mmalloc ( need ,"Msg40msg20");
if ( need && ! m_buf2 ) { m_errno = g_errno; return false; }
@ -1033,6 +1034,11 @@ bool Msg40::launchMsg20s ( bool recalled ) {
//if ( m_numRequests-m_numReplies >= need ) break;
// hard limit
if ( m_numRequests-m_numReplies >= maxOut ) break;
// do not launch another until m_printi comes back because
// all summaries are bottlenecked on printing him out now
if ( m_si->m_streamResults &&
i >= m_printi + MAX_OUTSTANDING_MSG20S - 1 )
break;
// do not double count!
//if ( i <= m_lastProcessedi ) continue;
// do not repeat for this i
@ -1238,7 +1244,8 @@ Msg20 *Msg40::getAvailMsg20 ( ) {
// m_inProgress is set to false right before it
// calls Msg20::m_callback which is gotSummaryWrapper()
// so we should be ok with this
if ( ! m_msg20[i]->m_inProgress ) return m_msg20[i];
if ( m_msg20[i]->m_launched ) continue;
return m_msg20[i];
}
// how can this happen???
char *xx=NULL;*xx=0;
@ -1377,27 +1384,42 @@ bool Msg40::gotSummary ( ) {
// otherwise, get the summary for result #m_printi
//Msg20 *m20 = m_msg20[m_printi];
if ( ! m20 ) {
log("msg40: m20 NULL #%li",m_printi);
continue;
}
//if ( ! m20 ) {
// log("msg40: m20 NULL #%li",m_printi);
// continue;
//}
// if result summary #i not yet in, wait...
if ( ! m20 )
break;
// wait if no reply for it yet
//if ( m20->m_inProgress )
// break;
if ( m20->m_errno ) {
log("msg40: sum #%li error: %s",
m_printi,mstrerror(m20->m_errno));
// make it available to be reused
m20->reset();
continue;
}
// get the next reply we are waiting on to print results order
Msg20Reply *mr = m20->m_r;
if ( ! mr ) break;
//if ( ! mr ) { char *xx=NULL;*xx=0; }
// primitive deduping. for diffbot json exclude url's from the
// XmlDoc::m_contentHash32.. it will be zero if invalid i guess
if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
mr->m_contentHash32 &&
m_dedupTable.isInTable ( &mr->m_contentHash32 ) ) {
//if ( g_conf.m_logDebugQuery )
log("msg40: dup sum #%li (%lu)",m_printi,
mr->m_contentHash32);
// make it available to be reused
m20->reset();
continue;
}
@ -1418,8 +1440,12 @@ bool Msg40::gotSummary ( ) {
printSearchResult9 ( m_printi );
// now free the reply to save memory since we could be
// streaming back 1M+
m20->freeReply();
// streaming back 1M+. we call reset below, no need for this.
//m20->freeReply();
// return it so getAvailMsg20() can use it again
// this will set m_launched to false
m20->reset();
}
// set it to true on all but the last thing we send!
@ -1477,6 +1503,9 @@ bool Msg40::gotSummary ( ) {
// do a recursive stack explosion
// . this returns false if still waiting on more to come back
if ( ! launchMsg20s ( true ) ) return false;
// it won't launch now if we are bottlnecked waiting for
// m_printi's summary to come in
if ( m_si->m_streamResults ) return false;
// maybe some were cached?
//goto refilter;
// it returned true, so m_numRequests == m_numReplies and

View File

@ -784,6 +784,12 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
// do not print "Fake First Ip"...
if ( m_prevReplyError == EFAKEFIRSTIP )
msg = "Initial crawl request";
// if the initial crawl request got a reply then that
// means the spiderrequest was added under the correct
// firstip... so skip it. i am assuming that the
// correct spidrerequest got added ok here...
if ( m_prevReplyError == EFAKEFIRSTIP )
continue;
}
if ( srep && srep->m_hadDiffbotError )
@ -1533,7 +1539,7 @@ static class HelpItem s_his[] = {
"the maxtocrawl or maxtoprocess limit, or when the crawl "
"completes."},
{"obeyRobots","Obey robots.txt files?"},
{"restrictDomain","Restrict downloaded urls to domains of seeds?"},
//{"restrictDomain","Restrict downloaded urls to domains of seeds?"},
{"urlCrawlPattern","List of || separated strings. If the url "
"contains any of these then we crawl the url, otherwise, we do not. "
@ -2365,11 +2371,11 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
// settable parms
"\"maxToCrawl\":%lli,\n"
"\"maxToProcess\":%lli,\n"
"\"restrictDomain\":%li,\n"
//"\"restrictDomain\":%li,\n"
"\"onlyProcessIfNew\":%li,\n"
, cx->m_maxToCrawl
, cx->m_maxToProcess
, (long)cx->m_restrictDomain
//, (long)cx->m_restrictDomain
, (long)cx->m_diffbotOnlyProcessIfNewUrl
);
sb.safePrintf("\"seeds\":\"");
@ -3344,13 +3350,15 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
urtYes = "";
urtNo = " checked";
}
/*
char *rdomYes = " checked";
char *rdomNo = "";
if ( ! cr->m_restrictDomain ) {
rdomYes = "";
rdomNo = " checked";
}
*/
char *isNewYes = "";
char *isNewNo = " checked";
@ -3541,15 +3549,15 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
"</td>"
"</tr>"
"<tr><td>"
"<b>Restrict domain to seeds?</b> "
"</td><td>"
"<input type=radio name=restrictDomain "
"value=1%s> yes &nbsp; "
"<input type=radio name=restrictDomain "
"value=0%s> no &nbsp; "
"</td>"
"</tr>"
//"<tr><td>"
//"<b>Restrict domain to seeds?</b> "
//"</td><td>"
//"<input type=radio name=restrictDomain "
//"value=1%s> yes &nbsp; "
//"<input type=radio name=restrictDomain "
//"value=0%s> no &nbsp; "
//"</td>"
//"</tr>"
//"<tr><td>"
//"Use spider proxies on AWS? "
@ -3592,8 +3600,8 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
, urtYes
, urtNo
, rdomYes
, rdomNo
//, rdomYes
//, rdomNo
);
}

View File

@ -2057,8 +2057,15 @@ bool printResult ( State0 *st, long ix ) {
}
Msg20 *m20 = msg40->m_msg20[ix];
Msg20Reply *mr = m20->m_r;
Msg20 *m20 ;
if ( si->m_streamResults )
m20 = msg40->getCompletedSummary(ix);
else
m20 = msg40->m_msg20[ix];
// get the reply
Msg20Reply *mr = m20->m_r;
// . sometimes the msg20reply is NULL so prevent it coring
// . i think this happens if all hosts in a shard are down or timeout
@ -5302,6 +5309,14 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
ji = ptrs[i];
// skip if none
if ( ! ji ) continue;
// skip "html" field... too spammy for csv and > 32k causes
// libreoffice calc to truncate it and break its parsing
if ( ji->m_name &&
//! ji->m_parent &&
strcmp(ji->m_name,"html")==0)
continue;
//
// get value and print otherwise
//

View File

@ -9963,6 +9963,7 @@ void Parms::init ( ) {
m++;
// use url filters for this. this is a crawlbot parm really.
/*
m->m_title = "restrict domain";
m->m_desc = "Keep crawler on same domain as seed urls?";
m->m_cgi = "restrictDomain";
@ -9972,6 +9973,7 @@ void Parms::init ( ) {
// we need to save this it is a diffbot parm
m->m_flags = PF_HIDDEN | PF_DIFFBOT;// | PF_NOSAVE;
m++;
*/
m->m_title = "do url sporn checking";
m->m_desc = "If this is true and the spider finds "

10
Rdb.cpp
View File

@ -13,7 +13,7 @@
#include "Datedb.h"
#include "Titledb.h"
#include "Spider.h"
#include "Tfndb.h"
//#include "Tfndb.h"
//#include "Sync.h"
#include "Spider.h"
#include "Repair.h"
@ -2648,7 +2648,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
s_table9 [ RDB_SYNCDB ] = g_syncdb.getRdb();
s_table9 [ RDB_SPIDERDB ] = g_spiderdb.getRdb();
s_table9 [ RDB_DOLEDB ] = g_doledb.getRdb();
s_table9 [ RDB_TFNDB ] = g_tfndb.getRdb();
//s_table9 [ RDB_TFNDB ] = g_tfndb.getRdb();
s_table9 [ RDB_CLUSTERDB ] = g_clusterdb.getRdb();
s_table9 [ RDB_CATDB ] = g_catdb.getRdb();
s_table9 [ RDB_DATEDB ] = g_datedb.getRdb();
@ -2667,7 +2667,7 @@ Rdb *getRdbFromId ( uint8_t rdbId ) {
s_table9 [ RDB2_SECTIONDB2 ] = g_sectiondb2.getRdb();
s_table9 [ RDB2_PLACEDB2 ] = g_placedb2.getRdb();
s_table9 [ RDB2_SPIDERDB2 ] = g_spiderdb2.getRdb();
s_table9 [ RDB2_TFNDB2 ] = g_tfndb2.getRdb();
//s_table9 [ RDB2_TFNDB2 ] = g_tfndb2.getRdb();
s_table9 [ RDB2_CLUSTERDB2 ] = g_clusterdb2.getRdb();
s_table9 [ RDB2_DATEDB2 ] = g_datedb2.getRdb();
s_table9 [ RDB2_LINKDB2 ] = g_linkdb2.getRdb();
@ -2691,7 +2691,7 @@ char getIdFromRdb ( Rdb *rdb ) {
//if ( rdb == g_checksumdb.getRdb() ) return RDB_CHECKSUMDB;
if ( rdb == g_spiderdb.getRdb () ) return RDB_SPIDERDB;
if ( rdb == g_doledb.getRdb () ) return RDB_DOLEDB;
if ( rdb == g_tfndb.getRdb () ) return RDB_TFNDB;
//if ( rdb == g_tfndb.getRdb () ) return RDB_TFNDB;
if ( rdb == g_clusterdb.getRdb () ) return RDB_CLUSTERDB;
if ( rdb == g_statsdb.getRdb () ) return RDB_STATSDB;
if ( rdb == g_linkdb.getRdb () ) return RDB_LINKDB;
@ -2712,7 +2712,7 @@ char getIdFromRdb ( Rdb *rdb ) {
if ( rdb == g_placedb2.getRdb () ) return RDB2_PLACEDB2;
//if ( rdb == g_checksumdb2.getRdb() ) return RDB2_CHECKSUMDB2;
if ( rdb == g_spiderdb2.getRdb () ) return RDB2_SPIDERDB2;
if ( rdb == g_tfndb2.getRdb () ) return RDB2_TFNDB2;
//if ( rdb == g_tfndb2.getRdb () ) return RDB2_TFNDB2;
if ( rdb == g_clusterdb2.getRdb () ) return RDB2_CLUSTERDB2;
//if ( rdb == g_statsdb2.getRdb () ) return RDB2_STATSDB2;
if ( rdb == g_linkdb2.getRdb () ) return RDB2_LINKDB2;

View File

@ -2,7 +2,7 @@
#include "Rdb.h"
#include "Msg35.h"
#include "Tfndb.h"
//#include "Tfndb.h"
//#include "Checksumdb.h"
#include "Clusterdb.h"
#include "Hostdb.h"
@ -966,7 +966,7 @@ bool RdbBase::incorporateMerge ( ) {
// tfndb has his own merge class since titledb merges write tfndb recs
RdbMerge *m = &g_merge;
if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
//if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
// print out info of newly merged file
long long tp = m_maps[x]->getNumPositiveRecs();
@ -974,7 +974,7 @@ bool RdbBase::incorporateMerge ( ) {
log(LOG_INFO,
"merge: Merge succeeded. %s (#%li) has %lli positive "
"and %lli negative recs.", m_files[x]->getFilename(), x, tp, tn);
if ( m_rdb == g_posdb.getRdb() || m_rdb == g_tfndb.getRdb() )
if ( m_rdb == g_posdb.getRdb() ) // || m_rdb == g_tfndb.getRdb() )
log(LOG_INFO,"merge: Removed %lli dup keys.",
m->getDupsRemoved() );
// . bitch if bad news
@ -1470,8 +1470,8 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
// if we are tfndb and someone else is merging, do not merge unless
// we have 3 or more files
long minToMerge = m_minToMerge;
if (g_tfndb.getRdb()==m_rdb&& g_merge.isMerging() && minToMerge <=2 )
minToMerge = 3;
//if (g_tfndb.getRdb()==m_rdb&& g_merge.isMerging() && minToMerge <=2 )
// minToMerge = 3;
// do not start a tfndb merge while someone is dumping because the
// dump starves the tfndb merge and we clog up adding links. i think
// this is mainly just indexdb dumps, but we'll see.
@ -1565,7 +1565,7 @@ void RdbBase::attemptMerge ( long niceness, bool forceMergeAll, bool doLog ,
//if ( m_mergeUrgent ) priority = 2;
//else priority = 0;
// tfndb doesn't need token, since titledb merge writes tfndb recs
if ( m_rdb != g_tfndb.getRdb() &&
if ( //m_rdb != g_tfndb.getRdb() &&
! g_msg35.getToken ( this , gotTokenForMergeWrapper, priority ) )
return ;
// bitch if we got token because there was an error somewhere
@ -1616,7 +1616,7 @@ void RdbBase::gotTokenForMerge ( ) {
}
// tfndb has his own merge class since titledb merges write tfndb recs
RdbMerge *m = &g_merge;
if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
//if ( m_rdb == g_tfndb.getRdb() ) m = &g_merge2;
// sanity check
if ( m_isMerging || m->isMerging() ) {
//if ( m_doLog )
@ -1724,8 +1724,8 @@ void RdbBase::gotTokenForMerge ( ) {
}
minToMerge = m_minToMerge;
if (m_rdb==g_tfndb.getRdb()&& g_merge.isMerging() && minToMerge <=2 )
minToMerge = 3;
//if (m_rdb==g_tfndb.getRdb()&& g_merge.isMerging() && minToMerge <=2 )
// minToMerge = 3;
// look at this merge:
// indexdb0003.dat.part1

View File

@ -2101,8 +2101,15 @@ bool XmlDoc::indexDoc ( ) {
// cr->m_localCrawlInfo.m_pageDownloadAttempts);
// this is just how many urls we tried to index
//cr->m_localCrawlInfo.m_urlsConsidered++;
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
// avoid counting if it is a fake first ip
bool countIt = true;
// pagereindex.cpp sets this as does any add url (bulk job)
if ( m_sreqValid && m_sreq.m_fakeFirstIp )
countIt = false;
if ( countIt ) {
cr->m_localCrawlInfo.m_pageDownloadAttempts++;
cr->m_globalCrawlInfo.m_pageDownloadAttempts++;
}
// need to save collection rec now during auto save
cr->m_needsSave = true;
// update this just in case we are the last url crawled
@ -2358,7 +2365,8 @@ bool XmlDoc::indexDoc2 ( ) {
// return false;
// MDW: we do this in indexDoc() above why do we need it here?
/*
// even if not using diffbot, keep track of these counts
if ( ! m_isDiffbotJSONObject &&
! m_incrementedAttemptsCount ) {
@ -2374,7 +2382,7 @@ bool XmlDoc::indexDoc2 ( ) {
long long now = gettimeofdayInMillisecondsGlobal();
cr->m_diffbotCrawlEndTime = now;
}
*/
/*
// if we are being called from Spider.cpp and we met our max
// to crawl requirement, then bail out on this. this might
@ -12973,11 +12981,13 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
// because we need the anchor text to pass in to diffbot
bool doLinkSpamCheck = cr->m_doLinkSpamCheck;
bool oneVotePerIpDom = cr->m_oneVotePerIpDom;
if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
doLinkSpamCheck = false;
oneVotePerIpDom = false;
onlyNeedGoodInlinks = false;
}
// this seems to overdo it when we have a ton of linktext
// perhaps, so take this out...
//if ( cr->m_isCustomCrawl && cr->m_restrictDomain ) {
// doLinkSpamCheck = false;
// oneVotePerIpDom = false;
// onlyNeedGoodInlinks = false;
//}
// call it
char *url = getFirstUrl()->getUrl();
@ -13764,7 +13774,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
// we make a "fake" url for the diffbot reply when indexing it
// by appending -diffbotxyz%lu. see "fakeUrl" below.
if ( m_firstUrl.getUrlLen() + 15 >= MAX_URL_LEN ) {
if ( m_firstUrl.getUrlLen() + 24 >= MAX_URL_LEN ) {
if ( m_firstUrlValid )
log("build: diffbot url would be too long for "
"%s", m_firstUrl.getUrl() );

View File

@ -25,7 +25,7 @@
#include "Tagdb.h"
#include "Catdb.h"
#include "Users.h"
#include "Tfndb.h"
//#include "Tfndb.h"
#include "Spider.h"
//#include "Doledb.h"
//#include "Checksumdb.h"
@ -150,8 +150,8 @@ static void dumpTitledb ( char *coll,long sfn,long numFiles,bool includeTree,
long long docId , char justPrintDups ,
bool dumpSentences ,
bool dumpWords );
static void dumpTfndb ( char *coll,long sfn,long numFiles,bool includeTree,
bool verify);
//static void dumpTfndb (char *coll,long sfn,long numFiles,bool includeTree,
// bool verify);
static long dumpSpiderdb ( char *coll,long sfn,long numFiles,bool includeTree,
char printStats , long firstIp );
static void dumpSectiondb( char *coll,long sfn,long numFiles,bool includeTree);
@ -773,8 +773,8 @@ int main ( int argc , char *argv[] ) {
"\tV is z to dump statsdb all keys.\n"
"\tV is Z to dump statsdb all keys and data samples.\n"
"\tV is L to dump linkdb.\n"
"\tV is u to dump tfndb.\n"
"\tV is vu to verify tfndb.\n"
//"\tV is u to dump tfndb.\n"
//"\tV is vu to verify tfndb.\n"
"\tC is the name of the collection.\n"
"\tX is start file num. (default 0)\n"
"\tY is num files. (default -1)\n"
@ -2420,10 +2420,10 @@ int main ( int argc , char *argv[] ) {
dumpTitledb(coll,startFileNum,numFiles,includeTree,
docId,1,false,false);
}
else if ( argv[cmdarg+1][0] == 'v' && argv[cmdarg+1][1] =='u' )
dumpTfndb (coll,startFileNum,numFiles,includeTree,1);
else if ( argv[cmdarg+1][0] == 'u' )
dumpTfndb (coll,startFileNum,numFiles,includeTree,0);
//else if(argv[cmdarg+1][0] == 'v' && argv[cmdarg+1][1] =='u' )
// dumpTfndb (coll,startFileNum,numFiles,includeTree,1);
//else if ( argv[cmdarg+1][0] == 'u' )
// dumpTfndb (coll,startFileNum,numFiles,includeTree,0);
else if ( argv[cmdarg+1][0] == 'w' )
dumpWaitingTree(coll);
else if ( argv[cmdarg+1][0] == 'x' )
@ -5652,7 +5652,7 @@ void zlibtest() {
#include "Rdb.h"
#include "Xml.h"
#include "Tfndb.h"
//#include "Tfndb.h"
//#include "Checksumdb.h"
#include "Threads.h"
@ -5988,7 +5988,7 @@ void dumpTitledb (char *coll,long startFileNum,long numFiles,bool includeTree,
if ( startKey < *(key_t *)list.getLastKey() ) return;
goto loop;
}
/*
void dumpTfndb (char *coll,long startFileNum,long numFiles,bool includeTree ,
bool verify) {
//g_conf.m_spiderdbMaxTreeMem = 1024*1024*30;
@ -6060,7 +6060,7 @@ void dumpTfndb (char *coll,long startFileNum,long numFiles,bool includeTree ,
if ( startKey < *(key_t *)list.getLastKey() ) return;
goto loop;
}
*/
void dumpWaitingTree (char *coll ) {
RdbTree wt;
if (!wt.set(0,-1,true,20000000,true,"waittree2",
@ -7895,9 +7895,9 @@ void dumpMissing ( char *coll ) {
g_conf.m_indexdbMaxCacheMem = 0;
//g_conf.m_clusterdbMaxDiskPageCacheMem = 0;
g_tfndb.init ();
//g_tfndb.init ();
//g_collectiondb.init(true); // isDump?
g_tfndb.getRdb()->addRdbBase1 ( coll );
//g_tfndb.getRdb()->addRdbBase1 ( coll );
g_titledb.init();
g_titledb.getRdb()->addRdbBase1 ( coll );
// if titledb has stuff in memory, do not do this, it needs to
@ -7911,7 +7911,8 @@ void dumpMissing ( char *coll ) {
}
// . just get the docids from tfndb...
// . this tfndb rec count is for ALL colls!! DOH!
long long numRecs = g_tfndb.getRdb()->getNumTotalRecs();
// MDW FIX THIS RIGHT!
long long numRecs = 12345;//g_tfndb.getRdb()->getNumTotalRecs();
long long oldNumSlots = (numRecs * 100) / 80;
// make a power of 2
// make it a power of 2
@ -7980,10 +7981,10 @@ void dumpMissing ( char *coll ) {
if ( (k.n0 & 0x01LL) == 0x00 ) continue;
// titledb tree is empty, so this must indicate it is in
// spiderdb only
long tfn = g_tfndb.getTfn(&k);
long tfn = 0;//g_tfndb.getTfn(&k);
if ( tfn == 255 ) continue;
// get docid
unsigned long long d = g_tfndb.getDocId ( &k );
unsigned long long d = 0LL;//g_tfndb.getDocId ( &k );
// add to hash table
//long n = (unsigned long)d & mask;
long n = (unsigned long)d % numSlots;
@ -8664,12 +8665,12 @@ void removeDocIds ( char *coll , char *filename ) {
//g_conf.m_checksumdbMaxCacheMem = 0;
//g_conf.m_clusterdbMaxCacheMem = 0;
g_tfndb.init();
//g_tfndb.init();
g_indexdb.init ();
//g_checksumdb.init();
g_clusterdb.init();
//g_collectiondb.init(true);
g_tfndb.getRdb()->addRdbBase1 ( coll );
//g_tfndb.getRdb()->addRdbBase1 ( coll );
g_indexdb.getRdb()->addRdbBase1 ( coll );
//g_checksumdb.getRdb()->addRdbBase1 ( coll );
g_clusterdb.getRdb()->addRdbBase1 ( coll );
@ -9044,7 +9045,7 @@ void removeDocIds ( char *coll , char *filename ) {
//
logf(LOG_INFO,"db: Scanning tfndb and removing recs.");
r = g_tfndb.getRdb();
r = 0;//g_tfndb.getRdb();
count = 0;
scanned = 0;
recs = 0;
@ -9089,7 +9090,7 @@ void removeDocIds ( char *coll , char *filename ) {
key_t k = list.getCurrentKey();
// skip deletes
if ( (k.n0 & 0x01) == 0x00 ) continue;
unsigned long long d = g_tfndb.getDocId(&k);
unsigned long long d = 0;//g_tfndb.getDocId(&k);
// see if docid is in delete list
long n = (unsigned long)d & mask;
while ( slots[n] && slots[n] != d )