mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
Merge branch 'diffbot-testing' into diffbot
Conflicts: Spider.cpp
This commit is contained in:
commit
08b103f3a4
@ -449,7 +449,7 @@ bool Collectiondb::addNewColl ( char *coll ,
|
||||
// show the ban links in the search results. the
|
||||
// collection name is cryptographic enough to show that
|
||||
cr->m_isCustomCrawl = customCrawl;
|
||||
cr->m_diffbotOnlyProcessIfNew = true;
|
||||
cr->m_diffbotOnlyProcessIfNewUrl = true;
|
||||
// default respider to off
|
||||
cr->m_collectiveRespiderFrequency = 0.0;
|
||||
cr->m_restrictDomain = true;
|
||||
@ -1459,6 +1459,9 @@ void CollectionRec::reset() {
|
||||
if ( m_hasucr ) regfree ( &m_ucr );
|
||||
if ( m_hasupr ) regfree ( &m_upr );
|
||||
|
||||
m_hasucr = false;
|
||||
m_hasupr = false;
|
||||
|
||||
// make sure we do not leave spiders "hanging" waiting for their
|
||||
// callback to be called... and it never gets called
|
||||
//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
|
||||
@ -2264,6 +2267,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasucr = true;
|
||||
@ -2284,6 +2288,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx ) {
|
||||
tmp.reset();
|
||||
tmp.safeStrcpy ( rx );
|
||||
expandRegExShortcuts ( &tmp );
|
||||
m_hasupr = true;
|
||||
@ -2299,6 +2304,10 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
m_hasupr = false;
|
||||
}
|
||||
|
||||
|
||||
//char *x = "http://staticpages.diffbot.com/testCrawl/article1.html";
|
||||
//if(m_hasupr && regexec(&m_upr,x,0,NULL,0) ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
@ -640,7 +640,7 @@ class CollectionRec {
|
||||
long m_hasucr:1;
|
||||
long m_hasupr:1;
|
||||
|
||||
char m_diffbotOnlyProcessIfNew;
|
||||
char m_diffbotOnlyProcessIfNewUrl;
|
||||
|
||||
//SafeBuf m_diffbotClassify;
|
||||
//char m_diffbotClassify;
|
||||
|
@ -986,6 +986,7 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
if ( strncmp(path,"/crawlbot",9) == 0 ) n = PAGE_CRAWLBOT;
|
||||
if ( strncmp(path,"/v2/crawl",9) == 0 ) n = PAGE_CRAWLBOT;
|
||||
if ( strncmp(path,"/v2/bulk" ,8) == 0 ) n = PAGE_CRAWLBOT;
|
||||
if ( strncmp(path,"/v2/search" ,8) == 0 ) n = PAGE_RESULTS;
|
||||
|
||||
bool isProxy = g_proxy.isProxy();
|
||||
// . prevent coring
|
||||
|
@ -1096,7 +1096,7 @@ void Msg39::estimateHits ( ) {
|
||||
|
||||
// convenience ptrs. we will store the docids/scores into these arrays
|
||||
long long *topDocIds;
|
||||
float *topScores;
|
||||
double *topScores;
|
||||
key_t *topRecs;
|
||||
|
||||
// numDocIds counts docs in all tiers when using toptree.
|
||||
@ -1163,7 +1163,7 @@ void Msg39::estimateHits ( ) {
|
||||
mr.ptr_clusterRecs = NULL;
|
||||
// this is how much space to reserve
|
||||
mr.size_docIds = 8 * numDocIds; // long long
|
||||
mr.size_scores = 4 * numDocIds; // float
|
||||
mr.size_scores = sizeof(double) * numDocIds; // float
|
||||
// if not doing site clustering, we won't have these perhaps...
|
||||
if ( m_gotClusterRecs )
|
||||
mr.size_clusterRecs = sizeof(key_t) *numDocIds;
|
||||
@ -1191,7 +1191,7 @@ void Msg39::estimateHits ( ) {
|
||||
return ;
|
||||
}
|
||||
topDocIds = (long long *) mr.ptr_docIds;
|
||||
topScores = (float *) mr.ptr_scores;
|
||||
topScores = (double *) mr.ptr_scores;
|
||||
topRecs = (key_t *) mr.ptr_clusterRecs;
|
||||
}
|
||||
|
||||
@ -1225,6 +1225,8 @@ void Msg39::estimateHits ( ) {
|
||||
//add it to the reply
|
||||
topDocIds [docCount] = t->m_docId;
|
||||
topScores [docCount] = t->m_score;
|
||||
if ( m_tt.m_useIntScores )
|
||||
topScores[docCount] = (double)t->m_intScore;
|
||||
// supply clusterdb rec? only for full splits
|
||||
if ( m_gotClusterRecs )
|
||||
topRecs [docCount] = t->m_clusterRec;
|
||||
|
2
Msg39.h
2
Msg39.h
@ -158,7 +158,7 @@ public:
|
||||
long m_errno;
|
||||
|
||||
char *ptr_docIds ; // the results, long long
|
||||
char *ptr_scores; ; // floats
|
||||
char *ptr_scores; ; // now doubles! so we can have intScores
|
||||
char *ptr_scoreInfo ; // transparency info
|
||||
char *ptr_pairScoreBuf ; // transparency info
|
||||
char *ptr_singleScoreBuf ; // transparency info
|
||||
|
31
Msg3a.cpp
31
Msg3a.cpp
@ -277,8 +277,8 @@ bool Msg3a::gotCacheReply ( ) {
|
||||
m_docIds = (long long *)p;
|
||||
p += 8 * m_numDocIds;
|
||||
// scores
|
||||
m_scores = (float *)p;
|
||||
p += sizeof(float) * m_numDocIds;
|
||||
m_scores = (double *)p;
|
||||
p += sizeof(double) * m_numDocIds;
|
||||
// site hashes
|
||||
m_siteHashes26 = (long *)p;
|
||||
p += 4 * m_numDocIds;
|
||||
@ -727,20 +727,20 @@ bool Msg3a::gotAllSplitReplies ( ) {
|
||||
if ( ! m_debug ) continue;
|
||||
// cast these for printing out
|
||||
long long *docIds = (long long *)mr->ptr_docIds;
|
||||
score_t *scores = (score_t *)mr->ptr_scores;
|
||||
double *scores = (double *)mr->ptr_scores;
|
||||
// print out every docid in this split reply
|
||||
for ( long j = 0; j < mr->m_numDocIds ; j++ ) {
|
||||
// print out score_t
|
||||
logf( LOG_DEBUG,
|
||||
"query: msg3a: [%lu] %03li) "
|
||||
"split=%li docId=%012llu domHash=0x%02lx "
|
||||
"score=%lu" ,
|
||||
"score=%f" ,
|
||||
(unsigned long)this ,
|
||||
j ,
|
||||
i ,
|
||||
docIds [j] ,
|
||||
(long)g_titledb.getDomHash8FromDocId(docIds[j]),
|
||||
(long)scores[j] );
|
||||
(float)scores[j] );
|
||||
}
|
||||
}
|
||||
|
||||
@ -772,7 +772,7 @@ bool Msg3a::gotAllSplitReplies ( ) {
|
||||
for ( long i = 0 ; i < max ; i++ )
|
||||
cr.pushLongLong(m_docIds[i] );
|
||||
for ( long i = 0 ; i < max ; i++ )
|
||||
cr.pushFloat(m_scores[i]);
|
||||
cr.pushDouble(m_scores[i]);
|
||||
for ( long i = 0 ; i < max ; i++ )
|
||||
cr.pushLong(getSiteHash26(i));
|
||||
// sanity
|
||||
@ -849,7 +849,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
// . tcPtr = term count. how many required query terms does the doc
|
||||
// have? formerly called topExplicits in IndexTable2.cpp
|
||||
long long *diPtr [MAX_INDEXDB_SPLIT];
|
||||
float *rsPtr [MAX_INDEXDB_SPLIT];
|
||||
double *rsPtr [MAX_INDEXDB_SPLIT];
|
||||
key_t *ksPtr [MAX_INDEXDB_SPLIT];
|
||||
long long *diEnd [MAX_INDEXDB_SPLIT];
|
||||
for ( long j = 0; j < m_numHosts ; j++ ) {
|
||||
@ -863,7 +863,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
continue;
|
||||
}
|
||||
diPtr [j] = (long long *)mr->ptr_docIds;
|
||||
rsPtr [j] = (float *)mr->ptr_scores;
|
||||
rsPtr [j] = (double *)mr->ptr_scores;
|
||||
ksPtr [j] = (key_t *)mr->ptr_clusterRecs;
|
||||
diEnd [j] = (long long *)(mr->ptr_docIds +
|
||||
mr->m_numDocIds * 8);
|
||||
@ -919,7 +919,8 @@ bool Msg3a::mergeLists ( ) {
|
||||
|
||||
// . how much do we need to store final merged docids, etc.?
|
||||
// . docid=8 score=4 bitScore=1 clusterRecs=key_t clusterLevls=1
|
||||
long need = m_docsToGet * (8+4+sizeof(key_t)+sizeof(DocIdScore *)+1);
|
||||
long need = m_docsToGet * (8+sizeof(double)+
|
||||
sizeof(key_t)+sizeof(DocIdScore *)+1);
|
||||
// allocate it
|
||||
m_finalBuf = (char *)mmalloc ( need , "finalBuf" );
|
||||
m_finalBufSize = need;
|
||||
@ -928,7 +929,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
// hook into it
|
||||
char *p = m_finalBuf;
|
||||
m_docIds = (long long *)p; p += m_docsToGet * 8;
|
||||
m_scores = (float *)p; p += m_docsToGet * sizeof(float);
|
||||
m_scores = (double *)p; p += m_docsToGet * sizeof(double);
|
||||
m_clusterRecs = (key_t *)p; p += m_docsToGet * sizeof(key_t);
|
||||
m_clusterLevels = (char *)p; p += m_docsToGet * 1;
|
||||
m_scoreInfos = (DocIdScore **)p;p+=m_docsToGet*sizeof(DocIdScore *);
|
||||
@ -1078,7 +1079,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
|
||||
// turn it into a float, that is what rscore_t is.
|
||||
// we do this to make it easier for PostQueryRerank.cpp
|
||||
m_scores [m_numDocIds]=(float)*rsPtr[maxj];
|
||||
m_scores [m_numDocIds]=(double)*rsPtr[maxj];
|
||||
if ( m_r->m_doSiteClustering )
|
||||
m_clusterRecs[m_numDocIds]= *ksPtr[maxj];
|
||||
// clear this out
|
||||
@ -1142,7 +1143,7 @@ bool Msg3a::mergeLists ( ) {
|
||||
long Msg3a::getStoredSize ( ) {
|
||||
// docId=8, scores=sizeof(rscore_t), clusterLevel=1 bitScores=1
|
||||
// eventIds=1
|
||||
long need = m_numDocIds * ( 8 + sizeof(rscore_t) + 1 ) +
|
||||
long need = m_numDocIds * ( 8 + sizeof(double) + 1 ) +
|
||||
4 + // m_numDocIds
|
||||
8 ; // m_numTotalEstimatedHits (estimated # of results)
|
||||
return need;
|
||||
@ -1158,8 +1159,8 @@ long Msg3a::serialize ( char *buf , char *bufEnd ) {
|
||||
// store each docid, 8 bytes each
|
||||
memcpy ( p , m_docIds , m_numDocIds * 8 ); p += m_numDocIds * 8;
|
||||
// store scores
|
||||
memcpy ( p , m_scores , m_numDocIds * sizeof(rscore_t) );
|
||||
p += m_numDocIds * sizeof(rscore_t) ;
|
||||
memcpy ( p , m_scores , m_numDocIds * sizeof(double) );
|
||||
p += m_numDocIds * sizeof(double) ;
|
||||
// store cluster levels
|
||||
memcpy ( p , m_clusterLevels , m_numDocIds ); p += m_numDocIds;
|
||||
// sanity check
|
||||
@ -1178,7 +1179,7 @@ long Msg3a::deserialize ( char *buf , char *bufEnd ) {
|
||||
// get each docid, 8 bytes each
|
||||
m_docIds = (long long *)p; p += m_numDocIds * 8;
|
||||
// get scores
|
||||
m_scores = (rscore_t *)p; p += m_numDocIds * sizeof(rscore_t) ;
|
||||
m_scores = (double *)p; p += m_numDocIds * sizeof(double) ;
|
||||
// get cluster levels
|
||||
m_clusterLevels = (char *)p; p += m_numDocIds;
|
||||
// sanity check
|
||||
|
4
Msg3a.h
4
Msg3a.h
@ -61,7 +61,7 @@ public:
|
||||
// we basically turn the scores we get from each msg39 split into
|
||||
// floats (rscore_t) and store them as floats so that PostQueryRerank
|
||||
// has an easier time
|
||||
float *getScores ( ) { return m_scores; };
|
||||
double *getScores ( ) { return m_scores; };
|
||||
long getNumDocIds ( ) { return m_numDocIds; };
|
||||
|
||||
long getSiteHash26 ( long i ) {
|
||||
@ -160,7 +160,7 @@ public:
|
||||
|
||||
// final merged lists go here
|
||||
long long *m_docIds ;
|
||||
float *m_scores ;
|
||||
double *m_scores ;
|
||||
class DocIdScore **m_scoreInfos ;
|
||||
//key_t *m_recs ; // clusterdb recs
|
||||
key_t *m_clusterRecs ;
|
||||
|
26
Msg40.cpp
26
Msg40.cpp
@ -162,6 +162,7 @@ bool Msg40::getResults ( SearchInput *si ,
|
||||
// we need this info for caching as well
|
||||
//m_numGigabitInfos = 0;
|
||||
|
||||
m_lastHeartbeat = getTimeLocal();
|
||||
|
||||
//just getfrom searchinput
|
||||
//.... m_catId = hr->getLong("catid",0);m_si->m_catId;
|
||||
@ -1274,6 +1275,21 @@ bool Msg40::gotSummary ( ) {
|
||||
log("query: error initializing dedup table: %s",
|
||||
mstrerror(g_errno));
|
||||
|
||||
State0 *st = (State0 *)m_state;
|
||||
|
||||
// keep socket alive if not streaming. like downloading csv...
|
||||
long now2 = getTimeLocal();
|
||||
if ( now2 - m_lastHeartbeat >= 10 && ! m_si->m_streamResults &&
|
||||
// incase socket is closed and recycled for another connection
|
||||
st->m_socket->m_numDestroys == st->m_numDestroys ) {
|
||||
m_lastHeartbeat = now2;
|
||||
int n = ::send ( st->m_socket->m_sd , " " , 1 , 0 );
|
||||
log("msg40: sent heartbeat of %li bytes on sd=%li",
|
||||
(long)n,(long)st->m_socket->m_sd);
|
||||
}
|
||||
|
||||
|
||||
|
||||
/*
|
||||
// sanity check
|
||||
for ( long i = 0 ; i < m_msg3a.m_numDocIds ; i++ ) {
|
||||
@ -1295,8 +1311,6 @@ bool Msg40::gotSummary ( ) {
|
||||
|
||||
doAgain:
|
||||
|
||||
State0 *st = (State0 *)m_state;
|
||||
|
||||
SafeBuf *sb = &st->m_sb;
|
||||
|
||||
sb->reset();
|
||||
@ -1332,14 +1346,16 @@ bool Msg40::gotSummary ( ) {
|
||||
|
||||
// primitive deduping. for diffbot json exclude url's from the
|
||||
// XmlDoc::m_contentHash32.. it will be zero if invalid i guess
|
||||
if ( mr->m_contentHash32 &&
|
||||
if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
|
||||
mr->m_contentHash32 &&
|
||||
m_dedupTable.isInTable ( &mr->m_contentHash32 ) ) {
|
||||
log("msg40: dup sum #%li",m_printi);
|
||||
continue;
|
||||
}
|
||||
|
||||
// return true with g_errno set on error
|
||||
if ( mr->m_contentHash32 &&
|
||||
if ( m_si && m_si->m_doDupContentRemoval && // &dr=1
|
||||
mr->m_contentHash32 &&
|
||||
! m_dedupTable.addKey ( &mr->m_contentHash32 ) ) {
|
||||
m_hadPrintError = true;
|
||||
log("msg40: error adding to dedup table: %s",
|
||||
@ -1627,6 +1643,8 @@ bool Msg40::gotSummary ( ) {
|
||||
long dedupPercent = 0;
|
||||
if ( m_si->m_doDupContentRemoval && m_si->m_percentSimilarSummary )
|
||||
dedupPercent = m_si->m_percentSimilarSummary;
|
||||
// icc=1 turns this off too i think
|
||||
if ( m_si->m_includeCachedCopy ) dedupPercent = 0;
|
||||
// if the user only requested docids, we have no summaries
|
||||
if ( m_si->m_docIdsOnly ) dedupPercent = 0;
|
||||
|
||||
|
2
Msg40.h
2
Msg40.h
@ -201,6 +201,8 @@ class Msg40 {
|
||||
// Msg39 and all Msg20s must use the same clock timestamp
|
||||
time_t m_nowUTC;
|
||||
|
||||
long m_lastHeartbeat;
|
||||
|
||||
bool printSearchResult9 ( long ix ) ;
|
||||
|
||||
HashTableX m_dedupTable;
|
||||
|
@ -142,6 +142,14 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
// when downloading csv socket closes because we can take minutes
|
||||
// before we send over the first byte, so try to keep open
|
||||
//int parm = 1;
|
||||
//if(setsockopt(sock->m_sd,SOL_TCP,SO_KEEPALIVE,&parm,sizeof(int))<0){
|
||||
// log("crawlbot: setsockopt: %s",mstrerror(errno));
|
||||
// errno = 0;
|
||||
//}
|
||||
|
||||
//long pathLen = hr->getPathLen();
|
||||
char rdbId = RDB_NONE;
|
||||
bool downloadJSON = false;
|
||||
@ -203,13 +211,20 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
|
||||
|
||||
|
||||
// . if doing download of json, make it search results now!
|
||||
// . if doing download of csv, make it search results now!
|
||||
// . make an httprequest on stack and call it
|
||||
if ( fmt == FMT_CSV && rdbId == RDB_TITLEDB ) {
|
||||
char tmp2[5000];
|
||||
SafeBuf sb2(tmp2,5000);
|
||||
sb2.safePrintf("GET /search.csv?icc=1&format=csv&sc=0&dr=0&"
|
||||
"c=%s&n=1000000&"
|
||||
sb2.safePrintf("GET /search.csv?icc=1&format=csv&sc=0&"
|
||||
// dedup. since stream=1 and pss=0 below
|
||||
// this will dedup on page content hash only
|
||||
// which is super fast.
|
||||
"dr=1&"
|
||||
"c=%s&n=1000000&"
|
||||
// no summary similarity dedup, only exact
|
||||
// doc content hash. otherwise too slow!!
|
||||
"pss=0&"
|
||||
// no gigabits
|
||||
"dsrt=0&"
|
||||
// do not compute summary. 0 lines.
|
||||
@ -224,6 +239,39 @@ bool sendBackDump ( TcpSocket *sock, HttpRequest *hr ) {
|
||||
return sendPageResults ( sock , &hr2 );
|
||||
}
|
||||
|
||||
// . if doing download of json, make it search results now!
|
||||
// . make an httprequest on stack and call it
|
||||
if ( fmt == FMT_JSON && rdbId == RDB_TITLEDB ) {
|
||||
char tmp2[5000];
|
||||
SafeBuf sb2(tmp2,5000);
|
||||
sb2.safePrintf("GET /search.csv?icc=1&format=json&sc=0&"
|
||||
// dedup. since stream=1 and pss=0 below
|
||||
// this will dedup on page content hash only
|
||||
// which is super fast.
|
||||
"dr=1&"
|
||||
"c=%s&n=1000000&"
|
||||
// we can stream this because unlink csv it
|
||||
// has no header row that needs to be
|
||||
// computed from all results.
|
||||
"stream=1&"
|
||||
// no summary similarity dedup, only exact
|
||||
// doc content hash. otherwise too slow!!
|
||||
"pss=0&"
|
||||
// no gigabits
|
||||
"dsrt=0&"
|
||||
// do not compute summary. 0 lines.
|
||||
"ns=0&"
|
||||
"q=gbsortby%%3Agbspiderdate&"
|
||||
"prepend=type%%3Ajson"
|
||||
"\r\n\r\n"
|
||||
, cr->m_coll
|
||||
);
|
||||
HttpRequest hr2;
|
||||
hr2.set ( sb2.getBufStart() , sb2.length() , sock );
|
||||
return sendPageResults ( sock , &hr2 );
|
||||
}
|
||||
|
||||
|
||||
|
||||
//if ( strncmp ( path ,"/crawlbot/downloadurls",22 ) == 0 )
|
||||
// rdbId = RDB_SPIDERDB;
|
||||
@ -596,9 +644,9 @@ bool StateCD::sendList ( ) {
|
||||
|
||||
// TcpServer.cpp calls this when done sending TcpSocket's m_sendBuf
|
||||
void doneSendingWrapper ( void *state , TcpSocket *sock ) {
|
||||
|
||||
StateCD *st = (StateCD *)state;
|
||||
|
||||
// error on socket?
|
||||
//if ( g_errno ) st->m_socketError = g_errno;
|
||||
//TcpSocket *socket = st->m_socket;
|
||||
st->m_accumulated += sock->m_totalSent;
|
||||
|
||||
@ -2280,7 +2328,7 @@ bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) {
|
||||
, cx->m_maxToCrawl
|
||||
, cx->m_maxToProcess
|
||||
, (long)cx->m_restrictDomain
|
||||
, (long)cx->m_diffbotOnlyProcessIfNew
|
||||
, (long)cx->m_diffbotOnlyProcessIfNewUrl
|
||||
);
|
||||
sb.safePrintf("\"seeds\":\"");
|
||||
sb.safeUtf8ToJSON ( cx->m_diffbotSeeds.getBufStart());
|
||||
@ -3038,16 +3086,20 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
" "
|
||||
|
||||
// newest json on top of results, last 10 mins
|
||||
"<a href=/search?icc=1&format=json&sc=0&dr=0&"
|
||||
"<a href=/search?icc=1&format=json&"
|
||||
// disable site clustering
|
||||
"sc=0&"
|
||||
// dodupcontentremoval:
|
||||
"dr=1&"
|
||||
"c=%s&n=10000000&rand=%llu&scores=0&id=1&"
|
||||
"stream=1&" // stream results back as we get them
|
||||
"q="
|
||||
// put NEWEST on top
|
||||
"gbsortby%%3Agbspiderdate+"
|
||||
"gbsortbyint%%3Agbspiderdate+"
|
||||
// min spider date = now - 10 mins
|
||||
"gbmin%%3Agbspiderdate%%3A%li&"
|
||||
"debug=1"
|
||||
//"prepend=type%%3Ajson"
|
||||
"gbminint%%3Agbspiderdate%%3A%li&"
|
||||
//"debug=1"
|
||||
"prepend=type%%3Ajson"
|
||||
">"
|
||||
"json search (last 30 seconds)</a>"
|
||||
|
||||
@ -3260,7 +3312,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
char *isNewYes = "";
|
||||
char *isNewNo = " checked";
|
||||
if ( cr->m_diffbotOnlyProcessIfNew ) {
|
||||
if ( cr->m_diffbotOnlyProcessIfNewUrl ) {
|
||||
isNewYes = " checked";
|
||||
isNewNo = "";
|
||||
}
|
||||
|
@ -451,6 +451,9 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
// set this in case SearchInput::set fails!
|
||||
st->m_socket = s;
|
||||
|
||||
// save this count so we know if TcpServer.cpp calls destroySocket(s)
|
||||
st->m_numDestroys = s->m_numDestroys;
|
||||
|
||||
// . parse it up
|
||||
// . this returns false and sets g_errno and, maybe, g_msg on error
|
||||
SearchInput *si = &st->m_si;
|
||||
@ -501,7 +504,6 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
log ( LOG_DEBUG , "query: Getting search results for q=%s",
|
||||
st->m_si.m_displayQuery);
|
||||
|
||||
st->m_socket = s;
|
||||
// assume we'll block
|
||||
st->m_gotResults = false;
|
||||
st->m_gotAds = false;
|
||||
@ -936,7 +938,7 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
SafeBuf *sb = &st->m_sb;
|
||||
// reserve 1.5MB now!
|
||||
if ( ! sb->reserve(1500000 ,"pgresbuf" ) ) // 128000) )
|
||||
return true;
|
||||
return false;
|
||||
// just in case it is empty, make it null terminated
|
||||
sb->nullTerm();
|
||||
|
||||
@ -982,7 +984,8 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
log("query: Query failed. Had error processing query: %s",
|
||||
mstrerror(st->m_errno));
|
||||
g_errno = st->m_errno;
|
||||
return sendReply(st,sb->getBufStart());
|
||||
//return sendReply(st,sb->getBufStart());
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
@ -1077,7 +1080,7 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
// si->m_boolFlag,
|
||||
// true ); // keepAllSingles?
|
||||
|
||||
if ( g_errno ) return sendReply (st,NULL);
|
||||
if ( g_errno ) return false;//sendReply (st,NULL);
|
||||
|
||||
DocIdScore *dpx = NULL;
|
||||
if ( numResults > 0 ) dpx = msg40->getScoreInfo(0);
|
||||
@ -2089,6 +2092,23 @@ bool printResult ( State0 *st, long ix ) {
|
||||
// just print cached web page?
|
||||
if ( mr->ptr_content ) {
|
||||
sb->safeStrcpy ( mr->ptr_content );
|
||||
// . let's hack the spidertime onto the end
|
||||
// . so when we sort by that using gbsortby:spiderdate
|
||||
// we can ensure it is ordered correctly
|
||||
char *end = sb->getBuf() -1;
|
||||
if ( si->m_format == FORMAT_JSON &&
|
||||
end > sb->getBufStart() &&
|
||||
*end == '}' ) {
|
||||
// replace trailing } with spidertime}
|
||||
sb->incrementLength(-1);
|
||||
// crap, we lose resolution storing as a float
|
||||
// so fix that shit here...
|
||||
//float f = mr->m_lastSpidered;
|
||||
//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
|
||||
sb->safePrintf(",\"lastCrawlTimeUTC\":%li}",
|
||||
mr->m_lastSpidered);
|
||||
}
|
||||
|
||||
//mr->size_content );
|
||||
if ( si->m_format == FORMAT_HTML )
|
||||
sb->safePrintf("\n\n<br><br>\n\n");
|
||||
|
@ -12,6 +12,11 @@ public:
|
||||
// store results page in this safebuf
|
||||
SafeBuf m_sb;
|
||||
|
||||
// if socket closes before we get a chance to send back
|
||||
// search results, we will know by comparing this to
|
||||
// m_socket->m_numDestroys
|
||||
long m_numDestroys;
|
||||
|
||||
collnum_t m_collnum;
|
||||
Query m_q;
|
||||
SearchInput m_si;
|
||||
|
14
Parms.cpp
14
Parms.cpp
@ -9299,7 +9299,7 @@ void Parms::init ( ) {
|
||||
m->m_cgi = "onlyProcessIfNew";
|
||||
m->m_xml = "diffbotOnlyProcessIfNew";
|
||||
m->m_title = "onlyProcessIfNew";
|
||||
m->m_off = (char *)&cr.m_diffbotOnlyProcessIfNew - x;
|
||||
m->m_off = (char *)&cr.m_diffbotOnlyProcessIfNewUrl - x;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_def = "1";
|
||||
@ -11611,9 +11611,9 @@ void Parms::init ( ) {
|
||||
|
||||
m->m_title = "percent similar dedup summary";
|
||||
m->m_desc = "If document summary is this percent similar "
|
||||
"to a document summary above it, then remove it from the search "
|
||||
"results. 100 means only to remove if exactly the same. 0 means"
|
||||
" no summary deduping.";
|
||||
"to a document summary above it, then remove it from the "
|
||||
"search results. 100 means only to remove if exactly the "
|
||||
"same. 0 means no summary deduping.";
|
||||
m->m_cgi = "psds";
|
||||
m->m_off = (char *)&cr.m_percentSimilarSummary - x;
|
||||
m->m_soff = (char *)&si.m_percentSimilarSummary - y;
|
||||
@ -18903,13 +18903,17 @@ bool Parms::updateParm ( char *rec , WaitEntry *we ) {
|
||||
if ( strcmp ( val1.getBufStart() , val2.getBufStart() ) == 0 )
|
||||
return true;
|
||||
|
||||
char *coll = "";
|
||||
if ( cr ) coll = cr->m_coll;
|
||||
|
||||
// show it
|
||||
log("parms: updating parm \"%s\" "
|
||||
"(%s[%li]) (collnum=%li) from \"%s\" -> \"%s\"",
|
||||
"(%s[%li]) (collnum=%li) (coll=%s) from \"%s\" -> \"%s\"",
|
||||
parm->m_title,
|
||||
parm->m_cgi,
|
||||
occNum,
|
||||
(long)collnum,
|
||||
coll,
|
||||
val1.getBufStart(),
|
||||
val2.getBufStart());
|
||||
|
||||
|
69
Posdb.cpp
69
Posdb.cpp
@ -4118,11 +4118,16 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
|
||||
// assume not sorting by a numeric termlist
|
||||
m_sortByTermNum = -1;
|
||||
m_sortByTermNumInt = -1;
|
||||
|
||||
// now we have score ranges for gbmin:price:1.99 etc.
|
||||
m_minScoreTermNum = -1;
|
||||
m_maxScoreTermNum = -1;
|
||||
|
||||
// for gbminint:count:99 etc.
|
||||
m_minScoreTermNumInt = -1;
|
||||
m_maxScoreTermNumInt = -1;
|
||||
|
||||
//for ( long i = 0 ; i < m_msg2->getNumLists() ; i++ ) {
|
||||
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
|
||||
QueryTerm *qt = &m_q->m_qterms[i];
|
||||
@ -4141,6 +4146,14 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
if ( qt->m_fieldCode == FIELD_GBSORTBY ||
|
||||
qt->m_fieldCode == FIELD_GBREVSORTBY )
|
||||
m_sortByTermNum = i;
|
||||
|
||||
if ( qt->m_fieldCode == FIELD_GBSORTBYINT ||
|
||||
qt->m_fieldCode == FIELD_GBREVSORTBYINT ) {
|
||||
m_sortByTermNumInt = i;
|
||||
// tell topTree to use int scores
|
||||
m_topTree->m_useIntScores = true;
|
||||
}
|
||||
|
||||
// is it gbmin:price:1.99?
|
||||
if ( qt->m_fieldCode == FIELD_GBNUMBERMIN ) {
|
||||
m_minScoreTermNum = i;
|
||||
@ -4150,6 +4163,14 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
m_maxScoreTermNum = i;
|
||||
m_maxScoreVal = qt->m_qword->m_float;
|
||||
}
|
||||
if ( qt->m_fieldCode == FIELD_GBNUMBERMININT ) {
|
||||
m_minScoreTermNumInt = i;
|
||||
m_minScoreValInt = qt->m_qword->m_int;
|
||||
}
|
||||
if ( qt->m_fieldCode == FIELD_GBNUMBERMAXINT ) {
|
||||
m_maxScoreTermNumInt = i;
|
||||
m_maxScoreValInt = qt->m_qword->m_int;
|
||||
}
|
||||
// count
|
||||
long nn = 0;
|
||||
// also add in bigram lists
|
||||
@ -4277,6 +4298,15 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
if (qt->m_fieldCode == FIELD_GBNUMBERMAX )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
|
||||
if (qt->m_fieldCode == FIELD_GBSORTBYINT )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBREVSORTBYINT )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBNUMBERMININT )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBNUMBERMAXINT )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
|
||||
// only really add if useful
|
||||
// no, because when inserting NEW (related) terms that are
|
||||
// not currently in the document, this list may initially
|
||||
@ -5295,6 +5325,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
char siteRank =0;
|
||||
char docLang =0;
|
||||
float score;
|
||||
long intScore;
|
||||
float minScore;
|
||||
float minPairScore;
|
||||
float minSingleScore;
|
||||
@ -5365,6 +5396,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
// do not do it if we got a gbsortby: field
|
||||
if ( m_sortByTermNum >= 0 ) nnn = 0;
|
||||
if ( m_sortByTermNumInt >= 0 ) nnn = 0;
|
||||
|
||||
/*
|
||||
// skip all this if getting score of just one docid on special
|
||||
@ -5653,6 +5685,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
pass0++;
|
||||
|
||||
if ( m_sortByTermNum >= 0 ) goto skipScoringFilter;
|
||||
if ( m_sortByTermNumInt >= 0 ) goto skipScoringFilter;
|
||||
|
||||
// test why we are slow
|
||||
//if ( (s_sss++ % 8) != 0 ) { docIdPtr += 6; fail0++; goto docIdLoop;}
|
||||
@ -6493,11 +6526,18 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
score = g_posdb.getFloat ( miniMergedList[m_sortByTermNum] );
|
||||
}
|
||||
|
||||
if ( m_sortByTermNumInt >= 0 ) {
|
||||
// no term?
|
||||
if ( ! miniMergedList[m_sortByTermNumInt] ) goto advance;
|
||||
intScore = g_posdb.getInt( miniMergedList[m_sortByTermNumInt]);
|
||||
}
|
||||
|
||||
// skip docid if outside of range
|
||||
if ( m_minScoreTermNum >= 0 ) {
|
||||
// no term?
|
||||
if ( ! miniMergedList[m_minScoreTermNum] ) goto advance;
|
||||
float score2 = g_posdb.getFloat ( miniMergedList[m_minScoreTermNum] );
|
||||
float score2 ;
|
||||
score2= g_posdb.getFloat ( miniMergedList[m_minScoreTermNum] );
|
||||
if ( score2 < m_minScoreVal ) goto advance;
|
||||
}
|
||||
|
||||
@ -6505,10 +6545,29 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
if ( m_maxScoreTermNum >= 0 ) {
|
||||
// no term?
|
||||
if ( ! miniMergedList[m_maxScoreTermNum] ) goto advance;
|
||||
float score2 = g_posdb.getFloat ( miniMergedList[m_maxScoreTermNum] );
|
||||
float score2 ;
|
||||
score2= g_posdb.getFloat ( miniMergedList[m_maxScoreTermNum] );
|
||||
if ( score2 > m_maxScoreVal ) goto advance;
|
||||
}
|
||||
|
||||
// skip docid if outside of range
|
||||
if ( m_minScoreTermNumInt >= 0 ) {
|
||||
// no term?
|
||||
if ( ! miniMergedList[m_minScoreTermNumInt] ) goto advance;
|
||||
long score3;
|
||||
score3=g_posdb.getInt(miniMergedList[m_minScoreTermNumInt]);
|
||||
if ( score3 < m_minScoreValInt ) goto advance;
|
||||
}
|
||||
|
||||
// skip docid if outside of range
|
||||
if ( m_maxScoreTermNumInt >= 0 ) {
|
||||
// no term?
|
||||
if ( ! miniMergedList[m_maxScoreTermNumInt] ) goto advance;
|
||||
long score3 ;
|
||||
score3= g_posdb.getInt ( miniMergedList[m_maxScoreTermNumInt]);
|
||||
if ( score3 > m_maxScoreValInt ) goto advance;
|
||||
}
|
||||
|
||||
|
||||
// . seoDebug hack so we can set "dcs"
|
||||
// . we only come here if we actually made it into m_topTree
|
||||
@ -6606,6 +6665,12 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// set the score and docid ptr
|
||||
t->m_score = score;
|
||||
t->m_docId = m_docId;
|
||||
// use an integer score like lastSpidered timestamp?
|
||||
if ( m_sortByTermNumInt >= 0 ) {
|
||||
t->m_intScore = intScore;
|
||||
t->m_score = 0.0;
|
||||
if ( ! m_topTree->m_useIntScores){char *xx=NULL;*xx=0;}
|
||||
}
|
||||
// . this will not add if tree is full and it is less than the
|
||||
// m_lowNode in score
|
||||
// . if it does get added to a full tree, lowNode will be
|
||||
|
15
Posdb.h
15
Posdb.h
@ -208,10 +208,16 @@ class Posdb {
|
||||
void setFloat ( void *vkp , float f ) {
|
||||
*(float *)(((char *)vkp) + 2) = f; };
|
||||
|
||||
void setInt ( void *vkp , long x ) {
|
||||
*(long *)(((char *)vkp) + 2) = x; };
|
||||
|
||||
// and read the float as well
|
||||
float getFloat ( void *vkp ) {
|
||||
return *(float *)(((char *)vkp) + 2); };
|
||||
|
||||
long getInt ( void *vkp ) {
|
||||
return *(long *)(((char *)vkp) + 2); };
|
||||
|
||||
void setAlignmentBit ( void *vkp , char val ) {
|
||||
char *p = (char *)vkp;
|
||||
if ( val ) p[1] = p[1] | 0x02;
|
||||
@ -610,6 +616,7 @@ class PosdbTable {
|
||||
|
||||
// for gbsortby:item.price ...
|
||||
long m_sortByTermNum;
|
||||
long m_sortByTermNumInt;
|
||||
|
||||
// for gbmin:price:1.99
|
||||
long m_minScoreTermNum;
|
||||
@ -619,6 +626,14 @@ class PosdbTable {
|
||||
float m_minScoreVal;
|
||||
float m_maxScoreVal;
|
||||
|
||||
// for gbmin:count:99
|
||||
long m_minScoreTermNumInt;
|
||||
long m_maxScoreTermNumInt;
|
||||
|
||||
// for gbmin:count:99
|
||||
long m_minScoreValInt;
|
||||
long m_maxScoreValInt;
|
||||
|
||||
|
||||
// the new intersection/scoring algo
|
||||
void intersectLists10_r ( );
|
||||
|
78
Query.cpp
78
Query.cpp
@ -2187,6 +2187,11 @@ bool Query::setQWords ( char boolFlag ,
|
||||
if ( fieldCode == FIELD_GBNUMBERMAX )
|
||||
ph = hash64 ("gbsortby", 8);
|
||||
|
||||
if ( fieldCode == FIELD_GBNUMBERMININT )
|
||||
ph = hash64 ("gbsortbyint", 11);
|
||||
if ( fieldCode == FIELD_GBNUMBERMAXINT )
|
||||
ph = hash64 ("gbsortbyint", 11);
|
||||
|
||||
// ptr to field, if any
|
||||
|
||||
qw->m_fieldCode = fieldCode;
|
||||
@ -2213,8 +2218,14 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// gbmin:price:1.23
|
||||
fieldCode == FIELD_GBNUMBERMIN ||
|
||||
fieldCode == FIELD_GBNUMBERMAX ||
|
||||
|
||||
fieldCode == FIELD_GBSORTBYINT ||
|
||||
fieldCode == FIELD_GBREVSORTBYINT ||
|
||||
fieldCode == FIELD_GBNUMBERMININT ||
|
||||
fieldCode == FIELD_GBNUMBERMAXINT ||
|
||||
|
||||
fieldCode == FIELD_GBAD ) {
|
||||
// . find first space -- that terminates the field value
|
||||
// . find 1st space -- that terminates the field value
|
||||
// . make "end" point to the end of the entire query
|
||||
char *end =
|
||||
(words.m_words[words.m_numWords-1] +
|
||||
@ -2222,13 +2233,14 @@ bool Query::setQWords ( char boolFlag ,
|
||||
// use this for gbmin:price:1.99 etc.
|
||||
long firstColonLen = -1;
|
||||
// "w" points to the first alnumword after the field,
|
||||
// so for site:xyz.com "w" points to the 'x' and wlen would
|
||||
// be 3 in that case sinze xyz is a word of 3 chars. so advance
|
||||
// so for site:xyz.com "w" points to the 'x' and wlen
|
||||
// would be 3 in that case sinze xyz is a word of 3
|
||||
// chars. so advance
|
||||
// wlen until we hit a space.
|
||||
while ( w + wlen < end ) {
|
||||
// stop at first white space
|
||||
if ( is_wspace_utf8(w+wlen) ) break;
|
||||
// in the case of gbmin:price:1.99 record first ':'
|
||||
// in case of gbmin:price:1.99 record first ':'
|
||||
if ( w[wlen]==':' ) firstColonLen = wlen;
|
||||
wlen++;
|
||||
}
|
||||
@ -2238,21 +2250,28 @@ bool Query::setQWords ( char boolFlag ,
|
||||
unsigned long long wid = hash64 ( w , wlen, 0LL );
|
||||
|
||||
// i've decided not to make
|
||||
// gbsortby:products.offerPrice gbmin:price:1.23 case insensitive
|
||||
// gbsortby:products.offerPrice
|
||||
// gbmin:price:1.23 case insensitive
|
||||
if ( fieldCode == FIELD_GBSORTBY ||
|
||||
fieldCode == FIELD_GBREVSORTBY )
|
||||
fieldCode == FIELD_GBREVSORTBY ||
|
||||
fieldCode == FIELD_GBSORTBYINT ||
|
||||
fieldCode == FIELD_GBREVSORTBYINT )
|
||||
wid = hash64Lower_utf8 ( w , wlen , 0LL );
|
||||
|
||||
// gbmin:price:1.23
|
||||
if ( firstColonLen>0 &&
|
||||
( fieldCode == FIELD_GBNUMBERMIN ||
|
||||
fieldCode == FIELD_GBNUMBERMAX ) ) {
|
||||
fieldCode == FIELD_GBNUMBERMAX ||
|
||||
fieldCode == FIELD_GBNUMBERMININT ||
|
||||
fieldCode == FIELD_GBNUMBERMAXINT ) ) {
|
||||
// record the field
|
||||
wid = hash64Lower_utf8 ( w , firstColonLen , 0LL );
|
||||
wid = hash64Lower_utf8(w,firstColonLen , 0LL );
|
||||
// and also the floating point after that
|
||||
qw->m_float = atof ( w + firstColonLen + 1 );
|
||||
qw->m_int = (long)atoll( w + firstColonLen+1);
|
||||
}
|
||||
|
||||
|
||||
// should we have normalized before hashing?
|
||||
if ( fieldCode == FIELD_URL ||
|
||||
fieldCode == FIELD_GBPARENTURL ||
|
||||
@ -3078,9 +3097,12 @@ struct QueryField g_fields[] = {
|
||||
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
|
||||
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""},
|
||||
{"gbcontenthash", FIELD_GBCONTENTHASH, false,""},
|
||||
{"gbsortby", FIELD_GBSORTBY, false,"Example: gbsortby:price. Fields can be "
|
||||
|
||||
{"gbsortby", FIELD_GBSORTBY, false,
|
||||
"Example: gbsortby:price. Fields can be "
|
||||
"in JSON or in meta tag."},
|
||||
{"gbrevsortby", FIELD_GBREVSORTBY, false,"Example: gbrevsortby:item.price . "
|
||||
{"gbrevsortby", FIELD_GBREVSORTBY, false,
|
||||
"Example: gbrevsortby:item.price . "
|
||||
"Fields can be in JSON or in meta tag."},
|
||||
|
||||
// gbmin:price:1.23
|
||||
@ -3088,6 +3110,38 @@ struct QueryField g_fields[] = {
|
||||
"fields can be in JSON or in meta tag."},
|
||||
{"gbmax", FIELD_GBNUMBERMAX, false,"Usage: gbmax:price:1.99"},
|
||||
|
||||
|
||||
{"gbsortbyint", FIELD_GBSORTBYINT, false,
|
||||
"Example: gbsortbyint:intfield . Fields can be "
|
||||
"in JSON or in meta tag. "
|
||||
"Use 'gbspiderdate' field for the last time the page was "
|
||||
"spidered in seconds since the epoch in UTC."
|
||||
},
|
||||
|
||||
{"gbrevsortbyint", FIELD_GBREVSORTBYINT, false,
|
||||
"Example: 'gbrevsortbyint:item.count'. "
|
||||
"Fields can be in JSON or in meta tag. "
|
||||
"Use 'gbspiderdate' field for the last time the page was "
|
||||
"spidered in seconds since the epoch in UTC."
|
||||
},
|
||||
|
||||
{"gbminint", FIELD_GBNUMBERMININT, false,
|
||||
"Example: 'gbminint:spiderdate:1391749680' "
|
||||
"'gbminint:count:99'. Numeric "
|
||||
"fields can be in JSON or in meta tag. "
|
||||
"Use 'gbspiderdate' field for the last time the page was "
|
||||
"spidered in seconds since the epoch in UTC."
|
||||
},
|
||||
|
||||
{"gbmaxint", FIELD_GBNUMBERMAXINT, false,
|
||||
"Example: 'gbmaxint:spiderdate:1391749680' "
|
||||
"'gbmaxint:count:99'. Numeric "
|
||||
"fields can be in JSON or in meta tag. "
|
||||
"Use 'gbspiderdate' field for the last time the page was "
|
||||
"spidered in seconds since the epoch in UTC."
|
||||
},
|
||||
|
||||
|
||||
{"gbcountry",FIELD_GBCOUNTRY,false,""},
|
||||
{"gbad",FIELD_GBAD,false,""},
|
||||
|
||||
@ -3108,7 +3162,9 @@ struct QueryField g_fields[] = {
|
||||
|
||||
{"gbpermalink",FIELD_GBPERMALINK,false,""},
|
||||
//{"gbcsenum",FIELD_GBCSENUM,false,""},
|
||||
{"gbparenturl", FIELD_GBPARENTURL, true,"Match the json urls that were extract from this parent url. Example: gbparenturl:www.gigablast.com/addurl.htm"},
|
||||
{"gbparenturl", FIELD_GBPARENTURL, true,"Match the json urls that "
|
||||
"were extract from this parent url. Example: "
|
||||
"gbparenturl:www.gigablast.com/addurl.htm"},
|
||||
{"gbdocid",FIELD_GBDOCID,false,"restrict results to this docid"}
|
||||
|
||||
};
|
||||
|
8
Query.h
8
Query.h
@ -110,6 +110,12 @@ typedef unsigned long long qvec_t;
|
||||
#define FIELD_GBNUMBERMAX 57
|
||||
#define FIELD_GBPARENTURL 58
|
||||
|
||||
#define FIELD_GBSORTBYINT 59
|
||||
#define FIELD_GBREVSORTBYINT 60
|
||||
#define FIELD_GBNUMBERMININT 61
|
||||
#define FIELD_GBNUMBERMAXINT 62
|
||||
|
||||
|
||||
#define FIELD_GBOTHER 92
|
||||
|
||||
// returns a FIELD_* code above, or FIELD_GENERIC if not in the list
|
||||
@ -365,6 +371,8 @@ class QueryWord {
|
||||
|
||||
// for min/max score ranges like gbmin:price:1.99
|
||||
float m_float;
|
||||
// for gbminint:99 etc. uses integers instead of floats for better res
|
||||
long m_int;
|
||||
};
|
||||
|
||||
// . we filter the QueryWords and turn them into QueryTerms
|
||||
|
@ -2144,6 +2144,8 @@ long long RdbBase::getNumTotalRecs ( ) {
|
||||
numNegativeRecs += m_tree->getNumNegativeKeys(m_collnum);
|
||||
}
|
||||
else {
|
||||
// i've seen this happen when adding a new coll i guess
|
||||
if ( ! m_buckets ) return 0;
|
||||
//these routines are slow because they count every time.
|
||||
numPositiveRecs += m_buckets->getNumKeys(m_collnum);
|
||||
//numPositiveRecs += m_buckets->getNumPositiveKeys(m_collnum);
|
||||
|
11
RdbCache.cpp
11
RdbCache.cpp
@ -983,8 +983,11 @@ bool RdbCache::deleteRec ( ) {
|
||||
// find the key even after going through all the records
|
||||
// I think that the data here is corrupted or not pointed right
|
||||
|
||||
// collnum can be 0 in case we have to go to next buffer
|
||||
if ( collnum != 0 && ( collnum >= m_maxColls || collnum < 0
|
||||
// . collnum can be 0 in case we have to go to next buffer
|
||||
// . allow -1 collnum to exist, seems to happen in robots.txt cache
|
||||
// sometimes, maybe for delete collnum... not sure, but the timestamp
|
||||
// seems to be legit
|
||||
if ( collnum >= m_maxColls || collnum < -1
|
||||
// we now call ::reset(oldcollnum)
|
||||
// when resetting a collection in
|
||||
// Collectiondb::resetColl() which calls
|
||||
@ -993,7 +996,7 @@ bool RdbCache::deleteRec ( ) {
|
||||
// and then we nuke the collrec so it was
|
||||
// triggering this. so check m_ptrs[i]==-1
|
||||
//|| !g_collectiondb.m_recs[collnum]
|
||||
) ) {
|
||||
) {
|
||||
log (LOG_WARN,"db: cache: deleteRec: possible "
|
||||
"corruption, start=%lx collNum=%li "
|
||||
"maxCollNum=%li dbname=%s", (long)start,
|
||||
@ -1002,7 +1005,7 @@ bool RdbCache::deleteRec ( ) {
|
||||
char *xx=NULL;*xx=0;
|
||||
// exception for gourav's bug (dbname=Users)
|
||||
// i am tired of it craping out every 2-3 wks
|
||||
if ( m_dbname[0]=='U' ) return true;
|
||||
//if ( m_dbname[0]=='U' ) return true;
|
||||
// some records might have been deleted
|
||||
m_needsSave = true;
|
||||
// but its corrupt so don't save to disk
|
||||
|
@ -220,6 +220,15 @@ bool SafeBuf::pushFloat ( float i) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SafeBuf::pushDouble ( double i) {
|
||||
if ( m_length + (long)sizeof(double) > m_capacity )
|
||||
if(!reserve(sizeof(double)))
|
||||
return false;
|
||||
*(double *)(m_buf+m_length) = i;
|
||||
m_length += sizeof(double);
|
||||
return true;
|
||||
}
|
||||
|
||||
long SafeBuf::popLong ( ) {
|
||||
if ( m_length < 4 ) { char *xx=NULL;*xx=0; }
|
||||
long ret = *(long *)(m_buf+m_length-4);
|
||||
|
@ -306,6 +306,7 @@ struct SafeBuf {
|
||||
bool pushLong (long i);
|
||||
bool pushLongLong (long long i);
|
||||
bool pushFloat (float i);
|
||||
bool pushDouble (double i);
|
||||
long popLong();
|
||||
float popFloat();
|
||||
|
||||
|
60
Spider.cpp
60
Spider.cpp
@ -989,6 +989,8 @@ void SpiderCache::reset ( ) {
|
||||
}
|
||||
|
||||
SpiderColl *SpiderCache::getSpiderCollIffNonNull ( collnum_t collnum ) {
|
||||
// "coll" must be invalid
|
||||
if ( collnum < 0 ) return NULL;
|
||||
// shortcut
|
||||
CollectionRec *cr = g_collectiondb.m_recs[collnum];
|
||||
// empty?
|
||||
@ -1010,6 +1012,8 @@ bool tryToDeleteSpiderColl ( SpiderColl *sc ) {
|
||||
// . get SpiderColl for a collection
|
||||
// . if it is NULL for that collection then make a new one
|
||||
SpiderColl *SpiderCache::getSpiderColl ( collnum_t collnum ) {
|
||||
// "coll" must be invalid
|
||||
if ( collnum < 0 ) return NULL;
|
||||
// return it if non-NULL
|
||||
//if ( m_spiderColls [ collnum ] ) return m_spiderColls [ collnum ];
|
||||
// if spidering disabled, do not bother creating this!
|
||||
@ -3987,20 +3991,24 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
spiderTimeMS ,
|
||||
uh48 );
|
||||
|
||||
// if this url is already in the winnerTree then either we replace it
|
||||
// or we skip ourselves.
|
||||
// if this url is already in the winnerTree then either we
|
||||
// replace it or we skip ourselves.
|
||||
//
|
||||
// watch out for dups in winner tree, the same url can have multiple
|
||||
// spiderTimeMses somehow... i guess it could have different hop counts
|
||||
// watch out for dups in winner tree, the same url can have
|
||||
// multiple spiderTimeMses somehow... i guess it could have
|
||||
// different hop counts
|
||||
// as well, resulting in different priorities...
|
||||
// actually the dedup table could map to a priority and a node so
|
||||
// we can kick out a lower priority version of the same url...
|
||||
// actually the dedup table could map to a priority and a node
|
||||
// so we can kick out a lower priority version of the same url.
|
||||
long winSlot = m_winnerTable.getSlot ( &uh48 );
|
||||
if ( winSlot >= 0 ) {
|
||||
key192_t *oldwk ;
|
||||
oldwk = (key192_t *)m_winnerTable.getDataFromSlot ( winSlot );
|
||||
oldwk = (key192_t *)m_winnerTable.
|
||||
getDataFromSlot ( winSlot );
|
||||
// are we lower priority? (or equal)
|
||||
if(KEYCMP((char *)&wk,(char *)oldwk,sizeof(key192_t))<=0) continue;
|
||||
if(KEYCMP((char *)&wk,(char *)oldwk,
|
||||
sizeof(key192_t))<=0)
|
||||
continue;
|
||||
// from table too. no it's a dup uh48!
|
||||
//m_winnerTable.deleteKey ( &uh48 );
|
||||
// otherwise we supplant it. remove old key from tree.
|
||||
@ -4014,7 +4022,8 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
// only put 40 urls from the same firstIp into doledb if
|
||||
// we have a lot of urls in our spiderdb already.
|
||||
if ( m_totalBytesScanned < 200000 ) maxWinners = 1;
|
||||
// sanity. make sure read is somewhat hefty for our maxWinners=1 thing
|
||||
// sanity. make sure read is somewhat hefty for our
|
||||
// maxWinners=1 thing
|
||||
if ( (long)SR_READ_SIZE < 500000 ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
|
||||
@ -4086,13 +4095,15 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
//}
|
||||
*/
|
||||
|
||||
// . add to table which allows us to ensure same url not repeated in tree
|
||||
// . add to table which allows us to ensure same url not
|
||||
// repeated in tree
|
||||
// . just skip if fail to add...
|
||||
if ( m_winnerTable.addKey ( &uh48 , &wk ) < 0 ) continue;
|
||||
|
||||
// use an individually allocated buffer for each spiderrequest so if
|
||||
// it gets removed from tree the memory can be freed by the tree
|
||||
// which "owns" the data because m_winnerTree.set() above set ownsData
|
||||
// use an individually allocated buffer for each spiderrequest
|
||||
// so if it gets removed from tree the memory can be freed by
|
||||
// the tree which "owns" the data because m_winnerTree.set()
|
||||
// above set ownsData
|
||||
// to true above.
|
||||
long need = sreq->getRecSize();
|
||||
char *newMem = (char *)mdup ( sreq , need , "sreqbuf" );
|
||||
@ -4105,7 +4116,8 @@ bool SpiderColl::scanListForWinners ( ) {
|
||||
need );
|
||||
|
||||
// sanity
|
||||
//SpiderRequest *sreq2 = (SpiderRequest *)m_winnerTree.getData ( nn );
|
||||
//SpiderRequest *sreq2 = (SpiderRequest *)m_winnerTree.
|
||||
//getData ( nn );
|
||||
|
||||
// set new tail priority and time for next compare
|
||||
if ( m_winnerTree.getNumUsedNodes() >= maxWinners ) {
|
||||
@ -4299,10 +4311,11 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
|
||||
|
||||
// ok, all done if nothing to add to doledb. i guess we were misled
|
||||
// that firstIp had something ready for us. maybe the url filters
|
||||
// table changed to filter/ban them all. if a new request/reply comes in for
|
||||
// this firstIp then it will re-add an entry to waitingtree and we will
|
||||
// re-scan spiderdb. if we had something to spider but it was in the future
|
||||
// the m_minFutureTimeMS will be non-zero, and we deal with that below...
|
||||
// table changed to filter/ban them all. if a new request/reply comes
|
||||
// in for this firstIp then it will re-add an entry to waitingtree and
|
||||
// we will re-scan spiderdb. if we had something to spider but it was
|
||||
// in the future the m_minFutureTimeMS will be non-zero, and we deal
|
||||
// with that below...
|
||||
if ( m_winnerTree.isEmpty() && ! m_minFutureTimeMS ) {
|
||||
// if we received new incoming requests while we were
|
||||
// scanning, which is happening for some crawls, then do
|
||||
@ -4380,13 +4393,16 @@ bool SpiderColl::addWinnersIntoDoledb ( ) {
|
||||
// convert to seconds from ms
|
||||
winSpiderTimeMS / 1000 ,
|
||||
winUh48 ,
|
||||
false );
|
||||
false );
|
||||
// store doledb key first
|
||||
if ( ! m_doleBuf.safeMemcpy ( &doleKey, sizeof(key_t) ) ) hadError = true;
|
||||
if ( ! m_doleBuf.safeMemcpy ( &doleKey, sizeof(key_t) ) )
|
||||
hadError = true;
|
||||
// then size of spiderrequest
|
||||
if ( ! m_doleBuf.pushLong ( sreq2->getRecSize() ) ) hadError = true;
|
||||
if ( ! m_doleBuf.pushLong ( sreq2->getRecSize() ) )
|
||||
hadError = true;
|
||||
// then the spiderrequest encapsulated
|
||||
if ( ! m_doleBuf.safeMemcpy ( sreq2 , sreq2->getRecSize() )) hadError=true;
|
||||
if ( ! m_doleBuf.safeMemcpy ( sreq2 , sreq2->getRecSize() ))
|
||||
hadError=true;
|
||||
// note and error
|
||||
if ( hadError ) {
|
||||
log("spider: error making doledb list: %s",
|
||||
|
@ -901,8 +901,12 @@ TcpSocket *TcpServer::wrapSocket ( int sd , long niceness , bool isIncoming ) {
|
||||
//sleep(10000);
|
||||
return NULL;
|
||||
}
|
||||
// save this i guess
|
||||
long saved = s->m_numDestroys;
|
||||
// clear it
|
||||
memset ( s , 0 , sizeof(TcpSocket) );
|
||||
// restore
|
||||
s->m_numDestroys = saved;
|
||||
// store sd in our TcpSocket
|
||||
s->m_sd = sd;
|
||||
// store the last action time as now (used for timeout'ing sockets)
|
||||
@ -1852,6 +1856,14 @@ void TcpServer::destroySocket ( TcpSocket *s ) {
|
||||
if ( s->m_isIncoming ) m_numIncomingUsed--;
|
||||
// clear it, this means no longer in use
|
||||
s->m_startTime = 0LL;
|
||||
|
||||
// count # of destroys in case a function is still referencing
|
||||
// this socket and streaming back data on it or something. it won't
|
||||
// know we've destroyed it? we do call makeCallback before
|
||||
// calling destroySocket() it seems, but that might not help
|
||||
// for Msg40.cpp sending back search results.
|
||||
s->m_numDestroys++;
|
||||
|
||||
// free TcpSocket from the array
|
||||
//mfree ( s , sizeof(TcpSocket) ,"TcpServer");
|
||||
m_tcpSockets [ sd ] = NULL;
|
||||
|
@ -75,6 +75,8 @@ class TcpSocket {
|
||||
// userid that is logged in
|
||||
//long m_userId32;
|
||||
|
||||
long m_numDestroys;
|
||||
|
||||
// . getMsgPiece() is called when we need more to send
|
||||
char *m_sendBuf;
|
||||
long m_sendBufSize;
|
||||
|
52
TopTree.cpp
52
TopTree.cpp
@ -36,6 +36,7 @@ TopTree::~TopTree() { reset(); }
|
||||
void TopTree::reset ( ) {
|
||||
if ( m_nodes ) mfree(m_nodes,m_allocSize,"TopTree");
|
||||
m_nodes = NULL;
|
||||
m_useIntScores = false;
|
||||
//m_sampleVectors = NULL;
|
||||
m_numNodes = 0;
|
||||
m_numUsedNodes = 0;
|
||||
@ -200,9 +201,18 @@ bool TopTree::addNode ( TopNode *t , long tnn ) {
|
||||
if ( m_vcount >= m_docsWanted ) {
|
||||
long i = m_lowNode;
|
||||
|
||||
if ( t->m_score < m_nodes[i].m_score ) {
|
||||
m_kickedOutDocIds = true; return false; }
|
||||
if ( t->m_score > m_nodes[i].m_score ) goto addIt;
|
||||
if ( m_useIntScores ) {
|
||||
if ( t->m_intScore < m_nodes[i].m_intScore ) {
|
||||
m_kickedOutDocIds = true; return false; }
|
||||
if ( t->m_intScore > m_nodes[i].m_intScore) goto addIt;
|
||||
}
|
||||
|
||||
else {
|
||||
if ( t->m_score < m_nodes[i].m_score ) {
|
||||
m_kickedOutDocIds = true; return false; }
|
||||
if ( t->m_score > m_nodes[i].m_score ) goto addIt;
|
||||
}
|
||||
|
||||
// . finally, compare docids, store lower ones first
|
||||
// . docids should not tie...
|
||||
if ( t->m_docId >= m_nodes[i].m_docId ) {
|
||||
@ -243,11 +253,23 @@ bool TopTree::addNode ( TopNode *t , long tnn ) {
|
||||
// . if a node exists with our key then do NOT replace it
|
||||
else while ( i >= 0 ) {
|
||||
iparent = i;
|
||||
|
||||
// . compare to the ith node
|
||||
if ( t->m_score < m_nodes[i].m_score ) {
|
||||
i = LEFT(i); dir = 0; continue; }
|
||||
if ( t->m_score > m_nodes[i].m_score ) {
|
||||
i = RIGHT(i); dir = 1; continue; }
|
||||
if ( m_useIntScores ) {
|
||||
if ( t->m_intScore < m_nodes[i].m_intScore ) {
|
||||
i = LEFT(i); dir = 0; continue; }
|
||||
if ( t->m_intScore > m_nodes[i].m_intScore ) {
|
||||
i = RIGHT(i); dir = 1; continue; }
|
||||
|
||||
}
|
||||
else {
|
||||
if ( t->m_score < m_nodes[i].m_score ) {
|
||||
i = LEFT(i); dir = 0; continue; }
|
||||
if ( t->m_score > m_nodes[i].m_score ) {
|
||||
i = RIGHT(i); dir = 1; continue; }
|
||||
}
|
||||
|
||||
|
||||
// . finally, compare docids, store lower ones first
|
||||
// . docids should not tie...
|
||||
if ( t->m_docId > m_nodes[i].m_docId ) {
|
||||
@ -293,7 +315,13 @@ bool TopTree::addNode ( TopNode *t , long tnn ) {
|
||||
// . WARNING: if t->m_score is fractional, the fraction will be
|
||||
// dropped and could result in the lower scoring of the two docids
|
||||
// being kept.
|
||||
uint32_t cs = ((uint32_t)t->m_score);
|
||||
uint32_t cs ;
|
||||
|
||||
if ( m_useIntScores )
|
||||
cs = (uint32_t) t->m_intScore;
|
||||
else
|
||||
cs = ((uint32_t)t->m_score);
|
||||
|
||||
key_t k;
|
||||
k.n1 = domHash << 24; // 1 byte domHash
|
||||
//k.n1 |= (t->m_bscore & ~0xc0) << 16; // 1 byte bscore
|
||||
@ -421,7 +449,13 @@ bool TopTree::addNode ( TopNode *t , long tnn ) {
|
||||
// WARNING: if t->m_score is fractional, the fraction will be
|
||||
// dropped and could result in the lower scoring of the two
|
||||
// docids being kept.
|
||||
uint32_t cs = ((uint32_t)t->m_score);
|
||||
uint32_t cs ;
|
||||
|
||||
if ( m_useIntScores )
|
||||
cs = (uint32_t) t->m_intScore;
|
||||
else
|
||||
cs = ((uint32_t)t->m_score);
|
||||
|
||||
k.n1 = domHash2 << 24; // 1 byte domHash
|
||||
//k.n1 |= (t->m_bscore & ~0xc0) << 16; // 1 byte bscore
|
||||
k.n1 |= cs >> 16; // 4 byte score
|
||||
|
@ -30,6 +30,10 @@ class TopNode {
|
||||
//unsigned char m_tier ;
|
||||
float m_score ;
|
||||
long long m_docId;
|
||||
|
||||
// option for using int scores
|
||||
long m_intScore;
|
||||
|
||||
// clustering info
|
||||
//long m_kid ; // result from our same site below us
|
||||
//unsigned long m_siteHash ;
|
||||
@ -124,6 +128,7 @@ class TopTree {
|
||||
long m_cap ;
|
||||
float m_partial ;
|
||||
bool m_doSiteClustering;
|
||||
bool m_useIntScores;
|
||||
long m_docsWanted;
|
||||
long m_ridiculousMax;
|
||||
char m_kickedOutDocIds;
|
||||
|
448
XmlDoc.cpp
448
XmlDoc.cpp
@ -1197,8 +1197,11 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
// either way, validate it
|
||||
m_downloadEndTimeValid = true;
|
||||
// and need a legit mime
|
||||
m_mime.m_bufLen = 1;
|
||||
m_mimeValid = true;
|
||||
if ( ! m_mimeValid ) {
|
||||
m_mime.m_bufLen = 1;
|
||||
m_mimeValid = true;
|
||||
m_mime.m_contentType = contentType;
|
||||
}
|
||||
m_isContentTruncated = false;
|
||||
m_isContentTruncatedValid = true;
|
||||
// no redir
|
||||
@ -1213,6 +1216,12 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
m_crawlDelayValid = true;
|
||||
}
|
||||
|
||||
// override content type based on mime for application/json
|
||||
if ( m_mimeValid ) {
|
||||
m_contentType = m_mime.m_contentType;
|
||||
m_contentTypeValid = true;
|
||||
}
|
||||
|
||||
|
||||
//m_coll = coll;
|
||||
m_pbuf = pbuf;
|
||||
@ -1661,6 +1670,9 @@ bool XmlDoc::set2 ( char *titleRec ,
|
||||
//m_skipIndexingValid = true;
|
||||
m_isSiteRootValid = true;
|
||||
|
||||
// ptr_linkInfo2 is valid. so getDiffbotTitleHashes() works.
|
||||
m_diffbotTitleHashBufValid = true;
|
||||
|
||||
// set "m_oldTagRec" from ptr_tagRecData
|
||||
//memcpy ( &m_oldTagRec , ptr_tagRecData , size_tagRecData );
|
||||
//m_oldTagRecValid = true;
|
||||
@ -2897,7 +2909,7 @@ long *XmlDoc::getIndexCode2 ( ) {
|
||||
}
|
||||
*/
|
||||
|
||||
// . TCPTIMEDOUT, NOROUTETOHOST, etc.
|
||||
// . TCPTIMEDOUT, NOROUTETOHOST, EDOCUNCHANGED, etc.
|
||||
// . this will be the reply from diffbot.com if using diffbot
|
||||
long *dstatus = getDownloadStatus();
|
||||
if ( ! dstatus || dstatus == (void *)-1 ) return (long *)dstatus;
|
||||
@ -3033,8 +3045,8 @@ long *XmlDoc::getIndexCode2 ( ) {
|
||||
return &m_indexCode;
|
||||
}
|
||||
|
||||
// . i moved this up to perhaps fix problems of two dup pages being downloaded
|
||||
// at about the same time
|
||||
// . i moved this up to perhaps fix problems of two dup pages being
|
||||
// downloaded at about the same time
|
||||
// . are we a dup of another doc from any other site already indexed?
|
||||
char *isDup = getIsDup();
|
||||
if ( ! isDup || isDup == (char *)-1 ) return (long *)isDup;
|
||||
@ -3057,6 +3069,33 @@ long *XmlDoc::getIndexCode2 ( ) {
|
||||
return &m_indexCode;
|
||||
}
|
||||
|
||||
// was page unchanged since last time we downloaded it?
|
||||
XmlDoc **pod = getOldXmlDoc ( );
|
||||
if ( ! pod || pod == (XmlDoc **)-1 ) return (long *)pod;
|
||||
XmlDoc *od = NULL;
|
||||
if ( *pod ) od = *pod;
|
||||
bool check = true;
|
||||
if ( ! od ) check = false;
|
||||
// do not do this logic for diffbot because it might want to get
|
||||
// the diffbot reply even if page content is the same, because it
|
||||
// might have an ajax call that updates the product price.
|
||||
// onlyProcessIfNewUrl defaults to true, so typically even diffbot
|
||||
// crawls will do this check.
|
||||
if ( cr->m_isCustomCrawl && ! cr->m_diffbotOnlyProcessIfNewUrl &&
|
||||
// but allow urls like *-diffbotxyz2445187448 to be deduped,
|
||||
// that is the whole point of this line
|
||||
! m_isDiffbotJSONObject )
|
||||
check = false;
|
||||
if ( check ) {
|
||||
long *ch32 = getContentHash32();
|
||||
if ( ! ch32 || ch32 == (void *)-1 ) return (long *)ch32;
|
||||
if ( *ch32 == od->m_contentHash32 ) {
|
||||
m_indexCode = EDOCUNCHANGED;
|
||||
m_indexCodeValid = true;
|
||||
return &m_indexCode;
|
||||
}
|
||||
}
|
||||
|
||||
// words
|
||||
Words *words = getWords();
|
||||
if ( ! words || words == (Words *)-1 ) return (long *)words;
|
||||
@ -12973,11 +13012,18 @@ LinkInfo *XmlDoc::getLinkInfo1 ( ) {
|
||||
}
|
||||
|
||||
|
||||
|
||||
static void *s_null = NULL;
|
||||
|
||||
// . returns NULL and sets g_errno on error
|
||||
// . returns -1 if blocked, will re-call m_callback
|
||||
LinkInfo **XmlDoc::getLinkInfo2 ( ) {
|
||||
|
||||
// this can now be title hashes for XmlDoc::m_diffbotTitleHashes
|
||||
// but otherwise, we don't use it for link info from another cluster
|
||||
// any more.
|
||||
m_linkInfo2Valid = true;
|
||||
return (LinkInfo **)&s_null;
|
||||
|
||||
// return if we got it
|
||||
if ( m_linkInfo2Valid ) return &ptr_linkInfo2;
|
||||
|
||||
@ -13375,7 +13421,8 @@ SafeBuf *XmlDoc::getDiffbotApiUrl ( ) {
|
||||
return &m_diffbotApiUrl;
|
||||
}
|
||||
|
||||
// if only processing NEW is enabled, then do not
|
||||
// if only processing NEW URLs is enabled, then do not get diffbot reply
|
||||
// if we already got one before
|
||||
bool *XmlDoc::getRecycleDiffbotReply ( ) {
|
||||
|
||||
if ( m_recycleDiffbotReplyValid )
|
||||
@ -13399,7 +13446,7 @@ bool *XmlDoc::getRecycleDiffbotReply ( ) {
|
||||
// ***RECYCLE*** the diffbot reply!
|
||||
m_recycleDiffbotReply = false;
|
||||
|
||||
if ( cr->m_diffbotOnlyProcessIfNew &&
|
||||
if ( cr->m_diffbotOnlyProcessIfNewUrl &&
|
||||
od && od->m_gotDiffbotSuccessfulReply )
|
||||
m_recycleDiffbotReply = true;
|
||||
|
||||
@ -13408,6 +13455,63 @@ bool *XmlDoc::getRecycleDiffbotReply ( ) {
|
||||
return &m_recycleDiffbotReply;
|
||||
}
|
||||
|
||||
// get hashes of the json objects in the diffbotreply
|
||||
long *XmlDoc::getDiffbotTitleHashes ( long *numHashes ) {
|
||||
|
||||
*numHashes = size_linkInfo2 / 4;
|
||||
|
||||
if ( ! ptr_linkInfo2 ) *numHashes = 0;
|
||||
|
||||
// hack: use linkdbdata2 field
|
||||
if ( m_diffbotTitleHashBufValid ) return (long *)ptr_linkInfo2;
|
||||
|
||||
SafeBuf *tdbr = getTokenizedDiffbotReply();
|
||||
if ( ! tdbr || tdbr == (void *)-1 ) return (long *)tdbr;
|
||||
|
||||
HashTableX dedup;
|
||||
if ( ! dedup.set ( 4,0,1024,NULL,0,false,m_niceness,"ddthbuf") )
|
||||
return NULL;
|
||||
|
||||
// parse out the json items in the reply
|
||||
char *p = tdbr->getBufStart();
|
||||
char *pend = p + tdbr->length();
|
||||
|
||||
long plen;
|
||||
|
||||
for ( ; p < pend ; p += plen + 1 ) {
|
||||
// set this
|
||||
plen = gbstrlen(p);
|
||||
// get title from it
|
||||
long valLen;
|
||||
char *val = getJSONFieldValue ( p , "title", &valLen );
|
||||
long th32 = 0;
|
||||
// hash the title
|
||||
if ( val && valLen ) {
|
||||
th32 = hash32 ( val , valLen );
|
||||
// avoid 0
|
||||
if ( th32 == 0 ) th32 = 1;
|
||||
}
|
||||
// if no title, use hash of body
|
||||
if ( th32 == 0 ) {
|
||||
th32 = hash32 ( p , plen );
|
||||
// avoid 0
|
||||
if ( th32 == 0 ) th32 = 2;
|
||||
}
|
||||
// if our hash is duplicated then increment until unique
|
||||
while ( dedup.isInTable ( &th32 ) ) th32++;
|
||||
// store it for deduping
|
||||
dedup.addKey ( &th32 );
|
||||
// store it
|
||||
m_diffbotTitleHashBuf.pushLong(th32);
|
||||
}
|
||||
|
||||
ptr_linkInfo2 = (LinkInfo *)m_diffbotTitleHashBuf.getBufStart();
|
||||
size_linkInfo2 = m_diffbotTitleHashBuf.length();
|
||||
*numHashes = size_linkInfo2 / 4;
|
||||
m_diffbotTitleHashBufValid = true;
|
||||
|
||||
return (long *)ptr_linkInfo2;
|
||||
}
|
||||
|
||||
// . we now get the TOKENIZED diffbot reply.
|
||||
// . that converts a single diffbot reply into multiple \0 separated
|
||||
@ -13619,7 +13723,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
|
||||
|
||||
// we make a "fake" url for the diffbot reply when indexing it
|
||||
// by appending -diffbotxyz%li. see "fakeUrl" below.
|
||||
// by appending -diffbotxyz%lu. see "fakeUrl" below.
|
||||
if ( m_firstUrl.getUrlLen() + 15 >= MAX_URL_LEN ) {
|
||||
log("build: diffbot url would be too long for "
|
||||
"%s", m_firstUrl.getUrl() );
|
||||
@ -13657,9 +13761,9 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
if ( ucp && ! ucp[0] ) ucp = NULL;
|
||||
// do we match the url process pattern or regex?
|
||||
// get the compiled regular expressions
|
||||
regex_t *ucr = &cr->m_ucr;
|
||||
//regex_t *ucr = &cr->m_ucr;
|
||||
regex_t *upr = &cr->m_upr;
|
||||
if ( ! cr->m_hasucr ) ucr = NULL;
|
||||
//if ( ! cr->m_hasucr ) ucr = NULL;
|
||||
if ( ! cr->m_hasupr ) upr = NULL;
|
||||
// get the url
|
||||
Url *f = getFirstUrl();
|
||||
@ -13681,7 +13785,7 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
|
||||
|
||||
|
||||
// if already processed and onlyprocessifnew is enabled then
|
||||
// if already processed and onlyprocessifnewurl is enabled then
|
||||
// we recycle and do not bother with this, we also do not nuke
|
||||
// the diffbot json objects we have already indexed by calling
|
||||
// nukeJSONObjects()
|
||||
@ -13955,6 +14059,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
if ( headers.length() > 0 )
|
||||
additionalHeaders = headers.getBufStart();
|
||||
|
||||
// if did not get the web page first and we are crawling, not
|
||||
// doing a bulk, then core. we need the webpage to harvest links
|
||||
// and sometimes to check the pageprocesspattern to see if we should
|
||||
// process.
|
||||
if ( cr->m_isCustomCrawl ==1 && ! m_downloadStatusValid ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
|
||||
log("diffbot: getting %s headers=%s",m_diffbotUrl.getBufStart(),
|
||||
additionalHeaders);
|
||||
|
||||
@ -14270,6 +14381,10 @@ char **XmlDoc::getHttpReply2 ( ) {
|
||||
if ( m_sreqValid )
|
||||
r->m_contentHash32 = m_sreq.m_contentHash32;
|
||||
|
||||
// if we have the old doc already set use that
|
||||
if ( od )
|
||||
r->m_contentHash32 = od->m_contentHash32;
|
||||
|
||||
// eventgurubot is the max
|
||||
//char *userAgent = g_conf.m_spiderUserAgent;
|
||||
// hardcode it
|
||||
@ -16949,10 +17064,12 @@ long *XmlDoc::getContentHash32 ( ) {
|
||||
|
||||
// if we are a diffbot json object, fake this for now, it will
|
||||
// be set for real in hashJSON()
|
||||
if ( m_isDiffbotJSONObject ) {
|
||||
m_contentHash32 = 0;
|
||||
return &m_contentHash32;
|
||||
}
|
||||
// no, because we call this before hashJSON() for to set
|
||||
// EDOCUNCHANGED above... so just hash the json normally for now
|
||||
//if ( m_isDiffbotJSONObject ) {
|
||||
// m_contentHash32 = 0;
|
||||
// return &m_contentHash32;
|
||||
//}
|
||||
|
||||
// . get the content. get the pure untouched content!!!
|
||||
// . gotta be pure since that is what Msg13.cpp computes right
|
||||
@ -16979,7 +17096,7 @@ long *XmlDoc::getContentHash32 ( ) {
|
||||
// we set m_contentHash32 in ::hashJSON() below because it is special
|
||||
// for diffbot since it ignores certain json fields like url: and the
|
||||
// fields are independent, and numbers matter, like prices
|
||||
if ( m_isDiffbotJSONObject ) { char *xx=NULL; *xx=0; }
|
||||
//if ( m_isDiffbotJSONObject ) { char *xx=NULL; *xx=0; }
|
||||
|
||||
// *pend should be \0
|
||||
m_contentHash32 = getContentHash32Fast ( p , plen , m_niceness );
|
||||
@ -18102,7 +18219,11 @@ bool XmlDoc::logIt ( ) {
|
||||
|
||||
// just use the oldurlfilternum for grepping i guess
|
||||
//if ( m_oldDocValid && m_oldDoc )
|
||||
if ( m_sreqValid && m_sreq.m_hadReply )
|
||||
|
||||
// when injecting a request we have no idea if it had a reply or not
|
||||
if ( m_sreqValid && m_sreq.m_isInjecting )
|
||||
sb.safePrintf("firsttime=? ");
|
||||
else if ( m_sreqValid && m_sreq.m_hadReply )
|
||||
sb.safePrintf("firsttime=0 ");
|
||||
else if ( m_sreqValid )
|
||||
sb.safePrintf("firsttime=1 ");
|
||||
@ -19388,10 +19509,10 @@ bool XmlDoc::doesPageContentMatchDiffbotProcessPattern() {
|
||||
// . returns ptr to status
|
||||
// . diffbot uses this to remove the indexed json pages associated with
|
||||
// a url. each json object is basically its own url. a json object
|
||||
// url is the parent page's url with a -diffbotxyz-%li appended to it
|
||||
// url is the parent page's url with a -diffbotxyz-%lu appended to it
|
||||
// where %li is the object # starting at 0 and incrementing from there.
|
||||
// . XmlDoc::m_diffbotJSONCount is how many json objects the parent url had.
|
||||
long *XmlDoc::nukeJSONObjects ( ) {
|
||||
long *XmlDoc::nukeJSONObjects ( long *newTitleHashes , long numNewHashes ) {
|
||||
// use this
|
||||
static long s_return = 1;
|
||||
// if none, we are done
|
||||
@ -19414,15 +19535,39 @@ long *XmlDoc::nukeJSONObjects ( ) {
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
//
|
||||
// index the hashes of the latest diffbot json items for this parent
|
||||
//
|
||||
HashTableX dedup;
|
||||
if ( ! dedup.set(4,0,numNewHashes*4,NULL,0,false,m_niceness,"njodt") )
|
||||
return NULL;
|
||||
for ( long i = 0 ; i < numNewHashes ; i++ )
|
||||
dedup.addKey ( &newTitleHashes[i] );
|
||||
|
||||
// get this old doc's current title hashes
|
||||
long numOldHashes;
|
||||
long *oldTitleHashes = getDiffbotTitleHashes ( &numOldHashes );
|
||||
// sanity. should return right away without having to block
|
||||
if ( oldTitleHashes == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
||||
// sanity again
|
||||
if ( numOldHashes != m_diffbotJSONCount ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// scan down each
|
||||
for ( ; m_joc < m_diffbotJSONCount ; ) {
|
||||
// only NUKE the json items for which title hashes we lost
|
||||
long th32 = oldTitleHashes[m_joc];
|
||||
// . if still in the new diffbot reply, do not DELETE!!!
|
||||
// . if there was no title, it uses hash of entire object
|
||||
if ( dedup.isInTable(&th32) ) continue;
|
||||
// if m_dx has no url set, call set4 i guess
|
||||
if ( ! m_dx->m_firstUrlValid ) {
|
||||
// make the fake url for this json object for indexing
|
||||
SafeBuf fakeUrl;
|
||||
fakeUrl.set ( m_firstUrl.getUrl() );
|
||||
// append -diffbot0 etc. for fake url
|
||||
fakeUrl.safePrintf("-diffbotxyz%li",m_joc);
|
||||
// get his title hash32
|
||||
//long jsonTitleHash32 = titleHashes[m_joc];
|
||||
// append -diffbotxyz%lu for fake url
|
||||
fakeUrl.safePrintf("-diffbotxyz%lu",th32);
|
||||
// set url of new xmldoc
|
||||
if ( ! m_dx->set1 ( fakeUrl.getBufStart(),
|
||||
cr->m_coll ,
|
||||
@ -19441,6 +19586,8 @@ long *XmlDoc::nukeJSONObjects ( ) {
|
||||
// we need this because only m_dx->m_oldDoc will
|
||||
// load from titledb and have it set
|
||||
m_dx->m_isDiffbotJSONObject = true;
|
||||
// for debug
|
||||
log("xmldoc: nuking %s",fakeUrl.getBufStart());
|
||||
}
|
||||
|
||||
// when the indexdoc completes, or if it blocks, call us!
|
||||
@ -19691,10 +19838,10 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// . then just add the SpiderReply to avoid respidering
|
||||
// . NO! still need to add outlinks
|
||||
//|| diffbotEmptyReply
|
||||
// . treat this as a temporary error i guess
|
||||
// . getNewSpiderReply() below will clear the error in it and
|
||||
// copy stuff over from m_sreq and m_oldDoc for this case
|
||||
//|| *indexCode == EDOCUNCHANGED
|
||||
// . treat this as a temporary error i guess
|
||||
// . getNewSpiderReply() below will clear the error in it and
|
||||
// copy stuff over from m_sreq and m_oldDoc for this case
|
||||
|| *indexCode == EDOCUNCHANGED
|
||||
) {
|
||||
// sanity - in repair mode?
|
||||
if ( m_useSecondaryRdbs ) { char *xx=NULL;*xx=0; }
|
||||
@ -19725,6 +19872,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
m_metaList = (char *)0x1;
|
||||
return m_metaList;
|
||||
}
|
||||
// save this
|
||||
long savedCode = *indexCode;
|
||||
// before getting our spider reply, assign crap from the old
|
||||
// doc to us since we are unchanged! this will allow us to
|
||||
// call getNewSpiderReply() without doing any processing, like
|
||||
@ -19732,12 +19881,16 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
copyFromOldDoc ( od );
|
||||
// need this though! i don't want to print out "Success"
|
||||
// in the log in the logIt() function
|
||||
m_indexCode = *indexCode;
|
||||
m_indexCode = savedCode;
|
||||
m_indexCodeValid = true;
|
||||
// but set our m_contentHash32 from the spider request
|
||||
// which got it from the spiderreply in the case of
|
||||
// EDOCUNCHANGED. this way ch32=xxx will log correctly.
|
||||
if ( *indexCode == EDOCUNCHANGED && m_sreqValid ) {
|
||||
// I think this is only when EDOCUNCHANGED is set in the
|
||||
// Msg13.cpp code, when we have a spider compression proxy.
|
||||
if ( *indexCode == EDOCUNCHANGED &&
|
||||
m_sreqValid &&
|
||||
! m_contentHash32Valid ) {
|
||||
m_contentHash32 = m_sreq.m_contentHash32;
|
||||
m_contentHash32Valid = true;
|
||||
}
|
||||
@ -19747,6 +19900,20 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
m_isInIndex = m_wasInIndex;
|
||||
m_wasInIndexValid = true;
|
||||
m_isInIndexValid = true;
|
||||
|
||||
// unset our ptr_linkInfo1 so we do not free it and core
|
||||
// since we might have set it in copyFromOldDoc() above
|
||||
ptr_linkInfo1 = NULL;
|
||||
size_linkInfo1 = 0;
|
||||
|
||||
// . if not using spiderdb we are done at this point
|
||||
// . this happens for diffbot json replies (m_dx)
|
||||
if ( ! m_useSpiderdb ) {
|
||||
m_metaList = NULL;
|
||||
m_metaListSize = 0;
|
||||
return (char *)0x01;
|
||||
}
|
||||
|
||||
// get our spider reply
|
||||
SpiderReply *newsr = getNewSpiderReply();
|
||||
// return on error
|
||||
@ -19754,10 +19921,6 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// . panic on blocking! this is supposed to be fast!
|
||||
// . it might still have to lookup the tagdb rec?????
|
||||
if ( newsr == (void *)-1 ) { char *xx=NULL;*xx=0; }
|
||||
// unset our ptr_linkInfo1 so we do not free it and core
|
||||
// since we might have set it in copyFromOldDoc() above
|
||||
ptr_linkInfo1 = NULL;
|
||||
size_linkInfo1 = 0;
|
||||
// how much we need
|
||||
long needx = sizeof(SpiderReply) + 1;
|
||||
// doledb key?
|
||||
@ -19900,7 +20063,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
}
|
||||
|
||||
// . should we recycle the diffbot reply for this url?
|
||||
// . if m_diffbotOnlyProcessIfNew is true then we want to keep
|
||||
// . if m_diffbotOnlyProcessIfNewUrl is true then we want to keep
|
||||
// our existing diffbot reply, i.e. recycle it, even though we
|
||||
// respidered this page.
|
||||
bool *recycle = getRecycleDiffbotReply();
|
||||
@ -19910,20 +20073,46 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
m_diffbotJSONCount = od->m_diffbotJSONCount;
|
||||
m_sentToDiffbot = od->m_sentToDiffbot;
|
||||
m_gotDiffbotSuccessfulReply = od->m_gotDiffbotSuccessfulReply;
|
||||
// copy title hashes info. it goes hand in hand with the
|
||||
// NUMBER of diffbot items we have.
|
||||
if(!m_diffbotTitleHashBuf.
|
||||
safeMemcpy(&od->m_diffbotTitleHashBuf) )
|
||||
return NULL;
|
||||
ptr_linkInfo2 =(LinkInfo *)m_diffbotTitleHashBuf.getBufStart();
|
||||
size_linkInfo2=m_diffbotTitleHashBuf.length();
|
||||
}
|
||||
|
||||
|
||||
// we can't really get the meta list of each json object we may
|
||||
// have indexed in od's diffbot reply buffer because they all
|
||||
// were indexed with their own docids in the "m_dx" code below. so
|
||||
// just delete them and we'll re-add from this doc's diffbot reply.
|
||||
if ( od && od->m_diffbotJSONCount && ! *recycle &&
|
||||
cr->m_isCustomCrawl &&
|
||||
// do not remove old json objects if pageparser.cpp test
|
||||
// because that can not change the index, etc.
|
||||
! getIsPageParser() ) {
|
||||
// just delete the json items whose "title hashes" are present
|
||||
// in the "old doc" but NOT i the "new doc".
|
||||
// we use the title hash to construct a unique url for each json item.
|
||||
// if the title hash is present in both the old and new docs then
|
||||
// do not delete it here, but we will reindex it later in
|
||||
// getMetaList() below when we call indexDoc() on each one after
|
||||
// setting m_dx to each one.
|
||||
bool nukeJson = true;
|
||||
if ( ! od ) nukeJson = false;
|
||||
if ( od && od->m_diffbotJSONCount <= 0 ) nukeJson = false;
|
||||
// if recycling json objects, leave them there!
|
||||
if ( *recycle ) nukeJson = false;
|
||||
// you have to be a diffbot crawl to do this
|
||||
if ( ! cr->m_isCustomCrawl ) nukeJson = false;
|
||||
// do not remove old json objects if pageparser.cpp test
|
||||
// because that can not change the index, etc.
|
||||
if ( ! getIsPageParser() ) nukeJson = false;
|
||||
|
||||
if ( nukeJson ) {
|
||||
// it should only nuke/delete the json items that we LOST,
|
||||
// so if we still have the title hash in our latest
|
||||
// diffbot reply, then do not nuke that json item, which
|
||||
// will have a url ending in -diffboyxyz%lu (where %lu
|
||||
// is the json item title hash). This will download the
|
||||
// diffbot reply if not already there.
|
||||
long numHashes;
|
||||
long *th = getDiffbotTitleHashes(&numHashes);
|
||||
if ( ! th || th == (void *)-1 ) return (char *)th;
|
||||
// this returns false if it blocks
|
||||
long *status = od->nukeJSONObjects();
|
||||
long *status = od->nukeJSONObjects( th , numHashes );
|
||||
if ( ! status || status == (void *)-1) return (char *)status;
|
||||
}
|
||||
|
||||
@ -20292,18 +20481,32 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// we create from the original diffbot reply
|
||||
SafeBuf *tdbr = getTokenizedDiffbotReply();
|
||||
if ( ! tdbr || tdbr == (void *)-1 ) return (char *)tdbr;
|
||||
|
||||
long tdbrLen = tdbr->length();
|
||||
|
||||
// do not index json items as separate docs if we are page parser
|
||||
if ( getIsPageParser() ) tdbrLen = 0;
|
||||
|
||||
// once we have tokenized diffbot reply we can get a unique
|
||||
// hash of the title of each json item. that way, if a page changes
|
||||
// and it gains or loses a diffbot item, the old items will still
|
||||
// have the same url and we can set their m_indexCode to EDOCUNCHANGED
|
||||
// if the individual json item itself has not changed when we
|
||||
// call m_dx->indexDoc() below.
|
||||
long numHashes = 0;
|
||||
long *titleHashBuf = NULL;
|
||||
|
||||
//
|
||||
// if we got a json object or two from diffbot, index them
|
||||
// as their own child xmldocs.
|
||||
// watch out for reply from diffbot of "-1" indicating error!
|
||||
//
|
||||
if ( tdbrLen > 3 ) {
|
||||
|
||||
// get title hashes of the json items
|
||||
titleHashBuf = getDiffbotTitleHashes(&numHashes);
|
||||
if (!titleHashBuf || titleHashBuf == (void *)-1){
|
||||
char *xx=NULL;*xx=0;}
|
||||
|
||||
// make sure diffbot reply is valid for sure
|
||||
if ( ! m_diffbotReplyValid ) { char *xx=NULL;*xx=0; }
|
||||
// set status for this
|
||||
@ -20326,12 +20529,21 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
jsonloop:
|
||||
// if m_dx has no url set, call set4 i guess
|
||||
if ( ! m_dx->m_contentValid ) {
|
||||
|
||||
// sanity. ensure the json item we are trying to
|
||||
// index has a title hash in this buf
|
||||
if(m_diffbotJSONCount>=numHashes){char *xx=NULL;*xx=0;}
|
||||
|
||||
// get the title of the json we are indexing
|
||||
long jth = titleHashBuf [ m_diffbotJSONCount ];
|
||||
|
||||
// make the fake url for this json object for indexing
|
||||
SafeBuf fakeUrl;
|
||||
fakeUrl.set ( m_firstUrl.getUrl() );
|
||||
// append -diffbot-0 etc. for fake url
|
||||
fakeUrl.safePrintf("-diffbotxyz%li",
|
||||
(long)m_diffbotJSONCount);
|
||||
fakeUrl.safePrintf("-diffbotxyz%lu",
|
||||
//(long)m_diffbotJSONCount);
|
||||
jth);
|
||||
m_diffbotJSONCount++;
|
||||
// this can go on the stack since set4() copies it
|
||||
SpiderRequest sreq;
|
||||
@ -20350,7 +20562,6 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
sreq.m_hopCountValid = 1;
|
||||
sreq.m_fakeFirstIp = 1;
|
||||
sreq.m_firstIp = firstIp;
|
||||
|
||||
// set this
|
||||
if (!m_dx->set4 ( &sreq ,
|
||||
NULL ,
|
||||
@ -20392,17 +20603,21 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
//xd->setCallback ( this , getMetaListWrapper );
|
||||
m_dx->setCallback ( m_masterState , m_masterLoop );
|
||||
|
||||
///////////////
|
||||
// . inject the content of the json using this fake url
|
||||
// . return -1 if this blocks
|
||||
// . if m_dx got its msg4 reply it ends up here, in which
|
||||
// case do NOT re-call indexDoc() so check for
|
||||
// m_listAdded.
|
||||
///////////////
|
||||
if ( ! m_dx->m_listAdded && ! m_dx->indexDoc ( ) )
|
||||
return (char *)-1;
|
||||
|
||||
// critical error on our part trying to index it?
|
||||
// does not include timeouts or 404s, etc. mostly just
|
||||
// OOM errors.
|
||||
if ( g_errno ) return NULL;
|
||||
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
// count as deleted
|
||||
@ -25019,7 +25234,7 @@ bool XmlDoc::hashUrl ( HashTableX *tt ) {
|
||||
if ( m_isDiffbotJSONObject ) {
|
||||
setStatus ( "hashing gbparenturl term");
|
||||
char *p = fu->getUrl() + fu->getUrlLen() - 1;
|
||||
// back up to - as in "http://xyz.com/foo-diffbotxyz13"
|
||||
// back up to - as in "http://xyz.com/foo-diffbotxyz123456"
|
||||
for ( ; *p && *p != '-' ; p-- );
|
||||
// set up the hashing parms
|
||||
hi.m_hashGroup = HASHGROUP_INTAG;
|
||||
@ -29871,6 +30086,23 @@ bool XmlDoc::hashNumber ( char *beginBuf ,
|
||||
if ( ! hashNumber2 ( f , hi , "gbrevsortby" ) )
|
||||
return false;
|
||||
|
||||
//
|
||||
// also hash as an int, 4 byte-integer so our lastSpidered timestamps
|
||||
// dont lose 128 seconds of resolution
|
||||
//
|
||||
|
||||
long i = (long) atoll2 ( p , bufEnd - p );
|
||||
|
||||
if ( ! hashNumber3 ( i , hi , "gbsortbyint" ) )
|
||||
return false;
|
||||
|
||||
// also hash in reverse order for sorting from low to high
|
||||
i = -1 * i;
|
||||
|
||||
if ( ! hashNumber3 ( i , hi , "gbrevsortbyint" ) )
|
||||
return false;
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -29979,6 +30211,113 @@ bool XmlDoc::hashNumber2 ( float f , HashInfo *hi , char *sortByStr ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool XmlDoc::hashNumber3 ( long n , HashInfo *hi , char *sortByStr ) {
|
||||
|
||||
// prefix is something like price. like the meta "name" or
|
||||
// the json name with dots in it like "product.info.price" or something
|
||||
long long nameHash = 0LL;
|
||||
long nameLen = 0;
|
||||
if ( hi->m_prefix ) nameLen = gbstrlen ( hi->m_prefix );
|
||||
if ( hi->m_prefix && nameLen )
|
||||
nameHash = hash64Lower_utf8 ( hi->m_prefix , nameLen );
|
||||
// need a prefix for hashing numbers... for now
|
||||
else { char *xx=NULL; *xx=0; }
|
||||
|
||||
// combine prefix hash with a special hash to make it unique to avoid
|
||||
// collisions. this is the "TRUE" prefix.
|
||||
long long truePrefix64 = hash64n ( sortByStr ); // "gbsortby");
|
||||
// hash with the "TRUE" prefix
|
||||
long long ph2 = hash64 ( nameHash , truePrefix64 );
|
||||
|
||||
// . now store it
|
||||
// . use field hash as the termid. normally this would just be
|
||||
// a prefix hash
|
||||
// . use mostly fake value otherwise
|
||||
key144_t k;
|
||||
g_posdb.makeKey ( &k ,
|
||||
ph2 ,
|
||||
0,//docid
|
||||
0,// word pos #
|
||||
0,// densityRank , // 0-15
|
||||
0 , // MAXDIVERSITYRANK
|
||||
0 , // wordSpamRank ,
|
||||
0 , //siterank
|
||||
0 , // hashGroup,
|
||||
// we set to docLang final hash loop
|
||||
//langUnknown, // langid
|
||||
// unless already set. so set to english here
|
||||
// so it will not be set to something else
|
||||
// otherwise our floats would be ordered by langid!
|
||||
// somehow we have to indicate that this is a float
|
||||
// termlist so it will not be mangled any more.
|
||||
//langEnglish,
|
||||
langUnknown,
|
||||
0 , // multiplier
|
||||
false, // syn?
|
||||
false , // delkey?
|
||||
hi->m_shardByTermId );
|
||||
|
||||
//long long final = hash64n("products.offerprice",0);
|
||||
//long long prefix = hash64n("gbsortby",0);
|
||||
//long long h64 = hash64 ( final , prefix);
|
||||
//if ( ph2 == h64 )
|
||||
// log("hey: got offer price");
|
||||
|
||||
// now set the float in that key
|
||||
//g_posdb.setFloat ( &k , f );
|
||||
g_posdb.setInt ( &k , n );
|
||||
|
||||
// HACK: this bit is ALWAYS set by Posdb::makeKey() to 1
|
||||
// so that we can b-step into a posdb list and make sure
|
||||
// we are aligned on a 6 byte or 12 byte key, since they come
|
||||
// in both sizes. but for this, hack it off to tell
|
||||
// addTable144() that we are a special posdb key, a "numeric"
|
||||
// key that has a float stored in it. then it will NOT
|
||||
// set the siterank and langid bits which throw our sorting
|
||||
// off!!
|
||||
g_posdb.setAlignmentBit ( &k , 0 );
|
||||
|
||||
// sanity
|
||||
//float t = g_posdb.getFloat ( &k );
|
||||
long x = g_posdb.getInt ( &k );
|
||||
if ( x != n ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
HashTableX *dt = hi->m_tt;
|
||||
|
||||
// the key may indeed collide, but that's ok for this application
|
||||
if ( ! dt->addTerm144 ( &k ) )
|
||||
return false;
|
||||
|
||||
if ( ! m_wts )
|
||||
return true;
|
||||
|
||||
// store in buffer
|
||||
char buf[128];
|
||||
long bufLen = sprintf(buf,"%li",n);
|
||||
|
||||
// add to wts for PageParser.cpp display
|
||||
// store it
|
||||
if ( ! storeTerm ( buf,
|
||||
bufLen,
|
||||
truePrefix64,
|
||||
hi,
|
||||
0, // word#, i,
|
||||
0, // wordPos
|
||||
0,// densityRank , // 0-15
|
||||
0, // MAXDIVERSITYRANK,//phrase
|
||||
0, // ws,
|
||||
0, // hashGroup,
|
||||
//true,
|
||||
&m_wbuf,
|
||||
m_wts,
|
||||
// a hack for display in wts:
|
||||
SOURCE_NUMBER, // SOURCE_BIGRAM, // synsrc
|
||||
langUnknown ) )
|
||||
return false;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// . many many websites got hijacked pages in them...
|
||||
// . revkim.org/mcdrt/mgntf/sata/sata.htm
|
||||
// . collegefootballweekly.net/hswsj/riime/sata/sata.htm
|
||||
@ -45048,7 +45387,7 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {
|
||||
char nb[1024];
|
||||
SafeBuf nameBuf(nb,1024);
|
||||
|
||||
long totalHash32 = 0;
|
||||
//long totalHash32 = 0;
|
||||
|
||||
for ( ; ji ; ji = ji->m_next ) {
|
||||
QUICKPOLL(m_niceness);
|
||||
@ -45122,8 +45461,12 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {
|
||||
|
||||
//
|
||||
// for deduping search results we set m_contentHash32 here for
|
||||
// diffbot json objects
|
||||
// diffbot json objects.
|
||||
// we can't do this here anymore, we have to set the
|
||||
// contenthash in ::getContentHash32() because we need it to
|
||||
// set EDOCUNCHANGED in ::getIndexCode() above.
|
||||
//
|
||||
/*
|
||||
if ( hi.m_hashGroup != HASHGROUP_INURL ) {
|
||||
// make the content hash so we can set m_contentHash32
|
||||
// for deduping
|
||||
@ -45134,6 +45477,7 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {
|
||||
totalHash32 ^= nh32;
|
||||
totalHash32 ^= vh32;
|
||||
}
|
||||
*/
|
||||
|
||||
// index like "title:whatever"
|
||||
hi.m_prefix = name;
|
||||
@ -45157,8 +45501,8 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {
|
||||
*/
|
||||
}
|
||||
|
||||
m_contentHash32 = totalHash32;
|
||||
m_contentHash32Valid = true;
|
||||
//m_contentHash32 = totalHash32;
|
||||
//m_contentHash32Valid = true;
|
||||
|
||||
return (char *)0x01;
|
||||
}
|
||||
|
11
XmlDoc.h
11
XmlDoc.h
@ -863,6 +863,10 @@ class XmlDoc {
|
||||
class HashInfo *hi ,
|
||||
char *gbsortByStr ) ;
|
||||
|
||||
bool hashNumber3 ( long x,
|
||||
class HashInfo *hi ,
|
||||
char *gbsortByStr ) ;
|
||||
|
||||
// print out for PageTitledb.cpp and PageParser.cpp
|
||||
bool printDoc ( class SafeBuf *pbuf );
|
||||
bool printMenu ( class SafeBuf *pbuf );
|
||||
@ -1311,6 +1315,7 @@ class XmlDoc {
|
||||
//bool m_useDiffbotValid;
|
||||
//bool m_diffbotApiNumValid;
|
||||
bool m_diffbotApiUrlValid;
|
||||
bool m_diffbotTitleHashBufValid;
|
||||
bool m_crawlInfoValid;
|
||||
bool m_isPageParserValid;
|
||||
bool m_imageUrlValid;
|
||||
@ -1603,9 +1608,13 @@ class XmlDoc {
|
||||
//bool doesUrlMatchDiffbotCrawlPattern() ;
|
||||
//bool doesUrlMatchDiffbotProcessPattern() ;
|
||||
bool doesPageContentMatchDiffbotProcessPattern() ;
|
||||
long *getDiffbotTitleHashes ( long *numHashes ) ;
|
||||
char *hashJSON ( HashTableX *table );
|
||||
long *nukeJSONObjects ( ) ;
|
||||
long *nukeJSONObjects ( long *newTitleHashes , long numNewHashes ) ;
|
||||
|
||||
long m_joc;
|
||||
SafeBuf m_diffbotTitleHashBuf;
|
||||
|
||||
|
||||
//EmailInfo m_emailInfo;
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user