mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
the new urls.csv format is ready.
added url discovered time to gbssdocs so we know when we first found a url. also added to new urls.csv. fixed spiderdb list deduping so as not to discard the oldest spider request any more so we keep our discovered time in tact.
This commit is contained in:
parent
f0f8f0a967
commit
3191980f49
2
File.cpp
2
File.cpp
@ -132,10 +132,10 @@ bool File::rename ( char *newFilename ) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/*
|
||||||
static File *s_activeHead = NULL;
|
static File *s_activeHead = NULL;
|
||||||
static File *s_activeTail = NULL;
|
static File *s_activeTail = NULL;
|
||||||
|
|
||||||
/*
|
|
||||||
void rmFileFromLinkedList ( File *f ) {
|
void rmFileFromLinkedList ( File *f ) {
|
||||||
// excise from linked list of active files
|
// excise from linked list of active files
|
||||||
if ( s_activeHead == f )
|
if ( s_activeHead == f )
|
||||||
|
116
PageResults.cpp
116
PageResults.cpp
@ -7926,15 +7926,18 @@ int csvPtrCmp ( const void *a, const void *b ) {
|
|||||||
if ( strcmp(pa,"title") == 0 ) return -1;
|
if ( strcmp(pa,"title") == 0 ) return -1;
|
||||||
if ( strcmp(pb,"title") == 0 ) return 1;
|
if ( strcmp(pb,"title") == 0 ) return 1;
|
||||||
|
|
||||||
|
// this is now taken care of from the 'supps[]' array below
|
||||||
|
// by prepending two digits before each field name
|
||||||
|
|
||||||
// put url first for spider status docs
|
// put url first for spider status docs
|
||||||
if ( strcmp(pa,"gbssUrl") == 0 ) return -1;
|
// if ( strcmp(pa,"gbssUrl") == 0 ) return -1;
|
||||||
if ( strcmp(pb,"gbssUrl") == 0 ) return 1;
|
// if ( strcmp(pb,"gbssUrl") == 0 ) return 1;
|
||||||
|
|
||||||
if ( strcmp(pa,"gbssStatusMsg") == 0 ) return -1;
|
// if ( strcmp(pa,"gbssStatusMsg") == 0 ) return -1;
|
||||||
if ( strcmp(pb,"gbssStatusMsg") == 0 ) return 1;
|
// if ( strcmp(pb,"gbssStatusMsg") == 0 ) return 1;
|
||||||
|
|
||||||
if ( strcmp(pa,"gbssStatusCode") == 0 ) return -1;
|
// if ( strcmp(pa,"gbssStatusCode") == 0 ) return -1;
|
||||||
if ( strcmp(pb,"gbssStatusCode") == 0 ) return 1;
|
// if ( strcmp(pb,"gbssStatusCode") == 0 ) return 1;
|
||||||
|
|
||||||
|
|
||||||
// otherwise string compare
|
// otherwise string compare
|
||||||
@ -8052,30 +8055,40 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
|
|||||||
|
|
||||||
// if doing spider status docs not all will have dupofdocid field
|
// if doing spider status docs not all will have dupofdocid field
|
||||||
char *supps [] = {
|
char *supps [] = {
|
||||||
"gbssFinalRedirectUrl",
|
"00gbssUrl",
|
||||||
|
"01gbssDocId",
|
||||||
|
"02gbssDiscoveredTime",
|
||||||
|
"03gbssDownloadStartTime",
|
||||||
|
"04gbssDownloadEndTime",
|
||||||
|
"05gbssContentType",
|
||||||
|
"06gbssContentLen",
|
||||||
|
"07gbssDupOfDocId" ,
|
||||||
|
"08gbssNumRedirects",
|
||||||
|
"09gbssFinalRedirectUrl",
|
||||||
|
"10gbssPercentContentChanged",
|
||||||
|
"11gbssCrawlRound",
|
||||||
|
"12gbssHopCount",
|
||||||
|
"13gbssIp",
|
||||||
|
"14gbssSentToDiffbotThisTime",
|
||||||
|
"15gbssDiffbotReplyMsg",
|
||||||
|
"16gbssStatusMsg",
|
||||||
|
|
||||||
|
|
||||||
"gbssHttpStatus",
|
"gbssHttpStatus",
|
||||||
"gbssWasIndexed",
|
"gbssWasIndexed",
|
||||||
"gbssAgeInIndex",
|
"gbssAgeInIndex",
|
||||||
"gbssDupOfDocId" ,
|
|
||||||
"gbssPrevTotalNumIndexAttempts",
|
"gbssPrevTotalNumIndexAttempts",
|
||||||
"gbssPrevTotalNumIndexSuccesses",
|
"gbssPrevTotalNumIndexSuccesses",
|
||||||
"gbssPrevTotalNumIndexFailures",
|
"gbssPrevTotalNumIndexFailures",
|
||||||
"gbssDownloadStartTime",
|
|
||||||
"gbssDownloadEndTime",
|
|
||||||
"gbssDownloadStartTimeMS",
|
"gbssDownloadStartTimeMS",
|
||||||
"gbssDownloadEndTimeMS",
|
"gbssDownloadEndTimeMS",
|
||||||
"gbssDownloadDurationMS",
|
"gbssDownloadDurationMS",
|
||||||
"gbssIp",
|
|
||||||
"gbssIpLookupTimeMS",
|
"gbssIpLookupTimeMS",
|
||||||
"gbssSiteNumInlinks",
|
"gbssSiteNumInlinks",
|
||||||
"gbssSiteRank",
|
"gbssSiteRank",
|
||||||
"gbssPercentContentChanged",
|
|
||||||
"gbssLanguage",
|
"gbssLanguage",
|
||||||
"gbssContentType",
|
|
||||||
"gbssContentLen",
|
|
||||||
"gbssCrawlDelayMS",
|
"gbssCrawlDelayMS",
|
||||||
"gbssDiffbotReplyCode",
|
"gbssDiffbotReplyCode",
|
||||||
"gbssDiffbotReplyMsg",
|
|
||||||
"gbssDiffbotLen",
|
"gbssDiffbotLen",
|
||||||
"gbssDiffbotReplyResponseTimeMS",
|
"gbssDiffbotReplyResponseTimeMS",
|
||||||
"gbssDiffbotReplyRetries",
|
"gbssDiffbotReplyRetries",
|
||||||
@ -8085,10 +8098,14 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
|
|||||||
for ( int32_t i = 0 ; supps[i] ; i++ ) {
|
for ( int32_t i = 0 ; supps[i] ; i++ ) {
|
||||||
// don't add these column headers to non spider status docs
|
// don't add these column headers to non spider status docs
|
||||||
if ( ct != CT_STATUS ) break;
|
if ( ct != CT_STATUS ) break;
|
||||||
int64_t h64 = hash64n ( supps[i] );
|
char *skip = supps[i];
|
||||||
|
// skip over the two order digits
|
||||||
|
if ( is_digit(skip[0]) ) skip += 2;
|
||||||
|
// don't include the order digits in the hash
|
||||||
|
int64_t h64 = hash64n ( skip );
|
||||||
if ( nameTable.isInTable ( &h64 ) ) continue;
|
if ( nameTable.isInTable ( &h64 ) ) continue;
|
||||||
// only show diffbot column headers for custom (diffbot) crawls
|
// only show diffbot column headers for custom (diffbot) crawls
|
||||||
if ( strncmp(supps[i],"gbssDiffbot",11) == 0 &&
|
if ( strncmp(skip,"gbssDiffbot",11) == 0 &&
|
||||||
( ! cr || ! cr->m_isCustomCrawl ) )
|
( ! cr || ! cr->m_isCustomCrawl ) )
|
||||||
break;
|
break;
|
||||||
// record offset of the name for our hash table
|
// record offset of the name for our hash table
|
||||||
@ -8124,7 +8141,65 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
|
|||||||
// now print them out as the header row
|
// now print them out as the header row
|
||||||
for ( int32_t i = 0 ; i < numPtrs ; i++ ) {
|
for ( int32_t i = 0 ; i < numPtrs ; i++ ) {
|
||||||
if ( i > 0 && ! sb->pushChar(',') ) return false;
|
if ( i > 0 && ! sb->pushChar(',') ) return false;
|
||||||
if ( ! sb->safeStrcpy ( ptrs[i] ) ) return false;
|
|
||||||
|
char *hdr = ptrs[i];
|
||||||
|
|
||||||
|
// skip the two order digits
|
||||||
|
if ( ct == CT_STATUS && is_digit(hdr[0]) ) hdr += 2;
|
||||||
|
|
||||||
|
// now transform the hdr from gbss* into the old way
|
||||||
|
if ( ! cr->m_isCustomCrawl )
|
||||||
|
goto skipTransform;
|
||||||
|
|
||||||
|
if ( ! strcmp(hdr,"gbssUrl") )
|
||||||
|
hdr = "Url";
|
||||||
|
if ( ! strcmp(hdr,"gbssDocId") )
|
||||||
|
hdr = "Doc ID";
|
||||||
|
// when url was first discovered
|
||||||
|
if ( ! strcmp(hdr,"gbssDiscoveredTime") ) // need this!
|
||||||
|
hdr = "Url Discovered";
|
||||||
|
// when it was crawled this time
|
||||||
|
if ( ! strcmp(hdr,"gbssDownloadStartTime") )
|
||||||
|
hdr = "Crawled";
|
||||||
|
if ( ! strcmp(hdr,"gbssContentLen") )
|
||||||
|
hdr = "Page Length";
|
||||||
|
if ( ! strcmp(hdr,"gbssDupOfDocId") )
|
||||||
|
hdr = "Duplicate Of";
|
||||||
|
if ( ! strcmp(hdr,"gbssNumRedirects") )
|
||||||
|
hdr = "Redirects";
|
||||||
|
if ( ! strcmp(hdr,"gbssFinalRedirectUrl") )
|
||||||
|
hdr = "Redirected To";
|
||||||
|
if ( ! strcmp(hdr,"gbssCrawlRound") )
|
||||||
|
hdr = "Crawl Round";
|
||||||
|
if ( ! strcmp(hdr,"gbssHopCount") )
|
||||||
|
hdr = "Hop Count";
|
||||||
|
if ( ! strcmp(hdr,"gbssIp") )
|
||||||
|
hdr = "IP";
|
||||||
|
if ( ! strcmp(hdr,"gbssSentToDiffbotThisTime") )
|
||||||
|
hdr = "Process Attempted";
|
||||||
|
if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )
|
||||||
|
hdr = "Process Response";
|
||||||
|
if ( ! strcmp(hdr,"gbssStatusMsg") )
|
||||||
|
hdr = "Status";
|
||||||
|
|
||||||
|
//if ( ! strcmp(hdr,"gbssMatchingUrlFilter") )
|
||||||
|
// hdr = "Matching Expression";
|
||||||
|
// value is 'url ignored', 'will spider next round', 'error' or
|
||||||
|
// a numeric priority
|
||||||
|
// if ( ! strcmp(hdr,"gbssSpiderPriority") )
|
||||||
|
// hdr = "Matching Action";
|
||||||
|
|
||||||
|
// new columns
|
||||||
|
// if ( ! strcmp(hdr,"gbssAgeInIndex") )
|
||||||
|
// hdr = "Age in Index";
|
||||||
|
|
||||||
|
// if not transformed, then do not print it out
|
||||||
|
if ( ! strncmp(hdr,"gbss",4) )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
skipTransform:
|
||||||
|
if ( ! sb->safeStrcpy ( hdr ) ) return false;
|
||||||
|
|
||||||
// record the hash of each one for printing out further json
|
// record the hash of each one for printing out further json
|
||||||
// objects in the same order so columns are aligned!
|
// objects in the same order so columns are aligned!
|
||||||
int64_t h64 = hash64n ( ptrs[i] );
|
int64_t h64 = hash64n ( ptrs[i] );
|
||||||
@ -8145,6 +8220,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
|
|||||||
// returns false and sets g_errno on error
|
// returns false and sets g_errno on error
|
||||||
bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
|
bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
|
||||||
|
|
||||||
|
CollectionRec *cr = g_collectiondb.getRec ( st->m_collnum );
|
||||||
|
|
||||||
int32_t niceness = 0;
|
int32_t niceness = 0;
|
||||||
|
|
||||||
// parse the json
|
// parse the json
|
||||||
@ -8203,6 +8280,9 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
|
|||||||
int32_t slot = columnTable->getSlot ( &h64 ) ;
|
int32_t slot = columnTable->getSlot ( &h64 ) ;
|
||||||
// MUST be in there
|
// MUST be in there
|
||||||
if ( slot < 0 ) {
|
if ( slot < 0 ) {
|
||||||
|
// we do not transform all gbss fields any more for
|
||||||
|
// diffbot to avoid overpopulating the csv
|
||||||
|
if ( cr && cr->m_isCustomCrawl ) continue;
|
||||||
// do not core on this anymore...
|
// do not core on this anymore...
|
||||||
log("serps: json column not in table : %s",ji->m_name);
|
log("serps: json column not in table : %s",ji->m_name);
|
||||||
continue;
|
continue;
|
||||||
|
14
Spider.cpp
14
Spider.cpp
@ -4540,8 +4540,17 @@ bool SpiderColl::scanListForWinners ( ) {
|
|||||||
wsreq->m_hopCount = sreq->m_hopCount;
|
wsreq->m_hopCount = sreq->m_hopCount;
|
||||||
if ( wsreq->m_hopCount < sreq->m_hopCount )
|
if ( wsreq->m_hopCount < sreq->m_hopCount )
|
||||||
sreq->m_hopCount = wsreq->m_hopCount;
|
sreq->m_hopCount = wsreq->m_hopCount;
|
||||||
|
// and the min added time as well!
|
||||||
|
// get the oldest timestamp so
|
||||||
|
// gbssDiscoveryTime will be accurate.
|
||||||
|
if ( sreq->m_addedTime < wsreq->m_addedTime )
|
||||||
|
wsreq->m_addedTime = sreq->m_addedTime;
|
||||||
|
if ( wsreq->m_addedTime < sreq->m_addedTime )
|
||||||
|
sreq->m_addedTime = wsreq->m_addedTime;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
// are we lower priority? (or equal)
|
// are we lower priority? (or equal)
|
||||||
// smaller keys are HIGHER priority.
|
// smaller keys are HIGHER priority.
|
||||||
if(KEYCMP((char *)&wk,(char *)oldwk,
|
if(KEYCMP((char *)&wk,(char *)oldwk,
|
||||||
@ -12950,6 +12959,8 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
|||||||
// url to a different url priority!
|
// url to a different url priority!
|
||||||
if ( oldReq->m_siteHash32 != sreq->m_siteHash32 ||
|
if ( oldReq->m_siteHash32 != sreq->m_siteHash32 ||
|
||||||
oldReq->m_isNewOutlink != sreq->m_isNewOutlink ||
|
oldReq->m_isNewOutlink != sreq->m_isNewOutlink ||
|
||||||
|
// use hopcount now too!
|
||||||
|
oldReq->m_hopCount != sreq->m_hopCount ||
|
||||||
// makes a difference as far a m_minPubDate goes, because
|
// makes a difference as far a m_minPubDate goes, because
|
||||||
// we want to make sure not to delete that request that
|
// we want to make sure not to delete that request that
|
||||||
// has m_parentPrevSpiderTime
|
// has m_parentPrevSpiderTime
|
||||||
@ -12966,7 +12977,8 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
|
|||||||
goto addIt;
|
goto addIt;
|
||||||
// . if the same check who has the most recent added time
|
// . if the same check who has the most recent added time
|
||||||
// . if we are not the most recent, just do not add us
|
// . if we are not the most recent, just do not add us
|
||||||
if ( sreq->m_addedTime <= oldReq->m_addedTime ) continue;
|
// . no, now i want the oldest so we can do gbssDiscoveryTime
|
||||||
|
if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
|
||||||
// otherwise, erase over him
|
// otherwise, erase over him
|
||||||
dst = restorePoint;
|
dst = restorePoint;
|
||||||
lastKey = prevLastKey;
|
lastKey = prevLastKey;
|
||||||
|
@ -27280,6 +27280,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
|||||||
jd.safePrintf("\"gbssCrawlRound\":%"INT32",\n",
|
jd.safePrintf("\"gbssCrawlRound\":%"INT32",\n",
|
||||||
cr->m_spiderRoundNum);
|
cr->m_spiderRoundNum);
|
||||||
|
|
||||||
|
if ( m_sreqValid ) {
|
||||||
|
// in Spider.cpp we try to set m_sreq's m_addedTime to the
|
||||||
|
// min of all the spider requests, and we try to ensure
|
||||||
|
// that in the case of deduping we preserve the one with
|
||||||
|
// the oldest time.
|
||||||
|
jd.safePrintf("\"gbssDiscoveredTime\":%"INT32",\n",
|
||||||
|
m_sreq.m_addedTime);
|
||||||
|
}
|
||||||
|
|
||||||
if ( m_isDupValid && m_isDup )
|
if ( m_isDupValid && m_isDup )
|
||||||
jd.safePrintf("\"gbssDupOfDocId\":%"INT64",\n",
|
jd.safePrintf("\"gbssDupOfDocId\":%"INT64",\n",
|
||||||
m_docIdWeAreADupOf);
|
m_docIdWeAreADupOf);
|
||||||
|
Loading…
Reference in New Issue
Block a user