the new urls.csv format is ready.

added url discovered time to gbssdocs so we know when we first found a url. also added to new urls.csv. fixed spiderdb list deduping so as not to discard the oldest spider request any more so we keep our discovered time in tact.
2024-10-04 12:17:35 +03:00 · 2015-04-15 12:13:27 -06:00 · 2015-04-15 12:13:27 -06:00 · 3191980f49
commit 3191980f49
parent f0f8f0a967
4 changed files with 121 additions and 20 deletions
--- a/File.cpp
+++ b/File.cpp
@ -132,10 +132,10 @@ bool File::rename ( char *newFilename ) {
 }
 /*
 static File *s_activeHead = NULL;
 static File *s_activeTail = NULL;
 /*
 void rmFileFromLinkedList ( File *f ) {
 	// excise from linked list of active files
 	if ( s_activeHead == f )
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -7926,15 +7926,18 @@ int csvPtrCmp ( const void *a, const void *b ) {
 	if ( strcmp(pa,"title") == 0 ) return -1;
 	if ( strcmp(pb,"title") == 0 ) return  1;
 	// this is now taken care of from the 'supps[]' array below
 	// by prepending two digits before each field name
 	// put url first for spider status docs
-	if ( strcmp(pa,"gbssUrl") == 0 ) return -1;
+	// if ( strcmp(pa,"gbssUrl") == 0 ) return -1;
-	if ( strcmp(pb,"gbssUrl") == 0 ) return  1;
+	// if ( strcmp(pb,"gbssUrl") == 0 ) return  1;
-	if ( strcmp(pa,"gbssStatusMsg") == 0 ) return -1;
+	// if ( strcmp(pa,"gbssStatusMsg") == 0 ) return -1;
-	if ( strcmp(pb,"gbssStatusMsg") == 0 ) return  1;
+	// if ( strcmp(pb,"gbssStatusMsg") == 0 ) return  1;
-	if ( strcmp(pa,"gbssStatusCode") == 0 ) return -1;
+	// if ( strcmp(pa,"gbssStatusCode") == 0 ) return -1;
-	if ( strcmp(pb,"gbssStatusCode") == 0 ) return  1;
+	// if ( strcmp(pb,"gbssStatusCode") == 0 ) return  1;
 	// otherwise string compare
@ -8052,30 +8055,40 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 	// if doing spider status docs not all will have dupofdocid field
 	char *supps [] = { 
-		"gbssFinalRedirectUrl",
+		"00gbssUrl",
 		"01gbssDocId",
 		"02gbssDiscoveredTime",
 		"03gbssDownloadStartTime",
 		"04gbssDownloadEndTime",
 		"05gbssContentType",
 		"06gbssContentLen",
 		"07gbssDupOfDocId" ,
 		"08gbssNumRedirects",
 		"09gbssFinalRedirectUrl",
 		"10gbssPercentContentChanged",
 		"11gbssCrawlRound",
 		"12gbssHopCount",
 		"13gbssIp",
 		"14gbssSentToDiffbotThisTime",
 		"15gbssDiffbotReplyMsg",
 		"16gbssStatusMsg",
 		"gbssHttpStatus",
 		"gbssWasIndexed",
 		"gbssAgeInIndex",
 		"gbssDupOfDocId" ,
 		"gbssPrevTotalNumIndexAttempts",
 		"gbssPrevTotalNumIndexSuccesses",
 		"gbssPrevTotalNumIndexFailures",
 		"gbssDownloadStartTime",
 		"gbssDownloadEndTime",
 		"gbssDownloadStartTimeMS",
 		"gbssDownloadEndTimeMS",
 		"gbssDownloadDurationMS",
 		"gbssIp",
 		"gbssIpLookupTimeMS",
 		"gbssSiteNumInlinks",
 		"gbssSiteRank",
 		"gbssPercentContentChanged",
 		"gbssLanguage",
 		"gbssContentType",
 		"gbssContentLen",
 		"gbssCrawlDelayMS",
 		"gbssDiffbotReplyCode",
 		"gbssDiffbotReplyMsg",
 		"gbssDiffbotLen",
 		"gbssDiffbotReplyResponseTimeMS",
 		"gbssDiffbotReplyRetries",
@ -8085,10 +8098,14 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 	for ( int32_t i = 0 ; supps[i] ; i++ ) {
 		// don't add these column headers to non spider status docs
 		if ( ct != CT_STATUS ) break;
-		int64_t h64 = hash64n ( supps[i] );
+		char *skip = supps[i];
 		// skip over the two order digits
 		if ( is_digit(skip[0]) ) skip += 2;
 		// don't include the order digits in the hash
 		int64_t h64 = hash64n ( skip );
 		if ( nameTable.isInTable ( &h64 ) ) continue;
 		// only show diffbot column headers for custom (diffbot) crawls
-		if ( strncmp(supps[i],"gbssDiffbot",11) == 0 &&
+		if ( strncmp(skip,"gbssDiffbot",11) == 0 &&
 		     ( ! cr || ! cr->m_isCustomCrawl ) )
 			break;
 		// record offset of the name for our hash table
@ -8124,7 +8141,65 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 	// now print them out as the header row
 	for ( int32_t i = 0 ; i < numPtrs ; i++ ) {
 		if ( i > 0 && ! sb->pushChar(',') ) return false;
-		if ( ! sb->safeStrcpy ( ptrs[i] ) ) return false;
+
 		char *hdr = ptrs[i];
 		// skip the two order digits
 		if ( ct == CT_STATUS && is_digit(hdr[0]) ) hdr += 2;
 		// now transform the hdr from gbss* into the old way
 		if ( ! cr->m_isCustomCrawl )
 			goto skipTransform;
 		if ( ! strcmp(hdr,"gbssUrl") ) 
 			hdr = "Url";
 		if ( ! strcmp(hdr,"gbssDocId") ) 
 			hdr = "Doc ID";
 		// when url was first discovered
 		if ( ! strcmp(hdr,"gbssDiscoveredTime") ) // need this!
 			hdr = "Url Discovered";
 		// when it was crawled this time
 		if ( ! strcmp(hdr,"gbssDownloadStartTime") ) 
 			hdr = "Crawled";
 		if ( ! strcmp(hdr,"gbssContentLen") ) 
 			hdr = "Page Length";
 		if ( ! strcmp(hdr,"gbssDupOfDocId") ) 
 			hdr = "Duplicate Of";
 		if ( ! strcmp(hdr,"gbssNumRedirects") ) 
 			hdr = "Redirects";
 		if ( ! strcmp(hdr,"gbssFinalRedirectUrl") )
 			hdr = "Redirected To";
 		if ( ! strcmp(hdr,"gbssCrawlRound") ) 
 			hdr = "Crawl Round";
 		if ( ! strcmp(hdr,"gbssHopCount") ) 
 			hdr = "Hop Count";
 		if ( ! strcmp(hdr,"gbssIp") ) 
 			hdr = "IP";
 		if ( ! strcmp(hdr,"gbssSentToDiffbotThisTime") ) 
 			hdr = "Process Attempted";
 		if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )
 			hdr = "Process Response";
 		if ( ! strcmp(hdr,"gbssStatusMsg") ) 
 			hdr = "Status";
 		//if ( ! strcmp(hdr,"gbssMatchingUrlFilter") ) 
 		//	hdr = "Matching Expression";
 		// value is 'url ignored', 'will spider next round', 'error' or 
 		// a numeric priority
 		// if ( ! strcmp(hdr,"gbssSpiderPriority") ) 
 		// 	hdr = "Matching Action";
 		// new columns
 		// if ( ! strcmp(hdr,"gbssAgeInIndex") ) 
 		// 	hdr = "Age in Index";
 		// if not transformed, then do not print it out
 		if ( ! strncmp(hdr,"gbss",4) )
 			continue;
 	skipTransform:
 		if ( ! sb->safeStrcpy ( hdr ) ) return false;
 		// record the hash of each one for printing out further json
 		// objects in the same order so columns are aligned!
 		int64_t h64 = hash64n ( ptrs[i] );
@ -8145,6 +8220,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 // returns false and sets g_errno on error
 bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
 	CollectionRec *cr = g_collectiondb.getRec ( st->m_collnum );
 	int32_t niceness = 0;
 	// parse the json
@ -8203,6 +8280,9 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
 		int32_t slot = columnTable->getSlot ( &h64 ) ;
 		// MUST be in there
 		if ( slot < 0 ) { 
 			// we do not transform all gbss fields any more for
 			// diffbot to avoid overpopulating the csv
 			if ( cr && cr->m_isCustomCrawl ) continue;
 			// do not core on this anymore...
 			log("serps: json column not in table : %s",ji->m_name);
 			continue;
--- a/Spider.cpp
+++ b/Spider.cpp
@ -4540,8 +4540,17 @@ bool SpiderColl::scanListForWinners ( ) {
 					wsreq->m_hopCount = sreq->m_hopCount;
 				if ( wsreq->m_hopCount < sreq->m_hopCount )
 					sreq->m_hopCount = wsreq->m_hopCount;
 				// and the min added time as well!
 				// get the oldest timestamp so
 				// gbssDiscoveryTime will be accurate.
 				if ( sreq->m_addedTime < wsreq->m_addedTime )
 					wsreq->m_addedTime = sreq->m_addedTime;
 				if ( wsreq->m_addedTime < sreq->m_addedTime )
 					sreq->m_addedTime = wsreq->m_addedTime;
 			}
 			// are we lower priority? (or equal)
 			// smaller keys are HIGHER priority.
 			if(KEYCMP((char *)&wk,(char *)oldwk,
@ -12950,6 +12959,8 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 		// url to a different url priority!
 		if ( oldReq->m_siteHash32    != sreq->m_siteHash32    ||
 		     oldReq->m_isNewOutlink  != sreq->m_isNewOutlink  ||
 		     //  use hopcount now too!
 		     oldReq->m_hopCount      != sreq->m_hopCount      ||
 		     // makes a difference as far a m_minPubDate goes, because
 		     // we want to make sure not to delete that request that
 		     // has m_parentPrevSpiderTime
@ -12966,7 +12977,8 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 			goto addIt;
 		// . if the same check who has the most recent added time
 		// . if we are not the most recent, just do not add us
-		if ( sreq->m_addedTime <= oldReq->m_addedTime ) continue;
+		// . no, now i want the oldest so we can do gbssDiscoveryTime
 		if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
 		// otherwise, erase over him
 		dst     = restorePoint;
 		lastKey = prevLastKey;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -27280,6 +27280,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 		jd.safePrintf("\"gbssCrawlRound\":%"INT32",\n",
 			      cr->m_spiderRoundNum);
 	if ( m_sreqValid ) {
 		// in Spider.cpp we try to set m_sreq's m_addedTime to the
 		// min of all the spider requests, and we try to ensure
 		// that in the case of deduping we preserve the one with
 		// the oldest time.
 		jd.safePrintf("\"gbssDiscoveredTime\":%"INT32",\n",
 			      m_sreq.m_addedTime);
 	}
 	if ( m_isDupValid && m_isDup )
 		jd.safePrintf("\"gbssDupOfDocId\":%"INT64",\n",
 			      m_docIdWeAreADupOf);