the new urls.csv format is ready.

added url discovered time to gbssdocs so we know when we first found a url. also added to new urls.csv. fixed spiderdb list deduping so as not to discard the oldest spider request any more so we keep our discovered time in tact.
2024-10-04 12:17:35 +03:00 · 2015-04-15 12:13:27 -06:00 · 2015-04-15 12:13:27 -06:00 · 3191980f49
commit 3191980f49
parent f0f8f0a967
4 changed files with 121 additions and 20 deletions
--- a/File.cpp
+++ b/File.cpp
@ -132,10 +132,10 @@ bool File::rename ( char *newFilename ) {
 }


+/*
 static File *s_activeHead = NULL;
 static File *s_activeTail = NULL;

-/*
 void rmFileFromLinkedList ( File *f ) {
 	// excise from linked list of active files
 	if ( s_activeHead == f )
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -7926,15 +7926,18 @@ int csvPtrCmp ( const void *a, const void *b ) {
 	if ( strcmp(pa,"title") == 0 ) return -1;
 	if ( strcmp(pb,"title") == 0 ) return  1;

+	// this is now taken care of from the 'supps[]' array below
+	// by prepending two digits before each field name
+
 	// put url first for spider status docs
-	if ( strcmp(pa,"gbssUrl") == 0 ) return -1;
-	if ( strcmp(pb,"gbssUrl") == 0 ) return  1;
+	// if ( strcmp(pa,"gbssUrl") == 0 ) return -1;
+	// if ( strcmp(pb,"gbssUrl") == 0 ) return  1;

-	if ( strcmp(pa,"gbssStatusMsg") == 0 ) return -1;
-	if ( strcmp(pb,"gbssStatusMsg") == 0 ) return  1;
+	// if ( strcmp(pa,"gbssStatusMsg") == 0 ) return -1;
+	// if ( strcmp(pb,"gbssStatusMsg") == 0 ) return  1;

-	if ( strcmp(pa,"gbssStatusCode") == 0 ) return -1;
-	if ( strcmp(pb,"gbssStatusCode") == 0 ) return  1;
+	// if ( strcmp(pa,"gbssStatusCode") == 0 ) return -1;
+	// if ( strcmp(pb,"gbssStatusCode") == 0 ) return  1;


 	// otherwise string compare
@ -8052,30 +8055,40 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {

 	// if doing spider status docs not all will have dupofdocid field
 	char *supps [] = { 
-		"gbssFinalRedirectUrl",
+		"00gbssUrl",
+		"01gbssDocId",
+		"02gbssDiscoveredTime",
+		"03gbssDownloadStartTime",
+		"04gbssDownloadEndTime",
+		"05gbssContentType",
+		"06gbssContentLen",
+		"07gbssDupOfDocId" ,
+		"08gbssNumRedirects",
+		"09gbssFinalRedirectUrl",
+		"10gbssPercentContentChanged",
+		"11gbssCrawlRound",
+		"12gbssHopCount",
+		"13gbssIp",
+		"14gbssSentToDiffbotThisTime",
+		"15gbssDiffbotReplyMsg",
+		"16gbssStatusMsg",
+
+
 		"gbssHttpStatus",
 		"gbssWasIndexed",
 		"gbssAgeInIndex",
-		"gbssDupOfDocId" ,
 		"gbssPrevTotalNumIndexAttempts",
 		"gbssPrevTotalNumIndexSuccesses",
 		"gbssPrevTotalNumIndexFailures",
-		"gbssDownloadStartTime",
-		"gbssDownloadEndTime",
 		"gbssDownloadStartTimeMS",
 		"gbssDownloadEndTimeMS",
 		"gbssDownloadDurationMS",
-		"gbssIp",
 		"gbssIpLookupTimeMS",
 		"gbssSiteNumInlinks",
 		"gbssSiteRank",
-		"gbssPercentContentChanged",
 		"gbssLanguage",
-		"gbssContentType",
-		"gbssContentLen",
 		"gbssCrawlDelayMS",
 		"gbssDiffbotReplyCode",
-		"gbssDiffbotReplyMsg",
 		"gbssDiffbotLen",
 		"gbssDiffbotReplyResponseTimeMS",
 		"gbssDiffbotReplyRetries",
@ -8085,10 +8098,14 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 	for ( int32_t i = 0 ; supps[i] ; i++ ) {
 		// don't add these column headers to non spider status docs
 		if ( ct != CT_STATUS ) break;
-		int64_t h64 = hash64n ( supps[i] );
+		char *skip = supps[i];
+		// skip over the two order digits
+		if ( is_digit(skip[0]) ) skip += 2;
+		// don't include the order digits in the hash
+		int64_t h64 = hash64n ( skip );
 		if ( nameTable.isInTable ( &h64 ) ) continue;
 		// only show diffbot column headers for custom (diffbot) crawls
-		if ( strncmp(supps[i],"gbssDiffbot",11) == 0 &&
+		if ( strncmp(skip,"gbssDiffbot",11) == 0 &&
 		     ( ! cr || ! cr->m_isCustomCrawl ) )
 			break;
 		// record offset of the name for our hash table
@ -8124,7 +8141,65 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 	// now print them out as the header row
 	for ( int32_t i = 0 ; i < numPtrs ; i++ ) {
 		if ( i > 0 && ! sb->pushChar(',') ) return false;
-		if ( ! sb->safeStrcpy ( ptrs[i] ) ) return false;
+
+		char *hdr = ptrs[i];
+
+		// skip the two order digits
+		if ( ct == CT_STATUS && is_digit(hdr[0]) ) hdr += 2;
+
+		// now transform the hdr from gbss* into the old way
+		if ( ! cr->m_isCustomCrawl )
+			goto skipTransform;
+
+		if ( ! strcmp(hdr,"gbssUrl") ) 
+			hdr = "Url";
+		if ( ! strcmp(hdr,"gbssDocId") ) 
+			hdr = "Doc ID";
+		// when url was first discovered
+		if ( ! strcmp(hdr,"gbssDiscoveredTime") ) // need this!
+			hdr = "Url Discovered";
+		// when it was crawled this time
+		if ( ! strcmp(hdr,"gbssDownloadStartTime") ) 
+			hdr = "Crawled";
+		if ( ! strcmp(hdr,"gbssContentLen") ) 
+			hdr = "Page Length";
+		if ( ! strcmp(hdr,"gbssDupOfDocId") ) 
+			hdr = "Duplicate Of";
+		if ( ! strcmp(hdr,"gbssNumRedirects") ) 
+			hdr = "Redirects";
+		if ( ! strcmp(hdr,"gbssFinalRedirectUrl") )
+			hdr = "Redirected To";
+		if ( ! strcmp(hdr,"gbssCrawlRound") ) 
+			hdr = "Crawl Round";
+		if ( ! strcmp(hdr,"gbssHopCount") ) 
+			hdr = "Hop Count";
+		if ( ! strcmp(hdr,"gbssIp") ) 
+			hdr = "IP";
+		if ( ! strcmp(hdr,"gbssSentToDiffbotThisTime") ) 
+			hdr = "Process Attempted";
+		if ( ! strcmp(hdr,"gbssDiffbotReplyMsg") )
+			hdr = "Process Response";
+		if ( ! strcmp(hdr,"gbssStatusMsg") ) 
+			hdr = "Status";
+
+		//if ( ! strcmp(hdr,"gbssMatchingUrlFilter") ) 
+		//	hdr = "Matching Expression";
+		// value is 'url ignored', 'will spider next round', 'error' or 
+		// a numeric priority
+		// if ( ! strcmp(hdr,"gbssSpiderPriority") ) 
+		// 	hdr = "Matching Action";
+
+		// new columns
+		// if ( ! strcmp(hdr,"gbssAgeInIndex") ) 
+		// 	hdr = "Age in Index";
+
+		// if not transformed, then do not print it out
+		if ( ! strncmp(hdr,"gbss",4) )
+			continue;
+
+	skipTransform:
+		if ( ! sb->safeStrcpy ( hdr ) ) return false;
+
 		// record the hash of each one for printing out further json
 		// objects in the same order so columns are aligned!
 		int64_t h64 = hash64n ( ptrs[i] );
@ -8145,6 +8220,8 @@ bool printCSVHeaderRow ( SafeBuf *sb , State0 *st , int32_t ct ) {
 // returns false and sets g_errno on error
 bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {

+	CollectionRec *cr = g_collectiondb.getRec ( st->m_collnum );
+
 	int32_t niceness = 0;

 	// parse the json
@ -8203,6 +8280,9 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
 		int32_t slot = columnTable->getSlot ( &h64 ) ;
 		// MUST be in there
 		if ( slot < 0 ) { 
+			// we do not transform all gbss fields any more for
+			// diffbot to avoid overpopulating the csv
+			if ( cr && cr->m_isCustomCrawl ) continue;
 			// do not core on this anymore...
 			log("serps: json column not in table : %s",ji->m_name);
 			continue;
--- a/Spider.cpp
+++ b/Spider.cpp
@ -4540,8 +4540,17 @@ bool SpiderColl::scanListForWinners ( ) {
 					wsreq->m_hopCount = sreq->m_hopCount;
 				if ( wsreq->m_hopCount < sreq->m_hopCount )
 					sreq->m_hopCount = wsreq->m_hopCount;
+				// and the min added time as well!
+				// get the oldest timestamp so
+				// gbssDiscoveryTime will be accurate.
+				if ( sreq->m_addedTime < wsreq->m_addedTime )
+					wsreq->m_addedTime = sreq->m_addedTime;
+				if ( wsreq->m_addedTime < sreq->m_addedTime )
+					sreq->m_addedTime = wsreq->m_addedTime;
 			}

+			
+
 			// are we lower priority? (or equal)
 			// smaller keys are HIGHER priority.
 			if(KEYCMP((char *)&wk,(char *)oldwk,
@ -12950,6 +12959,8 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 		// url to a different url priority!
 		if ( oldReq->m_siteHash32    != sreq->m_siteHash32    ||
 		     oldReq->m_isNewOutlink  != sreq->m_isNewOutlink  ||
+		     //  use hopcount now too!
+		     oldReq->m_hopCount      != sreq->m_hopCount      ||
 		     // makes a difference as far a m_minPubDate goes, because
 		     // we want to make sure not to delete that request that
 		     // has m_parentPrevSpiderTime
@ -12966,7 +12977,8 @@ void dedupSpiderdbList ( RdbList *list , int32_t niceness , bool removeNegRecs )
 			goto addIt;
 		// . if the same check who has the most recent added time
 		// . if we are not the most recent, just do not add us
-		if ( sreq->m_addedTime <= oldReq->m_addedTime ) continue;
+		// . no, now i want the oldest so we can do gbssDiscoveryTime
+		if ( sreq->m_addedTime >= oldReq->m_addedTime ) continue;
 		// otherwise, erase over him
 		dst     = restorePoint;
 		lastKey = prevLastKey;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -27280,6 +27280,15 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
 		jd.safePrintf("\"gbssCrawlRound\":%"INT32",\n",
 			      cr->m_spiderRoundNum);

+	if ( m_sreqValid ) {
+		// in Spider.cpp we try to set m_sreq's m_addedTime to the
+		// min of all the spider requests, and we try to ensure
+		// that in the case of deduping we preserve the one with
+		// the oldest time.
+		jd.safePrintf("\"gbssDiscoveredTime\":%"INT32",\n",
+			      m_sreq.m_addedTime);
+	}
+
 	if ( m_isDupValid && m_isDup )
 		jd.safePrintf("\"gbssDupOfDocId\":%"INT64",\n",
 			      m_docIdWeAreADupOf);