Merge branch 'diffbot-dan' into diffbot-testing

2024-10-04 12:17:35 +03:00 · 2014-03-27 12:19:50 -07:00 · 2014-03-27 12:19:50 -07:00 · c1671015c8
commit c1671015c8
parent 582349334f d67f09feeb
4 changed files with 40 additions and 7 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -1018,6 +1018,18 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
 	//collnum_t oldCollnum = cr->m_collnum;
 	//collnum_t newCollnum = m_numRecs;

+	// in case of bulk job, be sure to save list of spots
+	// copy existing list to a /tmp, where they will later be transferred back to the new folder
+	char oldbulkurlsname[1036];
+	snprintf(oldbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)oldCollnum);
+	char newbulkurlsname[1036];
+	snprintf(newbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)newCollnum);
+	char tmpbulkurlsname[1036];
+	snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%li.bulkurls.txt",cr->m_coll,(long)oldCollnum);
+
+	if (cr->m_isCustomCrawl == 2)
+	    rename( oldbulkurlsname , tmpbulkurlsname );
+
 	// reset spider info
 	SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
 	if ( sc ) {
@ -1127,6 +1139,9 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
 	// save coll.conf to new directory
 	cr->save();

+	// be sure to copy back the bulk urls for bulk jobs
+	if (cr->m_isCustomCrawl == 2)
+	    rename( tmpbulkurlsname, newbulkurlsname );

 	// and clear the robots.txt cache in case we recently spidered a
 	// robots.txt, we don't want to use it, we want to use the one we
--- a/Errno.cpp
+++ b/Errno.cpp
@ -167,7 +167,7 @@ case	EFAKEFIRSTIP: return "Fake firstIp";
 case	EBADHOSTSCONF: return "A hosts.conf is out of sync";
 case    EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
 case	EDOCNONCANONICAL: return "Url was dup of canonical page";
-case    ECUSTOMCRAWLMISMATCH: return "Crawl type mismatch";
+case    ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
 	}
 	// if the remote error bit is clear it must be a regulare errno
 	//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
--- a/PageCrawlBot.cpp
+++ b/PageCrawlBot.cpp
@ -875,6 +875,19 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
 				       );
 		// but default to csv
 		else {
+		    if (cr && cr->m_isCustomCrawl == 1 && sreq && !sreq->m_isAddUrl && !sreq->m_isInjecting) {
+		        if (cr->m_diffbotUrlCrawlPattern.m_length == 0
+                    && cr->m_diffbotUrlProcessPattern.m_length == 0) {
+		            // If a crawl and there are no urlCrawlPattern or urlCrawlRegEx values, only return URLs from seed domain
+		            if (sreq && !sreq->m_sameDom)
+		                continue;
+		        } else {
+		            // TODO: if we get here, we have a crawl with a custom urlCrawlPattern and/or custom
+		            //       urlProcessPattern. We have to check if the current url matches the pattern
+
+		        }
+		    }
+
 			sb->safePrintf("\"%s\",\"%s\","
 				       , sreq->m_url
 				       , as
@ -2108,7 +2121,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {

 	char bulkurlsfile[1024];
 	snprintf(bulkurlsfile, 1024, "%scoll.%s.%li/bulkurls.txt", g_hostdb.m_dir , coll , (long)st->m_collnum );
-	if ( spots ) {
+	if ( spots && cr && cr->m_isCustomCrawl == 2 ) {
 		log("crawlbot: got spots (len=%li) to add coll=%s (%li)",
 		    (long)gbstrlen(spots),coll,(long)st->m_collnum);
 		FILE *f = fopen(bulkurlsfile, "w");
@ -2120,7 +2133,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
 	}

 	// if restart flag is on and the file with bulk urls exists, get spots from there
-	if ( !spots && restartColl ) {
+	if ( !spots && restartColl && cr && cr->m_isCustomCrawl ) {
 	    FILE *f = fopen(bulkurlsfile, "r");
 	    if (f != NULL) {
 	        fseek(f, 0, SEEK_END);
@ -3946,7 +3959,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,

 // . do not add dups into m_diffbotSeeds safebuf
 // . return 0 if not in table, 1 if in table. -1 on error adding to table.
-long isInSeedBuf ( CollectionRec *cr , Url *url ) {
+long isInSeedBuf ( CollectionRec *cr , char *url, int len ) {

 	HashTableX *ht = &cr->m_seedHashTable;

@ -3973,7 +3986,7 @@ long isInSeedBuf ( CollectionRec *cr , Url *url ) {
 	}

 	// is this url in the hash table?
-	long long u64 = hash64 ( url->getUrl() , url->getUrlLen() );
+	long long u64 = hash64 ( url, len );
 	
 	if ( ht->isInTable ( &u64 ) ) return 1;

@ -4072,7 +4085,7 @@ bool getSpiderRequestMetaList ( char *doc ,
 		if ( ! cr ) continue;

 		// do not add dups into m_diffbotSeeds safebuf
-		long status = isInSeedBuf ( cr , &url );
+		long status = isInSeedBuf ( cr , saved , end - saved );

 		// error?
 		if ( status == -1 ) {
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -2148,8 +2148,13 @@ bool printResult ( State0 *st, long ix ) {
 			// so fix that shit here...
 			//float f = mr->m_lastSpidered;
 			//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
-			sb->safePrintf(",\"lastCrawlTimeUTC\":%li}\n",
+			sb->safePrintf(",\"lastCrawlTimeUTC\":%li\n",
 				       mr->m_lastSpidered);
+			// also include a timestamp field with an RFC 1123 formatted date
+			char timestamp[50];
+			struct tm *ptm = gmtime ( &mr->m_lastSpidered );
+			strftime(timestamp, 50, "%a, %d %b %Y %X %Z", ptm);
+			sb->safePrintf(",\"timestamp\":\"%s\"}\n", timestamp);
 		}

 		//mr->size_content );