Merge branch 'diffbot-dan' into diffbot-testing

This commit is contained in:
Matt Wells 2014-03-27 12:19:50 -07:00
commit c1671015c8
4 changed files with 40 additions and 7 deletions

View File

@ -1018,6 +1018,18 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
//collnum_t oldCollnum = cr->m_collnum;
//collnum_t newCollnum = m_numRecs;
// in case of bulk job, be sure to save list of spots
// copy existing list to a /tmp, where they will later be transferred back to the new folder
char oldbulkurlsname[1036];
snprintf(oldbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)oldCollnum);
char newbulkurlsname[1036];
snprintf(newbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)newCollnum);
char tmpbulkurlsname[1036];
snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%li.bulkurls.txt",cr->m_coll,(long)oldCollnum);
if (cr->m_isCustomCrawl == 2)
rename( oldbulkurlsname , tmpbulkurlsname );
// reset spider info
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
if ( sc ) {
@ -1127,6 +1139,9 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
// save coll.conf to new directory
cr->save();
// be sure to copy back the bulk urls for bulk jobs
if (cr->m_isCustomCrawl == 2)
rename( tmpbulkurlsname, newbulkurlsname );
// and clear the robots.txt cache in case we recently spidered a
// robots.txt, we don't want to use it, we want to use the one we

View File

@ -167,7 +167,7 @@ case EFAKEFIRSTIP: return "Fake firstIp";
case EBADHOSTSCONF: return "A hosts.conf is out of sync";
case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
case EDOCNONCANONICAL: return "Url was dup of canonical page";
case ECUSTOMCRAWLMISMATCH: return "Crawl type mismatch";
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
}
// if the remote error bit is clear it must be a regulare errno
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );

View File

@ -875,6 +875,19 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
);
// but default to csv
else {
if (cr && cr->m_isCustomCrawl == 1 && sreq && !sreq->m_isAddUrl && !sreq->m_isInjecting) {
if (cr->m_diffbotUrlCrawlPattern.m_length == 0
&& cr->m_diffbotUrlProcessPattern.m_length == 0) {
// If a crawl and there are no urlCrawlPattern or urlCrawlRegEx values, only return URLs from seed domain
if (sreq && !sreq->m_sameDom)
continue;
} else {
// TODO: if we get here, we have a crawl with a custom urlCrawlPattern and/or custom
// urlProcessPattern. We have to check if the current url matches the pattern
}
}
sb->safePrintf("\"%s\",\"%s\","
, sreq->m_url
, as
@ -2108,7 +2121,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
char bulkurlsfile[1024];
snprintf(bulkurlsfile, 1024, "%scoll.%s.%li/bulkurls.txt", g_hostdb.m_dir , coll , (long)st->m_collnum );
if ( spots ) {
if ( spots && cr && cr->m_isCustomCrawl == 2 ) {
log("crawlbot: got spots (len=%li) to add coll=%s (%li)",
(long)gbstrlen(spots),coll,(long)st->m_collnum);
FILE *f = fopen(bulkurlsfile, "w");
@ -2120,7 +2133,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
}
// if restart flag is on and the file with bulk urls exists, get spots from there
if ( !spots && restartColl ) {
if ( !spots && restartColl && cr && cr->m_isCustomCrawl ) {
FILE *f = fopen(bulkurlsfile, "r");
if (f != NULL) {
fseek(f, 0, SEEK_END);
@ -3946,7 +3959,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
// . do not add dups into m_diffbotSeeds safebuf
// . return 0 if not in table, 1 if in table. -1 on error adding to table.
long isInSeedBuf ( CollectionRec *cr , Url *url ) {
long isInSeedBuf ( CollectionRec *cr , char *url, int len ) {
HashTableX *ht = &cr->m_seedHashTable;
@ -3973,7 +3986,7 @@ long isInSeedBuf ( CollectionRec *cr , Url *url ) {
}
// is this url in the hash table?
long long u64 = hash64 ( url->getUrl() , url->getUrlLen() );
long long u64 = hash64 ( url, len );
if ( ht->isInTable ( &u64 ) ) return 1;
@ -4072,7 +4085,7 @@ bool getSpiderRequestMetaList ( char *doc ,
if ( ! cr ) continue;
// do not add dups into m_diffbotSeeds safebuf
long status = isInSeedBuf ( cr , &url );
long status = isInSeedBuf ( cr , saved , end - saved );
// error?
if ( status == -1 ) {

View File

@ -2148,8 +2148,13 @@ bool printResult ( State0 *st, long ix ) {
// so fix that shit here...
//float f = mr->m_lastSpidered;
//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
sb->safePrintf(",\"lastCrawlTimeUTC\":%li}\n",
sb->safePrintf(",\"lastCrawlTimeUTC\":%li\n",
mr->m_lastSpidered);
// also include a timestamp field with an RFC 1123 formatted date
char timestamp[50];
struct tm *ptm = gmtime ( &mr->m_lastSpidered );
strftime(timestamp, 50, "%a, %d %b %Y %X %Z", ptm);
sb->safePrintf(",\"timestamp\":\"%s\"}\n", timestamp);
}
//mr->size_content );