mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
Merge branch 'diffbot-dan' into diffbot-testing
This commit is contained in:
commit
c1671015c8
@ -1018,6 +1018,18 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
|||||||
//collnum_t oldCollnum = cr->m_collnum;
|
//collnum_t oldCollnum = cr->m_collnum;
|
||||||
//collnum_t newCollnum = m_numRecs;
|
//collnum_t newCollnum = m_numRecs;
|
||||||
|
|
||||||
|
// in case of bulk job, be sure to save list of spots
|
||||||
|
// copy existing list to a /tmp, where they will later be transferred back to the new folder
|
||||||
|
char oldbulkurlsname[1036];
|
||||||
|
snprintf(oldbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)oldCollnum);
|
||||||
|
char newbulkurlsname[1036];
|
||||||
|
snprintf(newbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)newCollnum);
|
||||||
|
char tmpbulkurlsname[1036];
|
||||||
|
snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%li.bulkurls.txt",cr->m_coll,(long)oldCollnum);
|
||||||
|
|
||||||
|
if (cr->m_isCustomCrawl == 2)
|
||||||
|
rename( oldbulkurlsname , tmpbulkurlsname );
|
||||||
|
|
||||||
// reset spider info
|
// reset spider info
|
||||||
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
|
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
|
||||||
if ( sc ) {
|
if ( sc ) {
|
||||||
@ -1127,6 +1139,9 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
|||||||
// save coll.conf to new directory
|
// save coll.conf to new directory
|
||||||
cr->save();
|
cr->save();
|
||||||
|
|
||||||
|
// be sure to copy back the bulk urls for bulk jobs
|
||||||
|
if (cr->m_isCustomCrawl == 2)
|
||||||
|
rename( tmpbulkurlsname, newbulkurlsname );
|
||||||
|
|
||||||
// and clear the robots.txt cache in case we recently spidered a
|
// and clear the robots.txt cache in case we recently spidered a
|
||||||
// robots.txt, we don't want to use it, we want to use the one we
|
// robots.txt, we don't want to use it, we want to use the one we
|
||||||
|
@ -167,7 +167,7 @@ case EFAKEFIRSTIP: return "Fake firstIp";
|
|||||||
case EBADHOSTSCONF: return "A hosts.conf is out of sync";
|
case EBADHOSTSCONF: return "A hosts.conf is out of sync";
|
||||||
case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
|
case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
|
||||||
case EDOCNONCANONICAL: return "Url was dup of canonical page";
|
case EDOCNONCANONICAL: return "Url was dup of canonical page";
|
||||||
case ECUSTOMCRAWLMISMATCH: return "Crawl type mismatch";
|
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
|
||||||
}
|
}
|
||||||
// if the remote error bit is clear it must be a regulare errno
|
// if the remote error bit is clear it must be a regulare errno
|
||||||
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
||||||
|
@ -875,6 +875,19 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
|||||||
);
|
);
|
||||||
// but default to csv
|
// but default to csv
|
||||||
else {
|
else {
|
||||||
|
if (cr && cr->m_isCustomCrawl == 1 && sreq && !sreq->m_isAddUrl && !sreq->m_isInjecting) {
|
||||||
|
if (cr->m_diffbotUrlCrawlPattern.m_length == 0
|
||||||
|
&& cr->m_diffbotUrlProcessPattern.m_length == 0) {
|
||||||
|
// If a crawl and there are no urlCrawlPattern or urlCrawlRegEx values, only return URLs from seed domain
|
||||||
|
if (sreq && !sreq->m_sameDom)
|
||||||
|
continue;
|
||||||
|
} else {
|
||||||
|
// TODO: if we get here, we have a crawl with a custom urlCrawlPattern and/or custom
|
||||||
|
// urlProcessPattern. We have to check if the current url matches the pattern
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
sb->safePrintf("\"%s\",\"%s\","
|
sb->safePrintf("\"%s\",\"%s\","
|
||||||
, sreq->m_url
|
, sreq->m_url
|
||||||
, as
|
, as
|
||||||
@ -2108,7 +2121,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
|||||||
|
|
||||||
char bulkurlsfile[1024];
|
char bulkurlsfile[1024];
|
||||||
snprintf(bulkurlsfile, 1024, "%scoll.%s.%li/bulkurls.txt", g_hostdb.m_dir , coll , (long)st->m_collnum );
|
snprintf(bulkurlsfile, 1024, "%scoll.%s.%li/bulkurls.txt", g_hostdb.m_dir , coll , (long)st->m_collnum );
|
||||||
if ( spots ) {
|
if ( spots && cr && cr->m_isCustomCrawl == 2 ) {
|
||||||
log("crawlbot: got spots (len=%li) to add coll=%s (%li)",
|
log("crawlbot: got spots (len=%li) to add coll=%s (%li)",
|
||||||
(long)gbstrlen(spots),coll,(long)st->m_collnum);
|
(long)gbstrlen(spots),coll,(long)st->m_collnum);
|
||||||
FILE *f = fopen(bulkurlsfile, "w");
|
FILE *f = fopen(bulkurlsfile, "w");
|
||||||
@ -2120,7 +2133,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// if restart flag is on and the file with bulk urls exists, get spots from there
|
// if restart flag is on and the file with bulk urls exists, get spots from there
|
||||||
if ( !spots && restartColl ) {
|
if ( !spots && restartColl && cr && cr->m_isCustomCrawl ) {
|
||||||
FILE *f = fopen(bulkurlsfile, "r");
|
FILE *f = fopen(bulkurlsfile, "r");
|
||||||
if (f != NULL) {
|
if (f != NULL) {
|
||||||
fseek(f, 0, SEEK_END);
|
fseek(f, 0, SEEK_END);
|
||||||
@ -3946,7 +3959,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
|||||||
|
|
||||||
// . do not add dups into m_diffbotSeeds safebuf
|
// . do not add dups into m_diffbotSeeds safebuf
|
||||||
// . return 0 if not in table, 1 if in table. -1 on error adding to table.
|
// . return 0 if not in table, 1 if in table. -1 on error adding to table.
|
||||||
long isInSeedBuf ( CollectionRec *cr , Url *url ) {
|
long isInSeedBuf ( CollectionRec *cr , char *url, int len ) {
|
||||||
|
|
||||||
HashTableX *ht = &cr->m_seedHashTable;
|
HashTableX *ht = &cr->m_seedHashTable;
|
||||||
|
|
||||||
@ -3973,7 +3986,7 @@ long isInSeedBuf ( CollectionRec *cr , Url *url ) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// is this url in the hash table?
|
// is this url in the hash table?
|
||||||
long long u64 = hash64 ( url->getUrl() , url->getUrlLen() );
|
long long u64 = hash64 ( url, len );
|
||||||
|
|
||||||
if ( ht->isInTable ( &u64 ) ) return 1;
|
if ( ht->isInTable ( &u64 ) ) return 1;
|
||||||
|
|
||||||
@ -4072,7 +4085,7 @@ bool getSpiderRequestMetaList ( char *doc ,
|
|||||||
if ( ! cr ) continue;
|
if ( ! cr ) continue;
|
||||||
|
|
||||||
// do not add dups into m_diffbotSeeds safebuf
|
// do not add dups into m_diffbotSeeds safebuf
|
||||||
long status = isInSeedBuf ( cr , &url );
|
long status = isInSeedBuf ( cr , saved , end - saved );
|
||||||
|
|
||||||
// error?
|
// error?
|
||||||
if ( status == -1 ) {
|
if ( status == -1 ) {
|
||||||
|
@ -2148,8 +2148,13 @@ bool printResult ( State0 *st, long ix ) {
|
|||||||
// so fix that shit here...
|
// so fix that shit here...
|
||||||
//float f = mr->m_lastSpidered;
|
//float f = mr->m_lastSpidered;
|
||||||
//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
|
//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
|
||||||
sb->safePrintf(",\"lastCrawlTimeUTC\":%li}\n",
|
sb->safePrintf(",\"lastCrawlTimeUTC\":%li\n",
|
||||||
mr->m_lastSpidered);
|
mr->m_lastSpidered);
|
||||||
|
// also include a timestamp field with an RFC 1123 formatted date
|
||||||
|
char timestamp[50];
|
||||||
|
struct tm *ptm = gmtime ( &mr->m_lastSpidered );
|
||||||
|
strftime(timestamp, 50, "%a, %d %b %Y %X %Z", ptm);
|
||||||
|
sb->safePrintf(",\"timestamp\":\"%s\"}\n", timestamp);
|
||||||
}
|
}
|
||||||
|
|
||||||
//mr->size_content );
|
//mr->size_content );
|
||||||
|
Loading…
Reference in New Issue
Block a user