mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
Merge branch 'diffbot-dan' into diffbot-testing
This commit is contained in:
commit
c1671015c8
@ -1018,6 +1018,18 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
//collnum_t oldCollnum = cr->m_collnum;
|
||||
//collnum_t newCollnum = m_numRecs;
|
||||
|
||||
// in case of bulk job, be sure to save list of spots
|
||||
// copy existing list to a /tmp, where they will later be transferred back to the new folder
|
||||
char oldbulkurlsname[1036];
|
||||
snprintf(oldbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)oldCollnum);
|
||||
char newbulkurlsname[1036];
|
||||
snprintf(newbulkurlsname, 1036, "%scoll.%s.%li/bulkurls.txt",g_hostdb.m_dir,cr->m_coll,(long)newCollnum);
|
||||
char tmpbulkurlsname[1036];
|
||||
snprintf(tmpbulkurlsname, 1036, "/tmp/coll.%s.%li.bulkurls.txt",cr->m_coll,(long)oldCollnum);
|
||||
|
||||
if (cr->m_isCustomCrawl == 2)
|
||||
rename( oldbulkurlsname , tmpbulkurlsname );
|
||||
|
||||
// reset spider info
|
||||
SpiderColl *sc = g_spiderCache.getSpiderCollIffNonNull(oldCollnum);
|
||||
if ( sc ) {
|
||||
@ -1127,6 +1139,9 @@ bool Collectiondb::resetColl2( collnum_t oldCollnum,
|
||||
// save coll.conf to new directory
|
||||
cr->save();
|
||||
|
||||
// be sure to copy back the bulk urls for bulk jobs
|
||||
if (cr->m_isCustomCrawl == 2)
|
||||
rename( tmpbulkurlsname, newbulkurlsname );
|
||||
|
||||
// and clear the robots.txt cache in case we recently spidered a
|
||||
// robots.txt, we don't want to use it, we want to use the one we
|
||||
|
@ -167,7 +167,7 @@ case EFAKEFIRSTIP: return "Fake firstIp";
|
||||
case EBADHOSTSCONF: return "A hosts.conf is out of sync";
|
||||
case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
|
||||
case EDOCNONCANONICAL: return "Url was dup of canonical page";
|
||||
case ECUSTOMCRAWLMISMATCH: return "Crawl type mismatch";
|
||||
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
||||
|
@ -875,6 +875,19 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
);
|
||||
// but default to csv
|
||||
else {
|
||||
if (cr && cr->m_isCustomCrawl == 1 && sreq && !sreq->m_isAddUrl && !sreq->m_isInjecting) {
|
||||
if (cr->m_diffbotUrlCrawlPattern.m_length == 0
|
||||
&& cr->m_diffbotUrlProcessPattern.m_length == 0) {
|
||||
// If a crawl and there are no urlCrawlPattern or urlCrawlRegEx values, only return URLs from seed domain
|
||||
if (sreq && !sreq->m_sameDom)
|
||||
continue;
|
||||
} else {
|
||||
// TODO: if we get here, we have a crawl with a custom urlCrawlPattern and/or custom
|
||||
// urlProcessPattern. We have to check if the current url matches the pattern
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
sb->safePrintf("\"%s\",\"%s\","
|
||||
, sreq->m_url
|
||||
, as
|
||||
@ -2108,7 +2121,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
|
||||
char bulkurlsfile[1024];
|
||||
snprintf(bulkurlsfile, 1024, "%scoll.%s.%li/bulkurls.txt", g_hostdb.m_dir , coll , (long)st->m_collnum );
|
||||
if ( spots ) {
|
||||
if ( spots && cr && cr->m_isCustomCrawl == 2 ) {
|
||||
log("crawlbot: got spots (len=%li) to add coll=%s (%li)",
|
||||
(long)gbstrlen(spots),coll,(long)st->m_collnum);
|
||||
FILE *f = fopen(bulkurlsfile, "w");
|
||||
@ -2120,7 +2133,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
}
|
||||
|
||||
// if restart flag is on and the file with bulk urls exists, get spots from there
|
||||
if ( !spots && restartColl ) {
|
||||
if ( !spots && restartColl && cr && cr->m_isCustomCrawl ) {
|
||||
FILE *f = fopen(bulkurlsfile, "r");
|
||||
if (f != NULL) {
|
||||
fseek(f, 0, SEEK_END);
|
||||
@ -3946,7 +3959,7 @@ bool printCrawlBotPage2 ( TcpSocket *socket ,
|
||||
|
||||
// . do not add dups into m_diffbotSeeds safebuf
|
||||
// . return 0 if not in table, 1 if in table. -1 on error adding to table.
|
||||
long isInSeedBuf ( CollectionRec *cr , Url *url ) {
|
||||
long isInSeedBuf ( CollectionRec *cr , char *url, int len ) {
|
||||
|
||||
HashTableX *ht = &cr->m_seedHashTable;
|
||||
|
||||
@ -3973,7 +3986,7 @@ long isInSeedBuf ( CollectionRec *cr , Url *url ) {
|
||||
}
|
||||
|
||||
// is this url in the hash table?
|
||||
long long u64 = hash64 ( url->getUrl() , url->getUrlLen() );
|
||||
long long u64 = hash64 ( url, len );
|
||||
|
||||
if ( ht->isInTable ( &u64 ) ) return 1;
|
||||
|
||||
@ -4072,7 +4085,7 @@ bool getSpiderRequestMetaList ( char *doc ,
|
||||
if ( ! cr ) continue;
|
||||
|
||||
// do not add dups into m_diffbotSeeds safebuf
|
||||
long status = isInSeedBuf ( cr , &url );
|
||||
long status = isInSeedBuf ( cr , saved , end - saved );
|
||||
|
||||
// error?
|
||||
if ( status == -1 ) {
|
||||
|
@ -2148,8 +2148,13 @@ bool printResult ( State0 *st, long ix ) {
|
||||
// so fix that shit here...
|
||||
//float f = mr->m_lastSpidered;
|
||||
//sb->safePrintf(",\"lastCrawlTimeUTC\":%.0f}",f);
|
||||
sb->safePrintf(",\"lastCrawlTimeUTC\":%li}\n",
|
||||
sb->safePrintf(",\"lastCrawlTimeUTC\":%li\n",
|
||||
mr->m_lastSpidered);
|
||||
// also include a timestamp field with an RFC 1123 formatted date
|
||||
char timestamp[50];
|
||||
struct tm *ptm = gmtime ( &mr->m_lastSpidered );
|
||||
strftime(timestamp, 50, "%a, %d %b %Y %X %Z", ptm);
|
||||
sb->safePrintf(",\"timestamp\":\"%s\"}\n", timestamp);
|
||||
}
|
||||
|
||||
//mr->size_content );
|
||||
|
Loading…
Reference in New Issue
Block a user