mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot
Conflicts: Process.cpp
This commit is contained in:
commit
adf9d807ea
@ -799,7 +799,7 @@ bool sendPageAutoban ( TcpSocket *s , HttpRequest *r ) {
|
||||
}
|
||||
|
||||
bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
|
||||
SafeBuf sb(512 * 512);
|
||||
SafeBuf sb(512 * 512,"autobbuf");
|
||||
//read in all of the possible cgi parms off the bat:
|
||||
//long user = g_pages.getUserType( s , r );
|
||||
char *username = g_users.getUsername(r);
|
||||
|
92
BigFile.cpp
92
BigFile.cpp
@ -11,6 +11,10 @@
|
||||
#include "Statsdb.h"
|
||||
#include "DiskPageCache.h"
|
||||
|
||||
#ifdef ASYNCIO
|
||||
#include <aio.h>
|
||||
#endif
|
||||
|
||||
// main.cpp will wait for this to be zero before exiting so all unlink/renames
|
||||
// can complete
|
||||
long g_unlinkRenameThreads = 0;
|
||||
@ -530,6 +534,11 @@ bool BigFile::readwrite ( void *buf ,
|
||||
// . if we're blocking then do it now
|
||||
// . this should return false and set g_errno on error, true otherwise
|
||||
if ( ! isNonBlocking ) goto skipThread;
|
||||
|
||||
#ifdef ASYNCIO
|
||||
goto skipThread;
|
||||
#endif
|
||||
|
||||
// . otherwise, spawn a thread to do this i/o
|
||||
// . this returns false and sets g_errno on error, true on success
|
||||
// . we should return false cuz we blocked
|
||||
@ -597,7 +606,90 @@ bool BigFile::readwrite ( void *buf ,
|
||||
log("disk: read buf alloc failed for %li "
|
||||
"bytes.",need);
|
||||
}
|
||||
|
||||
//
|
||||
// pthread_create() is abhorently slow. use asyncio if possible.
|
||||
//
|
||||
|
||||
#ifdef ASYNCIO
|
||||
|
||||
// we only have two in the array... most likely though we only
|
||||
// need one here...
|
||||
aiocb *a0 = &fstate->m_aiocb[0];
|
||||
aiocb *a1 = &fstate->m_aiocb[1];
|
||||
// init them for the read
|
||||
a0->aio_fildes = fstate->m_fd1;
|
||||
a1->aio_fildes = fstate->m_fd2;
|
||||
// the offset of each file
|
||||
long long off1 = fs->m_offset;
|
||||
// always read at start of 2nd file
|
||||
long long off2 = 0;
|
||||
// how many bytes to read from each file?
|
||||
long long readSize1 = size;
|
||||
long long readSize2 = 0;
|
||||
if ( off1 + readSize1 > MAX_PART_SIZE ) {
|
||||
readSize1 = ((long long)MAX_PART_SIZE) - off1;
|
||||
readSize2 = size - readSize1;
|
||||
}
|
||||
a0->aio_offset = off1;
|
||||
a1->aio_offset = off2;
|
||||
a0->aio_nbytes = readSize1;
|
||||
a1->aio_nbytes = readSize2;
|
||||
a0->aio_buf = fstate->m_buf;
|
||||
a1->aio_buf = fstate->m_buf + readSize1;
|
||||
a0->aio_reqprio = 0;
|
||||
a1->aio_reqprio = 0;
|
||||
a0->aio_sigevent = SIGEV_SIGNAL;
|
||||
a1->aio_sigevent = SIGEV_SIGNAL;
|
||||
|
||||
// translate offset to a filenum and offset
|
||||
long filenum = offset / MAX_PART_SIZE;
|
||||
long localOffset = offset % MAX_PART_SIZE;
|
||||
|
||||
|
||||
// read or write?
|
||||
if ( doWrite ) a0->aio_lio_opcode = LIO_WRITE;
|
||||
else a0->aio_lio_opcode = LIO_READ;
|
||||
|
||||
// different fds implies two different files we gotta read from.
|
||||
long numFilesToReadFrom = 1;
|
||||
if ( fstate->m_fd1 != fstate->m_fd2 ) numFilesToReadFrom = 2;
|
||||
// set it up
|
||||
//aioList->m_signal = ESIG;
|
||||
|
||||
retry77:
|
||||
|
||||
//
|
||||
// don't use this on kernels below 3.12 because it can block
|
||||
// when reading ext4 files.
|
||||
//
|
||||
io_submit();
|
||||
|
||||
|
||||
// this will send the signal when read/write is completed
|
||||
//long status = lio_listio ( LIO_NOWAIT ,
|
||||
// a0 ,
|
||||
// numFilesToReadFrom ,
|
||||
// &fstate->m_sigEvent );
|
||||
|
||||
// if status is 0, there was no error
|
||||
if ( status == 0 ) {
|
||||
g_errno = 0;
|
||||
// assume we will get the signal later
|
||||
return false;
|
||||
}
|
||||
// got interrupted by a signal? try again.
|
||||
if ( errno == EINTR )
|
||||
goto retry77;
|
||||
// tell caller about the error
|
||||
g_errno = errno;
|
||||
log("aio: %s", mstrerror(g_errno));
|
||||
// we did not block or anything
|
||||
return true;
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
// . this returns false and sets errno on error
|
||||
// . set g_errno to the errno
|
||||
if ( ! readwrite_r ( fstate , NULL ) ) g_errno = errno;
|
||||
|
@ -95,6 +95,11 @@ public:
|
||||
// m_allocOff is offset into m_allocBuf where we start reading into
|
||||
// from the file
|
||||
long m_allocOff;
|
||||
// do not call pthread_create() for every read we do. use async io
|
||||
// because it should be much much faster
|
||||
#ifdef ASYNCIO
|
||||
struct aiocb m_aiocb[2];
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
@ -72,6 +72,10 @@ CollectionRec::CollectionRec() {
|
||||
|
||||
m_lastResetCount = 0;
|
||||
|
||||
// regex_t types
|
||||
m_hasucr = false;
|
||||
m_hasupr = false;
|
||||
|
||||
// for diffbot caching the global spider stats
|
||||
reset();
|
||||
|
||||
@ -91,6 +95,11 @@ void CollectionRec::setToDefaults ( ) {
|
||||
}
|
||||
|
||||
void CollectionRec::reset() {
|
||||
|
||||
// regex_t types
|
||||
if ( m_hasucr ) regfree ( &m_ucr );
|
||||
if ( m_hasupr ) regfree ( &m_upr );
|
||||
|
||||
// make sure we do not leave spiders "hanging" waiting for their
|
||||
// callback to be called... and it never gets called
|
||||
//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
|
||||
@ -140,8 +149,34 @@ bool CollectionRec::load ( char *coll , long i ) {
|
||||
// . accepts OBJ_COLLECTIONREC or OBJ_CONF
|
||||
g_parms.setFromFile ( this , tmp2 , tmp1 );
|
||||
|
||||
// add default reg ex
|
||||
setUrlFiltersToDefaults();
|
||||
// add default reg ex IFF there are no url filters there now
|
||||
if ( m_numRegExs == 0 ) setUrlFiltersToDefaults();
|
||||
|
||||
// compile regexs here
|
||||
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasucr = true;
|
||||
if ( rx && regcomp ( &m_ucr , rx ,
|
||||
REG_EXTENDED|REG_ICASE|
|
||||
REG_NEWLINE|REG_NOSUB) ) {
|
||||
// error!
|
||||
return log("xmldoc: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
}
|
||||
|
||||
rx = m_diffbotUrlProcessRegEx.getBufStart();
|
||||
if ( rx && ! rx[0] ) rx = NULL;
|
||||
if ( rx ) m_hasupr = true;
|
||||
if ( rx && regcomp ( &m_upr , rx ,
|
||||
REG_EXTENDED|REG_ICASE|
|
||||
REG_NEWLINE|REG_NOSUB) ) {
|
||||
// error!
|
||||
return log("xmldoc: regcomp %s failed: %s. "
|
||||
"Ignoring.",
|
||||
rx,mstrerror(errno));
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// LOAD the crawlinfo class in the collectionrec for diffbot
|
||||
@ -392,7 +427,7 @@ bool CollectionRec::save ( ) {
|
||||
g_hostdb.m_dir , m_coll , (long)m_collnum );
|
||||
if ( ! g_parms.saveToXml ( (char *)this , tmp ) ) return false;
|
||||
// log msg
|
||||
log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
|
||||
//log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
|
||||
|
||||
//
|
||||
// save the crawlinfo class in the collectionrec for diffbot
|
||||
@ -400,7 +435,7 @@ bool CollectionRec::save ( ) {
|
||||
// SAVE LOCAL
|
||||
sprintf ( tmp , "%scoll.%s.%li/localcrawlinfo.dat",
|
||||
g_hostdb.m_dir , m_coll , (long)m_collnum );
|
||||
log("coll: saving %s",tmp);
|
||||
//log("coll: saving %s",tmp);
|
||||
SafeBuf sb;
|
||||
//m_localCrawlInfo.print ( &sb );
|
||||
// binary now
|
||||
@ -413,7 +448,7 @@ bool CollectionRec::save ( ) {
|
||||
// SAVE GLOBAL
|
||||
sprintf ( tmp , "%scoll.%s.%li/globalcrawlinfo.dat",
|
||||
g_hostdb.m_dir , m_coll , (long)m_collnum );
|
||||
log("coll: saving %s",tmp);
|
||||
//log("coll: saving %s",tmp);
|
||||
sb.reset();
|
||||
//m_globalCrawlInfo.print ( &sb );
|
||||
// binary now
|
||||
|
@ -56,7 +56,7 @@
|
||||
//#define MAX_SITE_EXPRESSION_LEN 128
|
||||
//#define MAX_SITE_EXPRESSIONS 256
|
||||
|
||||
//#include "regex.h"
|
||||
#include "regex.h"
|
||||
|
||||
#include "Url.h" // MAX_COLL_LEN
|
||||
//#include "Sync.h"
|
||||
@ -108,6 +108,9 @@ class CrawlInfo {
|
||||
// currently in the ready queue (doledb) to spider?
|
||||
char m_sentCrawlDoneAlert;
|
||||
|
||||
//long m_numUrlsLaunched;
|
||||
long m_dummy1;
|
||||
|
||||
void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); };
|
||||
//bool print (class SafeBuf *sb ) ;
|
||||
//bool setFromSafeBuf (class SafeBuf *sb ) ;
|
||||
@ -432,8 +435,27 @@ class CollectionRec {
|
||||
//SafeBuf m_diffbotApiList;//QueryString;
|
||||
//SafeBuf m_diffbotUrlCrawlPattern;
|
||||
//SafeBuf m_diffbotUrlProcessPattern;
|
||||
|
||||
// use for all now...
|
||||
SafeBuf m_diffbotApiUrl;
|
||||
|
||||
// only process pages whose content matches this pattern
|
||||
SafeBuf m_diffbotPageProcessPattern;
|
||||
// only process urls that match this pattern
|
||||
SafeBuf m_diffbotUrlProcessPattern;
|
||||
// only CRAWL urls that match this pattern
|
||||
SafeBuf m_diffbotUrlCrawlPattern;
|
||||
|
||||
// regex support
|
||||
SafeBuf m_diffbotUrlCrawlRegEx;
|
||||
SafeBuf m_diffbotUrlProcessRegEx;
|
||||
regex_t m_ucr;
|
||||
regex_t m_upr;
|
||||
long m_hasucr:1;
|
||||
long m_hasupr:1;
|
||||
|
||||
char m_diffbotOnlyProcessIfNew;
|
||||
|
||||
//SafeBuf m_diffbotClassify;
|
||||
//char m_diffbotClassify;
|
||||
//char m_useDiffbot;
|
||||
@ -515,6 +537,9 @@ class CollectionRec {
|
||||
long m_numRegExs11;
|
||||
SafeBuf m_spiderDiffbotApiUrl [ MAX_FILTERS ];
|
||||
|
||||
long m_numRegExs8;
|
||||
char m_harvestLinks [ MAX_FILTERS ];
|
||||
|
||||
// dummy?
|
||||
long m_numRegExs9;
|
||||
|
||||
@ -671,12 +696,6 @@ class CollectionRec {
|
||||
|
||||
class SpiderColl *m_spiderColl;
|
||||
|
||||
// each Rdb has a tree, so keep the pos/neg key count here so
|
||||
// that RdbTree does not have to have its own array limited by
|
||||
// MAX_COLLS which we did away with because we made this dynamic.
|
||||
long m_numPosKeysInTree[RDB_END];
|
||||
long m_numNegKeysInTree[RDB_END];
|
||||
|
||||
long m_overflow;
|
||||
long m_overflow2;
|
||||
|
||||
@ -1018,6 +1037,12 @@ class CollectionRec {
|
||||
// used by Parms.cpp
|
||||
char m_hackFlag;
|
||||
|
||||
// each Rdb has a tree, so keep the pos/neg key count here so
|
||||
// that RdbTree does not have to have its own array limited by
|
||||
// MAX_COLLS which we did away with because we made this dynamic.
|
||||
long m_numPosKeysInTree[RDB_END];
|
||||
long m_numNegKeysInTree[RDB_END];
|
||||
|
||||
//long m_numEventsOnHost;
|
||||
|
||||
// do we have the doc:quality var in any url filter?
|
||||
|
@ -247,7 +247,12 @@ bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew ,
|
||||
// MDW: ensure not created on disk since time of last load
|
||||
char dname[512];
|
||||
sprintf(dname, "%scoll.%s.%li/",g_hostdb.m_dir,coll,i);
|
||||
if ( isNew && opendir ( dname ) ) {
|
||||
DIR *dir = NULL;
|
||||
if ( isNew )
|
||||
dir = opendir ( dname );
|
||||
if ( dir )
|
||||
closedir ( dir );
|
||||
if ( isNew && dir ) {
|
||||
g_errno = EEXIST;
|
||||
return log("admin: Trying to create collection %s but "
|
||||
"directory %s already exists on disk.",coll,dname);
|
||||
@ -524,11 +529,12 @@ bool Collectiondb::isAdmin ( HttpRequest *r , TcpSocket *s ) {
|
||||
void savingCheckWrapper1 ( int fd , void *state ) {
|
||||
WaitEntry *we = (WaitEntry *)state;
|
||||
// no state?
|
||||
if ( ! we ) return;
|
||||
// if it blocked again i guess tree is still saving
|
||||
if ( ! g_collectiondb.resetColl ( we->m_coll , we ) ) return;
|
||||
if ( ! we ) { log("colldb: we1 is null"); return; }
|
||||
// unregister too
|
||||
g_loop.unregisterSleepCallback ( state,savingCheckWrapper1 );
|
||||
// if it blocked again i guess tree is still saving
|
||||
if ( ! g_collectiondb.resetColl ( we->m_coll , we , we->m_purgeSeeds))
|
||||
return;
|
||||
// all done
|
||||
we->m_callback ( we->m_state );
|
||||
}
|
||||
@ -536,11 +542,11 @@ void savingCheckWrapper1 ( int fd , void *state ) {
|
||||
void savingCheckWrapper2 ( int fd , void *state ) {
|
||||
WaitEntry *we = (WaitEntry *)state;
|
||||
// no state?
|
||||
if ( ! we ) return;
|
||||
// if it blocked again i guess tree is still saving
|
||||
if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
|
||||
if ( ! we ) { log("colldb: we2 is null"); return; }
|
||||
// unregister too
|
||||
g_loop.unregisterSleepCallback ( state,savingCheckWrapper2 );
|
||||
// if it blocked again i guess tree is still saving
|
||||
if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
|
||||
// all done
|
||||
we->m_callback ( we->m_state );
|
||||
}
|
||||
@ -599,7 +605,7 @@ bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
|
||||
g_errno = ENOTFOUND;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
if ( g_process.isAnyTreeSaving() ) {
|
||||
// note it
|
||||
log("admin: tree is saving. waiting2.");
|
||||
@ -700,7 +706,11 @@ bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
|
||||
|
||||
// . reset a collection
|
||||
// . returns false if blocked and will call callback
|
||||
bool Collectiondb::resetColl ( char *coll , WaitEntry *we ) {
|
||||
bool Collectiondb::resetColl ( char *coll , WaitEntry *we , bool purgeSeeds) {
|
||||
|
||||
// save parms in case we block
|
||||
we->m_purgeSeeds = purgeSeeds;
|
||||
|
||||
// ensure it's not NULL
|
||||
if ( ! coll ) {
|
||||
log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
|
||||
@ -849,11 +859,13 @@ bool Collectiondb::resetColl ( char *coll , WaitEntry *we ) {
|
||||
//cr->m_spiderStatusMsg = NULL;
|
||||
|
||||
// reset seed buf
|
||||
cr->m_diffbotSeeds.purge();
|
||||
|
||||
// reset seed dedup table
|
||||
HashTableX *ht = &cr->m_seedHashTable;
|
||||
ht->reset();
|
||||
if ( purgeSeeds ) {
|
||||
// free the buffer of seed urls
|
||||
cr->m_diffbotSeeds.purge();
|
||||
// reset seed dedup table
|
||||
HashTableX *ht = &cr->m_seedHashTable;
|
||||
ht->reset();
|
||||
}
|
||||
|
||||
// so XmlDoc.cpp can detect if the collection was reset since it
|
||||
// launched its spider:
|
||||
@ -866,6 +878,14 @@ bool Collectiondb::resetColl ( char *coll , WaitEntry *we ) {
|
||||
// right now we #define collnum_t short
|
||||
if ( m_numRecs > 0x7fff ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// make a new collnum so records in transit will not be added
|
||||
// to any rdb...
|
||||
cr->m_collnum = newCollnum;
|
||||
|
||||
// Rdb::resetColl() needs to know the new cr so it can move
|
||||
// the RdbBase into cr->m_bases[rdbId] array. recycling.
|
||||
m_recs[newCollnum] = cr;
|
||||
|
||||
// . unlink all the *.dat and *.map files for this coll in its subdir
|
||||
// . remove all recs from this collnum from m_tree/m_buckets
|
||||
// . updates RdbBase::m_collnum
|
||||
@ -879,16 +899,10 @@ bool Collectiondb::resetColl ( char *coll , WaitEntry *we ) {
|
||||
g_clusterdb.getRdb()->resetColl ( oldCollnum , newCollnum );
|
||||
g_linkdb.getRdb()->resetColl ( oldCollnum , newCollnum );
|
||||
|
||||
// make a new collnum so records in transit will not be added
|
||||
// to any rdb...
|
||||
cr->m_collnum = newCollnum;
|
||||
|
||||
// reset crawl status too!
|
||||
cr->m_spiderStatus = SP_INITIALIZING;
|
||||
|
||||
m_recs[oldCollnum] = NULL;
|
||||
m_recs[newCollnum] = cr;
|
||||
|
||||
|
||||
// readd it to the hashtable that maps name to collnum too
|
||||
long long h64 = hash64n(cr->m_coll);
|
||||
@ -902,7 +916,10 @@ bool Collectiondb::resetColl ( char *coll , WaitEntry *we ) {
|
||||
g_hostdb.m_dir,
|
||||
cr->m_coll,
|
||||
(long)newCollnum);
|
||||
if ( opendir ( dname ) ) {
|
||||
DIR *dir = opendir ( dname );
|
||||
if ( dir )
|
||||
closedir ( dir );
|
||||
if ( dir ) {
|
||||
//g_errno = EEXIST;
|
||||
log("admin: Trying to create collection %s but "
|
||||
"directory %s already exists on disk.",coll,dname);
|
||||
|
@ -18,6 +18,7 @@ public:
|
||||
void (* m_callback) (void *state);
|
||||
void *m_state;
|
||||
char *m_coll;
|
||||
bool m_purgeSeeds;
|
||||
};
|
||||
|
||||
class Collectiondb {
|
||||
@ -94,7 +95,7 @@ class Collectiondb {
|
||||
bool deleteRecs ( class HttpRequest *r ) ;
|
||||
|
||||
// returns false if blocked, true otherwise.
|
||||
bool resetColl ( char *coll , WaitEntry *we );
|
||||
bool resetColl ( char *coll , WaitEntry *we , bool purgeSeeds );
|
||||
|
||||
// . keep up to 128 of them, these reference into m_list
|
||||
// . COllectionRec now includes m_needsSave and m_lastUpdateTime
|
||||
|
8
Conf.cpp
8
Conf.cpp
@ -189,6 +189,7 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
|
||||
//g_conf.m_testSearchEnabled = false;
|
||||
|
||||
|
||||
/*
|
||||
//
|
||||
// are we running in Matt Wells's data center?
|
||||
// if so, we want to be able to use the seo tools that are not part
|
||||
@ -207,11 +208,16 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
|
||||
if ( hh && strcmp(hh,"galileo") == 0) priv = true;
|
||||
if ( hh && strcmp(hh,"sputnik") == 0) priv = true;
|
||||
if ( hh && strcmp(hh,"titan") == 0) priv = true;
|
||||
if ( hh[0]=='g' && hh[1]=='k' && is_digit(hh[2]) ) priv = true;
|
||||
if ( hh && hh[0]=='g' && hh[1]=='k' && is_digit(hh[2]) ) priv = true;
|
||||
//if(hh[0]=='s' && hh[1]=='p' && is_digit(hh[2])) ) priv = true;
|
||||
if ( priv ) g_conf.m_isMattWells = true;
|
||||
else g_conf.m_isMattWells = false;
|
||||
*/
|
||||
g_conf.m_isMattWells = false;
|
||||
|
||||
#ifdef MATTWELLS
|
||||
g_conf.m_isMattWells = true;
|
||||
#endif
|
||||
|
||||
// this is not possible
|
||||
/*
|
||||
|
5
Dir.cpp
5
Dir.cpp
@ -5,6 +5,7 @@
|
||||
Dir::Dir ( ) {
|
||||
m_dirname = NULL;
|
||||
m_dir = NULL;
|
||||
m_needsClose = false;
|
||||
}
|
||||
|
||||
|
||||
@ -40,7 +41,8 @@ bool Dir::set ( char *dirname ) {
|
||||
}
|
||||
|
||||
bool Dir::close ( ) {
|
||||
if ( m_dir ) closedir ( m_dir );
|
||||
if ( m_dir && m_needsClose ) closedir ( m_dir );
|
||||
m_needsClose = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -56,6 +58,7 @@ bool Dir::open ( ) {
|
||||
if ( ! m_dir )
|
||||
return log("disk: opendir(%s) : %s",
|
||||
m_dirname,strerror( g_errno ) );
|
||||
m_needsClose = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
1
Dir.h
1
Dir.h
@ -49,6 +49,7 @@ class Dir {
|
||||
|
||||
char *m_dirname;
|
||||
DIR *m_dir;
|
||||
bool m_needsClose;
|
||||
};
|
||||
|
||||
#endif
|
||||
|
@ -161,7 +161,8 @@ case EDIFFBOTMIMEERROR: return "Diffbot mime error";
|
||||
case EDIFFBOTBADHTTPSTATUS: return "Diffbot reply bad http status";
|
||||
case EHITCRAWLLIMIT: return "Hit the page download limit";
|
||||
case EHITPROCESSLIMIT: return "Hit the page process limit";
|
||||
case EINTERNALERROR: return "Internal error";
|
||||
case EINTERNALERROR: return "Internal error";
|
||||
case EBADJSONPARSER: return "Bad JSON parser";
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
||||
|
3
Errno.h
3
Errno.h
@ -165,6 +165,7 @@ enum {
|
||||
EDIFFBOTBADHTTPSTATUS,
|
||||
EHITCRAWLLIMIT,
|
||||
EHITPROCESSLIMIT,
|
||||
EINTERNALERROR
|
||||
EINTERNALERROR,
|
||||
EBADJSONPARSER
|
||||
};
|
||||
#endif
|
||||
|
@ -238,7 +238,7 @@ class FBRec {
|
||||
#endif
|
||||
|
||||
// facebook id for matt wells
|
||||
#define MATTWELLS 100003532411011LL
|
||||
#define FB_MATTWELLS 100003532411011LL
|
||||
|
||||
|
||||
//#define APPNAME "Event Widget"
|
||||
|
@ -341,6 +341,7 @@ bool HashTableX::setTableSize ( long oldn , char *buf , long bufSize ) {
|
||||
m_bufSize = need;
|
||||
m_doFree = true;
|
||||
if ( ! m_buf ) return false;
|
||||
QUICKPOLL(m_niceness);
|
||||
}
|
||||
|
||||
// save the old junk
|
||||
|
@ -99,10 +99,10 @@ long Highlight::set ( SafeBuf *sb,
|
||||
long version = TITLEREC_CURRENT_VERSION;
|
||||
|
||||
Bits bits;
|
||||
if ( ! bits.set (&words,version,niceness) ) return 0;
|
||||
if ( ! bits.set (&words,version,niceness) ) return -1;
|
||||
|
||||
Phrases phrases;
|
||||
if ( !phrases.set(&words,&bits,true,false,version,niceness))return 0;
|
||||
if ( !phrases.set(&words,&bits,true,false,version,niceness))return -1;
|
||||
|
||||
//SafeBuf langBuf;
|
||||
//if ( !setLangVec ( &words , &langBuf , niceness )) return 0;
|
||||
@ -115,7 +115,7 @@ long Highlight::set ( SafeBuf *sb,
|
||||
Matches matches;
|
||||
matches.setQuery ( q );
|
||||
|
||||
if ( ! matches.addMatches ( &words , &phrases ) ) return 0;
|
||||
if ( ! matches.addMatches ( &words , &phrases ) ) return -1;
|
||||
|
||||
// store
|
||||
m_numMatches = matches.getNumMatches();
|
||||
@ -172,7 +172,7 @@ long Highlight::set ( SafeBuf *sb ,
|
||||
// save room for terminating \0
|
||||
//m_bufEnd = m_buf + m_bufLen - 1;
|
||||
|
||||
if ( ! highlightWords ( words, matches, q ) ) return 0;
|
||||
if ( ! highlightWords ( words, matches, q ) ) return -1;
|
||||
|
||||
// null terminate
|
||||
//*m_bufPtr = '\0';
|
||||
|
@ -72,7 +72,9 @@ bool HttpRequest::copy ( class HttpRequest *r ) {
|
||||
// . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not)
|
||||
bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
char *userAgent , char *proto , bool doPost ,
|
||||
char *cookie , char *additionalHeader ) {
|
||||
char *cookie , char *additionalHeader ,
|
||||
// if posting something, how many bytes is it?
|
||||
long postContentLen ) {
|
||||
|
||||
m_reqBufValid = false;
|
||||
|
||||
@ -279,6 +281,8 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
if ( doPost ) {
|
||||
long contentLen = 0;
|
||||
if ( postData ) contentLen = strlen(postData);
|
||||
// this overrides if provided. -1 is default
|
||||
if ( postContentLen >= 0 ) contentLen = postContentLen;
|
||||
m_reqBuf.safePrintf ("Content-Length: %li\r\n", contentLen );
|
||||
m_reqBuf.safePrintf("\r\n");
|
||||
if ( postData ) m_reqBuf.safePrintf("%s",postData);
|
||||
@ -633,6 +637,13 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
|
||||
// matt comcast
|
||||
if ( sock && strncmp(iptoa(sock->m_ip),"75.160.49.8",11) == 0)
|
||||
m_isLocal = true;
|
||||
// matt comcast #2
|
||||
if ( sock && strncmp(iptoa(sock->m_ip),"69.181.136.143",14) == 0)
|
||||
m_isLocal = true;
|
||||
// titan
|
||||
if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0)
|
||||
m_isLocal = true;
|
||||
|
||||
|
||||
// roadrunner ip
|
||||
// if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0)
|
||||
|
@ -41,7 +41,8 @@ class HttpRequest {
|
||||
char *proto = "HTTP/1.0" ,
|
||||
bool doPost = false ,
|
||||
char *cookie = NULL ,
|
||||
char *additionalHeader = NULL ); // does not incl \r\n
|
||||
char *additionalHeader = NULL , // does not incl \r\n
|
||||
long postContentLen = -1 ); // for content-length of POST
|
||||
|
||||
// use this
|
||||
SafeBuf m_reqBuf;
|
||||
|
@ -130,7 +130,8 @@ bool HttpServer::getDoc ( char *url ,
|
||||
bool doPost ,
|
||||
char *cookie ,
|
||||
char *additionalHeader ,
|
||||
char *fullRequest ) {
|
||||
char *fullRequest ,
|
||||
char *postContent ) {
|
||||
// sanity
|
||||
if ( ip == -1 )
|
||||
log("http: you probably didn't mean to set ip=-1 did you? "
|
||||
@ -154,6 +155,9 @@ bool HttpServer::getDoc ( char *url ,
|
||||
defPort = 443;
|
||||
}
|
||||
|
||||
long pcLen = 0;
|
||||
if ( postContent ) pcLen = gbstrlen(postContent);
|
||||
|
||||
char *req = NULL;
|
||||
long reqSize;
|
||||
|
||||
@ -161,9 +165,15 @@ bool HttpServer::getDoc ( char *url ,
|
||||
if ( ! fullRequest ) {
|
||||
if ( ! r.set ( url , offset , size , ifModifiedSince ,
|
||||
userAgent , proto , doPost , cookie ,
|
||||
additionalHeader ) ) return true;
|
||||
additionalHeader , pcLen ) ) return true;
|
||||
reqSize = r.getRequestLen();
|
||||
req = (char *) mdup ( r.getRequest() , reqSize,"HttpServer");
|
||||
req = (char *) mmalloc( reqSize + pcLen ,"HttpServer");
|
||||
if ( req )
|
||||
memcpy ( req , r.getRequest() , reqSize );
|
||||
if ( req && pcLen ) {
|
||||
memcpy ( req + reqSize, postContent , pcLen );
|
||||
reqSize += pcLen;
|
||||
}
|
||||
}
|
||||
else {
|
||||
// does not contain \0 i guess
|
||||
@ -911,7 +921,8 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
|
||||
// "GET /crawlbot/downloadobjects"
|
||||
// "GET /crawlbot/downloadpages"
|
||||
if ( strncmp ( path , "/crawlbot/download/" ,19 ) == 0 ||
|
||||
strncmp ( path , "/v2/crawl/download/" ,19 ) == 0 )
|
||||
strncmp ( path , "/v2/crawl/download/" ,19 ) == 0 ||
|
||||
strncmp ( path , "/v2/bulk/download/" ,18 ) == 0 )
|
||||
return sendBackDump ( s , r );
|
||||
|
||||
// . is it a diffbot api request, like "GET /api/*"
|
||||
@ -1542,7 +1553,9 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
|
||||
*/
|
||||
}
|
||||
bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error ,
|
||||
char *errmsg, long rawFormat,
|
||||
char *errmsg,
|
||||
//long rawFormat,
|
||||
char format ,
|
||||
int errnum, char *content) {
|
||||
// clear g_errno so the send goes through
|
||||
g_errno = 0;
|
||||
@ -1559,7 +1572,7 @@ bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error ,
|
||||
// sanity check
|
||||
if ( strncasecmp(errmsg,"Success",7)==0 ) {char*xx=NULL;*xx=0;}
|
||||
|
||||
if (!rawFormat){
|
||||
if ( format == FORMAT_HTML ) {
|
||||
// Page content
|
||||
char cbuf[1024];
|
||||
sprintf (cbuf,
|
||||
@ -1946,7 +1959,11 @@ long getMsgSize ( char *buf, long bufSize, TcpSocket *s ) {
|
||||
totalReplySize,max);
|
||||
}
|
||||
// truncate the reply if we have to
|
||||
if ( totalReplySize > max ) totalReplySize = max;
|
||||
if ( totalReplySize > max ) {
|
||||
log("http: truncating reply of %li to %li bytes",
|
||||
totalReplySize,max);
|
||||
totalReplySize = max;
|
||||
}
|
||||
// truncate if we need to
|
||||
return totalReplySize;
|
||||
}
|
||||
|
@ -98,7 +98,8 @@ class HttpServer {
|
||||
char *cookie = NULL ,
|
||||
char *additionalHeader = NULL , // does not include \r\n
|
||||
// specify your own mime and post data here...
|
||||
char *fullRequest = NULL );
|
||||
char *fullRequest = NULL ,
|
||||
char *postContent = NULL );
|
||||
|
||||
bool getDoc ( long ip,
|
||||
long port,
|
||||
@ -134,7 +135,8 @@ class HttpServer {
|
||||
long *bytesSent = NULL );
|
||||
// send a "prettier" error reply, formatted in XML if necessary
|
||||
bool sendQueryErrorReply ( TcpSocket *s , long error , char *errmsg,
|
||||
long rawFormat, int errnum,
|
||||
// FORMAT_HTML=0,FORMAT_XML,FORMAT_JSON
|
||||
char format, int errnum,
|
||||
char *content=NULL);
|
||||
|
||||
|
||||
|
99
Json.cpp
99
Json.cpp
@ -4,8 +4,16 @@
|
||||
class JsonItem *Json::addNewItem () {
|
||||
|
||||
JsonItem *ji = (JsonItem *)m_sb.getBuf();
|
||||
|
||||
if ( m_sb.m_length + (long)sizeof(JsonItem) > m_sb.m_capacity ) {
|
||||
log("json: preventing buffer breach");
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// otherwise we got room
|
||||
m_sb.incrementLength(sizeof(JsonItem));
|
||||
|
||||
|
||||
if ( m_prev ) m_prev->m_next = ji;
|
||||
ji->m_prev = m_prev;
|
||||
ji->m_next = NULL;
|
||||
@ -53,7 +61,7 @@ JsonItem *Json::getItem ( char *name ) {
|
||||
|
||||
#include "Mem.h" // gbstrlen()
|
||||
|
||||
JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
|
||||
JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , long niceness ) {
|
||||
|
||||
m_prev = NULL;
|
||||
|
||||
@ -67,9 +75,15 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
|
||||
bool inQuote = false;
|
||||
long need = 0;
|
||||
for ( ; *p ; p++ ) {
|
||||
if ( *p == '\"' && (p==json || p[-1]!='\\') )
|
||||
// ignore any escaped char. also \x1234
|
||||
if ( *p == '\\' ) {
|
||||
if ( p[1] ) p++;
|
||||
continue;
|
||||
}
|
||||
if ( *p == '\"' )
|
||||
inQuote = ! inQuote;
|
||||
if ( inQuote ) continue;
|
||||
if ( inQuote )
|
||||
continue;
|
||||
if ( *p == '{' ||
|
||||
*p == ',' ||
|
||||
*p == '[' ||
|
||||
@ -172,8 +186,15 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
|
||||
if ( *p == '\"' ) {
|
||||
// find end of quote
|
||||
char *end = p + 1;
|
||||
for ( ; *end ; end++ )
|
||||
if ( *end == '\"' && end[-1] != '\\' ) break;
|
||||
for ( ; *end ; end++ ) {
|
||||
// skip two chars if escaped
|
||||
if ( *end == '\\' && end[1] ) {
|
||||
end++;
|
||||
continue;
|
||||
}
|
||||
// this quote is unescaped then
|
||||
if ( *end == '\"' ) break;
|
||||
}
|
||||
// field?
|
||||
char *x = end + 1;
|
||||
// skip spaces
|
||||
@ -207,7 +228,8 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
|
||||
// get length decoded
|
||||
long curr = m_sb.length();
|
||||
// store decoded string right after jsonitem
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,0))
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 (str,slen,
|
||||
niceness ))
|
||||
return NULL;
|
||||
// store length decoded json
|
||||
ji->m_valueLen = m_sb.length() - curr;
|
||||
@ -240,7 +262,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
|
||||
ji->m_valueDouble = 0;
|
||||
}
|
||||
// store decoded string right after jsonitem
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,0))
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,niceness))
|
||||
return NULL;
|
||||
// store length decoded json
|
||||
ji->m_valueLen = m_sb.length() - curr;
|
||||
@ -283,7 +305,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
|
||||
// copy the number as a string as well
|
||||
long curr = m_sb.length();
|
||||
// store decoded string right after jsonitem
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,0))
|
||||
if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,niceness))
|
||||
return NULL;
|
||||
// store length decoded json
|
||||
ji->m_valueLen = m_sb.length() - curr;
|
||||
@ -323,11 +345,68 @@ void Json::test ( ) {
|
||||
"in 2010\",\"18083009\":\"Apple personal digital assistants\",\"23475157\":\"Touchscreen portable media players\",\"30107877\":\"IPad\",\"9301031\":\"Apple Inc. hardware\",\"27765345\":\"IOS (Apple)\",\"26588084\":\"Tablet computers\"},\"type\":1,\"senseRank\":1,\"variety\":0.49056603773584906,\"depth\":0.5882352941176471},{\"id\":18839,\"positions\":[[1945,1950],[2204,2209]],\"name\":\"Music\",\"score\":0.7,\"contentMatch\":1,\"categories\":{\"991222\":\"Performing arts\",\"693016\":\"Entertainment\",\"691484\":\"Music\"},\"type\":1,\"senseRank\":1,\"variety\":0.22264150943396221,\"depth\":0.7058823529411764}],\"media\":[{\"pixelHeight\":350,\"link\":\"http://www.onlinemba.com/wp-content/uploads/2013/02/apple-innovates-invert-350x350.png\",\"primary\":\"true\",\"pixelWidth\":350,\"type\":\"image\"}]}";
|
||||
|
||||
|
||||
JsonItem *ji = parseJsonStringIntoJsonItems ( json );
|
||||
long niceness = 0;
|
||||
JsonItem *ji = parseJsonStringIntoJsonItems ( json , niceness );
|
||||
|
||||
// print them out?
|
||||
log("json: type0=%li",(long)ji->m_type);
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) {
|
||||
|
||||
// reset, but don't free mem etc. just set m_length to 0
|
||||
nameBuf.reset();
|
||||
// get its full compound name like "meta.twitter.title"
|
||||
JsonItem *p = this;//ji;
|
||||
char *lastName = NULL;
|
||||
char *nameArray[20];
|
||||
long numNames = 0;
|
||||
for ( ; p ; p = p->m_parent ) {
|
||||
// empty name?
|
||||
if ( ! p->m_name ) continue;
|
||||
if ( ! p->m_name[0] ) continue;
|
||||
// dup? can happen with arrays. parent of string
|
||||
// in object, has same name as his parent, the
|
||||
// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
|
||||
if ( p->m_name == lastName ) continue;
|
||||
// update
|
||||
lastName = p->m_name;
|
||||
// add it up
|
||||
nameArray[numNames++] = p->m_name;
|
||||
// breach?
|
||||
if ( numNames < 15 ) continue;
|
||||
log("build: too many names in json tag");
|
||||
break;
|
||||
}
|
||||
// assemble the names in reverse order which is correct order
|
||||
for ( long i = 1 ; i <= numNames ; i++ ) {
|
||||
// copy into our safebuf
|
||||
if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) )
|
||||
return false;
|
||||
// separate names with periods
|
||||
if ( ! nameBuf.pushChar('.') ) return false;
|
||||
}
|
||||
// remove last period
|
||||
nameBuf.removeLastChar('.');
|
||||
// and null terminate
|
||||
if ( ! nameBuf.nullTerm() ) return false;
|
||||
// change all :'s in names to .'s since : is reserved!
|
||||
char *px = nameBuf.getBufStart();
|
||||
for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// is this json item in an array of json items?
|
||||
bool JsonItem::isInArray ( ) {
|
||||
JsonItem *p = this;//ji;
|
||||
for ( ; p ; p = p->m_parent ) {
|
||||
// empty name? it's just a "value item" then, i guess.
|
||||
//if ( ! p->m_name ) continue;
|
||||
//if ( ! p->m_name[0] ) continue;
|
||||
if ( p->m_type == JT_ARRAY ) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
6
Json.h
6
Json.h
@ -51,6 +51,10 @@ class JsonItem {
|
||||
return (char *)this + sizeof(JsonItem);
|
||||
};
|
||||
|
||||
// like acme.product.offerPrice if "acme:{product:{offerprice:1.23}}"
|
||||
bool getCompoundName ( SafeBuf &nameBuf ) ;
|
||||
|
||||
bool isInArray ( );
|
||||
};
|
||||
|
||||
|
||||
@ -59,7 +63,7 @@ class Json {
|
||||
|
||||
void test();
|
||||
|
||||
JsonItem *parseJsonStringIntoJsonItems ( char *json );
|
||||
JsonItem *parseJsonStringIntoJsonItems ( char *json , long niceness );
|
||||
|
||||
JsonItem *getFirstItem ( ) ;
|
||||
|
||||
|
19
Make.depend
19
Make.depend
@ -354,7 +354,7 @@ Collectiondb.o: Collectiondb.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
IndexTable2.h Msg51.h Msg17.h IndexReadInfo.h Msg3a.h Stats.h \
|
||||
PostQueryRerank.h Sanity.h SiteGetter.h Title.h Address.h zlib.h zconf.h \
|
||||
HttpMime.h Users.h Pages.h HttpServer.h TcpServer.h openssl/err.h \
|
||||
PageCrawlBot.h Statsdb.h Process.h Msg28.h Cachedb.h Syncdb.h PageTurk.h
|
||||
PageCrawlBot.h Statsdb.h Process.h Msg28.h Cachedb.h Syncdb.h
|
||||
CollectionRec.o: CollectionRec.cpp gb-include.h types.h fctypes.h \
|
||||
Unicode.h UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h \
|
||||
hash.h Errno.h Log.h CollectionRec.h Url.h ip.h Parms.h Xml.h XmlNode.h \
|
||||
@ -374,7 +374,11 @@ CollectionRec.o: CollectionRec.cpp gb-include.h types.h fctypes.h \
|
||||
RdbBuckets.h RdbCache.h Msg5.h Msg3.h RdbMerge.h Dir.h PingServer.h \
|
||||
HttpServer.h TcpServer.h openssl/err.h MsgC.h UdpServer.h UdpSlot.h \
|
||||
UdpProtocol.h Dns.h DnsProtocol.h Multicast.h Threads.h HttpMime.h \
|
||||
Datedb.h Indexdb.h DiskPageCache.h Titledb.h Timedb.h
|
||||
Datedb.h Indexdb.h DiskPageCache.h Titledb.h Timedb.h Spider.h Msg4.h \
|
||||
Msg1.h Msg0.h Clusterdb.h Linkdb.h Msg2.h Query.h Msg20.h Summary.h \
|
||||
matches2.h Words.h StopWords.h Bits.h Pos.h Matches.h HashTableT.h \
|
||||
Domains.h CountryCode.h Tagdb.h Events.h Sections.h IndexList.h Dates.h \
|
||||
Msg22.h CatRec.h Categories.h Catdb.h
|
||||
Conf.o: Conf.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
|
||||
Log.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h iana_charset.h File.h \
|
||||
@ -668,6 +672,7 @@ Entities.o: Entities.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
Errno.o: Errno.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
|
||||
Log.h
|
||||
errnotest.o: errnotest.cpp
|
||||
Facebook.o: Facebook.cpp Facebook.h Conf.h Xml.h XmlNode.h gb-include.h \
|
||||
types.h fctypes.h Unicode.h UnicodeProperties.h UCPropTable.h iconv.h \
|
||||
UCNormalizer.h hash.h Errno.h Log.h Lang.h Iso8859.h iana_charset.h \
|
||||
@ -1349,10 +1354,10 @@ main.o: main.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
Msge0.h Msge1.h Msg8b.h SearchInput.h Msg40.h Msg39.h Msg37.h TopTree.h \
|
||||
IndexTable2.h Msg51.h Msg17.h Msg3a.h PostQueryRerank.h Sanity.h \
|
||||
SiteGetter.h Title.h Address.h DailyMerge.h Speller.h Language.h Wiki.h \
|
||||
Wiktionary.h Scraper.h Msg2a.h Msg9b.h Msg35.h Msg30.h Msg3e.h \
|
||||
PageNetTest.h AutoBan.h TuringTest.h Msg1f.h Profiler.h Blaster.h \
|
||||
Proxy.h linkspam.h sort.h Ads.h LanguagePages.h ValidPointer.h Placedb.h \
|
||||
Test.h seo.h Json.h
|
||||
Wiktionary.h Scraper.h Msg2a.h Msg9b.h Msg35.h Msg3e.h PageNetTest.h \
|
||||
AutoBan.h TuringTest.h Msg1f.h Profiler.h Blaster.h Proxy.h linkspam.h \
|
||||
sort.h Ads.h LanguagePages.h ValidPointer.h Placedb.h Test.h seo.h \
|
||||
Json.h
|
||||
matches2.o: matches2.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
|
||||
Log.h matches2.h Titledb.h Rdb.h RdbBase.h Conf.h Xml.h XmlNode.h Lang.h \
|
||||
@ -2694,7 +2699,7 @@ PageResults.o: PageResults.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
Highlight.h AutoBan.h TuringTest.h sort.h LanguageIdentifier.h \
|
||||
LanguagePages.h LangList.h XmlDoc.h Phrases.h Images.h Msg13.h Msge0.h \
|
||||
Msge1.h Msg8b.h SiteGetter.h Title.h Address.h Spider.h PageResults.h \
|
||||
Proxy.h
|
||||
Proxy.h Json.h
|
||||
PageRoot.o: PageRoot.cpp gb-include.h types.h fctypes.h Unicode.h \
|
||||
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
|
||||
Log.h Indexdb.h Rdb.h RdbBase.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h \
|
||||
|
9
Makefile
9
Makefile
@ -77,15 +77,20 @@ ifeq ("titan","$(HOST)")
|
||||
# in 2013. So it just uses clone() and does its own "threading". Unfortunately,
|
||||
# the way it works is not even possible on newer kernels because they no longer
|
||||
# allow you to override the _errno_location() function. -- matt
|
||||
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static
|
||||
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DMATTWELLS
|
||||
LIBS = ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a
|
||||
else
|
||||
# use -m32 to force 32-bit mode compilation.
|
||||
# you might have to do apt-get install gcc-multilib to ensure that -m32 works.
|
||||
# -m32 should use /usr/lib32/ as the library path.
|
||||
# i also provide 32-bit libraries for linking that are not so easy to get.
|
||||
#
|
||||
# mdw. 11/17/2013. i took out the -D_PTHREADS_ flag (and -lpthread).
|
||||
# trying to use good ole' clone() again because it seems the errno location
|
||||
# thing is fixed by just ignoring it.
|
||||
#
|
||||
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DPTHREADS -Wno-unused-but-set-variable
|
||||
LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
|
||||
LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
|
||||
endif
|
||||
|
||||
# if you have seo.cpp link that in. This is not part of the open source
|
||||
|
4
Mem.cpp
4
Mem.cpp
@ -462,6 +462,10 @@ bool Mem::init ( long long maxMem ) {
|
||||
// this is called by C++ classes' constructors to register mem
|
||||
void Mem::addMem ( void *mem , long size , const char *note , char isnew ) {
|
||||
|
||||
// enforce safebuf::setLabel being called
|
||||
//if ( size>=100000 && note && strcmp(note,"SafeBuf")==0 ) {
|
||||
// char *xx=NULL;*xx=0; }
|
||||
|
||||
//validate();
|
||||
|
||||
// sanity check
|
||||
|
193
Msg13.cpp
193
Msg13.cpp
@ -15,6 +15,9 @@ long filterRobotsTxt ( char *reply , long replySize , HttpMime *mime ,
|
||||
bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts );
|
||||
void gotIframeExpandedContent ( void *state ) ;
|
||||
|
||||
void scanHammerQueue ( int fd , void *state );
|
||||
void downloadTheDocForReals ( Msg13Request *r ) ;
|
||||
|
||||
// utility functions
|
||||
bool getTestSpideredDate ( Url *u , long *origSpiderDate , char *testDir ) ;
|
||||
bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) ;
|
||||
@ -111,6 +114,11 @@ bool Msg13::registerHandler ( ) {
|
||||
if ( ! s_rt.set ( 8 , 4 , 0 , NULL , 0 , true,0,"wait13tbl") )
|
||||
return false;
|
||||
|
||||
if ( ! g_loop.registerSleepCallback(10,NULL,scanHammerQueue) )
|
||||
return log("build: Failed to register timer callback for "
|
||||
"hammer queue.");
|
||||
|
||||
|
||||
// success
|
||||
return true;
|
||||
}
|
||||
@ -419,6 +427,8 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
|
||||
|
||||
RdbCache s_hammerCache;
|
||||
static bool s_flag = false;
|
||||
Msg13Request *s_hammerQueueHead = NULL;
|
||||
Msg13Request *s_hammerQueueTail = NULL;
|
||||
|
||||
// . only return false if you want slot to be nuked w/o replying
|
||||
// . MUST always call g_udpServer::sendReply() or sendErrorReply()
|
||||
@ -486,15 +496,6 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
// temporary hack
|
||||
if ( r->m_parent ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// use the default agent unless scraping
|
||||
// force to event guru bot for now
|
||||
//char *agent = "Mozilla/5.0 (compatible; ProCogSEOBot/1.0; +http://www.procog.com/ )";
|
||||
//char *agent = "Mozilla/5.0 (compatible; GigaBot/1.0; +http://www.gigablast.com/ )";
|
||||
char *agent = g_conf.m_spiderUserAgent;
|
||||
if ( r->m_isScraping )
|
||||
agent = "Mozilla/4.0 "
|
||||
"(compatible; MSIE 6.0; Windows 98; "
|
||||
"Win 9x 4.90)" ;
|
||||
// assume we do not add it!
|
||||
r->m_addToTestCache = false;
|
||||
|
||||
@ -515,18 +516,53 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
// we skip it if its a frame page, robots.txt, root doc or some other
|
||||
// page that is a "child" page of the main page we are spidering
|
||||
if ( ! r->m_skipHammerCheck ) {
|
||||
// make sure we are not hammering an ip
|
||||
// . make sure we are not hammering an ip
|
||||
// . returns 0 if currently downloading a url from that ip
|
||||
// . returns -1 if not found
|
||||
long long last=s_hammerCache.getLongLong(0,r->m_firstIp,
|
||||
30,true);
|
||||
// get time now
|
||||
long long nowms = gettimeofdayInMilliseconds();
|
||||
// how long has it been since last download START time?
|
||||
long long waited = nowms - last;
|
||||
|
||||
bool queueIt = false;
|
||||
if ( last > 0 && waited < r->m_crawlDelayMS ) queueIt = true;
|
||||
// a "last" of 0 means currently downloading
|
||||
if ( r->m_crawlDelayMS > 0 && last == 0LL ) queueIt = true;
|
||||
// a last of -1 means not found. so first time i guess.
|
||||
if ( last == -1 ) queueIt = false;
|
||||
|
||||
// . queue it up if we haven't waited long enough
|
||||
// . then the functionr, checkQueue(), will re-eval all
|
||||
// the download requests in this hammer queue every 10ms.
|
||||
// . it will just lookup the lastdownload time in the cache,
|
||||
// which will store maybe a -1 if currently downloading...
|
||||
if ( queueIt ) {
|
||||
// debug
|
||||
//log("spider: adding %s to crawldelayqueue",r->m_url);
|
||||
// save this
|
||||
r->m_udpSlot = slot;
|
||||
r->m_nextLink = NULL;
|
||||
// add it to queue
|
||||
if ( ! s_hammerQueueHead ) {
|
||||
s_hammerQueueHead = r;
|
||||
s_hammerQueueTail = r;
|
||||
}
|
||||
else {
|
||||
s_hammerQueueTail->m_nextLink = r;
|
||||
s_hammerQueueTail = r;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
||||
// if we had it in cache check the wait time
|
||||
if ( last > 0 && waited < 400 ) {
|
||||
if ( last > 0 && waited < r->m_crawlDelayMS ) {
|
||||
log("spider: hammering firstIp=%s url=%s "
|
||||
"only waited %lli ms",
|
||||
iptoa(r->m_firstIp),r->m_url,waited);
|
||||
"only waited %lli ms of %li ms",
|
||||
iptoa(r->m_firstIp),r->m_url,waited,
|
||||
r->m_crawlDelayMS);
|
||||
// this guy has too many redirects and it fails us...
|
||||
// BUT do not core if running live, only if for test
|
||||
// collection
|
||||
@ -536,14 +572,14 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
// char*xx = NULL; *xx = 0; }
|
||||
}
|
||||
// store time now
|
||||
s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
|
||||
//s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
|
||||
// note it
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: adding download end time of %llu for "
|
||||
"firstIp=%s "
|
||||
"url=%s "
|
||||
"to msg13::hammerCache",
|
||||
nowms,iptoa(r->m_firstIp),r->m_url);
|
||||
//if ( g_conf.m_logDebugSpider )
|
||||
// log("spider: adding download end time of %llu for "
|
||||
// "firstIp=%s "
|
||||
// "url=%s "
|
||||
// "to msg13::hammerCache",
|
||||
// nowms,iptoa(r->m_firstIp),r->m_url);
|
||||
// clear error from that if any, not important really
|
||||
g_errno = 0;
|
||||
}
|
||||
@ -616,26 +652,71 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
|
||||
}
|
||||
|
||||
|
||||
// do not get .google.com/ crap
|
||||
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
downloadTheDocForReals ( r );
|
||||
}
|
||||
|
||||
void downloadTheDocForReals ( Msg13Request *r ) {
|
||||
|
||||
// are we the first?
|
||||
bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
|
||||
// wait in line cuz someone else downloading it now
|
||||
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
|
||||
g_udpServer.sendErrorReply(slot,g_errno);
|
||||
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
|
||||
return;
|
||||
}
|
||||
|
||||
// this means our callback will be called
|
||||
if ( ! firstInLine ) return;
|
||||
if ( ! firstInLine ) {
|
||||
//log("spider: inlining %s",r->m_url);
|
||||
return;
|
||||
}
|
||||
|
||||
// . store time now
|
||||
// . no, now we store 0 to indicate in progress, then we
|
||||
// will overwrite it with a timestamp when the download completes
|
||||
// . but if measuring crawldelay from beginning of the download then
|
||||
// store the current time
|
||||
// . do NOT do this when downloading robots.txt etc. type files
|
||||
// which should have skipHammerCheck set to true
|
||||
if ( r->m_crawlDelayFromEnd && ! r->m_skipHammerCheck ) {
|
||||
s_hammerCache.addLongLong(0,r->m_firstIp, 0LL);//nowms);
|
||||
}
|
||||
else if ( ! r->m_skipHammerCheck ) {
|
||||
// get time now
|
||||
long long nowms = gettimeofdayInMilliseconds();
|
||||
s_hammerCache.addLongLong(0,r->m_firstIp, nowms);
|
||||
}
|
||||
|
||||
// note it
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: adding special \"in-progress\" time of %lli for "
|
||||
"firstIp=%s "
|
||||
"url=%s "
|
||||
"to msg13::hammerCache",
|
||||
-1LL,iptoa(r->m_firstIp),r->m_url);
|
||||
|
||||
// do not get .google.com/ crap
|
||||
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// flag this
|
||||
r->m_addToTestCache = true;
|
||||
// note it here
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: downloading %s (%s)",
|
||||
r->m_url,iptoa(r->m_urlIp) );
|
||||
log("spider: downloading %s (%s) (skiphammercheck=%li)",
|
||||
r->m_url,iptoa(r->m_urlIp) ,
|
||||
(long)r->m_skipHammerCheck);
|
||||
|
||||
// use the default agent unless scraping
|
||||
// force to event guru bot for now
|
||||
//char *agent = "Mozilla/5.0 (compatible; ProCogSEOBot/1.0; +http://www.procog.com/ )";
|
||||
//char *agent = "Mozilla/5.0 (compatible; GigaBot/1.0; +http://www.gigablast.com/ )";
|
||||
char *agent = g_conf.m_spiderUserAgent;
|
||||
if ( r->m_isScraping )
|
||||
agent = "Mozilla/4.0 "
|
||||
"(compatible; MSIE 6.0; Windows 98; "
|
||||
"Win 9x 4.90)" ;
|
||||
|
||||
// download it
|
||||
if ( ! g_httpServer.getDoc ( r->m_url ,
|
||||
r->m_urlIp ,
|
||||
@ -702,6 +783,21 @@ void gotHttpReply2 ( void *state ,
|
||||
"for %s at ip %s",
|
||||
mstrerror(g_errno),r->m_url,iptoa(r->m_urlIp));
|
||||
|
||||
// get time now
|
||||
long long nowms = gettimeofdayInMilliseconds();
|
||||
// . now store the current time in the cache
|
||||
// . do NOT do this for robots.txt etc. where we skip hammer check
|
||||
if ( r->m_crawlDelayFromEnd && ! r->m_skipHammerCheck )
|
||||
s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
|
||||
// note it
|
||||
if ( g_conf.m_logDebugSpider )
|
||||
log("spider: adding final download end time of %lli for "
|
||||
"firstIp=%s "
|
||||
"url=%s "
|
||||
"to msg13::hammerCache",
|
||||
nowms,iptoa(r->m_firstIp),r->m_url);
|
||||
|
||||
|
||||
// sanity. this was happening from iframe download
|
||||
//if ( g_errno == EDNSTIMEDOUT ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
@ -2086,5 +2182,48 @@ void gotIframeExpandedContent ( void *state ) {
|
||||
delete ( xd );
|
||||
}
|
||||
|
||||
// call this once every 10ms to launch queued up download requests so that
|
||||
// we respect crawl delay for sure
|
||||
void scanHammerQueue ( int fd , void *state ) {
|
||||
|
||||
|
||||
Msg13Request *r = s_hammerQueueHead;
|
||||
if ( ! r ) return;
|
||||
|
||||
long long nowms = gettimeofdayInMilliseconds();
|
||||
|
||||
Msg13Request *prev = NULL;
|
||||
long long waited = -1LL;
|
||||
|
||||
// scan down the linked list of queued of msg13 requests
|
||||
for ( ; r ; prev = r , r = r->m_nextLink ) {
|
||||
long long last;
|
||||
last = s_hammerCache.getLongLong(0,r->m_firstIp,30,true);
|
||||
// is one from this ip outstanding?
|
||||
if ( last == 0LL && r->m_crawlDelayFromEnd ) continue;
|
||||
// download finished?
|
||||
if ( last > 0 ) {
|
||||
waited = nowms - last;
|
||||
// but skip if haven't waited long enough
|
||||
if ( waited < r->m_crawlDelayMS ) continue;
|
||||
}
|
||||
// debug
|
||||
//log("spider: downloading %s from crawldelay queue "
|
||||
// "waited=%llims crawldelay=%lims",
|
||||
// r->m_url,waited,r->m_crawlDelayMS);
|
||||
// good to go
|
||||
downloadTheDocForReals ( r );
|
||||
//
|
||||
// remove from future scans
|
||||
//
|
||||
if ( prev )
|
||||
prev->m_nextLink = r->m_nextLink;
|
||||
|
||||
if ( s_hammerQueueHead == r )
|
||||
s_hammerQueueHead = r->m_nextLink;
|
||||
|
||||
if ( s_hammerQueueTail == r )
|
||||
s_hammerQueueTail = prev;
|
||||
|
||||
// try to download some more i guess...
|
||||
}
|
||||
}
|
||||
|
8
Msg13.h
8
Msg13.h
@ -25,6 +25,10 @@ public:
|
||||
long m_maxCacheAge;
|
||||
long m_maxTextDocLen;
|
||||
long m_maxOtherDocLen;
|
||||
// in milliseconds. use -1 if none or unknown.
|
||||
long m_crawlDelayMS;
|
||||
// for linked list, this is the hammer queue
|
||||
class Msg13Request *m_nextLink;
|
||||
// if doing spider compression, compute contentHash32 of document
|
||||
// downloaded, and if it matches this then send back EDOCUNCHANGED
|
||||
long m_contentHash32;
|
||||
@ -50,7 +54,8 @@ public:
|
||||
long m_addToTestCache:1;
|
||||
long m_skipHammerCheck:1;
|
||||
long m_attemptedIframeExpansion:1;
|
||||
long m_forEvents;
|
||||
long m_crawlDelayFromEnd:1;
|
||||
long m_forEvents:1;
|
||||
//long m_testParserEnabled:1;
|
||||
//long m_testSpiderEnabled:1;
|
||||
//long m_isPageParser:1;
|
||||
@ -83,6 +88,7 @@ public:
|
||||
memset (this,0,(char *)m_url - (char *)this + 1);
|
||||
m_maxTextDocLen = -1; // no limit
|
||||
m_maxOtherDocLen = -1; // no limit
|
||||
m_crawlDelayMS = -1; // unknown or none
|
||||
};
|
||||
};
|
||||
|
||||
|
@ -224,6 +224,7 @@ bool Msg20::getSummary ( Msg20Request *req ) {
|
||||
hostdb )) {
|
||||
// sendto() sometimes returns "Network is down" so i guess
|
||||
// we just had an "error reply".
|
||||
log("msg20: error sending mcast %s",mstrerror(g_errno));
|
||||
m_gotReply = true;
|
||||
return true;
|
||||
}
|
||||
@ -428,6 +429,12 @@ Msg20Reply::Msg20Reply ( ) {
|
||||
// this is free in destructor, so clear it here
|
||||
//ptr_eventSummaryLines = NULL;
|
||||
m_tmp = 0;
|
||||
|
||||
// seems to be an issue... caused a core with bogus size_dbuf
|
||||
long *sizePtr = &size_tbuf;
|
||||
long *sizeEnd = &size_note;
|
||||
for ( ; sizePtr <= sizeEnd ; sizePtr++ )
|
||||
*sizePtr = 0;
|
||||
}
|
||||
|
||||
|
||||
|
69
Msg39.cpp
69
Msg39.cpp
@ -13,7 +13,8 @@ static void sendReply ( UdpSlot *slot ,
|
||||
Msg39 *msg39 ,
|
||||
char *reply ,
|
||||
long replySize ,
|
||||
long replyMaxSize );
|
||||
long replyMaxSize ,
|
||||
bool hadError );
|
||||
// called when Msg2 has got all the termlists
|
||||
static void gotListsWrapper ( void *state ) ;
|
||||
// thread wrappers
|
||||
@ -66,7 +67,7 @@ void handleRequest39 ( UdpSlot *slot , long netnice ) {
|
||||
catch ( ... ) {
|
||||
g_errno = ENOMEM;
|
||||
log("msg39: new(%i): %s", sizeof(Msg39),mstrerror(g_errno));
|
||||
sendReply ( slot , NULL , NULL , 0 , 0 );
|
||||
sendReply ( slot , NULL , NULL , 0 , 0 ,true);
|
||||
return;
|
||||
}
|
||||
mnew ( THIS , sizeof(Msg39) , "Msg39" );
|
||||
@ -79,12 +80,15 @@ void handleRequest39 ( UdpSlot *slot , long netnice ) {
|
||||
|
||||
// this must always be called sometime AFTER handleRequest() is called
|
||||
void sendReply ( UdpSlot *slot , Msg39 *msg39 , char *reply , long replyLen ,
|
||||
long replyMaxSize ) {
|
||||
long replyMaxSize , bool hadError ) {
|
||||
// debug msg
|
||||
if ( g_conf.m_logDebugQuery || (msg39&&msg39->m_debug) )
|
||||
logf(LOG_DEBUG,"query: msg39: [%lu] Sending reply len=%li.",
|
||||
(long)msg39,replyLen);
|
||||
|
||||
// sanity
|
||||
if ( hadError && ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
// no longer in use. msg39 will be NULL if ENOMEM or something
|
||||
if ( msg39 ) msg39->m_inUse = false;
|
||||
|
||||
@ -140,7 +144,7 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
|
||||
g_errno = EBADREQUESTSIZE;
|
||||
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
|
||||
mstrerror(g_errno) );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
||||
return ;
|
||||
}
|
||||
|
||||
@ -176,7 +180,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
|
||||
mstrerror(g_errno) );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
||||
return ;
|
||||
}
|
||||
|
||||
@ -185,7 +189,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
|
||||
mstrerror(g_errno) );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
||||
return ;
|
||||
}
|
||||
|
||||
@ -199,7 +203,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
|
||||
m_r->m_useQueryStopWords ) ) {
|
||||
log(LOG_LOGIC,"query: msg39: setQuery: %s." ,
|
||||
mstrerror(g_errno) );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
||||
return ;
|
||||
}
|
||||
|
||||
@ -217,7 +221,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
|
||||
,m_tmpq.m_orig
|
||||
,(long)m_r->m_language
|
||||
);
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
||||
return ;
|
||||
}
|
||||
// debug
|
||||
@ -286,7 +290,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
|
||||
if ( g_errno ) {
|
||||
log(LOG_LOGIC,"query: msg39: doDocIdSplitLoop: %s." ,
|
||||
mstrerror(g_errno) );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 , true );
|
||||
return ;
|
||||
}
|
||||
// it might not have blocked! if all lists in tree and used no thread
|
||||
@ -327,11 +331,13 @@ bool Msg39::doDocIdSplitLoop ( ) {
|
||||
if ( d0 >= d1 ) break;
|
||||
// use this
|
||||
//m_debug = true;
|
||||
//log("call1");
|
||||
// . get the lists
|
||||
// . i think this always should block!
|
||||
// . it will also intersect the termlists to get the search
|
||||
// results and accumulate the winners into the "tree"
|
||||
if ( ! getLists() ) return false;
|
||||
//log("call2 g_errno=%li",(long)g_errno);
|
||||
// if there was an error, stop!
|
||||
if ( g_errno ) break;
|
||||
}
|
||||
@ -339,7 +345,7 @@ bool Msg39::doDocIdSplitLoop ( ) {
|
||||
// return error reply if we had an error
|
||||
if ( g_errno ) {
|
||||
log("msg39: Had error3: %s.", mstrerror(g_errno));
|
||||
sendReply (m_slot,this,NULL,0,0);
|
||||
sendReply (m_slot,this,NULL,0,0 , true);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -507,6 +513,7 @@ bool Msg39::getLists () {
|
||||
"sign=%c "
|
||||
"numPlusses=%hhu "
|
||||
"required=%li "
|
||||
"fielcode=%li "
|
||||
|
||||
"ebit=0x%0llx "
|
||||
"impBits=0x%0llx "
|
||||
@ -534,6 +541,7 @@ bool Msg39::getLists () {
|
||||
sign , //c ,
|
||||
0 ,
|
||||
(long)qt->m_isRequired,
|
||||
(long)qt->m_fieldCode,
|
||||
|
||||
(long long)qt->m_explicitBit ,
|
||||
(long long)qt->m_implicitBits ,
|
||||
@ -623,6 +631,16 @@ bool Msg39::getLists () {
|
||||
m_blocked = true;
|
||||
return false;
|
||||
}
|
||||
|
||||
// error?
|
||||
if ( g_errno ) {
|
||||
log("msg39: Had error getting termlists2: %s.",
|
||||
mstrerror(g_errno));
|
||||
// don't bail out here because we are in docIdSplitLoop()
|
||||
//sendReply (m_slot,this,NULL,0,0,true);
|
||||
return true;
|
||||
}
|
||||
|
||||
return gotLists ( true );
|
||||
}
|
||||
|
||||
@ -630,7 +648,16 @@ void gotListsWrapper ( void *state ) {
|
||||
Msg39 *THIS = (Msg39 *) state;
|
||||
// . hash the lists into our index table
|
||||
// . this will send back a reply or recycle and read more list data
|
||||
THIS->gotLists ( true );
|
||||
if ( ! THIS->gotLists ( true ) ) return;
|
||||
|
||||
// . if he did not block and there was an errno we send reply
|
||||
// otherwise if there was NO error he will have sent the reply
|
||||
// . if gotLists() was called in the ABOVE function and it returns
|
||||
// true then the docIdLoop() function will send back the reply.
|
||||
if ( g_errno ) {
|
||||
log("msg39: sending back error reply = %s",mstrerror(g_errno));
|
||||
sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
|
||||
}
|
||||
}
|
||||
|
||||
// . now come here when we got the necessary index lists
|
||||
@ -641,7 +668,8 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
if ( g_errno ) {
|
||||
log("msg39: Had error getting termlists: %s.",
|
||||
mstrerror(g_errno));
|
||||
sendReply (m_slot,this,NULL,0,0);
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
//sendReply (m_slot,this,NULL,0,0,true);
|
||||
return true;
|
||||
}
|
||||
// timestamp log
|
||||
@ -681,7 +709,8 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
// . actually we were using it before for rat=0/bool queries but
|
||||
// i got rid of NO_RAT_SLOTS
|
||||
if ( ! m_allocedTree && ! m_posdbTable.allocTopTree() ) {
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 );
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
//sendReply ( m_slot , this , NULL , 0 , 0 , true);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -690,7 +719,8 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
if ( ! m_posdbTable.allocWhiteListTable() ) {
|
||||
log("msg39: Had error allocating white list table: %s.",
|
||||
mstrerror(g_errno));
|
||||
sendReply (m_slot,this,NULL,0,0);
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
//sendReply (m_slot,this,NULL,0,0,true);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -703,7 +733,6 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
|
||||
// . we have to re-set the QueryTermInfos with each docid range split
|
||||
// since it will set the list ptrs from the msg2 lists
|
||||
if ( m_r->m_useNewAlgo && ! m_posdbTable.setQueryTermInfo () ) {
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 );
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -856,7 +885,7 @@ bool Msg39::addedLists ( ) {
|
||||
m_posdbTable.freeMem();
|
||||
g_errno = m_posdbTable.m_errno;
|
||||
log("query: posdbtable had error = %s",mstrerror(g_errno));
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 ,true);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -899,7 +928,7 @@ bool Msg39::setClusterRecs ( ) {
|
||||
// on error, return true, g_errno should be set
|
||||
if ( ! m_buf ) {
|
||||
log("query: msg39: Failed to alloc buf for clustering.");
|
||||
sendReply(m_slot,this,NULL,0,0);
|
||||
sendReply(m_slot,this,NULL,0,0,true);
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -981,7 +1010,7 @@ void Msg39::gotClusterRecs ( ) {
|
||||
m_clusterLevels )) {
|
||||
m_errno = g_errno;
|
||||
// send back an error reply
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 );
|
||||
sendReply ( m_slot , this , NULL , 0 , 0 ,true);
|
||||
return;
|
||||
}
|
||||
|
||||
@ -1146,7 +1175,7 @@ void Msg39::estimateHits ( ) {
|
||||
if ( ! reply ) {
|
||||
log("query: Could not allocated memory "
|
||||
"to hold reply of docids to send back.");
|
||||
sendReply(m_slot,this,NULL,0,0);
|
||||
sendReply(m_slot,this,NULL,0,0,true);
|
||||
return ;
|
||||
}
|
||||
topDocIds = (long long *) mr.ptr_docIds;
|
||||
@ -1233,6 +1262,6 @@ void Msg39::estimateHits ( ) {
|
||||
}
|
||||
|
||||
// now send back the reply
|
||||
sendReply(m_slot,this,reply,replySize,replySize);
|
||||
sendReply(m_slot,this,reply,replySize,replySize,false);
|
||||
return;
|
||||
}
|
||||
|
14
Msg40.cpp
14
Msg40.cpp
@ -1107,7 +1107,7 @@ bool Msg40::launchMsg20s ( bool recalled ) {
|
||||
req.m_bigSampleMaxLen = bigSampleMaxLen;
|
||||
req.m_titleMaxLen = 256;
|
||||
req.m_titleMaxLen = cr->m_titleMaxLen;
|
||||
if(m_si->m_isAdmin && m_si->m_xml == 0)
|
||||
if(m_si->m_isAdmin && m_si->m_format == FORMAT_HTML )
|
||||
req.m_getGigabitVector = true;
|
||||
else req.m_getGigabitVector = false;
|
||||
req.m_flags = 0;
|
||||
@ -1222,6 +1222,7 @@ bool Msg40::gotSummary ( ) {
|
||||
if ( m_numReplies < m_numRequests )
|
||||
return false;
|
||||
|
||||
doAgain:
|
||||
|
||||
// do we need to launch another batch of summary requests?
|
||||
if ( m_numRequests < m_msg3a.m_numDocIds ) {
|
||||
@ -1235,7 +1236,12 @@ bool Msg40::gotSummary ( ) {
|
||||
// it returned true, so m_numRequests == m_numReplies and
|
||||
// we don't need to launch any more! but that does NOT
|
||||
// make sense because m_numContiguous < m_msg3a.m_numDocIds
|
||||
char *xx=NULL; *xx=0;
|
||||
// . i guess the launch can fail because of oom... and
|
||||
// end up returning true here... seen it happen, and
|
||||
// we had full requests/replies for m_msg3a.m_numDocIds
|
||||
log("msg40: got all replies i guess");
|
||||
goto doAgain;
|
||||
//char *xx=NULL; *xx=0;
|
||||
}
|
||||
|
||||
|
||||
@ -1895,9 +1901,10 @@ bool Msg40::gotSummary ( ) {
|
||||
}
|
||||
|
||||
|
||||
// take this out for now...
|
||||
#ifdef GB_PQR
|
||||
// run post query reranks for this query
|
||||
long wanted = m_si->m_docsWanted + m_si->m_firstResultNum + 1;
|
||||
|
||||
if ( m_postQueryRerank.isEnabled() &&
|
||||
m_postQueryRerank.set2(wanted)){
|
||||
if ( ! m_postQueryRerank.preRerank () ) {
|
||||
@ -1916,6 +1923,7 @@ bool Msg40::gotSummary ( ) {
|
||||
m_postQueryRerank.rerankFailed();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
// set m_moreToCome, if true, we print a "Next 10" link
|
||||
m_moreToCome = (visible > //m_visibleContiguous >
|
||||
|
1188
PageCrawlBot.cpp
1188
PageCrawlBot.cpp
File diff suppressed because it is too large
Load Diff
20
PageGet.cpp
20
PageGet.cpp
@ -33,7 +33,7 @@ public:
|
||||
//TagRec m_tagRec;
|
||||
TcpSocket *m_socket;
|
||||
HttpRequest m_r;
|
||||
char m_coll[50];
|
||||
char m_coll[MAX_COLL_LEN+2];
|
||||
//CollectionRec *m_cr;
|
||||
bool m_isAdmin;
|
||||
bool m_isLocal;
|
||||
@ -136,7 +136,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
uint8_t langId = getLangIdFromAbbr ( langAbbr );
|
||||
st->m_langId = langId;
|
||||
}
|
||||
strncpy ( st->m_coll , coll , 40 );
|
||||
strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
|
||||
// store query for query highlighting
|
||||
st->m_netTestResults = r->getLong ("rnettest", false );
|
||||
if( st->m_netTestResults ) {
|
||||
@ -179,14 +179,22 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
sreq.reset();
|
||||
strcpy(sreq.m_url, url );
|
||||
sreq.setDataSize();
|
||||
xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness );
|
||||
// this returns false if "coll" is invalid
|
||||
if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) )
|
||||
goto hadSetError;
|
||||
}
|
||||
// . when getTitleRec() is called it will load the old one
|
||||
// since XmlDoc::m_setFromTitleRec will be true
|
||||
// . niceness is 0
|
||||
else {
|
||||
// use st->m_coll since XmlDoc just points to it!
|
||||
xd->set3 ( docId , st->m_coll , 0 );
|
||||
// . use st->m_coll since XmlDoc just points to it!
|
||||
// . this returns false if "coll" is invalid
|
||||
else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) {
|
||||
hadSetError:
|
||||
mdelete ( st , sizeof(State2) , "PageGet1" );
|
||||
delete ( st );
|
||||
g_errno = ENOMEM;
|
||||
log("PageGet: set3: %s", mstrerror(g_errno));
|
||||
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
||||
}
|
||||
// if it blocks while it loads title rec, it will re-call this routine
|
||||
xd->setCallback ( st , processLoopWrapper );
|
||||
|
@ -23,6 +23,8 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
|
||||
// don't allow pages bigger than 128k in cache
|
||||
char buf [ 64*1024 ];
|
||||
SafeBuf p(buf, 64*1024);
|
||||
p.setLabel ( "perfgrph" );
|
||||
|
||||
// print standard header
|
||||
g_pages.printAdminTop ( &p , s , r );
|
||||
|
||||
|
603
PageResults.cpp
603
PageResults.cpp
File diff suppressed because it is too large
Load Diff
100
PageRoot.cpp
100
PageRoot.cpp
@ -43,15 +43,36 @@ bool sendPageRoot ( TcpSocket *s, HttpRequest *r ){
|
||||
}
|
||||
|
||||
bool printNav ( SafeBuf &sb , HttpRequest *r ) {
|
||||
|
||||
char *root = "";
|
||||
char *rootSecure = "";
|
||||
if ( g_conf.m_isMattWells ) {
|
||||
root = "http://www.gigablast.com";
|
||||
rootSecure = "https://www.gigablast.com";
|
||||
}
|
||||
|
||||
sb.safePrintf("<center><b><p class=nav>"
|
||||
"<a href=\"/about.html\">About</a>"
|
||||
" <a href=\"/contact.html\">Contact</a>"
|
||||
" <a href=\"/help.html\">Help</a>"
|
||||
" <a href=/privacy.html>Privacy Policy</a>"
|
||||
" <a href=\"/searchfeed.html\">"
|
||||
"Search API</a>"
|
||||
" <a href=/seoapi.html>SEO API</a>"
|
||||
" <a href=/account>My Account</a> "
|
||||
"<a href=%s/about.html>About</a>"
|
||||
" "
|
||||
"<a href=%s/contact.html>Contact</a>"
|
||||
" "
|
||||
"<a href=%s/help.html>Help</a>"
|
||||
" "
|
||||
"<a href=%s/privacy.html>Privacy Policy</a>"
|
||||
" "
|
||||
"<a href=%s/searchfeed.html>Search API</a>"
|
||||
" "
|
||||
"<a href=%s/seoapi.html>SEO API</a>"
|
||||
" "
|
||||
"<a href=%s/account>My Account</a> "
|
||||
, root
|
||||
, root
|
||||
, root
|
||||
, root
|
||||
, root
|
||||
, root
|
||||
, rootSecure
|
||||
|
||||
//" <a href=/logout>Logout</a>"
|
||||
);
|
||||
if ( r->isLocal() )
|
||||
@ -115,7 +136,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
|
||||
sb.safePrintf("<br><br>\n");
|
||||
sb.safePrintf("<br><br><br>\n");
|
||||
sb.safePrintf("<b>web</b> <a href=/seo>seo</a> <a href=\"/Top\">directory</a> \n");
|
||||
sb.safePrintf("<b>web</b> "
|
||||
"<a href=http://www.gigablast.com/seo>seo</a> "
|
||||
" "
|
||||
"<a href=\"/Top\">directory</a> "
|
||||
" \n");
|
||||
sb.safePrintf("<a href=/adv.html>advanced search</a>");
|
||||
sb.safePrintf(" ");
|
||||
sb.safePrintf("<a href=/addurl title=\"Instantly add your url to "
|
||||
@ -135,7 +160,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
|
||||
|
||||
sb.safePrintf("<tr valign=top>\n");
|
||||
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:red;></td>\n");
|
||||
sb.safePrintf("<td align=center><div style=width:50px;height:50px;display:inline-block;background-color:red;></div></td>\n");
|
||||
sb.safePrintf("<td><font size=+1><b>Open Source!</b>"
|
||||
"</font><br>\n");
|
||||
sb.brify2("Gigablast is now available as an <a href=https://github.com/gigablast/open-source-search-engine>open source search engine</a> on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. <a href=/admin.html#features>Features.</a> Limited support available for free."
|
||||
@ -144,19 +169,37 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
sb.safePrintf("</td></tr>\n");
|
||||
|
||||
|
||||
char *root = "";
|
||||
if ( g_conf.m_isMattWells )
|
||||
root = "http://www.gigablast.com";
|
||||
|
||||
sb.safePrintf("<tr valign=top>\n");
|
||||
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:green;></td>\n");
|
||||
sb.safePrintf("<td><font size=+1><b>The Green Search Engine</b></font><br>\n");
|
||||
sb.brify2("Gigablast is the only clean-powered web search engine. 90% of its power usage comes from wind energy. Astoundingly, Gigablast is one of ONLY four search engines in the United States indexing over a billion pages.",80);
|
||||
// 204x143
|
||||
sb.safePrintf("<td><img height=52px width=75px "
|
||||
"src=%s/eventguru.png></td>\n"
|
||||
, root );
|
||||
sb.safePrintf("<td><font size=+1><b>Event Guru Returns</b></font><br>\n");
|
||||
sb.brify2("<a href=http://www.eventguru.com/>Event Guru</a> datamines events from the web. It identifies events on a web page, or even plain text, using the same rules of deduction used by the human mind. It also has Facebook integration and lots of other cool things.",80);
|
||||
sb.safePrintf("<br><br></td></tr>\n");
|
||||
sb.safePrintf("\n");
|
||||
sb.safePrintf("\n");
|
||||
|
||||
|
||||
/*
|
||||
sb.safePrintf("<tr valign=top>\n");
|
||||
sb.safePrintf("<td align=center><div style=width:50px;height:50px;display:inline-block;background-color:green;></div></td>\n");
|
||||
sb.safePrintf("<td><font size=+1><b>The Green Search Engine</b></font><br>\n");
|
||||
sb.brify2("Gigablast is the only clean-powered web search engine. 90% of its power usage comes from wind energy. Astoundingly, Gigablast is one of ONLY four search engines in the United States indexing over a billion pages.",80);
|
||||
sb.safePrintf("<br><br></td></tr>\n");
|
||||
sb.safePrintf("\n");
|
||||
sb.safePrintf("\n");
|
||||
*/
|
||||
|
||||
|
||||
sb.safePrintf("<tr valign=top>\n");
|
||||
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:0040fe;></td>\n");
|
||||
sb.safePrintf("<td align=center><img src=%s/gears.png "
|
||||
"height=50 width=50></div></td>\n"
|
||||
, root );
|
||||
sb.safePrintf("<td><font size=+1><b>The Transparent Search Engine</b></font><br>\n");
|
||||
sb.brify2("Gigablast is the first truly transparent search engine. It tells you exactly why the search results are ranked the way they are. There is nothing left to the imagination.",85);
|
||||
sb.safePrintf("<br><br>");
|
||||
@ -165,9 +208,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
sb.safePrintf("\n");
|
||||
|
||||
sb.safePrintf("<tr valign=top>\n");
|
||||
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:f2b629;></td>\n");
|
||||
sb.safePrintf("<td align=center><center><img src=%s/dollargear.png "
|
||||
"height=50 width=50></center></div></center></td>\n"
|
||||
, root );
|
||||
sb.safePrintf("<td><font size=+1><b>The SEO Search Engine</b></font><br>\n");
|
||||
sb.brify2("When it comes to search-engine based SEO, Gigablast is the place to be. With a frothy set of unique and effective <a href=/seo>SEO tools</a>, you will find all you need to execute a simple yet effective SEO strategy. Stop the guesswork, and let a search engine tell you how to SEO it.",85);
|
||||
sb.brify2("When it comes to search-engine based SEO, Gigablast is the place to be. With a frothy set of unique and effective <a href=http://www.gigablast.com/seo>SEO tools</a>, you will find all you need to execute a simple yet effective SEO strategy. Stop the guesswork, and let a search engine tell you how to SEO it.",85);
|
||||
sb.safePrintf("</td></tr>\n");
|
||||
|
||||
|
||||
@ -325,7 +370,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
|
||||
sb.safePrintf("<br><br>\n");
|
||||
sb.safePrintf("<br><br><br>\n");
|
||||
sb.safePrintf("<a href=/>web</a> <a href=/seo>seo</a> <a href=\"/Top\">directory</a> \n");
|
||||
sb.safePrintf("<a href=/>web</a> <a href=http://www.gigablast.com/seo>seo</a> <a href=\"/Top\">directory</a> \n");
|
||||
sb.safePrintf("<a href=/adv.html>advanced search</a>");
|
||||
sb.safePrintf(" ");
|
||||
sb.safePrintf("<b title=\"Instantly add your url to Gigablast's "
|
||||
@ -368,19 +413,22 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
// . when loaded with the main page for the first time it will
|
||||
// immediately replace its content...
|
||||
if ( url ) {
|
||||
char *root = "";
|
||||
if ( g_conf.m_isMattWells )
|
||||
root = "http://www.gigablast.com";
|
||||
sb.safePrintf("<br>"
|
||||
"<br>"
|
||||
"<div id=msgbox>"
|
||||
//"<b>Injecting your url. Please wait...</b>"
|
||||
"<center>"
|
||||
"<img src=/gears.gif width=50 height=50>"
|
||||
"<img src=%s/gears.gif width=50 height=50>"
|
||||
"</center>"
|
||||
"<script type=text/javascript>"
|
||||
//"alert('shit');"
|
||||
"var client = new XMLHttpRequest();\n"
|
||||
"client.onreadystatechange = handler;\n"
|
||||
"var url='/addurl?u="
|
||||
);
|
||||
, root );
|
||||
sb.urlEncode ( url );
|
||||
// propagate "admin" if set
|
||||
//long admin = hr->getLong("admin",-1);
|
||||
@ -463,11 +511,17 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
|
||||
sb.safePrintf("<br><br>\n");
|
||||
sb.safePrintf("<br><br><br>\n");
|
||||
sb.safePrintf("<a href=/>web</a> <a href=/seo>seo</a> <b>directory</b> \n");
|
||||
sb.safePrintf("<a href=/>web</a> <a href=http://www.gigablast.com/seo>seo</a> <b>directory</b> \n");
|
||||
sb.safePrintf("<a href=http://www.gigablast.com/events>events</a>"
|
||||
" \n");
|
||||
sb.safePrintf("<a href=/adv.html>advanced search</a>");
|
||||
sb.safePrintf(" ");
|
||||
sb.safePrintf("<a href=/addurl title=\"Instantly add your url to "
|
||||
"Gigablast's index\">add url</a>");
|
||||
char *root = "";
|
||||
if ( g_conf.m_isMattWells )
|
||||
root = "http://www.gigablast.com";
|
||||
sb.safePrintf("<a href=%s/addurl title=\"Instantly add your url to "
|
||||
"Gigablast's index\">add url</a>"
|
||||
, root );
|
||||
sb.safePrintf("\n");
|
||||
sb.safePrintf("<br><br>\n");
|
||||
// submit to HTTPS now
|
||||
@ -1591,7 +1645,7 @@ void doneInjectingWrapper3 ( void *st ) {
|
||||
rand32);
|
||||
sb.urlEncode(url);
|
||||
sb.safePrintf(">Check it</a> or "
|
||||
"<a href=/seo?u=");
|
||||
"<a href=http://www.gigablast.com/seo?u=");
|
||||
sb.urlEncode(url);
|
||||
sb.safePrintf(">SEO it</a>"
|
||||
".</b>");
|
||||
|
@ -97,7 +97,7 @@ bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
|
||||
st->m_dateCustom = (bool)r->getLong( "custom", 0 );
|
||||
// default to 10 hours, i would do 1 day except that there are
|
||||
// some bugs that mess up the display a lot when i do that
|
||||
st->m_datePeriod = r->getLong( "date_period" , 36000 );
|
||||
st->m_datePeriod = r->getLong( "date_period" , 300 );//36000 );
|
||||
st->m_dateUnits = r->getLong( "date_units" , 1 );//SECS_PER_MIN
|
||||
st->m_now = (bool)r->getLong( "date_now" , 1 );
|
||||
st->m_autoUpdate = (bool)r->getLong( "auto_update" , 0 );
|
||||
@ -152,8 +152,8 @@ void sendReply ( void *state ) {
|
||||
|
||||
TcpSocket *s = st->m_socket;
|
||||
|
||||
SafeBuf buf( 1024*32 );
|
||||
SafeBuf tmpBuf( 1024 );
|
||||
SafeBuf buf( 1024*32 , "tmpbuf0" );
|
||||
SafeBuf tmpBuf( 1024 , "tmpbuf1" );
|
||||
|
||||
//
|
||||
// take these out until we need them!
|
||||
|
@ -361,6 +361,8 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) {
|
||||
path = "admin/inject"; pathLen = gbstrlen(path); }
|
||||
if ( pathLen == 9 && strncmp ( path , "index.php" , 9 ) == 0 ) {
|
||||
path = "search"; pathLen = gbstrlen(path); }
|
||||
if ( pathLen == 10 && strncmp ( path , "search.csv" , 10 ) == 0 ) {
|
||||
path = "search"; pathLen = gbstrlen(path); }
|
||||
|
||||
// if it is like /GA/Atlanta then call sendPageResults
|
||||
// and that should be smart enough to set the m_where in
|
||||
|
106
Parms.cpp
106
Parms.cpp
@ -2127,15 +2127,15 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
// . if printing on crawlbot page hide these
|
||||
// . we repeat this logic below when printing parm titles
|
||||
// for the column headers in the table
|
||||
char *vt = "";
|
||||
if ( isCrawlbot &&
|
||||
m->m_page == PAGE_FILTERS &&
|
||||
(strcmp(m->m_xml,"spidersEnabled") == 0 ||
|
||||
//strcmp(m->m_xml,"maxSpidersPerRule")==0||
|
||||
//strcmp(m->m_xml,"maxSpidersPerIp") == 0||
|
||||
strcmp(m->m_xml,"spiderIpWait") == 0
|
||||
) )
|
||||
vt = " style=display:none;";
|
||||
//char *vt = "";
|
||||
//if ( isCrawlbot &&
|
||||
// m->m_page == PAGE_FILTERS &&
|
||||
// (strcmp(m->m_xml,"spidersEnabled") == 0 ||
|
||||
// //strcmp(m->m_xml,"maxSpidersPerRule")==0||
|
||||
// //strcmp(m->m_xml,"maxSpidersPerIp") == 0||
|
||||
// strcmp(m->m_xml,"spiderIpWait") == 0
|
||||
// ) )
|
||||
// vt = " style=display:none;";
|
||||
|
||||
// what type of parameter?
|
||||
char t = m->m_type;
|
||||
@ -2210,15 +2210,16 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
if ( isJSON ) continue;
|
||||
// . hide table column headers that are too advanced
|
||||
// . we repeat this logic above for the actual parms
|
||||
char *vt = "";
|
||||
if ( isCrawlbot &&
|
||||
m->m_page == PAGE_FILTERS &&
|
||||
(strcmp(mk->m_xml,"spidersEnabled") == 0 ||
|
||||
//strcmp(mk->m_xml,"maxSpidersPerRule")==0||
|
||||
//strcmp(mk->m_xml,"maxSpidersPerIp") == 0||
|
||||
strcmp(mk->m_xml,"spiderIpWait") == 0 ) )
|
||||
vt = " style=display:none;display:none;";
|
||||
sb->safePrintf ( "<td%s>" , vt );
|
||||
//char *vt = "";
|
||||
//if ( isCrawlbot &&
|
||||
// m->m_page == PAGE_FILTERS &&
|
||||
// (strcmp(mk->m_xml,"spidersEnabled") == 0 ||
|
||||
// //strcmp(mk->m_xml,"maxSpidersPerRule")==0||
|
||||
// //strcmp(mk->m_xml,"maxSpidersPerIp") == 0||
|
||||
// strcmp(mk->m_xml,"spiderIpWait") == 0 ) )
|
||||
// vt = " style=display:none;display:none;";
|
||||
//sb->safePrintf ( "<td%s>" , vt );
|
||||
sb->safePrintf ( "<td>" );
|
||||
// if its of type checkbox in a table make it
|
||||
// toggle them all on/off
|
||||
if ( mk->m_type == TYPE_CHECKBOX &&
|
||||
@ -2310,7 +2311,8 @@ bool Parms::printParm ( SafeBuf* sb,
|
||||
else if ( firstInRow )
|
||||
sb->safePrintf ( "<tr><td>" );
|
||||
else
|
||||
sb->safePrintf ( "<td%s>" , vt);
|
||||
//sb->safePrintf ( "<td%s>" , vt);
|
||||
sb->safePrintf ( "<td>" );
|
||||
}
|
||||
|
||||
long cast = m->m_cast;
|
||||
@ -4008,7 +4010,7 @@ char *Parms::getParmHtmlEncoded ( char *p , char *pend , Parm *m , char *s ) {
|
||||
// time is stored as long
|
||||
long ct = *(long *)s;
|
||||
// get the time struct
|
||||
struct tm *tp = gmtime ( (time_t *)&ct ) ;
|
||||
struct tm *tp = localtime ( (time_t *)&ct ) ;
|
||||
// set the "selected" month for the drop down
|
||||
strftime ( p , 100 , "%d %b %Y %H:%M UTC" , tp );
|
||||
}
|
||||
@ -8499,6 +8501,30 @@ void Parms::init ( ) {
|
||||
m->m_units = "seconds";
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbapi";
|
||||
m->m_xml = "diffbotApiUrl";
|
||||
m->m_off = (char *)&cr.m_diffbotApiUrl - x;
|
||||
m->m_type = TYPE_SAFEBUF;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_def = "";
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbucp";
|
||||
m->m_xml = "diffbotUrlCrawlPattern";
|
||||
m->m_off = (char *)&cr.m_diffbotUrlCrawlPattern - x;
|
||||
m->m_type = TYPE_SAFEBUF;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_def = "";
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbupp";
|
||||
m->m_xml = "diffbotUrlProcessPattern";
|
||||
m->m_off = (char *)&cr.m_diffbotUrlProcessPattern - x;
|
||||
m->m_type = TYPE_SAFEBUF;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_def = "";
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbppp";
|
||||
m->m_xml = "diffbotPageProcessPattern";
|
||||
m->m_off = (char *)&cr.m_diffbotPageProcessPattern - x;
|
||||
@ -8507,6 +8533,22 @@ void Parms::init ( ) {
|
||||
m->m_def = "";
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbucre";
|
||||
m->m_xml = "diffbotUrlCrawlRegEx";
|
||||
m->m_off = (char *)&cr.m_diffbotUrlCrawlRegEx - x;
|
||||
m->m_type = TYPE_SAFEBUF;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_def = "";
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbupre";
|
||||
m->m_xml = "diffbotUrlProcessRegEx";
|
||||
m->m_off = (char *)&cr.m_diffbotUrlProcessRegEx - x;
|
||||
m->m_type = TYPE_SAFEBUF;
|
||||
m->m_page = PAGE_NONE;
|
||||
m->m_def = "";
|
||||
m++;
|
||||
|
||||
m->m_cgi = "dbopn";
|
||||
m->m_xml = "diffbotOnlyProcessIfNew";
|
||||
m->m_off = (char *)&cr.m_diffbotOnlyProcessIfNew - x;
|
||||
@ -13027,6 +13069,17 @@ void Parms::init ( ) {
|
||||
m->m_def = "";
|
||||
m++;
|
||||
|
||||
m->m_title = "harvest links";
|
||||
m->m_cgi = "hspl";
|
||||
m->m_xml = "harvestLinks";
|
||||
m->m_max = MAX_FILTERS;
|
||||
m->m_off = (char *)cr.m_harvestLinks - x;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "1";
|
||||
m->m_page = PAGE_FILTERS;
|
||||
m->m_rowid = 1;
|
||||
m++;
|
||||
|
||||
m->m_title = "spidering enabled";
|
||||
m->m_cgi = "cspe";
|
||||
m->m_xml = "spidersEnabled";
|
||||
@ -15116,18 +15169,19 @@ void Parms::init ( ) {
|
||||
m->m_sprpp = 0;
|
||||
m++;
|
||||
|
||||
/*
|
||||
m->m_title = "format of the returned search results";
|
||||
m->m_desc = "X is 0 to get back results in regular html, and 8 to "
|
||||
"get back results in XML.";
|
||||
m->m_desc = "X is 0 to get back results in regular html, 1 to "
|
||||
"get back results in XML, 2 for JSON.";
|
||||
m->m_def = "0";
|
||||
m->m_soff = (char *)&si.m_xml - y;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_soff = (char *)&si.m_formatStr - y;
|
||||
m->m_type = TYPE_STRING;//CHAR;
|
||||
m->m_sparm = 1;
|
||||
m->m_scgi = "xml";
|
||||
m->m_scgi = "format";
|
||||
m->m_smin = 0;
|
||||
m->m_smax = 12;
|
||||
m++;
|
||||
|
||||
*/
|
||||
|
||||
m->m_title = "highlight query terms in summaries.";
|
||||
m->m_desc = "Use to disable or enable "
|
||||
|
@ -3043,6 +3043,8 @@ void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
|
||||
ei->m_finalCallback ( ei->m_finalState );
|
||||
}
|
||||
|
||||
bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) ;
|
||||
|
||||
// . return false if would block, true otherwise
|
||||
// . used to send email and get a url when a crawl hits a maxToCrawl
|
||||
// or maxToProcess limitation.
|
||||
@ -3103,15 +3105,38 @@ bool sendNotification ( EmailInfo *ei ) {
|
||||
if ( url && url[0] ) {
|
||||
log("build: sending url notification to %s for coll \"%s\"",
|
||||
url,crawl);
|
||||
|
||||
Url uu; uu.set ( url );
|
||||
|
||||
SafeBuf fullReq;
|
||||
fullReq.safePrintf("POST %s HTTP/1.0\r\n"
|
||||
"User-Agent: Crawlbot/2.0\r\n"
|
||||
"Accept: */*\r\n"
|
||||
"Host: "
|
||||
, uu.getPath()
|
||||
);
|
||||
fullReq.safeMemcpy ( uu.getHost() , uu.getHostLen() );
|
||||
// make custom headers
|
||||
SafeBuf custom;
|
||||
custom.safePrintf ( "X-Crawl-Name: %s\r\n"
|
||||
fullReq.safePrintf ("X-Crawl-Name: %s\r\n"
|
||||
// last \r\n is added in HttpRequest.cpp
|
||||
"X-Crawl-Status: %s"// \r\n" // hdrs
|
||||
|
||||
"X-Crawl-Status: %s\r\n" // hdrs
|
||||
, cr->m_diffbotCrawlName.getBufStart()
|
||||
, ei->m_spiderStatusMsg.getBufStart()
|
||||
);
|
||||
// also in post body
|
||||
SafeBuf postContent;
|
||||
// the collection details
|
||||
printCrawlDetailsInJson ( postContent , cr );
|
||||
// content-length of it
|
||||
fullReq.safePrintf("Content-Length: %li\r\n",
|
||||
postContent.length());
|
||||
// type is json
|
||||
fullReq.safePrintf("Content-Type: application/json\r\n");
|
||||
fullReq.safePrintf("\r\n");
|
||||
// then the post content
|
||||
fullReq.safeMemcpy ( &postContent );
|
||||
fullReq.nullTerm();
|
||||
|
||||
// GET request
|
||||
if ( ! g_httpServer.getDoc ( url ,
|
||||
0 , // ip
|
||||
@ -3129,8 +3154,9 @@ bool sendNotification ( EmailInfo *ei ) {
|
||||
"HTTP/1.0", // proto
|
||||
true , // doPost
|
||||
NULL, // cookie
|
||||
custom.getBufStart(),
|
||||
NULL ) ) // fullRequest
|
||||
NULL , // custom hdrs
|
||||
fullReq.getBufStart() ,
|
||||
NULL ) )
|
||||
ei->m_notifyBlocked++;
|
||||
}
|
||||
|
||||
|
93
Posdb.cpp
93
Posdb.cpp
@ -712,6 +712,9 @@ bool PosdbTable::allocTopTree ( ) {
|
||||
// return false;
|
||||
|
||||
if ( m_r->m_getDocIdScoringInfo ) {
|
||||
|
||||
m_scoreInfoBuf.setLabel ("scinfobuf" );
|
||||
|
||||
// . for holding the scoring info
|
||||
// . add 1 for the \0 safeMemcpy() likes to put at the end so
|
||||
// it will not realloc on us
|
||||
@ -731,6 +734,10 @@ bool PosdbTable::allocTopTree ( ) {
|
||||
// compute. so this could easily get into the megabytes, most
|
||||
// of the time we will not need nearly that much however.
|
||||
numPairs *= xx;
|
||||
|
||||
m_pairScoreBuf.setLabel ( "pairbuf" );
|
||||
m_singleScoreBuf.setLabel ("snglbuf" );
|
||||
|
||||
// but alloc it just in case
|
||||
if ( ! m_pairScoreBuf.reserve (numPairs * sizeof(PairScore) ) )
|
||||
return false;
|
||||
@ -786,7 +793,7 @@ bool PosdbTable::allocTopTree ( ) {
|
||||
slots = 20000000;
|
||||
}
|
||||
// each site hash is 4 bytes
|
||||
if ( ! m_siteHashList.reserve ( slots ) )
|
||||
if ( ! m_siteHashList.reserve ( slots ,"shshbuf" ) )
|
||||
return false;
|
||||
// quad # of sites to have space in between
|
||||
if ( ! m_dt.set(4,0,slots,NULL,0,false,0,"pdtdt"))
|
||||
@ -1005,7 +1012,7 @@ void PosdbTable::evalSlidingWindow ( char **ptrs ,
|
||||
for ( long i = 0 ; i < maxi ; i++ ) {
|
||||
|
||||
// skip if to the left of a pipe operator
|
||||
if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
|
||||
if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
|
||||
|
||||
//if ( ptrs[i] ) wpi = ptrs[i];
|
||||
// if term does not occur in body, sub-in the best term
|
||||
@ -1027,7 +1034,7 @@ void PosdbTable::evalSlidingWindow ( char **ptrs ,
|
||||
for ( ; j < maxj ; j++ ) {
|
||||
|
||||
// skip if to the left of a pipe operator
|
||||
if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE) ) continue;
|
||||
if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
|
||||
|
||||
// TODO: use a cache using wpi/wpj as the key.
|
||||
//if ( ptrs[j] ) wpj = ptrs[j];
|
||||
@ -4097,6 +4104,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
|
||||
long nrg = 0;
|
||||
|
||||
// assume not sorting by a numeric termlist
|
||||
m_sortByTermNum = -1;
|
||||
|
||||
//for ( long i = 0 ; i < m_msg2->getNumLists() ; i++ ) {
|
||||
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
|
||||
QueryTerm *qt = &m_q->m_qterms[i];
|
||||
@ -4111,6 +4121,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
qti->m_qpos = wordNum;
|
||||
qti->m_wikiPhraseId = qw->m_wikiPhraseId;
|
||||
qti->m_quotedStartId = qw->m_quoteStart;
|
||||
// is it gbsortby:?
|
||||
if ( qt->m_fieldCode == FIELD_GBSORTBY ||
|
||||
qt->m_fieldCode == FIELD_GBREVSORTBY )
|
||||
m_sortByTermNum = i;
|
||||
// count
|
||||
long nn = 0;
|
||||
// also add in bigram lists
|
||||
@ -4226,6 +4240,18 @@ bool PosdbTable::setQueryTermInfo ( ) {
|
||||
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
|
||||
// is it a negative term?
|
||||
if ( qt->m_termSign=='-')qti->m_bigramFlags[nn]|=BF_NEGATIVE;
|
||||
|
||||
// numeric posdb termlist flags. instead of word position
|
||||
// they have a float stored there for sorting etc.
|
||||
if (qt->m_fieldCode == FIELD_GBSORTBY )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBREVSORTBY )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBNUMBERMIN )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
if (qt->m_fieldCode == FIELD_GBNUMBERMAX )
|
||||
qti->m_bigramFlags[nn]|=BF_NUMBER;
|
||||
|
||||
// only really add if useful
|
||||
// no, because when inserting NEW (related) terms that are
|
||||
// not currently in the document, this list may initially
|
||||
@ -4912,15 +4938,27 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// sites right now. this hash table must have been pre-allocated
|
||||
// in Posdb::allocTopTree() above since we might be in a thread.
|
||||
//
|
||||
RdbList *whiteLists = m_msg2->m_whiteLists;
|
||||
long nw = m_msg2->m_w;
|
||||
RdbList *whiteLists = NULL;
|
||||
long nw = 0;
|
||||
if ( m_msg2 ) {
|
||||
whiteLists = m_msg2->m_whiteLists;
|
||||
nw = m_msg2->m_w;
|
||||
}
|
||||
for ( long i = 0 ; ! m_addedSites && i < nw ; i++ ) {
|
||||
RdbList *list = &whiteLists[i];
|
||||
if ( list->isEmpty() ) continue;
|
||||
// sanity test
|
||||
long long d1 = g_posdb.getDocId(list->getList());
|
||||
if ( d1 > m_msg2->m_docIdEnd ) { char *xx=NULL;*xx=0; }
|
||||
if ( d1 < m_msg2->m_docIdStart ) { char *xx=NULL;*xx=0; }
|
||||
if ( d1 > m_msg2->m_docIdEnd ) {
|
||||
log("posdb: d1=%lli > %lli",
|
||||
d1,m_msg2->m_docIdEnd);
|
||||
//char *xx=NULL;*xx=0;
|
||||
}
|
||||
if ( d1 < m_msg2->m_docIdStart ) {
|
||||
log("posdb: d1=%lli < %lli",
|
||||
d1,m_msg2->m_docIdStart);
|
||||
//char *xx=NULL;*xx=0;
|
||||
}
|
||||
// first key is always 18 bytes cuz it has the termid
|
||||
// scan recs in the list
|
||||
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
|
||||
@ -5049,6 +5087,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
QueryTermInfo *qti = &qip[i];
|
||||
// skip if negative query term
|
||||
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
|
||||
// skip if numeric field like gbsortby:price gbmin.price:1.23
|
||||
if ( qti->m_bigramFlags[0] & BF_NUMBER ) continue;
|
||||
// set it
|
||||
if ( qti->m_wikiPhraseId == 1 ) continue;
|
||||
// stop
|
||||
@ -5298,6 +5338,9 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
long nnn = m_numQueryTermInfos;
|
||||
if ( ! m_r->m_doMaxScoreAlgo ) nnn = 0;
|
||||
|
||||
// do not do it if we got a gbsortby: field
|
||||
if ( m_sortByTermNum >= 0 ) nnn = 0;
|
||||
|
||||
/*
|
||||
// skip all this if getting score of just one docid on special
|
||||
// posdb termlists that are 6-byte only keys
|
||||
@ -5584,6 +5627,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
|
||||
pass0++;
|
||||
|
||||
if ( m_sortByTermNum >= 0 ) goto skipScoringFilter;
|
||||
|
||||
// test why we are slow
|
||||
//if ( (s_sss++ % 8) != 0 ) { docIdPtr += 6; fail0++; goto docIdLoop;}
|
||||
|
||||
@ -5743,6 +5788,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
skipScoringFilter:
|
||||
|
||||
pass++;
|
||||
|
||||
skipPreAdvance:
|
||||
@ -5770,7 +5817,12 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
// mini merge buf:
|
||||
mptr = mbuf;
|
||||
|
||||
// merge each set of sublists
|
||||
// . merge each set of sublists
|
||||
// . like we merge a term's list with its two associated bigram
|
||||
// lists, if there, the left bigram and right bigram list.
|
||||
// . and merge all the synonym lists for that term together as well.
|
||||
// so if the term is 'run' we merge it with the lists for
|
||||
// 'running' 'ran' etc.
|
||||
for ( long j = 0 ; j < m_numQueryTermInfos ; j++ ) {
|
||||
// get the query term info
|
||||
QueryTermInfo *qti = &qip[j];
|
||||
@ -6045,12 +6097,12 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
|
||||
|
||||
// skip if not part of score
|
||||
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
|
||||
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
|
||||
|
||||
// and pair it with each other possible query term
|
||||
for ( long j = i+1 ; j < m_numQueryTermInfos ; j++ ) {
|
||||
// skip if not part of score
|
||||
if ( bflags[j] & (BF_PIPED|BF_NEGATIVE) ) continue;
|
||||
if ( bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
|
||||
// but if they are in the same wikipedia phrase
|
||||
// then try to keep their positions as in the query.
|
||||
// so for 'time enough for love' ideally we want
|
||||
@ -6126,7 +6178,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
|
||||
float sts;
|
||||
// skip if to the left of a pipe operator
|
||||
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
|
||||
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
|
||||
// sometimes there is no wordpos subtermlist for this docid
|
||||
// because it just has the bigram, like "streetlight" and not
|
||||
// the word "light" by itself for the query 'street light'
|
||||
@ -6218,7 +6270,7 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
//
|
||||
for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
|
||||
// skip if to the left of a pipe operator
|
||||
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
|
||||
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
|
||||
// skip wordposition until it in the body
|
||||
while ( xpos[i] &&!s_inBody[g_posdb.getHashGroup(xpos[i])]) {
|
||||
// advance
|
||||
@ -6269,7 +6321,9 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
minx = -1;
|
||||
for ( long x = 0 ; x < m_numQueryTermInfos ; x++ ) {
|
||||
// skip if to the left of a pipe operator
|
||||
if ( bflags[x] & (BF_PIPED|BF_NEGATIVE) ) continue;
|
||||
// and numeric posdb termlists do not have word positions,
|
||||
// they store a float there.
|
||||
if ( bflags[x] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
|
||||
if ( ! xpos[x] ) continue;
|
||||
if ( xpos[x] && minx == -1 ) {
|
||||
minx = x;
|
||||
@ -6298,7 +6352,8 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
long k;
|
||||
for ( k = 0 ; k < m_numQueryTermInfos ; k++ ) {
|
||||
// skip if to the left of a pipe operator
|
||||
if ( bflags[k] & (BF_PIPED|BF_NEGATIVE) ) continue;
|
||||
if ( bflags[k] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) )
|
||||
continue;
|
||||
if ( xpos[k] ) break;
|
||||
}
|
||||
// all lists are now exhausted
|
||||
@ -6337,12 +6392,12 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
|
||||
|
||||
// skip if to the left of a pipe operator
|
||||
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
|
||||
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
|
||||
|
||||
for ( long j = i+1 ; j < m_numQueryTermInfos ; j++ ) {
|
||||
|
||||
// skip if to the left of a pipe operator
|
||||
if ( bflags[j] & (BF_PIPED|BF_NEGATIVE) ) continue;
|
||||
if ( bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
|
||||
|
||||
//
|
||||
// get score for term pair from non-body occuring terms
|
||||
@ -6404,6 +6459,12 @@ void PosdbTable::intersectLists10_r ( ) {
|
||||
m_r->m_language == docLang)
|
||||
score *= SAMELANGMULT;
|
||||
|
||||
//
|
||||
// if we have a gbsortby:price term then score exclusively on that
|
||||
//
|
||||
if ( m_sortByTermNum >= 0 )
|
||||
score = g_posdb.getFloat ( miniMergedList[m_sortByTermNum] );
|
||||
|
||||
// . seoDebug hack so we can set "dcs"
|
||||
// . we only come here if we actually made it into m_topTree
|
||||
if ( secondPass || m_r->m_seoDebug ) {
|
||||
|
21
Posdb.h
21
Posdb.h
@ -99,6 +99,7 @@ float getTermFreqWeight ( long long termFreq , long long numDocsInColl );
|
||||
#define BF_SYNONYM 0x04
|
||||
#define BF_NEGATIVE 0x08 // query word has a negative sign before it
|
||||
#define BF_BIGRAM 0x10 // query word has a negative sign before it
|
||||
#define BF_NUMBER 0x20 // is it like gbsortby:price? numeric?
|
||||
|
||||
void printTermList ( long i, char *list, long listSize ) ;
|
||||
|
||||
@ -197,6 +198,23 @@ class Posdb {
|
||||
if ( langId & 0x20 ) kp->n0 |= 0x08;
|
||||
}
|
||||
|
||||
// set the word position bits et al to this float
|
||||
void setFloat ( void *vkp , float f ) {
|
||||
*(float *)(((char *)vkp) + 2) = f; };
|
||||
|
||||
// and read the float as well
|
||||
float getFloat ( void *vkp ) {
|
||||
return *(float *)(((char *)vkp) + 2); };
|
||||
|
||||
void setAlignmentBit ( void *vkp , char val ) {
|
||||
char *p = (char *)vkp;
|
||||
if ( val ) p[1] = p[1] | 0x02;
|
||||
else p[1] = p[1] & 0xfd;
|
||||
};
|
||||
|
||||
bool isAlignmentBitClear ( void *vkp ) {
|
||||
return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 );
|
||||
};
|
||||
|
||||
void makeStartKey ( void *kp, long long termId ,
|
||||
long long docId=0LL){
|
||||
@ -427,7 +445,7 @@ class PosdbList : public RdbList {
|
||||
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
|
||||
|
||||
// max # search results that can be viewed without using TopTree
|
||||
#define MAX_RESULTS 1000
|
||||
//#define MAX_RESULTS 1000
|
||||
|
||||
class PosdbTable {
|
||||
|
||||
@ -575,6 +593,7 @@ class PosdbTable {
|
||||
|
||||
class Msg39Request *m_r;
|
||||
|
||||
long m_sortByTermNum;
|
||||
|
||||
// the new intersection/scoring algo
|
||||
void intersectLists10_r ( );
|
||||
|
@ -162,6 +162,11 @@ bool PostQueryRerank::set2 ( long resultsWanted ) {
|
||||
m_pageUrl = (Url *)mcalloc( sizeof(Url)*m_maxResultsToRerank,
|
||||
"pqrpageUrls" );
|
||||
|
||||
if ( ! m_pageUrl ) {
|
||||
log("pqr: had out of memory error");
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
|
10
Process.cpp
10
Process.cpp
@ -500,6 +500,11 @@ bool Process::isAnyTreeSaving ( ) {
|
||||
Rdb *rdb = m_rdbs[i];
|
||||
if ( rdb->m_isCollectionLess ) continue;
|
||||
if ( rdb->isSavingTree() ) return true;
|
||||
// we also just disable writing below in Process.cpp
|
||||
// while saving other files. so hafta check that as well
|
||||
// since we use isAnyTreeSaving() to determine if we can
|
||||
// write to the tree or not.
|
||||
if ( ! rdb->isWritable() ) return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
@ -1064,7 +1069,10 @@ void processSleepWrapper ( int fd , void *state ) {
|
||||
//if ( ! isClockInSync() && ! g_hostdb.m_myHost->m_isProxy ) return;
|
||||
|
||||
// get time the day started
|
||||
long now = getTimeLocal();//GlobalNoCore();
|
||||
long now;
|
||||
if ( g_hostdb.m_myHost->m_isProxy ) now = getTimeLocal();
|
||||
else now = getTimeGlobal();
|
||||
|
||||
// set this for the first time
|
||||
if ( g_process.m_lastSaveTime == 0 )
|
||||
g_process.m_lastSaveTime = now;
|
||||
|
64
Proxy.cpp
64
Proxy.cpp
@ -60,6 +60,7 @@ struct StateControl{
|
||||
HttpRequest m_hr;
|
||||
Host *m_forwardHost;
|
||||
float m_pending;
|
||||
bool m_isEventGuru;
|
||||
};
|
||||
|
||||
#define UIF_ADMIN 0x01
|
||||
@ -370,6 +371,8 @@ bool Proxy::handleRequest (TcpSocket *s){
|
||||
char *host = hr.getHost();
|
||||
char *hdom = host;
|
||||
if ( strncasecmp(hdom,"www.",4) == 0 ) hdom += 4;
|
||||
if ( strncasecmp(hdom,"www2.",5) == 0 ) hdom += 5;
|
||||
if ( strncasecmp(hdom,"www1.",5) == 0 ) hdom += 5;
|
||||
// auto redirect eventguru.com to www.eventguru.com so cookies
|
||||
// are consistent
|
||||
if ( ! redir &&
|
||||
@ -387,9 +390,19 @@ bool Proxy::handleRequest (TcpSocket *s){
|
||||
redirLen = gbstrlen(redir);
|
||||
}
|
||||
|
||||
bool isEventGuru = false;
|
||||
if ( strcasecmp(hdom,"eventguru.com") == 0 )
|
||||
isEventGuru = true;
|
||||
|
||||
#ifdef MATTWELLS
|
||||
#define HTTPS_REDIR 1
|
||||
#endif
|
||||
|
||||
|
||||
if ( redirLen > 0 && redir ) {
|
||||
//redirect:
|
||||
#ifdef HTTPS_REDIR
|
||||
redirect:
|
||||
#endif
|
||||
HttpMime m;
|
||||
m.makeRedirMime (redir,redirLen);
|
||||
// . move the reply to a send buffer
|
||||
@ -431,6 +444,10 @@ bool Proxy::handleRequest (TcpSocket *s){
|
||||
char *path = hr.getPath();
|
||||
//long pathLen = hr.getPathLen();
|
||||
|
||||
// serve events on the gigablast.com domain:
|
||||
if ( path && strncmp(path,"/events",7) == 0 )
|
||||
isEventGuru = true;
|
||||
|
||||
/*
|
||||
bool badPage = false;
|
||||
if ( n < 0 ) badPage = true;
|
||||
@ -502,6 +519,32 @@ bool Proxy::handleRequest (TcpSocket *s){
|
||||
if ( ! strncmp(path,"/?id=" ,5 ) ) handleIt = false;
|
||||
|
||||
|
||||
// log the request iff filename does not end in .gif .jpg .
|
||||
char *f = NULL;
|
||||
long flen = 0;
|
||||
if ( isEventGuru ) {
|
||||
f = hr.getFilename();
|
||||
flen = hr.getFilenameLen();
|
||||
}
|
||||
|
||||
// proxy will handle eventguru images i guess
|
||||
bool isGif = ( f && flen >= 4 && strncmp(&f[flen-4],".gif",4) == 0 );
|
||||
bool isJpg = ( f && flen >= 4 && strncmp(&f[flen-4],".jpg",4) == 0 );
|
||||
bool isBmp = ( f && flen >= 4 && strncmp(&f[flen-4],".bmp",4) == 0 );
|
||||
bool isPng = ( f && flen >= 4 && strncmp(&f[flen-4],".png",4) == 0 );
|
||||
bool isIco = ( f && flen >= 4 && strncmp(&f[flen-4],".ico",4) == 0 );
|
||||
bool isPic = (isGif | isJpg | isBmp | isPng || isIco);
|
||||
|
||||
// use event guru favicon?
|
||||
//if ( isEventGuru && isIco && strcmp(f,"favicon.ico") == 0 ) {
|
||||
// f = "eventguru_favicon.ico";
|
||||
// flen = gbstrlen(f);
|
||||
//}
|
||||
|
||||
// eventguru.com host: in mime?
|
||||
if ( isEventGuru && ! isPic )
|
||||
handleIt = false;
|
||||
|
||||
// only proxy holds the accounting info
|
||||
if ( ! strncmp ( path ,"/account", 8 ) ) {
|
||||
printRequest(s, &hr);
|
||||
@ -515,12 +558,14 @@ bool Proxy::handleRequest (TcpSocket *s){
|
||||
if ( tcp == &g_httpServer.m_ssltcp ) max = g_conf.m_httpsMaxSockets;
|
||||
else max = g_conf.m_httpMaxSockets;
|
||||
|
||||
#ifdef _HTTPS_REDIR_
|
||||
#ifdef HTTPS_REDIR
|
||||
// if hitting root page then tell them to go to https
|
||||
// if not autobanned... but if it is an autobanned request on root
|
||||
// page it should have go the turing test above!
|
||||
if ( n == PAGE_ROOT &&
|
||||
! g_isYippy &&
|
||||
// not event guru homepage
|
||||
! isEventGuru &&
|
||||
// if not already on https
|
||||
tcp != &g_httpServer.m_ssltcp &&
|
||||
// do not redirect http://www.gigablast.com/?c=dmoz3 (directory)!
|
||||
@ -1265,6 +1310,8 @@ bool Proxy::forwardRequest ( StateControl *stC ) {
|
||||
p[5] = '9';
|
||||
break;
|
||||
}
|
||||
// code is invalid if is not for an old client
|
||||
//if ( userId32b == 0 ) code = NULL;
|
||||
}
|
||||
|
||||
|
||||
@ -1665,7 +1712,7 @@ void Proxy::gotReplyPage ( void *state, UdpSlot *slot ) {
|
||||
|
||||
// do not print login bars in the xml!! do not print for ixquick
|
||||
// which gets results in html...
|
||||
if ( ! stC->m_raw && ! stC->m_ch )
|
||||
if ( ! stC->m_raw && ! stC->m_ch && ! stC->m_isEventGuru )
|
||||
newReply = storeLoginBar ( reply ,
|
||||
size , // transmit size
|
||||
size , // allocsize
|
||||
@ -5153,11 +5200,16 @@ void Proxy::printUsers ( SafeBuf *sb ) {
|
||||
// but if admin we should still have set our cookie
|
||||
// adminsessid to our current session id so we know we are
|
||||
// also the admin!
|
||||
sb->safePrintf("<td><a href=/account?login=%s&password=%s>"
|
||||
"%s</td>"
|
||||
sb->safePrintf("<td><nobr>%li. "
|
||||
"<a href=/account?login=%s&password=%s>"
|
||||
"%s</a></nobr></td>"
|
||||
,i
|
||||
,ui->m_login
|
||||
,ui->m_password
|
||||
,ui->m_login);
|
||||
,ui->m_login
|
||||
//,ui->m_userId32
|
||||
);
|
||||
}
|
||||
sb->safePrintf("</tr>\n");
|
||||
sb->safePrintf("</table>\n");
|
||||
}
|
||||
|
19
Query.cpp
19
Query.cpp
@ -2199,6 +2199,10 @@ bool Query::setQWords ( char boolFlag ,
|
||||
fieldCode == FIELD_IP ||
|
||||
fieldCode == FIELD_ISCLEAN ||
|
||||
fieldCode == FIELD_QUOTA ||
|
||||
fieldCode == FIELD_GBSORTBY ||
|
||||
fieldCode == FIELD_GBREVSORTBY ||
|
||||
fieldCode == FIELD_GBNUMBERMIN ||
|
||||
fieldCode == FIELD_GBNUMBERMAX ||
|
||||
fieldCode == FIELD_GBAD ) {
|
||||
// find first space -- that terminates the field value
|
||||
char *end =
|
||||
@ -2210,6 +2214,15 @@ bool Query::setQWords ( char boolFlag ,
|
||||
ignoreTilSpace = true;
|
||||
// the hash
|
||||
unsigned long long wid = hash64 ( w , wlen, 0LL );
|
||||
|
||||
// i've decided not to make
|
||||
// gbsortby:products.offerPrice case sensitive
|
||||
if ( fieldCode == FIELD_GBSORTBY ||
|
||||
fieldCode == FIELD_GBREVSORTBY ||
|
||||
fieldCode == FIELD_GBNUMBERMIN ||
|
||||
fieldCode == FIELD_GBNUMBERMAX )
|
||||
wid = hash64Lower_utf8 ( w , wlen , 0LL );
|
||||
|
||||
// should we have normalized before hashing?
|
||||
if ( fieldCode == FIELD_URL ||
|
||||
fieldCode == FIELD_LINK ||
|
||||
@ -3032,6 +3045,12 @@ struct QueryField g_fields[] = {
|
||||
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
|
||||
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""},
|
||||
{"gbcontenthash", FIELD_GBCONTENTHASH, false,""},
|
||||
{"gbsortby", FIELD_GBSORTBY, false,""},
|
||||
{"gbrevsortby", FIELD_GBREVSORTBY, false,""},
|
||||
|
||||
{"gbnumbermin", FIELD_GBNUMBERMIN, false,""},
|
||||
{"gbnumbermax", FIELD_GBNUMBERMAX, false,""},
|
||||
|
||||
{"gbcountry",FIELD_GBCOUNTRY,false,""},
|
||||
{"gbad",FIELD_GBAD,false,""},
|
||||
|
||||
|
6
Query.h
6
Query.h
@ -103,7 +103,11 @@ typedef unsigned long long qvec_t;
|
||||
#define FIELD_GBCSENUM 50
|
||||
#define FIELD_GBSECTIONHASH 51
|
||||
#define FIELD_GBDOCID 52
|
||||
#define FIELD_GBCONTENTHASH 53
|
||||
#define FIELD_GBCONTENTHASH 53 // for deduping at spider time
|
||||
#define FIELD_GBSORTBY 54 // i.e. sortby:price -> numeric termlist
|
||||
#define FIELD_GBREVSORTBY 55 // i.e. sortby:price -> low to high
|
||||
#define FIELD_GBNUMBERMIN 56
|
||||
#define FIELD_GBNUMBERMAX 57
|
||||
|
||||
#define FIELD_GBOTHER 92
|
||||
|
||||
|
@ -1,7 +1,7 @@
|
||||
open-source-search-engine
|
||||
=========================
|
||||
|
||||
An open source web and enterprise search engine. As can be seen http://www.gigablast.com/
|
||||
An open source web and enterprise search engine. As can be seen on http://www.gigablast.com/ .
|
||||
|
||||
RUNNING GIGABLAST
|
||||
-----------------
|
||||
|
111
Rdb.cpp
111
Rdb.cpp
@ -132,7 +132,7 @@ bool Rdb::init ( char *dir ,
|
||||
// sanity
|
||||
if ( ! dir ) { char *xx=NULL;*xx=0; }
|
||||
// this is the working dir, all collection repositiories are subdirs
|
||||
m_dir.set ( dir );
|
||||
//m_dir.set ( dir );
|
||||
// catdb, statsdb, accessdb, facebookdb, syncdb
|
||||
m_isCollectionLess = isCollectionLess;
|
||||
// save the dbname NULL terminated into m_dbname/m_dbnameLen
|
||||
@ -466,6 +466,11 @@ bool Rdb::updateToRebuildFiles ( Rdb *rdb2 , char *coll ) {
|
||||
// . if this rdb is collectionless we set m_collectionlessBase in addBase()
|
||||
bool Rdb::addColl ( char *coll ) {
|
||||
collnum_t collnum = g_collectiondb.getCollnum ( coll );
|
||||
return addColl2 ( collnum );
|
||||
}
|
||||
|
||||
bool Rdb::addColl2 ( collnum_t collnum ) {
|
||||
|
||||
// catdb,statsbaccessdb,facebookdb,syncdb
|
||||
if ( m_isCollectionLess )
|
||||
collnum = (collnum_t)0;
|
||||
@ -477,6 +482,12 @@ bool Rdb::addColl ( char *coll ) {
|
||||
"breech maximum number of collections, %lli.",
|
||||
m_dbname,collnum,maxColls);
|
||||
}
|
||||
|
||||
|
||||
CollectionRec *cr = g_collectiondb.m_recs[collnum];
|
||||
char *coll = NULL;
|
||||
if ( cr ) coll = cr->m_coll;
|
||||
|
||||
// . ensure no previous one exists
|
||||
// . well it will be there but will be uninitialized, m_rdb will b NULL
|
||||
RdbBase *base = getBase ( collnum );
|
||||
@ -506,8 +517,9 @@ bool Rdb::addColl ( char *coll ) {
|
||||
if(m_useTree) tree = &m_tree;
|
||||
else buckets = &m_buckets;
|
||||
|
||||
// init it
|
||||
if ( ! base->init ( m_dir.getDir() ,
|
||||
// . init it
|
||||
// . g_hostdb.m_dir should end in /
|
||||
if ( ! base->init ( g_hostdb.m_dir, // m_dir.getDir() ,
|
||||
m_dbname ,
|
||||
m_dedup ,
|
||||
m_fixedDataSize ,
|
||||
@ -527,15 +539,16 @@ bool Rdb::addColl ( char *coll ) {
|
||||
m_biasDiskPageCache ) ) {
|
||||
logf(LOG_INFO,"db: %s: Failed to initialize db for "
|
||||
"collection \"%s\".", m_dbname,coll);
|
||||
exit(-1);
|
||||
//exit(-1);
|
||||
return false;
|
||||
}
|
||||
|
||||
// . set CollectionRec::m_numPos/NegKeysInTree[rdbId]
|
||||
// . these counts are now stored in the CollectionRec and not
|
||||
// in RdbTree since the # of collections can be huge!
|
||||
CollectionRec *cr = g_collectiondb.m_recs[collnum];
|
||||
m_tree.setNumKeys ( cr );
|
||||
if ( m_useTree ) {
|
||||
m_tree.setNumKeys ( cr );
|
||||
}
|
||||
|
||||
//if ( (long)collnum >= m_numBases ) m_numBases = (long)collnum + 1;
|
||||
// Success
|
||||
@ -544,7 +557,7 @@ bool Rdb::addColl ( char *coll ) {
|
||||
|
||||
bool Rdb::resetColl ( collnum_t collnum , collnum_t newCollnum ) {
|
||||
|
||||
char *coll = g_collectiondb.m_recs[collnum]->m_coll;
|
||||
//char *coll = g_collectiondb.m_recs[collnum]->m_coll;
|
||||
|
||||
// remove these collnums from tree
|
||||
if(m_useTree) m_tree.delColl ( collnum );
|
||||
@ -552,11 +565,48 @@ bool Rdb::resetColl ( collnum_t collnum , collnum_t newCollnum ) {
|
||||
|
||||
// . close all files, set m_numFiles to 0 in RdbBase
|
||||
// . TODO: what about outstanding merge or dump operations?
|
||||
RdbBase *base = getBase ( collnum );
|
||||
base->reset( );
|
||||
// . it seems like we can't really recycle this too easily
|
||||
// because reset it not resetting filenames or directory name?
|
||||
// just nuke it and rebuild using addColl2()...
|
||||
RdbBase *oldBase = getBase ( collnum );
|
||||
mdelete (oldBase, sizeof(RdbBase), "Rdb Coll");
|
||||
delete (oldBase);
|
||||
|
||||
// update this as well
|
||||
base->m_collnum = newCollnum;
|
||||
//base->reset( );
|
||||
|
||||
// NULL it out...
|
||||
CollectionRec *oldcr = g_collectiondb.getRec(collnum);
|
||||
oldcr->m_bases[(unsigned char)m_rdbId] = NULL;
|
||||
char *coll = oldcr->m_coll;
|
||||
|
||||
char *msg = "deleted";
|
||||
|
||||
// if just resetting recycle base
|
||||
if ( collnum != newCollnum ) {
|
||||
addColl2 ( newCollnum );
|
||||
// make a new base now
|
||||
//RdbBase *newBase = mnew
|
||||
// new cr
|
||||
//CollectionRec *newcr = g_collectiondb.getRec(newCollnum);
|
||||
// update this as well
|
||||
//base->m_collnum = newCollnum;
|
||||
// and the array
|
||||
//newcr->m_bases[(unsigned char)m_rdbId] = base;
|
||||
msg = "moved";
|
||||
}
|
||||
|
||||
|
||||
log("rdb: %s base from collrec "
|
||||
"rdb=%s rdbid=%li coll=%s collnum=%li newcollnum=%li",
|
||||
msg,m_dbname,(long)m_rdbId,coll,(long)collnum,
|
||||
(long)newCollnum);
|
||||
|
||||
|
||||
// new dir. otherwise RdbDump will try to dump out the recs to
|
||||
// the old dir and it will end up coring
|
||||
//char tmp[1024];
|
||||
//sprintf(tmp , "%scoll.%s.%li",g_hostdb.m_dir,coll,(long)newCollnum );
|
||||
//m_dir.set ( tmp );
|
||||
|
||||
// move the files into trash
|
||||
// nuke it on disk
|
||||
@ -597,19 +647,6 @@ bool Rdb::delColl ( char *coll ) {
|
||||
// move all files to trash and clear the tree/buckets
|
||||
resetColl ( collnum , collnum );
|
||||
|
||||
mdelete (base, sizeof(RdbBase), "Rdb Coll");
|
||||
delete (base);
|
||||
//m_bases[collnum] = NULL;
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec(collnum);
|
||||
|
||||
// NULL it out...
|
||||
cr->m_bases[(unsigned char)m_rdbId] = NULL;
|
||||
|
||||
log("rdb: deleted base from collrec "
|
||||
"rdb=%s rdbid=%li coll=%s collnum=%li base=0x%lx",
|
||||
m_dbname,(long)m_rdbId,coll,(long)collnum,(long)base);
|
||||
|
||||
// remove these collnums from tree
|
||||
//if(m_useTree) m_tree.delColl ( collnum );
|
||||
//else m_buckets.delColl ( collnum );
|
||||
@ -921,7 +958,8 @@ bool Rdb::saveMaps ( bool useThread ) {
|
||||
// shut it down
|
||||
RdbBase *base = getBase(i);
|
||||
//if ( m_bases[i] ) m_bases[i]->closeMaps ( m_urgent );
|
||||
if ( base ) base->closeMaps ( m_urgent );
|
||||
//if ( base ) base->closeMaps ( m_urgent );
|
||||
if ( base ) base->saveMaps ( useThread );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
@ -1242,6 +1280,7 @@ bool Rdb::gotTokenForDump ( ) {
|
||||
m_dumpCollnum = (collnum_t)-1;
|
||||
// clear this for dumpCollLoop()
|
||||
g_errno = 0;
|
||||
m_dumpErrno = 0;
|
||||
m_fn = -1000;
|
||||
// this returns false if blocked, which means we're ok, so we ret true
|
||||
if ( ! dumpCollLoop ( ) ) return true;
|
||||
@ -1414,9 +1453,16 @@ bool Rdb::dumpCollLoop ( ) {
|
||||
|
||||
// error?
|
||||
if ( g_errno ) {
|
||||
log("rdb: error dumping = %s",mstrerror(g_errno));
|
||||
log("rdb: error dumping = %s . coll deleted from under us?",
|
||||
mstrerror(g_errno));
|
||||
// shit, what to do here? this is causing our RdbMem
|
||||
// to get corrupted!
|
||||
// because if we end up continuing it calls doneDumping()
|
||||
// and updates RdbMem! maybe set a permanent error then!
|
||||
// and if that is there do not clear RdbMem!
|
||||
m_dumpErrno = g_errno;
|
||||
// for now core out
|
||||
char *xx=NULL;*xx=0;
|
||||
//char *xx=NULL;*xx=0;
|
||||
}
|
||||
|
||||
// loop back up since we did not block
|
||||
@ -1437,11 +1483,12 @@ void Rdb::doneDumping ( ) {
|
||||
// msg
|
||||
//log(LOG_INFO,"db: Done dumping %s to %s (#%li): %s.",
|
||||
// m_dbname,m_files[n]->getFilename(),n,mstrerror(g_errno));
|
||||
log(LOG_INFO,"db: Done dumping %s: %s.",m_dbname,mstrerror(g_errno));
|
||||
log(LOG_INFO,"db: Done dumping %s: %s.",m_dbname,
|
||||
mstrerror(m_dumpErrno));
|
||||
// give the token back so someone else can dump or merge
|
||||
//g_msg35.releaseToken();
|
||||
// free mem in the primary buffer
|
||||
if ( ! g_errno ) m_mem.freeDumpedMem();
|
||||
if ( ! m_dumpErrno ) m_mem.freeDumpedMem();
|
||||
// . tell RdbDump it is done
|
||||
// . we have to set this here otherwise RdbMem's memory ring buffer
|
||||
// will think the dumping is no longer going on and use the primary
|
||||
@ -2839,6 +2886,12 @@ void Rdb::enableWrites () {
|
||||
else m_buckets.enableWrites();
|
||||
}
|
||||
|
||||
bool Rdb::isWritable ( ) {
|
||||
if(m_useTree) return m_tree.m_isWritable;
|
||||
return m_buckets.m_isWritable;
|
||||
}
|
||||
|
||||
|
||||
bool Rdb::needsSave() {
|
||||
if(m_useTree) return m_tree.m_needsSave;
|
||||
else return m_buckets.needsSave();
|
||||
|
12
Rdb.h
12
Rdb.h
@ -10,7 +10,7 @@
|
||||
#include "RdbMem.h"
|
||||
#include "RdbCache.h"
|
||||
#include "RdbDump.h"
|
||||
#include "Dir.h"
|
||||
//#include "Dir.h"
|
||||
#include "RdbBuckets.h"
|
||||
|
||||
// . each Rdb instance has an ID
|
||||
@ -86,6 +86,7 @@ class Rdb {
|
||||
~Rdb ( );
|
||||
|
||||
bool addColl ( char *coll );
|
||||
bool addColl2 ( collnum_t collnum );
|
||||
bool delColl ( char *coll );
|
||||
|
||||
bool resetColl ( collnum_t collnum , collnum_t newCollnum ) ;
|
||||
@ -164,7 +165,8 @@ class Rdb {
|
||||
bool deleteRecord ( collnum_t collnum , char *key );
|
||||
|
||||
// get the directory name where this rdb stores it's files
|
||||
char *getDir ( ) { return m_dir.getDirname(); };
|
||||
//char *getDir ( ) { return m_dir.getDirname(); };
|
||||
char *getDir ( ) { return g_hostdb.m_dir; };
|
||||
char *getStripeDir ( ) { return g_conf.m_stripeDir; };
|
||||
|
||||
long getFixedDataSize ( ) { return m_fixedDataSize; };
|
||||
@ -185,7 +187,7 @@ class Rdb {
|
||||
|
||||
void disableWrites ();
|
||||
void enableWrites ();
|
||||
|
||||
bool isWritable ( ) ;
|
||||
|
||||
RdbBase *getBase ( collnum_t collnum ) ;
|
||||
long getNumBases ( ) { return g_collectiondb.m_numRecs; };
|
||||
@ -352,7 +354,7 @@ class Rdb {
|
||||
bool m_dedup;
|
||||
long m_fixedDataSize;
|
||||
|
||||
Dir m_dir;
|
||||
//Dir m_dir;
|
||||
char m_dbname [32];
|
||||
long m_dbnameLen;
|
||||
|
||||
@ -394,6 +396,8 @@ class Rdb {
|
||||
long m_numFilesToMerge ;
|
||||
long m_mergeStartFileNum ;
|
||||
|
||||
long m_dumpErrno;
|
||||
|
||||
// a dummy data string for deleting records when m_fixedDataSize > 0
|
||||
char *m_dummy;
|
||||
long m_dummySize ; // size of that dummy data
|
||||
|
22
RdbBase.cpp
22
RdbBase.cpp
@ -127,8 +127,15 @@ bool RdbBase::init ( char *dir ,
|
||||
// set all our contained classes
|
||||
//m_dir.set ( dir );
|
||||
// set all our contained classes
|
||||
// . "tmp" is bogus
|
||||
// . /home/mwells/github/coll.john-test1113.654coll.john-test1113.655
|
||||
char tmp[1024];
|
||||
sprintf ( tmp , "%scoll.%s.%li" , dir , coll , (long)collnum );
|
||||
|
||||
// debug
|
||||
log("base: adding new base for dir=%s coll=%s collnum=%li db=%s",
|
||||
dir,coll,(long)collnum,dbname);
|
||||
|
||||
// catdb is collection independent
|
||||
|
||||
// make a special subdir to store the map and data files in if
|
||||
@ -261,7 +268,8 @@ bool RdbBase::init ( char *dir ,
|
||||
// we can't merge more than MAX_RDB_FILES files at a time
|
||||
if ( minToMergeArg > MAX_RDB_FILES ) minToMergeArg = MAX_RDB_FILES;
|
||||
m_minToMergeArg = minToMergeArg;
|
||||
// set our m_files array
|
||||
// . set our m_files array
|
||||
// . m_dir is bogus causing this to fail
|
||||
if ( ! setFiles () ) return false;
|
||||
//long dataMem;
|
||||
// if we're in read only mode, don't bother with *ANY* trees
|
||||
@ -491,9 +499,11 @@ bool RdbBase::removeRebuildFromFilename ( BigFile *f ) {
|
||||
bool RdbBase::setFiles ( ) {
|
||||
// set our directory class
|
||||
if ( ! m_dir.open ( ) )
|
||||
// we are getting this from a bogus m_dir
|
||||
return log("db: Had error opening directory %s", getDir());
|
||||
// note it
|
||||
logf(LOG_INFO,"db: Loading files for %s.",m_dbname );
|
||||
logf(LOG_INFO,"db: Loading files for %s coll=%s (%li).",
|
||||
m_dbname,m_coll,(long)m_collnum );
|
||||
// . set our m_files array
|
||||
// . addFile() will return -1 and set g_errno on error
|
||||
// . the lower the fileId the older the data
|
||||
@ -600,6 +610,8 @@ bool RdbBase::setFiles ( ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
m_dir.close();
|
||||
|
||||
if ( ! converting ) return true;
|
||||
|
||||
// now if we are converting old titledb names to new...
|
||||
@ -723,7 +735,6 @@ long RdbBase::addFile ( long id , bool isNew , long mergeNum , long id2 ,
|
||||
sprintf ( name , "%s%04li.map", m_dbname, id );
|
||||
m->set ( getDir() , name , m_fixedDataSize , m_useHalfKeys , m_ks ,
|
||||
m_pageSize );
|
||||
if ( ! isNew ) logf(LOG_INFO,"db: Adding %s.", name );
|
||||
if ( ! isNew && ! m->readMap ( f ) ) {
|
||||
// if out of memory, do not try to regen for that
|
||||
if ( g_errno == ENOMEM ) return -1;
|
||||
@ -759,6 +770,8 @@ long RdbBase::addFile ( long id , bool isNew , long mergeNum , long id2 ,
|
||||
g_statsdb.m_disabled = false;
|
||||
if ( ! status ) return log("db: Save failed.");
|
||||
}
|
||||
if ( ! isNew ) logf(LOG_INFO,"db: Added %s for collnum=%li pages=%li",
|
||||
name ,(long)m_collnum,m->getNumPages());
|
||||
// open this big data file for reading only
|
||||
if ( ! isNew ) {
|
||||
if ( mergeNum < 0 )
|
||||
@ -1603,7 +1616,8 @@ void RdbBase::gotTokenForMerge ( ) {
|
||||
return;
|
||||
}
|
||||
// make a log note
|
||||
log(LOG_INFO,"merge: Resuming killed merge for %s.",m_dbname);
|
||||
log(LOG_INFO,"merge: Resuming killed merge for %s coll=%s.",
|
||||
m_dbname,m_coll);
|
||||
// compute the total size of merged file
|
||||
mint = 0;
|
||||
long mm = 0;
|
||||
|
@ -416,6 +416,9 @@ bool RdbCache::getRecord ( collnum_t collnum ,
|
||||
if ( maxAge == 0 ) return false;
|
||||
// bail if no cache
|
||||
if ( m_numPtrsMax <= 0 ) return false;
|
||||
// if init() called failed because of oom...
|
||||
if ( ! m_ptrs )
|
||||
return log("cache: getRecord: failed because oom");
|
||||
// time it -- debug
|
||||
long long t = 0LL ;
|
||||
if ( g_conf.m_logTimingDb ) t = gettimeofdayInMillisecondsLocal();
|
||||
|
64
RdbTree.cpp
64
RdbTree.cpp
@ -32,6 +32,10 @@ RdbTree::RdbTree () {
|
||||
m_useProtection = false;
|
||||
m_pickRight = false;
|
||||
m_gettingList = 0;
|
||||
|
||||
// before resetting... we have to set this so clear() won't breach buffers
|
||||
m_rdbId = -1;
|
||||
|
||||
reset();
|
||||
}
|
||||
|
||||
@ -125,10 +129,6 @@ bool RdbTree::set ( long fixedDataSize ,
|
||||
// sanity
|
||||
if ( rdbId < -1 ) { char *xx=NULL;*xx=0; }
|
||||
if ( rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
|
||||
// is it a valid one
|
||||
m_isRealTree = true;
|
||||
if ( m_rdbId <= RDB_NONE ) m_isRealTree = false;
|
||||
if ( m_rdbId >= RDB_END ) m_isRealTree = false;
|
||||
// if its doledb, set it
|
||||
//if ( dbname && strcmp(dbname,"doledb") == 0 ) m_rdbId = RDB_DOLEDB;
|
||||
// adjust m_maxMem to virtual infinity if it was -1
|
||||
@ -151,7 +151,7 @@ bool RdbTree::set ( long fixedDataSize ,
|
||||
// initiate protection
|
||||
if ( m_useProtection ) protect();
|
||||
// allocate the nodes
|
||||
return growTree ( maxNumNodes );
|
||||
return growTree ( maxNumNodes , 0 );
|
||||
}
|
||||
|
||||
void RdbTree::reset ( ) {
|
||||
@ -273,11 +273,12 @@ long RdbTree::clear ( ) {
|
||||
// clear tree counts for all collections!
|
||||
long nc = g_collectiondb.m_numRecs;
|
||||
// BUT only if we are an Rdb::m_tree!!!
|
||||
if ( ! m_isRealTree ) nc = 0;
|
||||
if ( m_rdbId == -1 ) nc = 0;
|
||||
// otherwise, we overwrite stuff in CollectionRec we shouldn't
|
||||
for ( long i = 0 ; i < nc ; i++ ) {
|
||||
CollectionRec *cr = g_collectiondb.getRec(i);
|
||||
if ( ! cr ) continue;
|
||||
//if ( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
cr->m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
|
||||
cr->m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
|
||||
}
|
||||
@ -547,7 +548,8 @@ long RdbTree::addNode ( collnum_t collnum ,
|
||||
// collections using the same Rdb::m_tree!
|
||||
// crap, when fixing a tree this will segfault because
|
||||
// m_recs[collnum] is NULL.
|
||||
if ( m_isRealTree && g_collectiondb.m_recs[collnum] ) {
|
||||
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
|
||||
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
g_collectiondb.m_recs[collnum]->
|
||||
m_numNegKeysInTree[(unsigned char)m_rdbId] =0;
|
||||
g_collectiondb.m_recs[collnum]->
|
||||
@ -629,7 +631,8 @@ long RdbTree::addNode ( collnum_t collnum ,
|
||||
// collections using the same Rdb::m_tree!
|
||||
// crap, when fixing a tree this will segfault because
|
||||
// m_recs[collnum] is NULL.
|
||||
if ( m_isRealTree && g_collectiondb.m_recs[collnum] ) {
|
||||
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
|
||||
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
g_collectiondb.m_recs[collnum]->
|
||||
m_numNegKeysInTree[(unsigned char)m_rdbId]++;
|
||||
}
|
||||
@ -639,7 +642,8 @@ long RdbTree::addNode ( collnum_t collnum ,
|
||||
//m_numPosKeysPerColl[collnum]++;
|
||||
// crap, when fixing a tree this will segfault because
|
||||
// m_recs[collnum] is NULL.
|
||||
if ( m_isRealTree && g_collectiondb.m_recs[collnum] ) {
|
||||
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
|
||||
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
g_collectiondb.m_recs[collnum]->
|
||||
m_numPosKeysInTree[(unsigned char)m_rdbId]++;
|
||||
}
|
||||
@ -834,14 +838,14 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
|
||||
if ( KEYNEG(m_keys,i,m_ks) ) {
|
||||
m_numNegativeKeys--;
|
||||
//m_numNegKeysPerColl[m_collnums[i]]--;
|
||||
if ( m_isRealTree )
|
||||
if ( m_rdbId >= 0 )
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
|
||||
}
|
||||
else {
|
||||
m_numPositiveKeys--;
|
||||
//m_numPosKeysPerColl[m_collnums[i]]--;
|
||||
if ( m_isRealTree )
|
||||
if ( m_rdbId >= 0 )
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
|
||||
}
|
||||
@ -868,7 +872,8 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
|
||||
m_numPositiveKeys = 0;
|
||||
//m_numNegKeysPerColl[m_collnums[i]] = 0;
|
||||
//m_numPosKeysPerColl[m_collnums[i]] = 0;
|
||||
if ( m_isRealTree ) {
|
||||
if ( m_rdbId >= 0 ) {
|
||||
//if ( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
@ -937,16 +942,20 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
|
||||
if ( KEYNEG(m_keys,i,m_ks) ) {
|
||||
m_numNegativeKeys--;
|
||||
//m_numNegKeysPerColl[m_collnums[i]]--;
|
||||
if ( m_isRealTree )
|
||||
if ( m_rdbId >= 0 ) {
|
||||
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
m_numNegKeysInTree[(unsigned char)m_rdbId]--;
|
||||
}
|
||||
}
|
||||
else {
|
||||
m_numPositiveKeys--;
|
||||
//m_numPosKeysPerColl[m_collnums[i]]--;
|
||||
if ( m_isRealTree )
|
||||
if ( m_rdbId >= 0 ) {
|
||||
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
|
||||
g_collectiondb.m_recs[m_collnums[i]]->
|
||||
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
|
||||
}
|
||||
}
|
||||
// debug step -- check chain from iparent down making sure that
|
||||
// all kids don't have -2 for their parent... seems to be a rare bug
|
||||
@ -1310,7 +1319,7 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
|
||||
|
||||
// . grow tree to "n" nodes
|
||||
// . this will now actually grow from a current size to a new one
|
||||
bool RdbTree::growTree ( long nn ) {
|
||||
bool RdbTree::growTree ( long nn , long niceness ) {
|
||||
// if we're that size, bail
|
||||
if ( m_numNodes == nn ) return true;
|
||||
|
||||
@ -1337,27 +1346,35 @@ bool RdbTree::growTree ( long nn ) {
|
||||
long cs = sizeof(collnum_t);
|
||||
cp =(collnum_t *)mrealloc (m_collnums, on*cs,nn*cs,m_allocName);
|
||||
if ( ! cp ) goto error;
|
||||
QUICKPOLL(niceness);
|
||||
kp = (char *) mrealloc ( m_keys , on*k , nn*k , m_allocName );
|
||||
if ( ! kp ) goto error;
|
||||
QUICKPOLL(niceness);
|
||||
lp = (long *) mrealloc ( m_left , on*4 , nn*4 , m_allocName );
|
||||
if ( ! lp ) goto error;
|
||||
QUICKPOLL(niceness);
|
||||
rp = (long *) mrealloc ( m_right , on*4 , nn*4 , m_allocName );
|
||||
if ( ! rp ) goto error;
|
||||
QUICKPOLL(niceness);
|
||||
pp = (long *) mrealloc ( m_parents , on*4 , nn*4 , m_allocName );
|
||||
if ( ! pp ) goto error;
|
||||
QUICKPOLL(niceness);
|
||||
|
||||
// deal with data, sizes and depth arrays on a basis of need
|
||||
if ( m_fixedDataSize != 0 ) {
|
||||
dp =(char **)mrealloc (m_data , on*d,nn*d,m_allocName);
|
||||
if ( ! dp ) goto error;
|
||||
QUICKPOLL(niceness);
|
||||
}
|
||||
if ( m_fixedDataSize == -1 ) {
|
||||
sp =(long *)mrealloc (m_sizes , on*4,nn*4,m_allocName);
|
||||
if ( ! sp ) goto error;
|
||||
QUICKPOLL(niceness);
|
||||
}
|
||||
if ( m_doBalancing ) {
|
||||
tp =(char *)mrealloc (m_depth , on ,nn ,m_allocName);
|
||||
if ( ! tp ) goto error;
|
||||
QUICKPOLL(niceness);
|
||||
}
|
||||
|
||||
// re-assign
|
||||
@ -1385,6 +1402,7 @@ bool RdbTree::growTree ( long nn ) {
|
||||
|
||||
// protect it from writes
|
||||
if ( m_useProtection ) protect ( );
|
||||
QUICKPOLL(niceness);
|
||||
return true;
|
||||
|
||||
error:
|
||||
@ -1399,41 +1417,49 @@ bool RdbTree::growTree ( long nn ) {
|
||||
ss = (collnum_t *)mrealloc ( cp , nn*cs , on*cs , m_allocName);
|
||||
if ( ! ss ) { char *xx = NULL; *xx = 0; }
|
||||
m_collnums = ss;
|
||||
QUICKPOLL(niceness);
|
||||
}
|
||||
if ( kp ) {
|
||||
kk = (char *)mrealloc ( kp, nn*k, on*k, m_allocName );
|
||||
if ( ! kk ) { char *xx = NULL; *xx = 0; }
|
||||
m_keys = kk;
|
||||
QUICKPOLL(niceness);
|
||||
}
|
||||
if ( lp ) {
|
||||
x = (long *)mrealloc ( lp , nn*4 , on*4 , m_allocName );
|
||||
if ( ! x ) { char *xx = NULL; *xx = 0; }
|
||||
m_left = x;
|
||||
QUICKPOLL(niceness);
|
||||
}
|
||||
if ( rp ) {
|
||||
x = (long *)mrealloc ( rp , nn*4 , on*4 , m_allocName );
|
||||
if ( ! x ) { char *xx = NULL; *xx = 0; }
|
||||
m_right = x;
|
||||
QUICKPOLL(niceness);
|
||||
}
|
||||
if ( pp ) {
|
||||
x = (long *)mrealloc ( pp , nn*4 , on*4 , m_allocName );
|
||||
if ( ! x ) { char *xx = NULL; *xx = 0; }
|
||||
m_parents = x;
|
||||
QUICKPOLL(niceness);
|
||||
}
|
||||
if ( dp && m_fixedDataSize != 0 ) {
|
||||
p = (char **)mrealloc ( dp , nn*d , on*d , m_allocName );
|
||||
if ( ! p ) { char *xx = NULL; *xx = 0; }
|
||||
m_data = p;
|
||||
QUICKPOLL(niceness);
|
||||
}
|
||||
if ( sp && m_fixedDataSize == -1 ) {
|
||||
x = (long *)mrealloc ( sp , nn*4 , on*4 , m_allocName );
|
||||
if ( ! x ) { char *xx = NULL; *xx = 0; }
|
||||
m_sizes = x;
|
||||
QUICKPOLL(niceness);
|
||||
}
|
||||
if ( tp && m_doBalancing ) {
|
||||
s = (char *)mrealloc ( tp , nn , on , m_allocName );
|
||||
if ( ! s ) { char *xx = NULL; *xx = 0; }
|
||||
m_depth = s;
|
||||
QUICKPOLL(niceness);
|
||||
}
|
||||
|
||||
return log("db: Failed to grow tree for %s from %li to %li bytes: %s.",
|
||||
@ -2612,7 +2638,7 @@ bool RdbTree::fastLoad ( BigFile *f , RdbMem *stack ) {
|
||||
if ( m_numNodes < minUnusedNode ) {
|
||||
log(LOG_INIT,
|
||||
"db: Growing tree to make room for %s",f->getFilename());
|
||||
if ( ! growTree ( minUnusedNode ) ) {
|
||||
if ( ! growTree ( minUnusedNode , 0 ) ) {
|
||||
f->close();
|
||||
m_isLoading = false;
|
||||
return log("db: Failed to grow tree: %s.",
|
||||
@ -3050,14 +3076,14 @@ void RdbTree::cleanTree ( ) { // char **bases ) {
|
||||
}
|
||||
|
||||
long RdbTree::getNumNegativeKeys ( collnum_t collnum ) {
|
||||
if ( ! m_isRealTree ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_rdbId < 0 ) { char *xx=NULL;*xx=0; }
|
||||
CollectionRec *cr = g_collectiondb.m_recs[collnum];
|
||||
if ( ! cr ) return 0;
|
||||
return cr->m_numNegKeysInTree[(unsigned char)m_rdbId];
|
||||
}
|
||||
|
||||
long RdbTree::getNumPositiveKeys ( collnum_t collnum ) {
|
||||
if ( ! m_isRealTree ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_rdbId < 0 ) { char *xx=NULL;*xx=0; }
|
||||
CollectionRec *cr = g_collectiondb.m_recs[collnum];
|
||||
if ( ! cr ) return 0;
|
||||
return cr->m_numPosKeysInTree[(unsigned char)m_rdbId];
|
||||
@ -3067,6 +3093,8 @@ void RdbTree::setNumKeys ( CollectionRec *cr ) {
|
||||
|
||||
if ( ! cr ) return;
|
||||
|
||||
if ( ((unsigned char)m_rdbId) >= RDB_END ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
collnum_t collnum = cr->m_collnum;
|
||||
cr->m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
|
||||
cr->m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
|
||||
|
@ -360,7 +360,7 @@ class RdbTree {
|
||||
// need to pass this file to the fastSave() thread
|
||||
//BigFile *m_saveFile;
|
||||
char m_rdbId;
|
||||
char m_isRealTree;
|
||||
//char m_isRealTree;
|
||||
char m_dir[128];
|
||||
char m_dbname[32];
|
||||
char m_memTag[16];
|
||||
@ -401,7 +401,7 @@ class RdbTree {
|
||||
// . returns true if tree doesn't need to grow/shrink
|
||||
// . re-allocs the m_keys,m_data,m_sizes,m_leftNodes,m_rightNodes
|
||||
// . used for growing AND shrinking the table
|
||||
bool growTree ( long newNumNodes );
|
||||
bool growTree ( long newNumNodes , long niceness );
|
||||
|
||||
// are we responsible for freeing nodes' data
|
||||
bool m_ownData;
|
||||
|
97
SafeBuf.cpp
97
SafeBuf.cpp
@ -22,11 +22,12 @@
|
||||
// 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
|
||||
// };
|
||||
|
||||
SafeBuf::SafeBuf(long initSize) {
|
||||
SafeBuf::SafeBuf(long initSize, char *label ) {
|
||||
if(initSize <= 0) initSize = 1;
|
||||
m_capacity = initSize;
|
||||
m_length = 0;
|
||||
m_buf = (char*)mrealloc(NULL, 0, m_capacity, "SafeBuf");
|
||||
m_label = label;
|
||||
m_buf = (char*)mrealloc(NULL, 0, m_capacity, m_label );
|
||||
if(!m_buf) m_capacity = 0;
|
||||
m_usingStack = false;
|
||||
m_encoding = csUTF8;
|
||||
@ -39,6 +40,11 @@ SafeBuf::SafeBuf() {
|
||||
m_buf = NULL;
|
||||
m_usingStack = false;
|
||||
m_encoding = csUTF8;
|
||||
m_label = NULL;
|
||||
}
|
||||
|
||||
void SafeBuf::setLabel ( char *label ) {
|
||||
m_label = label;
|
||||
}
|
||||
|
||||
SafeBuf::SafeBuf(char* stackBuf, long cap) {
|
||||
@ -47,6 +53,7 @@ SafeBuf::SafeBuf(char* stackBuf, long cap) {
|
||||
m_buf = stackBuf;
|
||||
m_length = 0;
|
||||
m_encoding = csUTF8;
|
||||
m_label = NULL;
|
||||
}
|
||||
|
||||
SafeBuf::SafeBuf(char *heapBuf, long bufMax, long bytesInUse, bool ownData) {
|
||||
@ -292,8 +299,14 @@ bool SafeBuf::advance ( long i ) {
|
||||
return true;
|
||||
}
|
||||
|
||||
bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
|
||||
if ( ! label ) label = "SafeBuf";
|
||||
bool SafeBuf::reserve(long i , char *label, bool clearIt ) {
|
||||
|
||||
// if we don't already have a label and they provided one, use it
|
||||
if ( ! m_label ) {
|
||||
if ( label ) m_label = label;
|
||||
else m_label = "SafeBuf";
|
||||
}
|
||||
|
||||
if(m_length + i > m_capacity) {
|
||||
char *tmpBuf = m_buf;
|
||||
long tmpCap = m_capacity;
|
||||
@ -301,7 +314,7 @@ bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
|
||||
m_buf = NULL;
|
||||
m_capacity += i;
|
||||
//if(m_capacity < 8) m_capacity = 8;
|
||||
m_buf = (char*)mrealloc(m_buf, 0, m_capacity, label);
|
||||
m_buf = (char*)mrealloc(m_buf, 0, m_capacity,m_label);
|
||||
if(!m_buf) {
|
||||
m_buf = tmpBuf;
|
||||
m_capacity = tmpCap;
|
||||
@ -320,7 +333,7 @@ bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
|
||||
}
|
||||
m_capacity += i;
|
||||
//if(m_capacity < 8) m_capacity = 8;
|
||||
m_buf = (char*)mrealloc(m_buf, tmpCap, m_capacity,label);
|
||||
m_buf = (char*)mrealloc(m_buf, tmpCap, m_capacity,m_label);
|
||||
if(!m_buf) {
|
||||
m_buf = tmpBuf;
|
||||
m_capacity = tmpCap;
|
||||
@ -344,11 +357,11 @@ bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
|
||||
|
||||
//reserve this many bytes, if we need to alloc, we double the
|
||||
//buffer size.
|
||||
bool SafeBuf::reserve2x(long i) {
|
||||
bool SafeBuf::reserve2x(long i, char *label) {
|
||||
//watch out for overflow!
|
||||
if((m_capacity << 1) + i < 0) return false;
|
||||
if(i + m_length >= m_capacity)
|
||||
return reserve(m_capacity + i);
|
||||
return reserve(m_capacity + i,label);
|
||||
else return true;
|
||||
}
|
||||
|
||||
@ -369,8 +382,8 @@ long SafeBuf::dumpToFile(char *filename ) {
|
||||
filename);
|
||||
return -1;
|
||||
}
|
||||
logf(LOG_DEBUG, "test: safebuf %li bytes written to %s",m_length,
|
||||
filename);
|
||||
//logf(LOG_DEBUG, "test: safebuf %li bytes written to %s",m_length,
|
||||
// filename);
|
||||
retry23:
|
||||
long bytes = write(fd, (char*)m_buf, m_length) ;
|
||||
if ( bytes != m_length ) {
|
||||
@ -972,7 +985,8 @@ bool SafeBuf::htmlEncode(char *s, long len, bool encodePoundSign ,
|
||||
// . sanity check
|
||||
if ( m_encoding == csUTF16 ) { char *xx = NULL; *xx = 0; }
|
||||
// alloc some space if we need to. add a byte for NULL termination.
|
||||
if(m_length+len+1>=m_capacity && !reserve(m_capacity+len))return false;
|
||||
if(m_length+len+1>=m_capacity && !reserve(m_capacity+len+1))
|
||||
return false;
|
||||
// tmp vars
|
||||
char *t = m_buf + m_length;
|
||||
char *tend = m_buf + m_capacity;
|
||||
@ -2517,7 +2531,11 @@ bool SafeBuf::decodeJSON ( long niceness ) {
|
||||
// . this is used by xmldoc.cpp to PARTIALLY decode a json buf so we do not
|
||||
// index letters in escapes like \n \r \f \t \uxxxx \\ \/
|
||||
// . SO we do keep \"
|
||||
bool SafeBuf::safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness) {
|
||||
// . so when indexing a doc we set decodeAll to FALSE, but if you want to
|
||||
// decode quotation marks as well then set decodeAll to TRUE!
|
||||
bool SafeBuf::safeDecodeJSONToUtf8 ( char *json,
|
||||
long jsonLen,
|
||||
long niceness ) {
|
||||
|
||||
// how much space to reserve for the copy?
|
||||
long need = jsonLen;
|
||||
@ -2579,6 +2597,15 @@ bool SafeBuf::safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness) {
|
||||
src += 2;
|
||||
continue;
|
||||
}
|
||||
// we do not decode quotation marks when indexing
|
||||
// the doc so we can preserve json names/value pair
|
||||
// information for indexing purposes. however,
|
||||
// Title.cpp DOES want to decode quotations.
|
||||
if ( src[1] == '\"' ) { // && decodeAll ) {
|
||||
*dst++ = '\"';
|
||||
src += 2;
|
||||
continue;
|
||||
}
|
||||
// utf8? if not, just skip the slash
|
||||
if ( src[1] != 'u' ) {
|
||||
// no, keep the slash so if we have /"
|
||||
@ -3155,3 +3182,49 @@ bool SafeBuf::htmlDecode ( char *src,
|
||||
// good to go
|
||||
return true;
|
||||
}
|
||||
|
||||
void SafeBuf::replaceChar ( char src , char dst ) {
|
||||
char *px = m_buf;
|
||||
char *pxEnd = m_buf + m_length;
|
||||
for ( ; px < pxEnd ; px++ ) if ( *px == src ) *px = dst;
|
||||
}
|
||||
|
||||
|
||||
// encode a double quote char to two double quote chars
|
||||
bool SafeBuf::csvEncode ( char *s , long len , long niceness ) {
|
||||
|
||||
if ( ! s ) return true;
|
||||
|
||||
// assume all chars are double quotes and will have to be encoded
|
||||
long need = len * 2 + 1;
|
||||
if ( ! reserve ( need ) ) return false;
|
||||
|
||||
// tmp vars
|
||||
char *dst = m_buf + m_length;
|
||||
//char *dstEnd = m_buf + m_capacity;
|
||||
|
||||
// scan through all
|
||||
char *send = s + len;
|
||||
for ( ; s < send ; s++ ) {
|
||||
// breathe
|
||||
QUICKPOLL ( niceness );
|
||||
// convert it?
|
||||
if ( *s == '\"' ) {
|
||||
*dst++ = '\"';
|
||||
*dst++ = '\"';
|
||||
continue;
|
||||
}
|
||||
//if ( *s == '\\' ) {
|
||||
// *dst++ = '\\';
|
||||
// *dst++ = '\\';
|
||||
// continue;
|
||||
//}
|
||||
*dst++ = *s;
|
||||
}
|
||||
|
||||
m_length += dst - (m_buf + m_length);
|
||||
|
||||
nullTerm();
|
||||
|
||||
return true;
|
||||
}
|
||||
|
13
SafeBuf.h
13
SafeBuf.h
@ -8,12 +8,14 @@
|
||||
struct SafeBuf {
|
||||
//*TRUCTORS
|
||||
SafeBuf();
|
||||
SafeBuf(long initSize);
|
||||
SafeBuf(long initSize, char *label = NULL);
|
||||
//be careful with passing in a stackBuf! it could go out
|
||||
//of scope independently of the safebuf.
|
||||
SafeBuf(char* stackBuf, long cap);
|
||||
SafeBuf(char *heapBuf, long bufMax, long bytesInUse, bool ownData);
|
||||
~SafeBuf();
|
||||
|
||||
void setLabel ( char *label );
|
||||
|
||||
// CAUTION: BE CAREFUL WHEN USING THE FOLLOWING TWO FUNCTIONS!!
|
||||
// setBuf() allows you reset the contents of the SafeBuf to either
|
||||
@ -59,6 +61,7 @@ struct SafeBuf {
|
||||
bool convertJSONtoXML ( long niceness , long startConvertPos );
|
||||
|
||||
bool safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness);
|
||||
// bool decodeAll = false );
|
||||
|
||||
bool decodeJSONToUtf8 ( long niceness );
|
||||
bool decodeJSON ( long niceness );
|
||||
@ -96,6 +99,9 @@ struct SafeBuf {
|
||||
bool safeStrcpy ( char *s ) ;
|
||||
bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
|
||||
bool safeUtf8ToJSON ( char *utf8 ) ;
|
||||
|
||||
bool csvEncode ( char *s , long len , long niceness = 0 );
|
||||
|
||||
//bool pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
|
||||
bool cat(SafeBuf& c);
|
||||
// . only cat the sections/tag that start with "tagFilter"
|
||||
@ -106,10 +112,11 @@ struct SafeBuf {
|
||||
void reset() { m_length = 0; }
|
||||
void purge(); // Clear all data and free all allocated memory
|
||||
bool advance ( long i ) ;
|
||||
|
||||
// . if clearIt is true we init the new buffer space to zeroes
|
||||
// . used by Collectiondb.cpp
|
||||
bool reserve(long i, char *label=NULL , bool clearIt = false );
|
||||
bool reserve2x(long i);
|
||||
bool reserve2x(long i, char *label = NULL );
|
||||
|
||||
char *makeSpace ( long size ) {
|
||||
if ( ! reserve ( size ) ) return NULL;
|
||||
@ -143,6 +150,7 @@ struct SafeBuf {
|
||||
char *t , long tlen ,
|
||||
long niceness ,
|
||||
long startOff = 0 );
|
||||
void replaceChar ( char src , char dst );
|
||||
bool copyToken(char* s);;
|
||||
//output encoding
|
||||
bool setEncoding(short cs);
|
||||
@ -326,6 +334,7 @@ struct SafeBuf {
|
||||
long m_capacity;
|
||||
long m_length;
|
||||
char *m_buf;
|
||||
char *m_label;
|
||||
bool m_usingStack;
|
||||
short m_encoding; // output charset
|
||||
|
||||
|
@ -342,7 +342,7 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
|
||||
|
||||
// we need to get some cgi values in order to correct the defaults
|
||||
// based on if we're doing an xml feed, have a site: query, etc.
|
||||
long xml = r->getLong ( "xml" , 0 ); // was "raw"
|
||||
//long xml = r->getLong ( "xml" , 0 ); // was "raw"
|
||||
long siteLen = 0; r->getString ("site",&siteLen);
|
||||
long sitesLen = 0;
|
||||
char *sites = r->getString ("sites",&sitesLen,NULL);
|
||||
@ -353,8 +353,11 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
|
||||
! m_whiteListBuf.nullTerm() ) )
|
||||
return log("query: unable to strcpy whitelist");
|
||||
|
||||
|
||||
char format = getFormatFromRequest ( r );
|
||||
|
||||
// now override automatic defaults for special cases
|
||||
if ( xml > 0 ) {
|
||||
if ( format != FORMAT_HTML ) {
|
||||
m_familyFilter = 0;
|
||||
// this is causing me a headache when on when i dont know it
|
||||
m_restrictIndexdbForQuery = false;
|
||||
@ -365,6 +368,8 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
|
||||
m_spellCheck = 0;
|
||||
m_refs_numToGenerate = 0;
|
||||
m_refs_docsToScan = 0;
|
||||
// default scoring info to off
|
||||
m_getDocIdScoringInfo = false;
|
||||
}
|
||||
else if ( m_siteLen > 0 ) {
|
||||
m_restrictIndexdbForQuery = false;
|
||||
@ -654,18 +659,19 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
|
||||
// use "&dg=1" to debug gigabits
|
||||
m_debugGigabits = r->getLong("dg",0);
|
||||
|
||||
// override
|
||||
m_format = format;
|
||||
|
||||
// . omit scoring info from the xml feed for now
|
||||
// . we have to roll this out to gk144 net i think
|
||||
if ( xml > 0 )
|
||||
m_getDocIdScoringInfo = 0;
|
||||
//if ( m_format != FORMAT_HTML )
|
||||
// m_getDocIdScoringInfo = 0;
|
||||
|
||||
// turn off by default!
|
||||
if ( ! r->getLong("gigabits",0) ) {
|
||||
m_numTopicGroups = 0;
|
||||
}
|
||||
|
||||
|
||||
//////////////////////////////////////
|
||||
//
|
||||
// transform input into classes
|
||||
@ -709,7 +715,8 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
|
||||
|
||||
// . returns false and sets g_errno on error
|
||||
// . sets m_qbuf1 and m_qbuf2
|
||||
if ( ! setQueryBuffers ( r ) ) return false;
|
||||
if ( ! setQueryBuffers (r) )
|
||||
return log("query: setQueryBuffers: %s",mstrerror(g_errno));
|
||||
|
||||
/* --- Virtual host language detection --- */
|
||||
if(r->getHost()) {
|
||||
@ -1089,10 +1096,11 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
|
||||
// if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
|
||||
// if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
|
||||
//}
|
||||
|
||||
|
||||
// append plus terms
|
||||
if ( m_plusLen > 0 ) {
|
||||
char *s = m_plus, *send = m_plus + m_plusLen;
|
||||
char *s = m_plus;
|
||||
char *send = m_plus + m_plusLen;
|
||||
//if ( p > pstart && p < pend ) *p++ = ' ';
|
||||
//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
|
||||
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
|
||||
@ -1108,7 +1116,7 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
|
||||
} else {
|
||||
while (!isspace(*s2) && s2 < send) s2++;
|
||||
}
|
||||
if (s < send) break;
|
||||
if (s2 < send) break;
|
||||
//if (p < pend) *p++ = '+';
|
||||
//if (p2 < pend2) *p2++ = '+';
|
||||
m_sbuf1.pushChar('+');
|
||||
@ -1142,7 +1150,8 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
|
||||
}
|
||||
// append minus terms
|
||||
if ( m_minusLen > 0 ) {
|
||||
char *s = m_minus, *send = m_minus + m_minusLen;
|
||||
char *s = m_minus;
|
||||
char *send = m_minus + m_minusLen;
|
||||
//if ( p > pstart && p < pend ) *p++ = ' ';
|
||||
//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
|
||||
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
|
||||
@ -1158,7 +1167,7 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
|
||||
} else {
|
||||
while (!isspace(*s2) && s2 < send) s2++;
|
||||
}
|
||||
if (s < send) break;
|
||||
if (s2 < send) break;
|
||||
//if (p < pend) *p++ = '-';
|
||||
//if (p2 < pend2) *p2++ = '-';
|
||||
m_sbuf1.pushChar('-');
|
||||
@ -1202,9 +1211,9 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
|
||||
}
|
||||
|
||||
// null terms
|
||||
m_sbuf1.pushChar('\0');
|
||||
m_sbuf2.pushChar('\0');
|
||||
m_sbuf3.pushChar('\0');
|
||||
if ( ! m_sbuf1.pushChar('\0') ) return false;
|
||||
if ( ! m_sbuf2.pushChar('\0') ) return false;
|
||||
if ( ! m_sbuf3.pushChar('\0') ) return false;
|
||||
|
||||
// the natural query
|
||||
m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset;
|
||||
@ -1239,6 +1248,7 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
|
||||
long dcatId = -1;
|
||||
// get the final query
|
||||
char *q =m_sbuf1.getBufStart();
|
||||
|
||||
if ( q ) sscanf(q,"gbpcatid:%li",&pcatId);
|
||||
if ( q ) sscanf(q,"gbcatid:%li",&dcatId);
|
||||
// pick the one that is valid
|
||||
@ -1301,3 +1311,33 @@ uint8_t SearchInput::detectQueryLanguage(void) {
|
||||
|
||||
return(lang);
|
||||
}
|
||||
|
||||
|
||||
char getFormatFromRequest ( HttpRequest *r ) {
|
||||
char format = FORMAT_HTML;
|
||||
|
||||
// what format should search results be in? default is html
|
||||
char *formatStr = r->getString("format", NULL );
|
||||
|
||||
if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
|
||||
if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
|
||||
if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
|
||||
if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
|
||||
|
||||
|
||||
// support old api &xml=1 to mean &format=1
|
||||
if ( r->getLong("xml",0) ) {
|
||||
format = FORMAT_XML;
|
||||
}
|
||||
|
||||
// also support &json=1
|
||||
if ( r->getLong("json",0) ) {
|
||||
format = FORMAT_JSON;
|
||||
}
|
||||
|
||||
if ( r->getLong("csv",0) ) {
|
||||
format = FORMAT_CSV;
|
||||
}
|
||||
|
||||
return format;
|
||||
}
|
||||
|
@ -22,6 +22,8 @@
|
||||
|
||||
#define MAX_TOPIC_GROUPS 10
|
||||
|
||||
char getFormatFromRequest ( class HttpRequest *r ) ;
|
||||
|
||||
// . parameters used to generate a set of related topics (gigabits)
|
||||
// . you can have Msg24 generate multiple sets of related topics in one call
|
||||
class TopicGroup {
|
||||
@ -43,6 +45,11 @@ class TopicGroup {
|
||||
long m_topicMaxPunctLen;
|
||||
};
|
||||
|
||||
#define FORMAT_HTML 0
|
||||
#define FORMAT_XML 1
|
||||
#define FORMAT_JSON 2
|
||||
#define FORMAT_CSV 3
|
||||
|
||||
class SearchInput {
|
||||
|
||||
public:
|
||||
@ -211,7 +218,13 @@ class SearchInput {
|
||||
|
||||
// tier sizes can change with different "raw" values, therefore,
|
||||
// so can search results
|
||||
long m_xml; // msg40
|
||||
//long m_xml; // msg40
|
||||
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON
|
||||
//long m_formatStrLen;
|
||||
//char *m_formatStr;
|
||||
|
||||
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv
|
||||
char m_format;
|
||||
|
||||
// this should be part of the key because it will affect the results!
|
||||
char m_queryExpansion;
|
||||
|
@ -252,6 +252,8 @@ bool Sections::set ( Words *w ,
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
|
||||
m_sectionPtrBuf.setLabel("psectbuf");
|
||||
|
||||
// separate buf now for section ptr for each word
|
||||
if ( ! m_sectionPtrBuf.reserve ( nw *4 ) ) return true;
|
||||
m_sectionPtrs = (Section **)m_sectionPtrBuf.getBufStart();
|
||||
@ -260,6 +262,8 @@ bool Sections::set ( Words *w ,
|
||||
// allocate m_sectionBuf
|
||||
m_sections = NULL;
|
||||
|
||||
m_sectionBuf.setLabel ( "sectbuf" );
|
||||
|
||||
if ( ! m_sectionBuf.reserve ( need ) )
|
||||
return true;
|
||||
|
||||
@ -15160,6 +15164,9 @@ bool Sections::print2 ( SafeBuf *sbuf ,
|
||||
|
||||
// save ptrs
|
||||
m_sbuf = sbuf;
|
||||
|
||||
m_sbuf->setLabel ("sectprnt");
|
||||
|
||||
//m_pt = pt;
|
||||
//m_et = et;
|
||||
//m_at = at;
|
||||
|
@ -1000,6 +1000,8 @@ bool Speller::loadUnifiedDict() {
|
||||
|
||||
bool needRebuild = false;
|
||||
|
||||
m_unifiedBuf.setLabel("unibuf");
|
||||
|
||||
// this MUST be there
|
||||
if ( m_unifiedBuf.fillFromFile(g_hostdb.m_dir,
|
||||
"unifiedDict-buf.txt" ) == 0 )
|
||||
|
610
Spider.cpp
610
Spider.cpp
File diff suppressed because it is too large
Load Diff
21
Spider.h
21
Spider.h
@ -45,6 +45,9 @@
|
||||
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
|
||||
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
|
||||
|
||||
bool testPatterns ( ) ;
|
||||
bool doesStringContainPattern ( char *content , char *pattern ) ;
|
||||
|
||||
bool getSpiderStatusMsg ( class CollectionRec *cx ,
|
||||
class SafeBuf *msg ,
|
||||
long *status ) ;
|
||||
@ -603,6 +606,8 @@ class SpiderRequest {
|
||||
long m_hasContactInfoValid :1;
|
||||
long m_isContactyValid :1;
|
||||
long m_hasAddressValid :1;
|
||||
//long m_matchesUrlCrawlPattern :1;
|
||||
//long m_matchesUrlProcessPattern:1;
|
||||
long m_hasTODValid :1;
|
||||
long m_hasSiteVenueValid :1;
|
||||
long m_siteNumInlinksValid :1;
|
||||
@ -832,8 +837,8 @@ class SpiderReply {
|
||||
// was the request an injection request
|
||||
long m_fromInjectionRequest :1;
|
||||
// did we TRY to send it to the diffbot backend filter? might be err?
|
||||
long m_sentToDiffbot:1;
|
||||
long m_reserved2 :1;
|
||||
long m_sentToDiffbot :1;
|
||||
long m_hadDiffbotError :1;
|
||||
long m_reserved3 :1;
|
||||
long m_reserved4 :1;
|
||||
|
||||
@ -1111,6 +1116,7 @@ class SpiderColl {
|
||||
key_t m_waitingTreeKey;
|
||||
bool m_waitingTreeKeyValid;
|
||||
long m_scanningIp;
|
||||
bool m_gotNewRequestsForScanningIp;
|
||||
|
||||
// start key for reading doledb
|
||||
key_t m_msg5StartKey;
|
||||
@ -1125,7 +1131,7 @@ class SpiderColl {
|
||||
|
||||
// for reading lists from spiderdb
|
||||
Msg5 m_msg5;
|
||||
bool m_gettingList;
|
||||
bool m_gettingList1;
|
||||
|
||||
// how many outstanding spiders a priority has
|
||||
long m_outstandingSpiders[MAX_SPIDER_PRIORITIES];
|
||||
@ -1276,7 +1282,7 @@ class SpiderLoop {
|
||||
|
||||
bool printLockTable ( );
|
||||
|
||||
long getNumSpidersOutPerIp ( long firstIp ) ;
|
||||
long getNumSpidersOutPerIp ( long firstIp , collnum_t collnum ) ;
|
||||
|
||||
// free all XmlDocs and m_list
|
||||
void reset();
|
||||
@ -1301,7 +1307,7 @@ class SpiderLoop {
|
||||
// . returns true and sets g_errno on error
|
||||
bool spiderUrl9 ( class SpiderRequest *sreq ,
|
||||
key_t *doledbKey ,
|
||||
char *coll ,
|
||||
collnum_t collnum,//char *coll ,
|
||||
long sameIpWaitTime , // in milliseconds
|
||||
long maxSpidersOutPerIp );
|
||||
|
||||
@ -1312,7 +1318,8 @@ class SpiderLoop {
|
||||
// state memory for calling SpiderUrl2() (maybe also getLocks()!)
|
||||
SpiderRequest *m_sreq;
|
||||
|
||||
char *m_coll;
|
||||
//char *m_coll;
|
||||
collnum_t m_collnum;
|
||||
char *m_content;
|
||||
long m_contentLen;
|
||||
char m_contentHasMime;
|
||||
@ -1354,7 +1361,7 @@ class SpiderLoop {
|
||||
class SpiderColl *m_sc;
|
||||
|
||||
// used to avoid calling getRec() twice!
|
||||
bool m_gettingList;
|
||||
//bool m_gettingList0;
|
||||
|
||||
long m_outstanding1;
|
||||
bool m_gettingDoledbList;
|
||||
|
@ -499,7 +499,7 @@ void drawLine2 ( SafeBuf &sb ,
|
||||
sb.safePrintf("<div style=\"position:absolute;"
|
||||
"left:%li;"
|
||||
"top:%li;"
|
||||
"background-color:#%lx;"
|
||||
"background-color:#%06lx;"
|
||||
"z-index:-5;"
|
||||
"min-height:%lipx;"
|
||||
"min-width:%lipx;\"></div>\n"
|
||||
|
2
Stats.h
2
Stats.h
@ -25,7 +25,7 @@ class StatPoint {
|
||||
|
||||
#define MAX_POINTS 6000
|
||||
#define MAX_WIDTH 6
|
||||
#define DY 600 // pixels vertical
|
||||
#define DY 1000 // pixels vertical
|
||||
#define DX 1000 // pixels across
|
||||
#define DT (20*1000) // time window, 20 seconds
|
||||
#define MAX_LINES (DY / (MAX_WIDTH+1)) // leave free pixel above each line
|
||||
|
54
Statsdb.cpp
54
Statsdb.cpp
@ -526,16 +526,16 @@ bool Statsdb::makeGIF ( long t1Arg ,
|
||||
|
||||
#define MAX_POINTS 6000
|
||||
#define MAX_WIDTH 6
|
||||
#define DY 600 // pixels vertical
|
||||
#define DX 1000 // pixels across
|
||||
#define MAX_LINES (DY / (MAX_WIDTH+1)) // leave free pixel above each line
|
||||
#define DY2 600 // pixels vertical
|
||||
#define DX2 1000 // pixels across
|
||||
#define MAX_LINES2 (DY2 / (MAX_WIDTH+1)) // leave free pixel above each line
|
||||
|
||||
long Statsdb::getImgHeight() {
|
||||
return (long)DY + m_by * 2;
|
||||
return (long)DY2 + m_by * 2;
|
||||
}
|
||||
|
||||
long Statsdb::getImgWidth() {
|
||||
return (long)DX + m_bx * 2;
|
||||
return (long)DX2 + m_bx * 2;
|
||||
}
|
||||
|
||||
// these are used for storing the "events"
|
||||
@ -599,7 +599,7 @@ bool Statsdb::gifLoop ( ) {
|
||||
// gif size
|
||||
//char tmp[64];
|
||||
// dimensions of the gif
|
||||
//sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 );
|
||||
//sprintf ( tmp , "%lix%li", (long)DX2+m_bx*2 , (long)DY2+m_by*2 );
|
||||
//GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp );
|
||||
// create one
|
||||
//GIFPlotter plotter ( NULL , m_fd , NULL );
|
||||
@ -607,7 +607,7 @@ bool Statsdb::gifLoop ( ) {
|
||||
//plotter.openpl ( );
|
||||
|
||||
// define the space with boundaries 100 unit wide boundaries
|
||||
//plotter.space ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 );
|
||||
//plotter.space ( 0 , 0 , DX2 + m_bx * 2 , DY2 + m_by * 2 );
|
||||
|
||||
// line thickness in user coordinates (pixels for us)
|
||||
//plotter.linewidth ( 1 );
|
||||
@ -628,7 +628,7 @@ bool Statsdb::gifLoop ( ) {
|
||||
"z-index:-10;"
|
||||
// the tick marks we print below are based on it
|
||||
// being a window of the last 20 seconds... and using
|
||||
// DX pixels
|
||||
// DX2 pixels
|
||||
"min-width:%lipx;"
|
||||
"min-height:%lipx;"
|
||||
//"width:100%%;"
|
||||
@ -637,15 +637,15 @@ bool Statsdb::gifLoop ( ) {
|
||||
"margin-bottom:10px;"
|
||||
"margin-right:10px;"
|
||||
"margin-left:10px;\">"
|
||||
,(long)DX + 2 *m_bx
|
||||
,(long)DY + 2*m_by);
|
||||
,(long)DX2 + 2 *m_bx
|
||||
,(long)DY2 + 2*m_by);
|
||||
|
||||
|
||||
// draw the x-axis
|
||||
//plotter.line ( m_bx , m_by , DX + m_bx , m_by );
|
||||
//plotter.line ( m_bx , m_by , DX2 + m_bx , m_by );
|
||||
|
||||
// 10 x-axis tick marks
|
||||
for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
|
||||
for ( int x = DX2/20 ; x <= DX2 ; x += DX2/20 ) {
|
||||
// tick mark
|
||||
//plotter.line ( x , -20 , x , 20 );
|
||||
m_gw.safePrintf("<div style=\"position:absolute;"
|
||||
@ -657,7 +657,7 @@ bool Statsdb::gifLoop ( ) {
|
||||
"min-width:3px;\"></div>\n"
|
||||
, m_bx + (long)x-1
|
||||
);
|
||||
long xv = (long)(dt * (long long)x/(long long)DX)-(long)dt;
|
||||
long xv = (long)(dt * (long long)x/(long long)DX2)-(long)dt;
|
||||
// LABEL
|
||||
m_gw.safePrintf("<div style=\"position:absolute;"
|
||||
"left:%li;"
|
||||
@ -780,8 +780,8 @@ bool Statsdb::gifLoop ( ) {
|
||||
// ensure at least 3 units wide for visibility
|
||||
if ( x2 < x1 + 10 ) x2 = x1 + 10;
|
||||
// . flip the y so we don't have to scroll the browser down
|
||||
// . DY does not include the axis and tick marks
|
||||
//long fy1 = DY - y1 + m_by ;
|
||||
// . DY2 does not include the axis and tick marks
|
||||
//long fy1 = DY2 - y1 + m_by ;
|
||||
// plot it
|
||||
//plotter.line ( x1 , fy1 , x2 , fy1 );
|
||||
drawLine3 ( m_gw , x1 , x2 , y1 , c1 , pp->m_thickness );
|
||||
@ -918,7 +918,7 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
|
||||
// . the minimum difference between ymax and ymin is minDiff.
|
||||
// . this prevents us from zooming in too close!
|
||||
float minDiff = (float)DY * label->m_minRes ;
|
||||
float minDiff = (float)DY2 * label->m_minRes ;
|
||||
// we are already scaled!
|
||||
float ourDiff = (ymax - ymin) ;
|
||||
|
||||
@ -976,14 +976,14 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
float y1 = lasty;
|
||||
|
||||
// normalize y into pixel space
|
||||
y2 = ((float)DY * (y2 - ymin)) / (ymax-ymin);
|
||||
y2 = ((float)DY2 * (y2 - ymin)) / (ymax-ymin);
|
||||
|
||||
// set lasts for next iteration of this loop
|
||||
lastx = x2;
|
||||
lasty = y2;
|
||||
|
||||
// . flip the y so we don't have to scroll the browser down
|
||||
// . DY does not include the axis and tick marks
|
||||
// . DY2 does not include the axis and tick marks
|
||||
// . do not flip y any more for statsdb graphs
|
||||
long fy1 = (long)(y1+.5);// + m_by ;
|
||||
long fy2 = (long)(y2+.5);// + m_by ;
|
||||
@ -1011,7 +1011,7 @@ char *Statsdb::plotGraph ( char *pstart ,
|
||||
|
||||
// plot it
|
||||
// BUT only iff not more than 5 seconds difference
|
||||
//float secondsPerPixel = (m_t2-m_t1)/(float)DX;
|
||||
//float secondsPerPixel = (m_t2-m_t1)/(float)DX2;
|
||||
|
||||
// avoid this for now. mdw oct 14 2013.
|
||||
//float dt = (x2 - x1) * secondsPerPixel;
|
||||
@ -1068,7 +1068,7 @@ void Statsdb::drawHR ( float z ,
|
||||
long color ) {
|
||||
|
||||
// convert into yspace
|
||||
float z2 = ((float)DY * (float)(z - ymin)) /(float)(ymax-ymin);
|
||||
float z2 = ((float)DY2 * (float)(z - ymin)) /(float)(ymax-ymin);
|
||||
// avoid collisions with other graphs
|
||||
z2 += zoff;
|
||||
// border
|
||||
@ -1076,7 +1076,7 @@ void Statsdb::drawHR ( float z ,
|
||||
// round off error
|
||||
z2 += 0.5;
|
||||
// for adjusatmnet
|
||||
float ptsPerPixel = (ymax-ymin)/ (float)DY;
|
||||
float ptsPerPixel = (ymax-ymin)/ (float)DY2;
|
||||
// make an adjustment to the label then! -- Commented out because it's currently not used.
|
||||
float zadj = zoff * ptsPerPixel;
|
||||
|
||||
@ -1088,9 +1088,9 @@ void Statsdb::drawHR ( float z ,
|
||||
// ((color >> 0) & 0xff) << 8 );
|
||||
|
||||
// horizontal line
|
||||
//plotter->line ( m_bx, (long)z2 , DX + m_bx, (long)z2 );
|
||||
//plotter->line ( m_bx, (long)z2 , DX2 + m_bx, (long)z2 );
|
||||
long width = 1;
|
||||
drawLine3 ( m_gw, 0, DX , (long)z2,color, width);
|
||||
drawLine3 ( m_gw, 0, DX2 , (long)z2,color, width);
|
||||
|
||||
|
||||
// make label
|
||||
@ -1364,7 +1364,7 @@ bool Statsdb::addPoint ( long x ,
|
||||
class StatState *ss ) {
|
||||
|
||||
// convert x into pixel position
|
||||
float xf = (float)DX * (float)(x - m_t1) / (float)(m_t2 - m_t1);
|
||||
float xf = (float)DX2 * (float)(x - m_t1) / (float)(m_t2 - m_t1);
|
||||
// round it to nearest pixel
|
||||
long x2 = (long)(xf + .5) ;//+ m_bx;
|
||||
// make this our y pos
|
||||
@ -1446,12 +1446,12 @@ bool Statsdb::addEventPoint ( long t1 ,
|
||||
long thickness ) {
|
||||
|
||||
// convert t1 into pixel position
|
||||
float af = (float)DX * (float)(t1 - m_t1) / (float)(m_t2 - m_t1);
|
||||
float af = (float)DX2 * (float)(t1 - m_t1) / (float)(m_t2 - m_t1);
|
||||
// round it to nearest pixel
|
||||
long a = (long)(af + .5) ;//+ m_bx;
|
||||
|
||||
// convert t2 into pixel position
|
||||
//float bf = (float)DX * (float)(t2 - m_t1) / (float)(m_t2 - m_t1);
|
||||
//float bf = (float)DX2 * (float)(t2 - m_t1) / (float)(m_t2 - m_t1);
|
||||
// round it to nearest pixel
|
||||
//long b = (long)(bf + .5) + m_bx;
|
||||
//if ( a > b ) { char *xx=NULL;*xx=0; }
|
||||
@ -1468,7 +1468,7 @@ bool Statsdb::addEventPoint ( long t1 ,
|
||||
}
|
||||
|
||||
// go down each line of points
|
||||
for ( long i = 0 ; i < MAX_LINES ; i++ ) {
|
||||
for ( long i = 0 ; i < MAX_LINES2 ; i++ ) {
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
// . is there room for us in this line?
|
||||
|
@ -429,6 +429,8 @@ char *getSourceString ( char source ) {
|
||||
if ( source == SOURCE_BIGRAM ) return "bigram";
|
||||
if ( source == SOURCE_TRIGRAM ) return "trigram";
|
||||
if ( source == SOURCE_WIKTIONARY_EN ) return "wiktionary-en";
|
||||
// the thing we are hashing is a "number"
|
||||
if ( source == SOURCE_NUMBER ) return "number";
|
||||
return "unknown";
|
||||
}
|
||||
|
||||
|
@ -15,6 +15,7 @@
|
||||
#define SOURCE_GENERATED 4
|
||||
#define SOURCE_BIGRAM 5
|
||||
#define SOURCE_TRIGRAM 6
|
||||
#define SOURCE_NUMBER 7
|
||||
|
||||
// per word!
|
||||
#define MAX_SYNS 64
|
||||
|
@ -2513,6 +2513,13 @@ bool Msg8a::getTagRec ( Url *url ,
|
||||
TagRec *tagRec ,
|
||||
bool doInheritance ,
|
||||
char rdbId ) {
|
||||
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
return true;
|
||||
}
|
||||
|
||||
// reset tag rec
|
||||
tagRec->reset();//m_numListPtrs = 0;
|
||||
|
@ -780,7 +780,10 @@ TcpSocket *TcpServer::getNewSocket ( ) {
|
||||
log("tcp: using statically linked libc that only supports "
|
||||
"an fd of up to %li, but got an fd = %li. fd_set is "
|
||||
"only geared for 1024 bits of file descriptors for "
|
||||
"doing poll() in Loop.cpp",
|
||||
"doing poll() in Loop.cpp. Ensure 'ulimit -a' limits "
|
||||
"open files to 1024. "
|
||||
"Check open fds using ls /proc/<gb-pid>/fds/ and ensure "
|
||||
"they are all BELOW 1024.",
|
||||
(long)MAX_NUM_FDS,(long)sd);
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
@ -1092,7 +1095,7 @@ bool TcpServer::closeLeastUsed ( long maxIdleTime ) {
|
||||
// . g_errno will be set by Loop if there was a kinda socket reset error
|
||||
void readSocketWrapper ( int sd , void *state ) {
|
||||
// debug msg
|
||||
// log("........... TcpServer::readSocketWrapper\n");
|
||||
//log("........... TcpServer::readSocketWrapper\n");
|
||||
// extract our this ptr
|
||||
TcpServer *THIS = (TcpServer *)state;
|
||||
// get a TcpSocket from sd
|
||||
@ -1239,8 +1242,13 @@ long TcpServer::readSocket ( TcpSocket *s ) {
|
||||
|
||||
// do the read
|
||||
int n;
|
||||
if (m_useSSL)
|
||||
n = SSL_read ( s->m_ssl, s->m_readBuf + s->m_readOffset, avail );
|
||||
if (m_useSSL) {
|
||||
//long long now1 = gettimeofdayInMilliseconds();
|
||||
n = SSL_read(s->m_ssl, s->m_readBuf + s->m_readOffset, avail );
|
||||
//long long now2 = gettimeofdayInMilliseconds();
|
||||
//long long took = now2 - now1 ;
|
||||
//if ( took >= 2 ) log("tcp: ssl_read took %llims", took);
|
||||
}
|
||||
else
|
||||
n = ::read ( s->m_sd, s->m_readBuf + s->m_readOffset, avail );
|
||||
|
||||
@ -1483,8 +1491,13 @@ long TcpServer::writeSocket ( TcpSocket *s ) {
|
||||
// send this piece
|
||||
int n;
|
||||
retry10:
|
||||
if (m_useSSL)
|
||||
if (m_useSSL) {
|
||||
//long long now1 = gettimeofdayInMilliseconds();
|
||||
n = SSL_write ( s->m_ssl, msg + s->m_sendOffset, toSend );
|
||||
//long long now2 = gettimeofdayInMilliseconds();
|
||||
//long long took = now2 - now1 ;
|
||||
//if ( took >= 2 ) log("tcp: ssl_write took %llims", took);
|
||||
}
|
||||
else
|
||||
n = ::send ( s->m_sd , msg + s->m_sendOffset , toSend , 0 );
|
||||
// cancel harmless errors, return -1 on severe ones
|
||||
@ -1626,8 +1639,12 @@ connected:
|
||||
int r;
|
||||
s->m_ssl = SSL_new(m_ctx);
|
||||
SSL_set_fd(s->m_ssl, s->m_sd);
|
||||
//long long now1 = gettimeofdayInMilliseconds();
|
||||
SSL_set_connect_state(s->m_ssl);
|
||||
r = SSL_connect(s->m_ssl);
|
||||
//long long now2 = gettimeofdayInMilliseconds();
|
||||
//long long took = now2 - now1 ;
|
||||
//if ( took >= 2 ) log("tcp: ssl_connect took %llims", took);
|
||||
if (!s->m_ssl) {
|
||||
log("ssl: SSL is NULL after connect.");
|
||||
char *xx = NULL; *xx = 0;
|
||||
@ -2092,9 +2109,19 @@ bool TcpServer::sslAccept ( TcpSocket *s ) {
|
||||
}
|
||||
|
||||
//log("ssl: SSL_accept %li",newsd);
|
||||
long long now1 = gettimeofdayInMilliseconds();
|
||||
retry19:
|
||||
// javier put this in here, but it was not non-blocking!!!
|
||||
// . javier put this in here, but it was not non-blocking!!!
|
||||
// . it is non-blocking now, however, when it does block and
|
||||
// complete the accept it takes 10ms on sp1, a server from ~2009
|
||||
// using a custom build of the lastest libssl.a from about 2013.
|
||||
// . this accept needs to be put in a thread then, maybe multiple
|
||||
// threads
|
||||
int r = SSL_accept(s->m_ssl);
|
||||
long long now2 = gettimeofdayInMilliseconds();
|
||||
long long took = now2 - now1 ;
|
||||
if ( took >= 2 )
|
||||
log("tcp: ssl_accept %li took %llims", (long)newsd, took);
|
||||
// did it block?
|
||||
if ( r < 0 && errno == EINTR ) goto retry19;
|
||||
// copy errno to g_errno
|
||||
@ -2103,7 +2130,7 @@ bool TcpServer::sslAccept ( TcpSocket *s ) {
|
||||
if ( g_errno == SSL_ERROR_WANT_READ ||
|
||||
g_errno == SSL_ERROR_WANT_WRITE ||
|
||||
g_errno == EAGAIN ) {
|
||||
//log("ssl: SSL_accept blocked %li",newsd);
|
||||
//log("ssl: SSL_accept would block %li",newsd);
|
||||
return true;
|
||||
}
|
||||
// any other?
|
||||
@ -2117,8 +2144,9 @@ bool TcpServer::sslAccept ( TcpSocket *s ) {
|
||||
}
|
||||
|
||||
// log this so we can monitor if we get too many of these per second
|
||||
// because they take like 10ms each on sp1!!! mdw
|
||||
log("ssl: SSL_accept (~10ms) completed %li",newsd);
|
||||
// because they take like 10ms each on sp1!!! (even with non-blocking
|
||||
// sockets, they'll block for 10ms) - mdw 2013
|
||||
//log("ssl: SSL_accept (~10ms) completed %li",newsd);
|
||||
// ok, we got it
|
||||
s->m_sockState = ST_READING;
|
||||
return true;
|
||||
|
2
Test.cpp
2
Test.cpp
@ -126,7 +126,7 @@ void Test::removeFiles ( ) {
|
||||
// . kinda like Collectiondb::deleteRec() i guess but we need to
|
||||
// preserve the parms!!
|
||||
// . deletetagdb = false
|
||||
g_collectiondb.resetColl ( "test" , NULL ); // false );
|
||||
g_collectiondb.resetColl ( "test" , NULL , true );
|
||||
|
||||
// reset event count
|
||||
//g_collectiondb.countEvents();
|
||||
|
100
Threads.cpp
100
Threads.cpp
@ -303,6 +303,10 @@ bool Threads::init ( ) {
|
||||
// generic multipurpose
|
||||
if ( ! g_threads.registerType (GENERIC_THREAD,100/*maxThreads*/,100) )
|
||||
return log("thread: Failed to register thread type." );
|
||||
// for call SSL_accept() which blocks for 10ms even when socket
|
||||
// is non-blocking...
|
||||
//if (!g_threads.registerType (SSLACCEPT_THREAD,20/*maxThreads*/,100))
|
||||
// return log("thread: Failed to register thread type." );
|
||||
|
||||
#ifndef PTHREADS
|
||||
|
||||
@ -884,20 +888,28 @@ bool ThreadQueue::timedCleanUp ( long maxNiceness ) {
|
||||
|
||||
#ifdef PTHREADS
|
||||
|
||||
// . join up with that thread
|
||||
// . damn, sometimes he can block forever on his
|
||||
// call to sigqueue(),
|
||||
long status = pthread_join ( t->m_joinTid , NULL );
|
||||
if ( status != 0 ) {
|
||||
log("threads: pthread_join %li = %s (%li)",
|
||||
(long)t->m_joinTid,mstrerror(status),status);
|
||||
}
|
||||
// debug msg
|
||||
if ( g_conf.m_logDebugThread )
|
||||
log(LOG_DEBUG,"thread: joined1 with t=0x%lx "
|
||||
"jointid=0x%lx.",
|
||||
(long)t,(long)t->m_joinTid);
|
||||
|
||||
// if pthread_create() failed it returns the errno and we
|
||||
// needsJoin is false, so do not try to join
|
||||
// to a thread if we did not create it, lest pthread_join()
|
||||
// cores
|
||||
if ( t->m_needsJoin ) {
|
||||
// . join up with that thread
|
||||
// . damn, sometimes he can block forever on his
|
||||
// call to sigqueue(),
|
||||
long status = pthread_join ( t->m_joinTid , NULL );
|
||||
if ( status != 0 ) {
|
||||
log("threads: pthread_join %li = %s (%li)",
|
||||
(long)t->m_joinTid,mstrerror(status),
|
||||
status);
|
||||
}
|
||||
// debug msg
|
||||
if ( g_conf.m_logDebugThread )
|
||||
log(LOG_DEBUG,"thread: joined1 with t=0x%lx "
|
||||
"jointid=0x%lx.",
|
||||
(long)t,(long)t->m_joinTid);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
again:
|
||||
@ -1211,20 +1223,22 @@ bool ThreadQueue::cleanUp ( ThreadEntry *tt , long maxNiceness ) {
|
||||
|
||||
#ifdef PTHREADS
|
||||
|
||||
// . join up with that thread
|
||||
// . damn, sometimes he can block forever on his
|
||||
// call to sigqueue(),
|
||||
long status = pthread_join ( t->m_joinTid , NULL );
|
||||
if ( status != 0 ) {
|
||||
log("threads: pthread_join2 %li = %s (%li)",
|
||||
(long)t->m_joinTid,mstrerror(status),status);
|
||||
if ( t->m_needsJoin ) {
|
||||
// . join up with that thread
|
||||
// . damn, sometimes he can block forever on his
|
||||
// call to sigqueue(),
|
||||
long status = pthread_join ( t->m_joinTid , NULL );
|
||||
if ( status != 0 ) {
|
||||
log("threads: pthread_join2 %li = %s (%li)",
|
||||
(long)t->m_joinTid,mstrerror(status),
|
||||
status);
|
||||
}
|
||||
// debug msg
|
||||
if ( g_conf.m_logDebugThread )
|
||||
log(LOG_DEBUG,"thread: joined2 with t=0x%lx "
|
||||
"jointid=0x%lx.",
|
||||
(long)t,(long)t->m_joinTid);
|
||||
}
|
||||
// debug msg
|
||||
if ( g_conf.m_logDebugThread )
|
||||
log(LOG_DEBUG,"thread: joined2 with t=0x%lx "
|
||||
"jointid=0x%lx.",
|
||||
(long)t,(long)t->m_joinTid);
|
||||
|
||||
#else
|
||||
|
||||
again:
|
||||
@ -1591,7 +1605,7 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
|
||||
// return if the max is already launched
|
||||
if ( active >= m_maxLaunched ) return false;
|
||||
|
||||
// do not launch a low priority merge, addlists or filter thread if we
|
||||
// do not launch a low priority merge, intersect or filter thread if we
|
||||
// have high priority cpu threads already going on. this way a
|
||||
// low priority spider thread will not launch if a high priority
|
||||
// cpu-based thread of any kind (right now just MERGE or INTERSECT)
|
||||
@ -1642,7 +1656,7 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
|
||||
// i dunno what the point of this was... so i commented it out
|
||||
//long max2 = g_conf.m_queryMaxDiskThreads ;
|
||||
//if ( max2 <= 0 ) max2 = 1;
|
||||
// only do this check if we're a addlists thread queue
|
||||
// only do this check if we're a addlists/instersect thread queue
|
||||
//if (m_threadType == INTERSECT_THREAD&& hiActive >= max2)return false;
|
||||
|
||||
// loop through candidates
|
||||
@ -2008,7 +2022,26 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
|
||||
//
|
||||
#else
|
||||
|
||||
pthread_create ( &t->m_joinTid , &s_attr, startUp2 , t) ;
|
||||
// assume it does not go through
|
||||
t->m_needsJoin = false;
|
||||
|
||||
// pthread inherits our sigmask, so don't let it handle sigalrm
|
||||
// signals in Loop.cpp, it'll screw things up. that handler
|
||||
// is only meant to be called by the main process. if we end up
|
||||
// double calling it, this thread may think g_callback is non-null
|
||||
// then it gets set to NULL, then the thread cores! seen it...
|
||||
sigset_t sigs;
|
||||
sigemptyset ( &sigs );
|
||||
sigaddset ( &sigs , SIGALRM );
|
||||
if ( sigprocmask ( SIG_BLOCK , &sigs , NULL ) < 0 )
|
||||
log("threads: failed to block sig");
|
||||
|
||||
// this returns 0 on success, or the errno otherwise
|
||||
g_errno = pthread_create ( &t->m_joinTid , &s_attr, startUp2 , t) ;
|
||||
|
||||
if ( sigprocmask ( SIG_UNBLOCK , &sigs , NULL ) < 0 )
|
||||
log("threads: failed to unblock sig");
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
@ -2020,6 +2053,8 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
|
||||
|
||||
// return true on successful creation of the thread
|
||||
if ( g_errno == 0 ) {
|
||||
// good stuff, the thread needs a join now
|
||||
t->m_needsJoin = true;
|
||||
if ( count > 0 )
|
||||
log("thread: Call to clone looped %li times.",count);
|
||||
return true;
|
||||
@ -2047,6 +2082,11 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
|
||||
#ifndef PTHREADS
|
||||
hadError:
|
||||
#endif
|
||||
|
||||
if ( g_errno )
|
||||
log("thread: pthread_create had error = %s",
|
||||
mstrerror(g_errno));
|
||||
|
||||
// it didn't launch, did it? dec the count.
|
||||
m_launched--;
|
||||
// priority-based LOCAL & GLOBAL launch counts
|
||||
@ -2326,7 +2366,7 @@ const char *ThreadQueue::getThreadType ( ) {
|
||||
const char *s = "unknown";
|
||||
if ( m_threadType == DISK_THREAD ) s = "disk";
|
||||
if ( m_threadType == MERGE_THREAD ) s = "merge";
|
||||
if ( m_threadType == INTERSECT_THREAD ) s = "addlists";
|
||||
if ( m_threadType == INTERSECT_THREAD ) s = "intersectlists";
|
||||
if ( m_threadType == FILTER_THREAD ) s = "filter";
|
||||
if ( m_threadType == SAVETREE_THREAD ) s = "savetree";
|
||||
if ( m_threadType == UNLINK_THREAD ) s = "unlink";
|
||||
|
@ -21,6 +21,7 @@ pid_t getpidtid();
|
||||
#define SAVETREE_THREAD 4
|
||||
#define UNLINK_THREAD 5
|
||||
#define GENERIC_THREAD 6
|
||||
//#define SSLACCEPT_THREAD 7
|
||||
#define GB_SIGRTMIN (SIGRTMIN+4)
|
||||
#define MAX_NICENESS 2
|
||||
// . a ThreadQueue has a list of thread entries
|
||||
@ -54,6 +55,7 @@ class ThreadEntry {
|
||||
long m_stackSize ;
|
||||
long m_si ; // s_stackPtrs[i] = m_stack
|
||||
|
||||
bool m_needsJoin;
|
||||
pthread_t m_joinTid;
|
||||
};
|
||||
|
||||
|
13
Title.cpp
13
Title.cpp
@ -113,8 +113,17 @@ bool Title::setTitle ( XmlDoc *xd ,
|
||||
char *val = NULL;
|
||||
long vlen;
|
||||
// look for the "title:" field in json then use that
|
||||
if ( xd->m_contentType == CT_JSON )
|
||||
val = getJSONFieldValue ( xd->ptr_utf8Content,"title",&vlen);
|
||||
SafeBuf jsonTitle;
|
||||
if ( xd->m_contentType == CT_JSON ) {
|
||||
char *jt;
|
||||
jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
|
||||
if ( jt && vlen > 0 ) {
|
||||
jsonTitle.safeDecodeJSONToUtf8 (jt, vlen, m_niceness);
|
||||
//true ); // decodeAll?
|
||||
jsonTitle.nullTerm();
|
||||
val = jsonTitle.getBufStart();
|
||||
}
|
||||
}
|
||||
// if we had a title: field in the json...
|
||||
if ( val ) {
|
||||
char *dst = NULL;
|
||||
|
@ -22,6 +22,8 @@ Wiktionary::Wiktionary () {
|
||||
// . now m_langTable just maps to langId, no POS bits...
|
||||
//m_langTable.set ( 6 , 1,0,NULL,0,false,0 ,"wkt-lang");
|
||||
m_synTable.set ( 6 , 4,0,NULL,0,true,0 ,"wkt-synt");
|
||||
|
||||
m_synBuf.setLabel("synbuf");
|
||||
}
|
||||
|
||||
void Wiktionary::reset() {
|
||||
@ -47,6 +49,11 @@ Wiktionary::~Wiktionary () {
|
||||
|
||||
|
||||
bool Wiktionary::test ( ) {
|
||||
|
||||
// test words parsing here
|
||||
//Words w;
|
||||
//w.set9 ("get $4,500.00 now",0);
|
||||
|
||||
// test it out!
|
||||
char *str = "love";//pie"; //forsake";
|
||||
//long long wid = hash64Lower_utf8(str);
|
||||
|
31
Words.cpp
31
Words.cpp
@ -416,7 +416,38 @@ bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {
|
||||
}
|
||||
// . c#, j#, ...
|
||||
if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++;
|
||||
|
||||
// comma is ok if like ,ddd!d
|
||||
if ( s[i]==',' &&
|
||||
i-j <= 3 &&
|
||||
is_digit(s[i-1]) ) {
|
||||
// if word so far is 2 or 3 chars, make sure digits
|
||||
if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo;
|
||||
if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo;
|
||||
// scan forward
|
||||
subloop:
|
||||
if ( s[i] == ',' &&
|
||||
is_digit(s[i+1]) &&
|
||||
is_digit(s[i+2]) &&
|
||||
is_digit(s[i+3]) &&
|
||||
! is_digit(s[i+4]) ) {
|
||||
i += 4;
|
||||
goto subloop;
|
||||
}
|
||||
}
|
||||
|
||||
// decimal point?
|
||||
if ( s[i] == '.' &&
|
||||
is_digit(s[i-1]) &&
|
||||
is_digit(s[i+1]) ) {
|
||||
// allow the decimal point
|
||||
i++;
|
||||
// skip over string of digits
|
||||
while ( is_digit(s[i]) ) i++;
|
||||
}
|
||||
|
||||
nogo:
|
||||
|
||||
// allow for words like we're dave's and i'm
|
||||
if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){
|
||||
i++;
|
||||
|
902
XmlDoc.cpp
902
XmlDoc.cpp
File diff suppressed because it is too large
Load Diff
20
XmlDoc.h
20
XmlDoc.h
@ -630,6 +630,8 @@ class XmlDoc {
|
||||
long *getIp ( ) ;
|
||||
long *gotIp ( bool save ) ;
|
||||
bool *getIsAllowed ( ) ;
|
||||
long *getFinalCrawlDelay();
|
||||
long m_finalCrawlDelay;
|
||||
//long getTryAgainTimeDelta() {
|
||||
// if ( ! m_tryAgainTimeDeltaValid ) { char *xx=NULL;*xx=0;}
|
||||
// return m_tryAgainTimeDelta;
|
||||
@ -752,6 +754,7 @@ class XmlDoc {
|
||||
bool hashDMOZCategories ( class HashTableX *table ) ;
|
||||
bool hashLinks ( class HashTableX *table ) ;
|
||||
bool hashUrl ( class HashTableX *table ) ;
|
||||
bool hashDateNumbers ( class HashTableX *tt ) ;
|
||||
bool hashSections ( class HashTableX *table ) ;
|
||||
bool hashIncomingLinkText ( class HashTableX *table ,
|
||||
bool hashAnomalies ,
|
||||
@ -849,6 +852,15 @@ class XmlDoc {
|
||||
long niceness );
|
||||
|
||||
|
||||
bool hashNumber ( char *beginBuf ,
|
||||
char *buf ,
|
||||
long bufLen ,
|
||||
class HashInfo *hi ) ;
|
||||
|
||||
bool hashNumber2 ( float f ,
|
||||
class HashInfo *hi ,
|
||||
char *gbsortByStr ) ;
|
||||
|
||||
// print out for PageTitledb.cpp and PageParser.cpp
|
||||
bool printDoc ( class SafeBuf *pbuf );
|
||||
bool printMenu ( class SafeBuf *pbuf );
|
||||
@ -1159,6 +1171,7 @@ class XmlDoc {
|
||||
*/
|
||||
bool m_httpStatusValid;
|
||||
bool m_crawlDelayValid;
|
||||
bool m_finalCrawlDelayValid;
|
||||
bool m_titleRecKeyValid;
|
||||
bool m_adVectorValid;
|
||||
bool m_wikiDocIdsValid;
|
||||
@ -1279,6 +1292,7 @@ class XmlDoc {
|
||||
bool m_replyValid;
|
||||
bool m_recycleDiffbotReplyValid;
|
||||
bool m_diffbotReplyValid;
|
||||
bool m_tokenizedDiffbotReplyValid;
|
||||
//bool m_diffbotUrlCrawlPatternMatchValid;
|
||||
//bool m_diffbotUrlProcessPatternMatchValid;
|
||||
//bool m_diffbotPageProcessPatternMatchValid;
|
||||
@ -1480,6 +1494,7 @@ class XmlDoc {
|
||||
char m_isWWWDup;
|
||||
char m_calledMsg0b;
|
||||
Url m_tmpUrl;
|
||||
|
||||
SafeBuf m_tmpsb1;
|
||||
SafeBuf m_tmpsb2;
|
||||
SafeBuf m_turkBuf;
|
||||
@ -1548,9 +1563,9 @@ class XmlDoc {
|
||||
//
|
||||
XmlDoc *m_dx;
|
||||
char *m_diffbotObj;
|
||||
char *m_diffbotObjEnd;
|
||||
char m_diffbotSavedChar;
|
||||
SafeBuf m_diffbotReply;
|
||||
SafeBuf *m_tokenizedDiffbotReplyPtr;
|
||||
SafeBuf m_tokenizedDiffbotReply;
|
||||
long m_diffbotReplyError;
|
||||
bool m_recycleDiffbotReply;
|
||||
//bool m_diffbotUrlCrawlPatternMatch;
|
||||
@ -1562,6 +1577,7 @@ class XmlDoc {
|
||||
SafeBuf m_diffbotApiUrl;
|
||||
|
||||
bool *getRecycleDiffbotReply ( ) ;
|
||||
SafeBuf *getTokenizedDiffbotReply ( ) ;
|
||||
SafeBuf *getDiffbotReply ( ) ;
|
||||
//bool doesUrlMatchDiffbotCrawlPattern() ;
|
||||
//bool doesUrlMatchDiffbotProcessPattern() ;
|
||||
|
@ -382,9 +382,9 @@ void gotDocWrapper ( void *state , TcpSocket *s ) {
|
||||
// parse status message out of response
|
||||
|
||||
// HTTP/1.0
|
||||
while ( p < pend && !is_space(*p) ) p++;
|
||||
while ( p < pend && !isspace(*p) ) p++;
|
||||
// skip space
|
||||
while ( p < pend && is_space(*p) ) p++;
|
||||
while ( p < pend && isspace(*p) ) p++;
|
||||
// copy to end of line
|
||||
while (p < pend && mlen < 255 && *p != '\r' && *p != '\n'){
|
||||
message[mlen++] = *p;
|
||||
|
@ -4,7 +4,12 @@
|
||||
<notifyUrl><![CDATA[ccc]]></>
|
||||
<collectiveRespiderFrequency>0.000000</>
|
||||
<collectiveCrawlDelay>0.250000</>
|
||||
<diffbotApiUrl><![CDATA[]]></>
|
||||
<diffbotUrlCrawlPattern><![CDATA[]]></>
|
||||
<diffbotUrlProcessPattern><![CDATA[]]></>
|
||||
<diffbotPageProcessPattern><![CDATA[]]></>
|
||||
<diffbotUrlCrawlRegEx><![CDATA[]]></>
|
||||
<diffbotUrlProcessRegEx><![CDATA[]]></>
|
||||
<diffbotOnlyProcessIfNew>1</>
|
||||
<diffbotSeeds><![CDATA[]]></>
|
||||
<isCustomCrawl>0</>
|
||||
@ -79,6 +84,9 @@
|
||||
# The spider round number.
|
||||
<spiderRoundNum>0</>
|
||||
|
||||
# The spider status number.
|
||||
<spiderStatus>0</>
|
||||
|
||||
# Do searches for queries in this hosts part of the query log.
|
||||
<scrapingEnabledProcog>0</>
|
||||
|
||||
@ -326,12 +334,12 @@
|
||||
<maxRobotstxtCacheAge>86400</>
|
||||
|
||||
# Only spider URLs scheduled to be spidered at this time or after. In UTC.
|
||||
<spiderStartTime>17 Jan 1970 20:00 UTC</>
|
||||
<spiderStartTime>24 Jan 1970 20:00 UTC</>
|
||||
|
||||
# Only spider URLs scheduled to be spidered at this time or before. If "use
|
||||
# current time" is true then the current local time is used for this value
|
||||
# instead. in UTC.
|
||||
<spiderEndTime>01 Jan 1970 08:00 UTC</>
|
||||
<spiderEndTime>08 Jan 1970 08:00 UTC</>
|
||||
|
||||
# Use the current time as the spider end time?
|
||||
<useCurrentTime>1</>
|
||||
@ -812,7 +820,7 @@
|
||||
<highlightQueryTermsInRelatedPagesSummary>0</>
|
||||
|
||||
# Truncates a related page title after this many charaters and adds ...
|
||||
<numberOfCharactersToDisplayInTitleBeforeTruncating>50</>
|
||||
<numberOfCharactersToDisplayInTitleBeforeTruncating>0</>
|
||||
|
||||
# Use the search results' links in order to generate related pages.
|
||||
<useResultsPagesAsReferences>0</>
|
||||
@ -1017,173 +1025,23 @@
|
||||
<filterExpression><![CDATA[hopcount>=3]]></>
|
||||
<filterExpression><![CDATA[isnew]]></>
|
||||
<filterExpression><![CDATA[default]]></>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>0</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>0</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>0</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>0</>
|
||||
<spidersEnabled>1</>
|
||||
<spidersEnabled>0</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
<filterFrequency>1.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>7.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>10.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>20.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>0.000000</>
|
||||
<filterFrequency>30.000000</>
|
||||
<filterFrequency>30.000000</>
|
||||
|
||||
# Use <harvestLinks> tag.
|
||||
|
||||
# Use <spidersEnabled> tag.
|
||||
|
||||
# Use <filterFrequency> tag.
|
||||
|
||||
# Do not allow more than this many outstanding spiders for all urls in this
|
||||
# priority.
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>4</>
|
||||
<maxSpidersPerRule>2</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>2</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>1</>
|
||||
<maxSpidersPerRule>99</>
|
||||
<maxSpidersPerRule>99</>
|
||||
# Use <maxSpidersPerRule> tag.
|
||||
|
||||
# Allow this many spiders per IP.
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
<maxSpidersPerIp>1</>
|
||||
# Use <maxSpidersPerIp> tag.
|
||||
|
||||
# Wait at least this long before downloading urls from the same IP address.
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<spiderIpWait>1000</>
|
||||
<filterPriority>80</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>0</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>-3</>
|
||||
<filterPriority>3</>
|
||||
<filterPriority>45</>
|
||||
<filterPriority>85</>
|
||||
<filterPriority>50</>
|
||||
<filterPriority>48</>
|
||||
<filterPriority>49</>
|
||||
<filterPriority>47</>
|
||||
<filterPriority>40</>
|
||||
<filterPriority>39</>
|
||||
<filterPriority>30</>
|
||||
<filterPriority>29</>
|
||||
<filterPriority>20</>
|
||||
<filterPriority>19</>
|
||||
<filterPriority>1</>
|
||||
<filterPriority>0</>
|
||||
<diffbotAPI><![CDATA[]]></>
|
||||
# Use <spiderIpWait> tag.
|
||||
|
||||
# Use <filterPriority> tag.
|
||||
|
||||
# Use <diffbotAPI> tag.
|
||||
|
66
errnotest.cpp
Normal file
66
errnotest.cpp
Normal file
@ -0,0 +1,66 @@
|
||||
#include <stdio.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/wait.h>
|
||||
#include <errno.h>
|
||||
#include <sched.h>
|
||||
#include <unistd.h>
|
||||
#include <assert.h>
|
||||
|
||||
static int s_called = 0;
|
||||
|
||||
#define MAX_PID 32767
|
||||
static int s_errno ;
|
||||
static int s_errnos [ MAX_PID + 1 ];
|
||||
|
||||
static long s_bad = 0;
|
||||
static long s_badPid = -1;
|
||||
|
||||
// WARNING: you MUST compile with -DREENTRANT for this to work
|
||||
int *__errno_location (void) {
|
||||
long pid = (long) getpid();
|
||||
s_called++;
|
||||
if ( pid <= (long)MAX_PID ) return &s_errnos[pid];
|
||||
s_bad++;
|
||||
s_badPid = pid;
|
||||
return &s_errno;
|
||||
}
|
||||
|
||||
//extern __thread int errno;
|
||||
|
||||
int g_errno = 0;
|
||||
|
||||
int startup ( void *state ) {
|
||||
char buf[5];
|
||||
// this sets errno, but does not seem to call our __errno_location
|
||||
// override, BUT does seem to not affect "errno" in main() either!
|
||||
// maybe this is the TLS support?
|
||||
int bytes = read(-9,buf,5);
|
||||
//errno = 7; // E2BIG;
|
||||
//assert ( errno && bytes == -1 );
|
||||
g_errno = errno;
|
||||
}
|
||||
|
||||
|
||||
int main() {
|
||||
errno = 10; // EINVAL;
|
||||
g_errno = 10;
|
||||
char stack[10000];
|
||||
pid_t pid = clone( startup ,
|
||||
stack + 10000 ,
|
||||
//CLONE_SETTLS |
|
||||
CLONE_VM | SIGCHLD,
|
||||
NULL );
|
||||
int status;
|
||||
waitpid ( pid , &status, 0 );
|
||||
|
||||
if ( s_called ) fprintf(stderr,"__errno_location() was called %i "
|
||||
"times\n",s_called);
|
||||
|
||||
if ( errno != 10 ) fprintf(stderr,"errno=%i (failed)\n",errno);
|
||||
else fprintf(stderr,"errno=%i (success)\n",errno);
|
||||
|
||||
if ( g_errno == 10 || g_errno == 0 )
|
||||
fprintf(stderr,"gerrno=%i (failed)\n",g_errno);
|
||||
else
|
||||
fprintf(stderr,"gerrno=%i (success)\n",g_errno);
|
||||
}
|
23
fctypes.cpp
23
fctypes.cpp
@ -999,14 +999,27 @@ long long atoll2 ( const char *s, long len ) {
|
||||
double atof2 ( const char *s, long len ) {
|
||||
// skip over spaces
|
||||
const char *end = s + len;
|
||||
while ( s < end && is_wspace_a ( *s ) ) s++;
|
||||
while ( s < end && is_wspace_a ( *s ) ) { s++; len--; }
|
||||
// return 0 if all spaces
|
||||
if ( s == end ) return 0;
|
||||
char buf[128];
|
||||
char tmpBuf[128];
|
||||
if ( len >= 128 ) len = 127;
|
||||
strncpy ( buf , s , len );
|
||||
buf[len] = '\0';
|
||||
return atof ( buf );
|
||||
//strncpy ( dst , s , len );
|
||||
|
||||
const char *p = s;
|
||||
const char *srcEnd = s + len;
|
||||
char *dst = tmpBuf;
|
||||
// remove commas
|
||||
for ( ; p < srcEnd ; p++ ) {
|
||||
// skip commas
|
||||
if ( *p == ',' ) continue;
|
||||
// otherwise store it
|
||||
*dst++ = *p;
|
||||
}
|
||||
// null term
|
||||
*dst = '\0';
|
||||
//buf[len] = '\0';
|
||||
return atof ( tmpBuf );
|
||||
}
|
||||
|
||||
double atod2 ( char *s, long len ) {
|
||||
|
8
gb.conf
8
gb.conf
@ -57,7 +57,7 @@
|
||||
<doNarrowSearch>0</>
|
||||
|
||||
# Overrides all spidering for all collections on just this host.
|
||||
<localSpideringEnabled>1</>
|
||||
<localSpideringEnabled>0</>
|
||||
|
||||
# Overrides all add urls for all collections on just this host.
|
||||
<localAddUrlEnabled>1</>
|
||||
@ -73,10 +73,10 @@
|
||||
<qaSearchTestEnabled>1</>
|
||||
|
||||
# Enable spidering on all hosts
|
||||
<allSpidersOn>1</>
|
||||
<allSpidersOn>0</>
|
||||
|
||||
# Disable spidering on all hosts
|
||||
<allSpidersOff>1</>
|
||||
<allSpidersOff>0</>
|
||||
|
||||
# Serves ads unless pure=1 is in cgi parms.
|
||||
<adFeedEnabled>0</>
|
||||
@ -385,7 +385,7 @@
|
||||
|
||||
# Maximum number of threads to use per Gigablast process for intersecting
|
||||
# docid lists. Generally, set this to the number of CPUs on the machine.
|
||||
<maxCpuThreads>1</>
|
||||
<maxCpuThreads>10</>
|
||||
|
||||
# Maximum number of pages to index or delete from index per second for all
|
||||
# hosts combined.
|
||||
|
@ -78,21 +78,19 @@ You will need the following packages installed<br>
|
||||
2. Edit hosts.conf so the working directory is not /home/mwells/github/ but
|
||||
rather your current working directory, where the 'gb' binary resides.
|
||||
<br><br>
|
||||
3. Run './gb 0' to start a single gigablast node.
|
||||
3. Run './gb 0' to start a single gigablast node which listens on port 8000.
|
||||
<br><br>
|
||||
4. Access the server with your browser on port 8000 (default port). You can change this default port in the gb.conf file.
|
||||
4. The first time you run it you will have to wait for it to build some binary data files from the txt files it uses that are based on wiktionary and wikipedia that it uses to do synonyms and phrasing.
|
||||
<br><br>
|
||||
5. The first time you run it you will have to wait for it to build some binary data files from the txt files it uses that are based on wiktionary and wikipedia that it uses to do synonyms and phrasing.
|
||||
5. Re-run it after it builds those binaries.
|
||||
<br><br>
|
||||
6. Re-run it after it builds those binaries.
|
||||
6. Check out the <a href=http://127.0.0.1:8000/master>Master Controls</a>. You need to connect to port 8000 from a local IP address or from an IP address on the same C-Class as part of Gigablast's security. Consider using an ssh tunnel if your browser's IP is not on the same C-Class as the server's. i.e. From your browser machine, ssh to the machine running the gb server: <i>ssh someservername.com -L 8000:127.0.0.1:8000</i> . Then on your browser go to the <a href=http://127.0.0.1:8000/master>Master Controls</a>.
|
||||
<br><br>
|
||||
7. Check out the <a href=http://127.0.0.1:8000/master>Master Controls</a>. You need to connect to port 8000 from a local IP address or from an IP address on the same C-Class as part of Gigablast's security. Consider using an ssh tunnel if your browser's IP is not on the same C-Class. i.e. From your browser machine, ssh to the machine running the gb server: <i>ssh someservername.com -L 8000:127.0.0.1:8000</i> . Then on your browser go to the <a href=http://127.0.0.1:8000/master>Master Controls</a>.
|
||||
7. Click on the <a href=http://127.0.0.1:8000/admin/inject?c=main>inject</a> menu and inject a URL into the index. It might be slow because it uses Google's public DNSes as specified in the Master Controls as 8.8.8.8 and 8.8.4.4. You should change those to your own local bind9 server for speed.
|
||||
<br><br>
|
||||
8. Click on the <a href=http://127.0.0.1:8000/admin/inject?c=main>inject</a> menu and inject a URL into the index. It might be slow because it uses Google's public DNSes as specified in the Master Controls as 8.8.8.8 and 8.8.4.4. You should change those to your own local bind9 server for speed.
|
||||
8. When the injection completes, try a <a href=http://127.0.0.1:8000/>search</a> for the document you injected.
|
||||
<br><br>
|
||||
9. When the injection completes, try a <a href=http://127.0.0.1:8000/>search</a> for the document you injected.
|
||||
<br><br>
|
||||
10. Turn on spiders on the <a href=http://127.0.0.1:8000>Master Controls</a> page so that it will begin spidering the outlinks of the page you injected.
|
||||
9. <a href=http://127.0.0.1:8000/master?se=1>Turn on spiders</a> on the <a href=http://127.0.0.1:8000/master>Master Controls</a> page so that it will begin spidering the outlinks of the page you injected.
|
||||
|
||||
<br>
|
||||
|
||||
|
115
html/adv.html
Normal file
115
html/adv.html
Normal file
@ -0,0 +1,115 @@
|
||||
<?xml version="1.0" encoding="iso-8859-1"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head>
|
||||
<title>Gigablast Advanced Search</title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
|
||||
<meta name="MSSmartTagsPreventParsing" content="true" />
|
||||
<meta http-equiv="imagetoolbar" content="no" />
|
||||
<link href="stylesmain.css" rel="stylesheet" type="text/css" />
|
||||
<script type="text/javascript">
|
||||
<!--
|
||||
function x(){document.f.q.focus();}
|
||||
// -->
|
||||
</script>
|
||||
</head>
|
||||
|
||||
<body onload="x()">
|
||||
<a href="/" target="_top"><img src="logo-small.png" alt="Gigablast Logo" title="Return to Basic Search" border="0" style="margin-bottom:15px;" /></a>
|
||||
<h2>Advanced Search</h2>
|
||||
<form method="get" action="/search">
|
||||
<table width="605" border="0" align="center" cellpadding="5" cellspacing="3">
|
||||
<tbody>
|
||||
<tr align="left" valign="middle">
|
||||
<th colspan="3">Search for...</th>
|
||||
</tr>
|
||||
<tr align="left" valign="middle">
|
||||
<td><strong>all</strong> of these words</td>
|
||||
<td><input type="text" name="plus" size="40" /></td>
|
||||
<td><input type="submit" value="Search" /></td>
|
||||
</tr>
|
||||
<tr align="left" valign="middle">
|
||||
<td>this <strong>exact phrase</strong></td>
|
||||
<td colspan="2"><input type="text" name="quote1" size="40" /></td>
|
||||
</tr>
|
||||
<tr align="left" valign="middle">
|
||||
<td>and this <strong>exact phrase</strong></td>
|
||||
<td colspan="2"><input type="text" name="quote2" size="40" /></td>
|
||||
</tr>
|
||||
<tr align="left" valign="middle">
|
||||
<td><strong>any</strong> of these words</td>
|
||||
<td colspan="2"><input type="text" name="q" size="40" /></td>
|
||||
</tr>
|
||||
<tr align="left" valign="middle">
|
||||
<td><strong>none</strong> of these words</td>
|
||||
<td colspan="2"><input type="text" name="minus" size="40" /></td>
|
||||
</tr>
|
||||
|
||||
<tr align="left" valign="middle">
|
||||
<td>In this language:
|
||||
</td>
|
||||
<td colspan="2">
|
||||
<select name=gblang>
|
||||
<option value=0>Any</option>
|
||||
<option value=1>English</option>
|
||||
<option value=2>French</option>
|
||||
<option value=3>Spanish</option>
|
||||
<option value=4>Russian</option>
|
||||
<option value=5>Turkish</option>
|
||||
<option value=6>Japanese</option>
|
||||
<option value=7>ChineseTrad</option>
|
||||
<option value=8>ChineseSimp</option>
|
||||
<option value=9>Korean</option>
|
||||
<option value=10>German</option>
|
||||
<option value=11>Dutch</option>
|
||||
<option value=12>Italian</option>
|
||||
<option value=13>Finnish</option>
|
||||
<option value=14>Swedish</option>
|
||||
<option value=15>Norwegian</option>
|
||||
<option value=16>Portuguese</option>
|
||||
<option value=17>Vietnamese</option>
|
||||
<option value=18>Arabic</option>
|
||||
<option value=19>Hebrew</option>
|
||||
<option value=20>Indonesian</option>
|
||||
<option value=21>Greek</option>
|
||||
<option value=22>Thai</option>
|
||||
<option value=23>Hindi</option>
|
||||
<option value=24>Bengala</option>
|
||||
<option value=25>Polish</option>
|
||||
<option value=26>Tagalog</option>
|
||||
</select>
|
||||
</td>
|
||||
</tr>
|
||||
|
||||
|
||||
<tr align="left" valign="middle">
|
||||
<td>Restrict to this URL</td>
|
||||
<td colspan="2"><input type="text" name="url" size="40" /></td>
|
||||
</tr>
|
||||
<tr align="left" valign="middle">
|
||||
<td>Pages that link to this URL</td>
|
||||
<td colspan="2"><input type="text" name="link" size="40" /></td>
|
||||
</tr>
|
||||
<tr align="left" valign="middle">
|
||||
<td>Site Clustering</td>
|
||||
<td colspan="2"><input type="radio" name="sc" value="1" checked="checked" />yes <input type="radio" name="sc" value="0" />no</td>
|
||||
</tr>
|
||||
<tr align="left" valign="middle">
|
||||
<td>Number of summary excerpts</td>
|
||||
<td colspan="2"><input type="radio" name="ns" value="0" />0 <input type="radio" name="ns" value="1" />1 <input type="radio" name="ns" value="2" />2 <input type="radio" name="ns" value="3" checked="checked" />3 <input type="radio" name="ns" value="4" />4 <input type="radio" name="ns" value="5" />5</td>
|
||||
</tr>
|
||||
<tr align="left" valign="middle">
|
||||
<td>Results per Page</td>
|
||||
<td colspan="2"><input type="radio" name="n" value="10" checked="checked" />10 <input type="radio" name="n" value="20" />20 <input type="radio" name="n" value="30" />30 <input type="radio" name="n" value="40" />40 <input type="radio" name="n" value="50" />50 <input type="radio" name="n" value="100" />100</td>
|
||||
</tr>
|
||||
<tr align="left" valign="middle">
|
||||
<td>Restrict to these Sites</td>
|
||||
<td colspan="2"><textarea rows="10" cols="40" name="sites"></textarea></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</form>
|
||||
<div id="footer">Copyright © 2010-2020 <a href="http://www.gigablast.com" target="_top">Gigablast,
|
||||
Inc.</a> All rights reserved.</div>
|
||||
</body>
|
||||
</html>
|
BIN
html/dollargear.png
Normal file
BIN
html/dollargear.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 9.9 KiB |
BIN
html/eventguru.png
Normal file
BIN
html/eventguru.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 39 KiB |
BIN
html/gears.png
Normal file
BIN
html/gears.png
Normal file
Binary file not shown.
After Width: | Height: | Size: 9.6 KiB |
@ -34,7 +34,7 @@ Gigablast - The Private Search Engine</font>
|
||||
Gigablast does not give your IP address to any third parties, nor allow any third party to deduce what queries might be coming from your IP address. Read the text below to understand what we mean by <i>deduce</i>.
|
||||
<br><br>
|
||||
|
||||
In the summer of 2013 <a href=https://en.wikipedia.org/wiki/Edward_Snowden>Edward Snowden</a>, an ex-NSA contractor, described a secret NSA project known as <a href=https://en.wikipedia.org/wiki/PRISM_%28surveillance_program%29>project PRISM</a>. This project by the NSA wire taps not just live data traversing the internet, but also has automated access to large data repositories controlled by major internet companies. The data repositories consist of anything from search engine query logs and private emails to chat histories, among others. With today's fairly accurate audio-to-text transcription software, even services like <a href=https://en.wikipedia.org/wiki/Skype>Skype</a> audio and video calls are being tapped.
|
||||
In the summer of 2013 <a href=https://en.wikipedia.org/wiki/Edward_Snowden>Edward Snowden</a>, an ex-NSA contractor, described a secret NSA project known as <a href=https://en.wikipedia.org/wiki/PRISM_%28surveillance_program%29>project PRISM</a>, and more recently <a href="http://www.washingtonpost.com/world/national-security/nsa-infiltrates-links-to-yahoo-google-data-centers-worldwide-snowden-documents-say/2013/10/30/e51d661e-4166-11e3-8b74-d89d714ca4dd_story.html">Muscular</a>. These NSA projects wire tap not just live data traversing the internet, but also has automated access to large data repositories controlled by major internet companies. The data repositories consist of anything from search engine query logs and private emails to chat histories, among others. With today's fairly accurate audio-to-text transcription software, even services like <a href=https://en.wikipedia.org/wiki/Skype>Skype</a> audio and video calls are being tapped.
|
||||
<br><br>
|
||||
Such data access makes it very easy for government agencies like the NSA to set up large search engines that index these data streams and execute a list of queries on such search engines in order to profile and flag individuals for further examination.
|
||||
<br><br>
|
||||
|
@ -1,18 +1,18 @@
|
||||
User-Agent: googlebot
|
||||
Disallow: /search
|
||||
Disallow: /search?
|
||||
|
||||
User-Agent: bingbot
|
||||
Disallow: /search
|
||||
Disallow: /search?
|
||||
|
||||
User-Agent: msnbot
|
||||
Disallow: /search
|
||||
Disallow: /search?
|
||||
|
||||
User-Agent: slurp
|
||||
Disallow: /search
|
||||
Disallow: /search?
|
||||
|
||||
User-Agent: gigabot
|
||||
Disallow: /search
|
||||
Disallow: /search?
|
||||
|
||||
User-Agent: *
|
||||
Disallow: /search
|
||||
Disallow: /search?
|
||||
|
||||
|
@ -41,8 +41,9 @@ counts as a single query.
|
||||
<!--<li>Gigablast has many powerful <a href="/features.html">features</a>.
|
||||
<br><br>-->
|
||||
<li><a href=https://www.gigablast.com/account.html>Sign up now</a> to start accessing the feed.
|
||||
</ul>
|
||||
<br><br>
|
||||
<li>You can use the search results however you want. You can rearrange them, embed ads, etc.
|
||||
</ul>
|
||||
</td>
|
||||
</tr>
|
||||
</table>
|
||||
|
18
main.cpp
18
main.cpp
@ -2454,9 +2454,21 @@ int main ( int argc , char *argv[] ) {
|
||||
if ( setrlimit(RLIMIT_CORE,&lim) )
|
||||
log("db: setrlimit: %s.", mstrerror(errno) );
|
||||
// limit fds
|
||||
//lim.rlim_cur = lim.rlim_max = 511;
|
||||
//if ( setrlimit(RLIMIT_NOFILE,&lim))
|
||||
// log("db: setrlimit2: %s.", mstrerror(errno) );
|
||||
// try to prevent core from systems where it is above 1024
|
||||
// because our FD_ISSET() libc function will core! (it's older)
|
||||
long NOFILE = 1024;
|
||||
lim.rlim_cur = lim.rlim_max = NOFILE;
|
||||
if ( setrlimit(RLIMIT_NOFILE,&lim))
|
||||
log("db: setrlimit RLIMIT_NOFILE %li: %s.",
|
||||
NOFILE,mstrerror(errno) );
|
||||
struct rlimit rlim;
|
||||
getrlimit ( RLIMIT_NOFILE,&rlim);
|
||||
if ( (long)rlim.rlim_max > NOFILE || (long)rlim.rlim_cur > NOFILE ) {
|
||||
log("db: setrlimit RLIMIT_NOFILE failed!");
|
||||
char *xx=NULL;*xx=0;
|
||||
}
|
||||
log("db: RLIMIT_NOFILE = %li",(long)rlim.rlim_max);
|
||||
//exit(0);
|
||||
// . disable o/s's and hard drive's read ahead
|
||||
// . set multcount to 16 --> 1 interrupt for every 16 sectors read
|
||||
// . multcount of 16 reduces OS overhead by 30%-50% (more throughput)
|
||||
|
@ -154,6 +154,13 @@ int main ( int argc , char *argv[] ) {
|
||||
printf("%s\n", out );
|
||||
}
|
||||
|
||||
// encoded
|
||||
char dst[MAX_URL_LEN+200];
|
||||
urlEncode ( dst,MAX_URL_LEN+100,
|
||||
u.getUrl(), u.getUrlLen(),
|
||||
false ); // are we encoding a request path?
|
||||
printf("encoded: %s\n",dst);
|
||||
|
||||
// the probable docid
|
||||
long long pd = g_titledb.getProbableDocId(&u);
|
||||
printf("pdocid: %llu\n", pd );
|
||||
|
Loading…
Reference in New Issue
Block a user