Merge branch 'diffbot' of github.com:gigablast/open-source-search-engine into diffbot

Conflicts:
	Process.cpp
This commit is contained in:
mwells 2013-12-06 12:31:36 -08:00
commit adf9d807ea
90 changed files with 4547 additions and 1199 deletions

View File

@ -799,7 +799,7 @@ bool sendPageAutoban ( TcpSocket *s , HttpRequest *r ) {
}
bool AutoBan::printTable( TcpSocket *s , HttpRequest *r ) {
SafeBuf sb(512 * 512);
SafeBuf sb(512 * 512,"autobbuf");
//read in all of the possible cgi parms off the bat:
//long user = g_pages.getUserType( s , r );
char *username = g_users.getUsername(r);

View File

@ -11,6 +11,10 @@
#include "Statsdb.h"
#include "DiskPageCache.h"
#ifdef ASYNCIO
#include <aio.h>
#endif
// main.cpp will wait for this to be zero before exiting so all unlink/renames
// can complete
long g_unlinkRenameThreads = 0;
@ -530,6 +534,11 @@ bool BigFile::readwrite ( void *buf ,
// . if we're blocking then do it now
// . this should return false and set g_errno on error, true otherwise
if ( ! isNonBlocking ) goto skipThread;
#ifdef ASYNCIO
goto skipThread;
#endif
// . otherwise, spawn a thread to do this i/o
// . this returns false and sets g_errno on error, true on success
// . we should return false cuz we blocked
@ -597,7 +606,90 @@ bool BigFile::readwrite ( void *buf ,
log("disk: read buf alloc failed for %li "
"bytes.",need);
}
//
// pthread_create() is abhorently slow. use asyncio if possible.
//
#ifdef ASYNCIO
// we only have two in the array... most likely though we only
// need one here...
aiocb *a0 = &fstate->m_aiocb[0];
aiocb *a1 = &fstate->m_aiocb[1];
// init them for the read
a0->aio_fildes = fstate->m_fd1;
a1->aio_fildes = fstate->m_fd2;
// the offset of each file
long long off1 = fs->m_offset;
// always read at start of 2nd file
long long off2 = 0;
// how many bytes to read from each file?
long long readSize1 = size;
long long readSize2 = 0;
if ( off1 + readSize1 > MAX_PART_SIZE ) {
readSize1 = ((long long)MAX_PART_SIZE) - off1;
readSize2 = size - readSize1;
}
a0->aio_offset = off1;
a1->aio_offset = off2;
a0->aio_nbytes = readSize1;
a1->aio_nbytes = readSize2;
a0->aio_buf = fstate->m_buf;
a1->aio_buf = fstate->m_buf + readSize1;
a0->aio_reqprio = 0;
a1->aio_reqprio = 0;
a0->aio_sigevent = SIGEV_SIGNAL;
a1->aio_sigevent = SIGEV_SIGNAL;
// translate offset to a filenum and offset
long filenum = offset / MAX_PART_SIZE;
long localOffset = offset % MAX_PART_SIZE;
// read or write?
if ( doWrite ) a0->aio_lio_opcode = LIO_WRITE;
else a0->aio_lio_opcode = LIO_READ;
// different fds implies two different files we gotta read from.
long numFilesToReadFrom = 1;
if ( fstate->m_fd1 != fstate->m_fd2 ) numFilesToReadFrom = 2;
// set it up
//aioList->m_signal = ESIG;
retry77:
//
// don't use this on kernels below 3.12 because it can block
// when reading ext4 files.
//
io_submit();
// this will send the signal when read/write is completed
//long status = lio_listio ( LIO_NOWAIT ,
// a0 ,
// numFilesToReadFrom ,
// &fstate->m_sigEvent );
// if status is 0, there was no error
if ( status == 0 ) {
g_errno = 0;
// assume we will get the signal later
return false;
}
// got interrupted by a signal? try again.
if ( errno == EINTR )
goto retry77;
// tell caller about the error
g_errno = errno;
log("aio: %s", mstrerror(g_errno));
// we did not block or anything
return true;
#endif
// . this returns false and sets errno on error
// . set g_errno to the errno
if ( ! readwrite_r ( fstate , NULL ) ) g_errno = errno;

View File

@ -95,6 +95,11 @@ public:
// m_allocOff is offset into m_allocBuf where we start reading into
// from the file
long m_allocOff;
// do not call pthread_create() for every read we do. use async io
// because it should be much much faster
#ifdef ASYNCIO
struct aiocb m_aiocb[2];
#endif
};

View File

@ -72,6 +72,10 @@ CollectionRec::CollectionRec() {
m_lastResetCount = 0;
// regex_t types
m_hasucr = false;
m_hasupr = false;
// for diffbot caching the global spider stats
reset();
@ -91,6 +95,11 @@ void CollectionRec::setToDefaults ( ) {
}
void CollectionRec::reset() {
// regex_t types
if ( m_hasucr ) regfree ( &m_ucr );
if ( m_hasupr ) regfree ( &m_upr );
// make sure we do not leave spiders "hanging" waiting for their
// callback to be called... and it never gets called
//if ( m_callbackQueue.length() > 0 ) { char *xx=NULL;*xx=0; }
@ -140,8 +149,34 @@ bool CollectionRec::load ( char *coll , long i ) {
// . accepts OBJ_COLLECTIONREC or OBJ_CONF
g_parms.setFromFile ( this , tmp2 , tmp1 );
// add default reg ex
setUrlFiltersToDefaults();
// add default reg ex IFF there are no url filters there now
if ( m_numRegExs == 0 ) setUrlFiltersToDefaults();
// compile regexs here
char *rx = m_diffbotUrlCrawlRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) m_hasucr = true;
if ( rx && regcomp ( &m_ucr , rx ,
REG_EXTENDED|REG_ICASE|
REG_NEWLINE|REG_NOSUB) ) {
// error!
return log("xmldoc: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
}
rx = m_diffbotUrlProcessRegEx.getBufStart();
if ( rx && ! rx[0] ) rx = NULL;
if ( rx ) m_hasupr = true;
if ( rx && regcomp ( &m_upr , rx ,
REG_EXTENDED|REG_ICASE|
REG_NEWLINE|REG_NOSUB) ) {
// error!
return log("xmldoc: regcomp %s failed: %s. "
"Ignoring.",
rx,mstrerror(errno));
}
//
// LOAD the crawlinfo class in the collectionrec for diffbot
@ -392,7 +427,7 @@ bool CollectionRec::save ( ) {
g_hostdb.m_dir , m_coll , (long)m_collnum );
if ( ! g_parms.saveToXml ( (char *)this , tmp ) ) return false;
// log msg
log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
//log (LOG_INFO,"db: Saved %s.",tmp);//f.getFilename());
//
// save the crawlinfo class in the collectionrec for diffbot
@ -400,7 +435,7 @@ bool CollectionRec::save ( ) {
// SAVE LOCAL
sprintf ( tmp , "%scoll.%s.%li/localcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (long)m_collnum );
log("coll: saving %s",tmp);
//log("coll: saving %s",tmp);
SafeBuf sb;
//m_localCrawlInfo.print ( &sb );
// binary now
@ -413,7 +448,7 @@ bool CollectionRec::save ( ) {
// SAVE GLOBAL
sprintf ( tmp , "%scoll.%s.%li/globalcrawlinfo.dat",
g_hostdb.m_dir , m_coll , (long)m_collnum );
log("coll: saving %s",tmp);
//log("coll: saving %s",tmp);
sb.reset();
//m_globalCrawlInfo.print ( &sb );
// binary now

View File

@ -56,7 +56,7 @@
//#define MAX_SITE_EXPRESSION_LEN 128
//#define MAX_SITE_EXPRESSIONS 256
//#include "regex.h"
#include "regex.h"
#include "Url.h" // MAX_COLL_LEN
//#include "Sync.h"
@ -108,6 +108,9 @@ class CrawlInfo {
// currently in the ready queue (doledb) to spider?
char m_sentCrawlDoneAlert;
//long m_numUrlsLaunched;
long m_dummy1;
void reset() { memset ( this , 0 , sizeof(CrawlInfo) ); };
//bool print (class SafeBuf *sb ) ;
//bool setFromSafeBuf (class SafeBuf *sb ) ;
@ -432,8 +435,27 @@ class CollectionRec {
//SafeBuf m_diffbotApiList;//QueryString;
//SafeBuf m_diffbotUrlCrawlPattern;
//SafeBuf m_diffbotUrlProcessPattern;
// use for all now...
SafeBuf m_diffbotApiUrl;
// only process pages whose content matches this pattern
SafeBuf m_diffbotPageProcessPattern;
// only process urls that match this pattern
SafeBuf m_diffbotUrlProcessPattern;
// only CRAWL urls that match this pattern
SafeBuf m_diffbotUrlCrawlPattern;
// regex support
SafeBuf m_diffbotUrlCrawlRegEx;
SafeBuf m_diffbotUrlProcessRegEx;
regex_t m_ucr;
regex_t m_upr;
long m_hasucr:1;
long m_hasupr:1;
char m_diffbotOnlyProcessIfNew;
//SafeBuf m_diffbotClassify;
//char m_diffbotClassify;
//char m_useDiffbot;
@ -515,6 +537,9 @@ class CollectionRec {
long m_numRegExs11;
SafeBuf m_spiderDiffbotApiUrl [ MAX_FILTERS ];
long m_numRegExs8;
char m_harvestLinks [ MAX_FILTERS ];
// dummy?
long m_numRegExs9;
@ -671,12 +696,6 @@ class CollectionRec {
class SpiderColl *m_spiderColl;
// each Rdb has a tree, so keep the pos/neg key count here so
// that RdbTree does not have to have its own array limited by
// MAX_COLLS which we did away with because we made this dynamic.
long m_numPosKeysInTree[RDB_END];
long m_numNegKeysInTree[RDB_END];
long m_overflow;
long m_overflow2;
@ -1018,6 +1037,12 @@ class CollectionRec {
// used by Parms.cpp
char m_hackFlag;
// each Rdb has a tree, so keep the pos/neg key count here so
// that RdbTree does not have to have its own array limited by
// MAX_COLLS which we did away with because we made this dynamic.
long m_numPosKeysInTree[RDB_END];
long m_numNegKeysInTree[RDB_END];
//long m_numEventsOnHost;
// do we have the doc:quality var in any url filter?

View File

@ -247,7 +247,12 @@ bool Collectiondb::addRec ( char *coll , char *cpc , long cpclen , bool isNew ,
// MDW: ensure not created on disk since time of last load
char dname[512];
sprintf(dname, "%scoll.%s.%li/",g_hostdb.m_dir,coll,i);
if ( isNew && opendir ( dname ) ) {
DIR *dir = NULL;
if ( isNew )
dir = opendir ( dname );
if ( dir )
closedir ( dir );
if ( isNew && dir ) {
g_errno = EEXIST;
return log("admin: Trying to create collection %s but "
"directory %s already exists on disk.",coll,dname);
@ -524,11 +529,12 @@ bool Collectiondb::isAdmin ( HttpRequest *r , TcpSocket *s ) {
void savingCheckWrapper1 ( int fd , void *state ) {
WaitEntry *we = (WaitEntry *)state;
// no state?
if ( ! we ) return;
// if it blocked again i guess tree is still saving
if ( ! g_collectiondb.resetColl ( we->m_coll , we ) ) return;
if ( ! we ) { log("colldb: we1 is null"); return; }
// unregister too
g_loop.unregisterSleepCallback ( state,savingCheckWrapper1 );
// if it blocked again i guess tree is still saving
if ( ! g_collectiondb.resetColl ( we->m_coll , we , we->m_purgeSeeds))
return;
// all done
we->m_callback ( we->m_state );
}
@ -536,11 +542,11 @@ void savingCheckWrapper1 ( int fd , void *state ) {
void savingCheckWrapper2 ( int fd , void *state ) {
WaitEntry *we = (WaitEntry *)state;
// no state?
if ( ! we ) return;
// if it blocked again i guess tree is still saving
if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
if ( ! we ) { log("colldb: we2 is null"); return; }
// unregister too
g_loop.unregisterSleepCallback ( state,savingCheckWrapper2 );
// if it blocked again i guess tree is still saving
if ( ! g_collectiondb.deleteRec ( we->m_coll , we ) ) return;
// all done
we->m_callback ( we->m_state );
}
@ -599,7 +605,7 @@ bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
g_errno = ENOTFOUND;
return true;
}
if ( g_process.isAnyTreeSaving() ) {
// note it
log("admin: tree is saving. waiting2.");
@ -700,7 +706,11 @@ bool Collectiondb::deleteRec ( char *coll , WaitEntry *we ) {
// . reset a collection
// . returns false if blocked and will call callback
bool Collectiondb::resetColl ( char *coll , WaitEntry *we ) {
bool Collectiondb::resetColl ( char *coll , WaitEntry *we , bool purgeSeeds) {
// save parms in case we block
we->m_purgeSeeds = purgeSeeds;
// ensure it's not NULL
if ( ! coll ) {
log(LOG_LOGIC,"admin: Collection name to delete is NULL.");
@ -849,11 +859,13 @@ bool Collectiondb::resetColl ( char *coll , WaitEntry *we ) {
//cr->m_spiderStatusMsg = NULL;
// reset seed buf
cr->m_diffbotSeeds.purge();
// reset seed dedup table
HashTableX *ht = &cr->m_seedHashTable;
ht->reset();
if ( purgeSeeds ) {
// free the buffer of seed urls
cr->m_diffbotSeeds.purge();
// reset seed dedup table
HashTableX *ht = &cr->m_seedHashTable;
ht->reset();
}
// so XmlDoc.cpp can detect if the collection was reset since it
// launched its spider:
@ -866,6 +878,14 @@ bool Collectiondb::resetColl ( char *coll , WaitEntry *we ) {
// right now we #define collnum_t short
if ( m_numRecs > 0x7fff ) { char *xx=NULL;*xx=0; }
// make a new collnum so records in transit will not be added
// to any rdb...
cr->m_collnum = newCollnum;
// Rdb::resetColl() needs to know the new cr so it can move
// the RdbBase into cr->m_bases[rdbId] array. recycling.
m_recs[newCollnum] = cr;
// . unlink all the *.dat and *.map files for this coll in its subdir
// . remove all recs from this collnum from m_tree/m_buckets
// . updates RdbBase::m_collnum
@ -879,16 +899,10 @@ bool Collectiondb::resetColl ( char *coll , WaitEntry *we ) {
g_clusterdb.getRdb()->resetColl ( oldCollnum , newCollnum );
g_linkdb.getRdb()->resetColl ( oldCollnum , newCollnum );
// make a new collnum so records in transit will not be added
// to any rdb...
cr->m_collnum = newCollnum;
// reset crawl status too!
cr->m_spiderStatus = SP_INITIALIZING;
m_recs[oldCollnum] = NULL;
m_recs[newCollnum] = cr;
// readd it to the hashtable that maps name to collnum too
long long h64 = hash64n(cr->m_coll);
@ -902,7 +916,10 @@ bool Collectiondb::resetColl ( char *coll , WaitEntry *we ) {
g_hostdb.m_dir,
cr->m_coll,
(long)newCollnum);
if ( opendir ( dname ) ) {
DIR *dir = opendir ( dname );
if ( dir )
closedir ( dir );
if ( dir ) {
//g_errno = EEXIST;
log("admin: Trying to create collection %s but "
"directory %s already exists on disk.",coll,dname);

View File

@ -18,6 +18,7 @@ public:
void (* m_callback) (void *state);
void *m_state;
char *m_coll;
bool m_purgeSeeds;
};
class Collectiondb {
@ -94,7 +95,7 @@ class Collectiondb {
bool deleteRecs ( class HttpRequest *r ) ;
// returns false if blocked, true otherwise.
bool resetColl ( char *coll , WaitEntry *we );
bool resetColl ( char *coll , WaitEntry *we , bool purgeSeeds );
// . keep up to 128 of them, these reference into m_list
// . COllectionRec now includes m_needsSave and m_lastUpdateTime

View File

@ -189,6 +189,7 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
//g_conf.m_testSearchEnabled = false;
/*
//
// are we running in Matt Wells's data center?
// if so, we want to be able to use the seo tools that are not part
@ -207,11 +208,16 @@ bool Conf::init ( char *dir ) { // , long hostId ) {
if ( hh && strcmp(hh,"galileo") == 0) priv = true;
if ( hh && strcmp(hh,"sputnik") == 0) priv = true;
if ( hh && strcmp(hh,"titan") == 0) priv = true;
if ( hh[0]=='g' && hh[1]=='k' && is_digit(hh[2]) ) priv = true;
if ( hh && hh[0]=='g' && hh[1]=='k' && is_digit(hh[2]) ) priv = true;
//if(hh[0]=='s' && hh[1]=='p' && is_digit(hh[2])) ) priv = true;
if ( priv ) g_conf.m_isMattWells = true;
else g_conf.m_isMattWells = false;
*/
g_conf.m_isMattWells = false;
#ifdef MATTWELLS
g_conf.m_isMattWells = true;
#endif
// this is not possible
/*

View File

@ -5,6 +5,7 @@
Dir::Dir ( ) {
m_dirname = NULL;
m_dir = NULL;
m_needsClose = false;
}
@ -40,7 +41,8 @@ bool Dir::set ( char *dirname ) {
}
bool Dir::close ( ) {
if ( m_dir ) closedir ( m_dir );
if ( m_dir && m_needsClose ) closedir ( m_dir );
m_needsClose = false;
return true;
}
@ -56,6 +58,7 @@ bool Dir::open ( ) {
if ( ! m_dir )
return log("disk: opendir(%s) : %s",
m_dirname,strerror( g_errno ) );
m_needsClose = true;
return true;
}

1
Dir.h
View File

@ -49,6 +49,7 @@ class Dir {
char *m_dirname;
DIR *m_dir;
bool m_needsClose;
};
#endif

View File

@ -161,7 +161,8 @@ case EDIFFBOTMIMEERROR: return "Diffbot mime error";
case EDIFFBOTBADHTTPSTATUS: return "Diffbot reply bad http status";
case EHITCRAWLLIMIT: return "Hit the page download limit";
case EHITPROCESSLIMIT: return "Hit the page process limit";
case EINTERNALERROR: return "Internal error";
case EINTERNALERROR: return "Internal error";
case EBADJSONPARSER: return "Bad JSON parser";
}
// if the remote error bit is clear it must be a regulare errno
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );

View File

@ -165,6 +165,7 @@ enum {
EDIFFBOTBADHTTPSTATUS,
EHITCRAWLLIMIT,
EHITPROCESSLIMIT,
EINTERNALERROR
EINTERNALERROR,
EBADJSONPARSER
};
#endif

View File

@ -238,7 +238,7 @@ class FBRec {
#endif
// facebook id for matt wells
#define MATTWELLS 100003532411011LL
#define FB_MATTWELLS 100003532411011LL
//#define APPNAME "Event Widget"

View File

@ -341,6 +341,7 @@ bool HashTableX::setTableSize ( long oldn , char *buf , long bufSize ) {
m_bufSize = need;
m_doFree = true;
if ( ! m_buf ) return false;
QUICKPOLL(m_niceness);
}
// save the old junk

View File

@ -99,10 +99,10 @@ long Highlight::set ( SafeBuf *sb,
long version = TITLEREC_CURRENT_VERSION;
Bits bits;
if ( ! bits.set (&words,version,niceness) ) return 0;
if ( ! bits.set (&words,version,niceness) ) return -1;
Phrases phrases;
if ( !phrases.set(&words,&bits,true,false,version,niceness))return 0;
if ( !phrases.set(&words,&bits,true,false,version,niceness))return -1;
//SafeBuf langBuf;
//if ( !setLangVec ( &words , &langBuf , niceness )) return 0;
@ -115,7 +115,7 @@ long Highlight::set ( SafeBuf *sb,
Matches matches;
matches.setQuery ( q );
if ( ! matches.addMatches ( &words , &phrases ) ) return 0;
if ( ! matches.addMatches ( &words , &phrases ) ) return -1;
// store
m_numMatches = matches.getNumMatches();
@ -172,7 +172,7 @@ long Highlight::set ( SafeBuf *sb ,
// save room for terminating \0
//m_bufEnd = m_buf + m_bufLen - 1;
if ( ! highlightWords ( words, matches, q ) ) return 0;
if ( ! highlightWords ( words, matches, q ) ) return -1;
// null terminate
//*m_bufPtr = '\0';

View File

@ -72,7 +72,9 @@ bool HttpRequest::copy ( class HttpRequest *r ) {
// . NOTE: http 1.1 uses Keep-Alive by default (use Connection: close to not)
bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
char *userAgent , char *proto , bool doPost ,
char *cookie , char *additionalHeader ) {
char *cookie , char *additionalHeader ,
// if posting something, how many bytes is it?
long postContentLen ) {
m_reqBufValid = false;
@ -279,6 +281,8 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
if ( doPost ) {
long contentLen = 0;
if ( postData ) contentLen = strlen(postData);
// this overrides if provided. -1 is default
if ( postContentLen >= 0 ) contentLen = postContentLen;
m_reqBuf.safePrintf ("Content-Length: %li\r\n", contentLen );
m_reqBuf.safePrintf("\r\n");
if ( postData ) m_reqBuf.safePrintf("%s",postData);
@ -633,6 +637,13 @@ bool HttpRequest::set (char *url,long offset,long size,time_t ifModifiedSince,
// matt comcast
if ( sock && strncmp(iptoa(sock->m_ip),"75.160.49.8",11) == 0)
m_isLocal = true;
// matt comcast #2
if ( sock && strncmp(iptoa(sock->m_ip),"69.181.136.143",14) == 0)
m_isLocal = true;
// titan
if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0)
m_isLocal = true;
// roadrunner ip
// if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0)

View File

@ -41,7 +41,8 @@ class HttpRequest {
char *proto = "HTTP/1.0" ,
bool doPost = false ,
char *cookie = NULL ,
char *additionalHeader = NULL ); // does not incl \r\n
char *additionalHeader = NULL , // does not incl \r\n
long postContentLen = -1 ); // for content-length of POST
// use this
SafeBuf m_reqBuf;

View File

@ -130,7 +130,8 @@ bool HttpServer::getDoc ( char *url ,
bool doPost ,
char *cookie ,
char *additionalHeader ,
char *fullRequest ) {
char *fullRequest ,
char *postContent ) {
// sanity
if ( ip == -1 )
log("http: you probably didn't mean to set ip=-1 did you? "
@ -154,6 +155,9 @@ bool HttpServer::getDoc ( char *url ,
defPort = 443;
}
long pcLen = 0;
if ( postContent ) pcLen = gbstrlen(postContent);
char *req = NULL;
long reqSize;
@ -161,9 +165,15 @@ bool HttpServer::getDoc ( char *url ,
if ( ! fullRequest ) {
if ( ! r.set ( url , offset , size , ifModifiedSince ,
userAgent , proto , doPost , cookie ,
additionalHeader ) ) return true;
additionalHeader , pcLen ) ) return true;
reqSize = r.getRequestLen();
req = (char *) mdup ( r.getRequest() , reqSize,"HttpServer");
req = (char *) mmalloc( reqSize + pcLen ,"HttpServer");
if ( req )
memcpy ( req , r.getRequest() , reqSize );
if ( req && pcLen ) {
memcpy ( req + reqSize, postContent , pcLen );
reqSize += pcLen;
}
}
else {
// does not contain \0 i guess
@ -911,7 +921,8 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
// "GET /crawlbot/downloadobjects"
// "GET /crawlbot/downloadpages"
if ( strncmp ( path , "/crawlbot/download/" ,19 ) == 0 ||
strncmp ( path , "/v2/crawl/download/" ,19 ) == 0 )
strncmp ( path , "/v2/crawl/download/" ,19 ) == 0 ||
strncmp ( path , "/v2/bulk/download/" ,18 ) == 0 )
return sendBackDump ( s , r );
// . is it a diffbot api request, like "GET /api/*"
@ -1542,7 +1553,9 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
*/
}
bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error ,
char *errmsg, long rawFormat,
char *errmsg,
//long rawFormat,
char format ,
int errnum, char *content) {
// clear g_errno so the send goes through
g_errno = 0;
@ -1559,7 +1572,7 @@ bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error ,
// sanity check
if ( strncasecmp(errmsg,"Success",7)==0 ) {char*xx=NULL;*xx=0;}
if (!rawFormat){
if ( format == FORMAT_HTML ) {
// Page content
char cbuf[1024];
sprintf (cbuf,
@ -1946,7 +1959,11 @@ long getMsgSize ( char *buf, long bufSize, TcpSocket *s ) {
totalReplySize,max);
}
// truncate the reply if we have to
if ( totalReplySize > max ) totalReplySize = max;
if ( totalReplySize > max ) {
log("http: truncating reply of %li to %li bytes",
totalReplySize,max);
totalReplySize = max;
}
// truncate if we need to
return totalReplySize;
}

View File

@ -98,7 +98,8 @@ class HttpServer {
char *cookie = NULL ,
char *additionalHeader = NULL , // does not include \r\n
// specify your own mime and post data here...
char *fullRequest = NULL );
char *fullRequest = NULL ,
char *postContent = NULL );
bool getDoc ( long ip,
long port,
@ -134,7 +135,8 @@ class HttpServer {
long *bytesSent = NULL );
// send a "prettier" error reply, formatted in XML if necessary
bool sendQueryErrorReply ( TcpSocket *s , long error , char *errmsg,
long rawFormat, int errnum,
// FORMAT_HTML=0,FORMAT_XML,FORMAT_JSON
char format, int errnum,
char *content=NULL);

View File

@ -4,8 +4,16 @@
class JsonItem *Json::addNewItem () {
JsonItem *ji = (JsonItem *)m_sb.getBuf();
if ( m_sb.m_length + (long)sizeof(JsonItem) > m_sb.m_capacity ) {
log("json: preventing buffer breach");
return NULL;
}
// otherwise we got room
m_sb.incrementLength(sizeof(JsonItem));
if ( m_prev ) m_prev->m_next = ji;
ji->m_prev = m_prev;
ji->m_next = NULL;
@ -53,7 +61,7 @@ JsonItem *Json::getItem ( char *name ) {
#include "Mem.h" // gbstrlen()
JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
JsonItem *Json::parseJsonStringIntoJsonItems ( char *json , long niceness ) {
m_prev = NULL;
@ -67,9 +75,15 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
bool inQuote = false;
long need = 0;
for ( ; *p ; p++ ) {
if ( *p == '\"' && (p==json || p[-1]!='\\') )
// ignore any escaped char. also \x1234
if ( *p == '\\' ) {
if ( p[1] ) p++;
continue;
}
if ( *p == '\"' )
inQuote = ! inQuote;
if ( inQuote ) continue;
if ( inQuote )
continue;
if ( *p == '{' ||
*p == ',' ||
*p == '[' ||
@ -172,8 +186,15 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
if ( *p == '\"' ) {
// find end of quote
char *end = p + 1;
for ( ; *end ; end++ )
if ( *end == '\"' && end[-1] != '\\' ) break;
for ( ; *end ; end++ ) {
// skip two chars if escaped
if ( *end == '\\' && end[1] ) {
end++;
continue;
}
// this quote is unescaped then
if ( *end == '\"' ) break;
}
// field?
char *x = end + 1;
// skip spaces
@ -207,7 +228,8 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
// get length decoded
long curr = m_sb.length();
// store decoded string right after jsonitem
if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,0))
if ( !m_sb.safeDecodeJSONToUtf8 (str,slen,
niceness ))
return NULL;
// store length decoded json
ji->m_valueLen = m_sb.length() - curr;
@ -240,7 +262,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
ji->m_valueDouble = 0;
}
// store decoded string right after jsonitem
if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,0))
if ( !m_sb.safeDecodeJSONToUtf8 (p,slen,niceness))
return NULL;
// store length decoded json
ji->m_valueLen = m_sb.length() - curr;
@ -283,7 +305,7 @@ JsonItem *Json::parseJsonStringIntoJsonItems ( char *json ) {
// copy the number as a string as well
long curr = m_sb.length();
// store decoded string right after jsonitem
if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,0))
if ( !m_sb.safeDecodeJSONToUtf8 ( str, slen,niceness))
return NULL;
// store length decoded json
ji->m_valueLen = m_sb.length() - curr;
@ -323,11 +345,68 @@ void Json::test ( ) {
"in 2010\",\"18083009\":\"Apple personal digital assistants\",\"23475157\":\"Touchscreen portable media players\",\"30107877\":\"IPad\",\"9301031\":\"Apple Inc. hardware\",\"27765345\":\"IOS (Apple)\",\"26588084\":\"Tablet computers\"},\"type\":1,\"senseRank\":1,\"variety\":0.49056603773584906,\"depth\":0.5882352941176471},{\"id\":18839,\"positions\":[[1945,1950],[2204,2209]],\"name\":\"Music\",\"score\":0.7,\"contentMatch\":1,\"categories\":{\"991222\":\"Performing arts\",\"693016\":\"Entertainment\",\"691484\":\"Music\"},\"type\":1,\"senseRank\":1,\"variety\":0.22264150943396221,\"depth\":0.7058823529411764}],\"media\":[{\"pixelHeight\":350,\"link\":\"http://www.onlinemba.com/wp-content/uploads/2013/02/apple-innovates-invert-350x350.png\",\"primary\":\"true\",\"pixelWidth\":350,\"type\":\"image\"}]}";
JsonItem *ji = parseJsonStringIntoJsonItems ( json );
long niceness = 0;
JsonItem *ji = parseJsonStringIntoJsonItems ( json , niceness );
// print them out?
log("json: type0=%li",(long)ji->m_type);
return;
}
bool JsonItem::getCompoundName ( SafeBuf &nameBuf ) {
// reset, but don't free mem etc. just set m_length to 0
nameBuf.reset();
// get its full compound name like "meta.twitter.title"
JsonItem *p = this;//ji;
char *lastName = NULL;
char *nameArray[20];
long numNames = 0;
for ( ; p ; p = p->m_parent ) {
// empty name?
if ( ! p->m_name ) continue;
if ( ! p->m_name[0] ) continue;
// dup? can happen with arrays. parent of string
// in object, has same name as his parent, the
// name of the array. "dupname":[{"a":"b"},{"c":"d"}]
if ( p->m_name == lastName ) continue;
// update
lastName = p->m_name;
// add it up
nameArray[numNames++] = p->m_name;
// breach?
if ( numNames < 15 ) continue;
log("build: too many names in json tag");
break;
}
// assemble the names in reverse order which is correct order
for ( long i = 1 ; i <= numNames ; i++ ) {
// copy into our safebuf
if ( ! nameBuf.safeStrcpy ( nameArray[numNames-i]) )
return false;
// separate names with periods
if ( ! nameBuf.pushChar('.') ) return false;
}
// remove last period
nameBuf.removeLastChar('.');
// and null terminate
if ( ! nameBuf.nullTerm() ) return false;
// change all :'s in names to .'s since : is reserved!
char *px = nameBuf.getBufStart();
for ( ; *px ; px++ ) if ( *px == ':' ) *px = '.';
return true;
}
// is this json item in an array of json items?
bool JsonItem::isInArray ( ) {
JsonItem *p = this;//ji;
for ( ; p ; p = p->m_parent ) {
// empty name? it's just a "value item" then, i guess.
//if ( ! p->m_name ) continue;
//if ( ! p->m_name[0] ) continue;
if ( p->m_type == JT_ARRAY ) return true;
}
return false;
}

6
Json.h
View File

@ -51,6 +51,10 @@ class JsonItem {
return (char *)this + sizeof(JsonItem);
};
// like acme.product.offerPrice if "acme:{product:{offerprice:1.23}}"
bool getCompoundName ( SafeBuf &nameBuf ) ;
bool isInArray ( );
};
@ -59,7 +63,7 @@ class Json {
void test();
JsonItem *parseJsonStringIntoJsonItems ( char *json );
JsonItem *parseJsonStringIntoJsonItems ( char *json , long niceness );
JsonItem *getFirstItem ( ) ;

View File

@ -354,7 +354,7 @@ Collectiondb.o: Collectiondb.cpp gb-include.h types.h fctypes.h Unicode.h \
IndexTable2.h Msg51.h Msg17.h IndexReadInfo.h Msg3a.h Stats.h \
PostQueryRerank.h Sanity.h SiteGetter.h Title.h Address.h zlib.h zconf.h \
HttpMime.h Users.h Pages.h HttpServer.h TcpServer.h openssl/err.h \
PageCrawlBot.h Statsdb.h Process.h Msg28.h Cachedb.h Syncdb.h PageTurk.h
PageCrawlBot.h Statsdb.h Process.h Msg28.h Cachedb.h Syncdb.h
CollectionRec.o: CollectionRec.cpp gb-include.h types.h fctypes.h \
Unicode.h UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h \
hash.h Errno.h Log.h CollectionRec.h Url.h ip.h Parms.h Xml.h XmlNode.h \
@ -374,7 +374,11 @@ CollectionRec.o: CollectionRec.cpp gb-include.h types.h fctypes.h \
RdbBuckets.h RdbCache.h Msg5.h Msg3.h RdbMerge.h Dir.h PingServer.h \
HttpServer.h TcpServer.h openssl/err.h MsgC.h UdpServer.h UdpSlot.h \
UdpProtocol.h Dns.h DnsProtocol.h Multicast.h Threads.h HttpMime.h \
Datedb.h Indexdb.h DiskPageCache.h Titledb.h Timedb.h
Datedb.h Indexdb.h DiskPageCache.h Titledb.h Timedb.h Spider.h Msg4.h \
Msg1.h Msg0.h Clusterdb.h Linkdb.h Msg2.h Query.h Msg20.h Summary.h \
matches2.h Words.h StopWords.h Bits.h Pos.h Matches.h HashTableT.h \
Domains.h CountryCode.h Tagdb.h Events.h Sections.h IndexList.h Dates.h \
Msg22.h CatRec.h Categories.h Catdb.h
Conf.o: Conf.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
Log.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h iana_charset.h File.h \
@ -668,6 +672,7 @@ Entities.o: Entities.cpp gb-include.h types.h fctypes.h Unicode.h \
Errno.o: Errno.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
Log.h
errnotest.o: errnotest.cpp
Facebook.o: Facebook.cpp Facebook.h Conf.h Xml.h XmlNode.h gb-include.h \
types.h fctypes.h Unicode.h UnicodeProperties.h UCPropTable.h iconv.h \
UCNormalizer.h hash.h Errno.h Log.h Lang.h Iso8859.h iana_charset.h \
@ -1349,10 +1354,10 @@ main.o: main.cpp gb-include.h types.h fctypes.h Unicode.h \
Msge0.h Msge1.h Msg8b.h SearchInput.h Msg40.h Msg39.h Msg37.h TopTree.h \
IndexTable2.h Msg51.h Msg17.h Msg3a.h PostQueryRerank.h Sanity.h \
SiteGetter.h Title.h Address.h DailyMerge.h Speller.h Language.h Wiki.h \
Wiktionary.h Scraper.h Msg2a.h Msg9b.h Msg35.h Msg30.h Msg3e.h \
PageNetTest.h AutoBan.h TuringTest.h Msg1f.h Profiler.h Blaster.h \
Proxy.h linkspam.h sort.h Ads.h LanguagePages.h ValidPointer.h Placedb.h \
Test.h seo.h Json.h
Wiktionary.h Scraper.h Msg2a.h Msg9b.h Msg35.h Msg3e.h PageNetTest.h \
AutoBan.h TuringTest.h Msg1f.h Profiler.h Blaster.h Proxy.h linkspam.h \
sort.h Ads.h LanguagePages.h ValidPointer.h Placedb.h Test.h seo.h \
Json.h
matches2.o: matches2.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
Log.h matches2.h Titledb.h Rdb.h RdbBase.h Conf.h Xml.h XmlNode.h Lang.h \
@ -2694,7 +2699,7 @@ PageResults.o: PageResults.cpp gb-include.h types.h fctypes.h Unicode.h \
Highlight.h AutoBan.h TuringTest.h sort.h LanguageIdentifier.h \
LanguagePages.h LangList.h XmlDoc.h Phrases.h Images.h Msg13.h Msge0.h \
Msge1.h Msg8b.h SiteGetter.h Title.h Address.h Spider.h PageResults.h \
Proxy.h
Proxy.h Json.h
PageRoot.o: PageRoot.cpp gb-include.h types.h fctypes.h Unicode.h \
UnicodeProperties.h UCPropTable.h iconv.h UCNormalizer.h hash.h Errno.h \
Log.h Indexdb.h Rdb.h RdbBase.h Conf.h Xml.h XmlNode.h Lang.h Iso8859.h \

View File

@ -77,15 +77,20 @@ ifeq ("titan","$(HOST)")
# in 2013. So it just uses clone() and does its own "threading". Unfortunately,
# the way it works is not even possible on newer kernels because they no longer
# allow you to override the _errno_location() function. -- matt
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DMATTWELLS
LIBS = ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a
else
# use -m32 to force 32-bit mode compilation.
# you might have to do apt-get install gcc-multilib to ensure that -m32 works.
# -m32 should use /usr/lib32/ as the library path.
# i also provide 32-bit libraries for linking that are not so easy to get.
#
# mdw. 11/17/2013. i took out the -D_PTHREADS_ flag (and -lpthread).
# trying to use good ole' clone() again because it seems the errno location
# thing is fixed by just ignoring it.
#
CPPFLAGS = -m32 -g -Wall -pipe -Wno-write-strings -Wstrict-aliasing=0 -Wno-uninitialized -static -DPTHREADS -Wno-unused-but-set-variable
LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
endif
# if you have seo.cpp link that in. This is not part of the open source

View File

@ -462,6 +462,10 @@ bool Mem::init ( long long maxMem ) {
// this is called by C++ classes' constructors to register mem
void Mem::addMem ( void *mem , long size , const char *note , char isnew ) {
// enforce safebuf::setLabel being called
//if ( size>=100000 && note && strcmp(note,"SafeBuf")==0 ) {
// char *xx=NULL;*xx=0; }
//validate();
// sanity check

193
Msg13.cpp
View File

@ -15,6 +15,9 @@ long filterRobotsTxt ( char *reply , long replySize , HttpMime *mime ,
bool getIframeExpandedContent ( Msg13Request *r , TcpSocket *ts );
void gotIframeExpandedContent ( void *state ) ;
void scanHammerQueue ( int fd , void *state );
void downloadTheDocForReals ( Msg13Request *r ) ;
// utility functions
bool getTestSpideredDate ( Url *u , long *origSpiderDate , char *testDir ) ;
bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) ;
@ -111,6 +114,11 @@ bool Msg13::registerHandler ( ) {
if ( ! s_rt.set ( 8 , 4 , 0 , NULL , 0 , true,0,"wait13tbl") )
return false;
if ( ! g_loop.registerSleepCallback(10,NULL,scanHammerQueue) )
return log("build: Failed to register timer callback for "
"hammer queue.");
// success
return true;
}
@ -419,6 +427,8 @@ bool Msg13::gotFinalReply ( char *reply, long replySize, long replyAllocSize ){
RdbCache s_hammerCache;
static bool s_flag = false;
Msg13Request *s_hammerQueueHead = NULL;
Msg13Request *s_hammerQueueTail = NULL;
// . only return false if you want slot to be nuked w/o replying
// . MUST always call g_udpServer::sendReply() or sendErrorReply()
@ -486,15 +496,6 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
// temporary hack
if ( r->m_parent ) { char *xx=NULL;*xx=0; }
// use the default agent unless scraping
// force to event guru bot for now
//char *agent = "Mozilla/5.0 (compatible; ProCogSEOBot/1.0; +http://www.procog.com/ )";
//char *agent = "Mozilla/5.0 (compatible; GigaBot/1.0; +http://www.gigablast.com/ )";
char *agent = g_conf.m_spiderUserAgent;
if ( r->m_isScraping )
agent = "Mozilla/4.0 "
"(compatible; MSIE 6.0; Windows 98; "
"Win 9x 4.90)" ;
// assume we do not add it!
r->m_addToTestCache = false;
@ -515,18 +516,53 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
// we skip it if its a frame page, robots.txt, root doc or some other
// page that is a "child" page of the main page we are spidering
if ( ! r->m_skipHammerCheck ) {
// make sure we are not hammering an ip
// . make sure we are not hammering an ip
// . returns 0 if currently downloading a url from that ip
// . returns -1 if not found
long long last=s_hammerCache.getLongLong(0,r->m_firstIp,
30,true);
// get time now
long long nowms = gettimeofdayInMilliseconds();
// how long has it been since last download START time?
long long waited = nowms - last;
bool queueIt = false;
if ( last > 0 && waited < r->m_crawlDelayMS ) queueIt = true;
// a "last" of 0 means currently downloading
if ( r->m_crawlDelayMS > 0 && last == 0LL ) queueIt = true;
// a last of -1 means not found. so first time i guess.
if ( last == -1 ) queueIt = false;
// . queue it up if we haven't waited long enough
// . then the functionr, checkQueue(), will re-eval all
// the download requests in this hammer queue every 10ms.
// . it will just lookup the lastdownload time in the cache,
// which will store maybe a -1 if currently downloading...
if ( queueIt ) {
// debug
//log("spider: adding %s to crawldelayqueue",r->m_url);
// save this
r->m_udpSlot = slot;
r->m_nextLink = NULL;
// add it to queue
if ( ! s_hammerQueueHead ) {
s_hammerQueueHead = r;
s_hammerQueueTail = r;
}
else {
s_hammerQueueTail->m_nextLink = r;
s_hammerQueueTail = r;
}
return;
}
// if we had it in cache check the wait time
if ( last > 0 && waited < 400 ) {
if ( last > 0 && waited < r->m_crawlDelayMS ) {
log("spider: hammering firstIp=%s url=%s "
"only waited %lli ms",
iptoa(r->m_firstIp),r->m_url,waited);
"only waited %lli ms of %li ms",
iptoa(r->m_firstIp),r->m_url,waited,
r->m_crawlDelayMS);
// this guy has too many redirects and it fails us...
// BUT do not core if running live, only if for test
// collection
@ -536,14 +572,14 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
// char*xx = NULL; *xx = 0; }
}
// store time now
s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
//s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
// note it
if ( g_conf.m_logDebugSpider )
log("spider: adding download end time of %llu for "
"firstIp=%s "
"url=%s "
"to msg13::hammerCache",
nowms,iptoa(r->m_firstIp),r->m_url);
//if ( g_conf.m_logDebugSpider )
// log("spider: adding download end time of %llu for "
// "firstIp=%s "
// "url=%s "
// "to msg13::hammerCache",
// nowms,iptoa(r->m_firstIp),r->m_url);
// clear error from that if any, not important really
g_errno = 0;
}
@ -616,26 +652,71 @@ void handleRequest13 ( UdpSlot *slot , long niceness ) {
}
// do not get .google.com/ crap
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
downloadTheDocForReals ( r );
}
void downloadTheDocForReals ( Msg13Request *r ) {
// are we the first?
bool firstInLine = s_rt.isEmpty ( &r->m_cacheKey );
// wait in line cuz someone else downloading it now
if ( ! s_rt.addKey ( &r->m_cacheKey , &r ) ) {
g_udpServer.sendErrorReply(slot,g_errno);
g_udpServer.sendErrorReply(r->m_udpSlot,g_errno);
return;
}
// this means our callback will be called
if ( ! firstInLine ) return;
if ( ! firstInLine ) {
//log("spider: inlining %s",r->m_url);
return;
}
// . store time now
// . no, now we store 0 to indicate in progress, then we
// will overwrite it with a timestamp when the download completes
// . but if measuring crawldelay from beginning of the download then
// store the current time
// . do NOT do this when downloading robots.txt etc. type files
// which should have skipHammerCheck set to true
if ( r->m_crawlDelayFromEnd && ! r->m_skipHammerCheck ) {
s_hammerCache.addLongLong(0,r->m_firstIp, 0LL);//nowms);
}
else if ( ! r->m_skipHammerCheck ) {
// get time now
long long nowms = gettimeofdayInMilliseconds();
s_hammerCache.addLongLong(0,r->m_firstIp, nowms);
}
// note it
if ( g_conf.m_logDebugSpider )
log("spider: adding special \"in-progress\" time of %lli for "
"firstIp=%s "
"url=%s "
"to msg13::hammerCache",
-1LL,iptoa(r->m_firstIp),r->m_url);
// do not get .google.com/ crap
//if ( strstr(r->m_url,".google.com/") ) { char *xx=NULL;*xx=0; }
// flag this
r->m_addToTestCache = true;
// note it here
if ( g_conf.m_logDebugSpider )
log("spider: downloading %s (%s)",
r->m_url,iptoa(r->m_urlIp) );
log("spider: downloading %s (%s) (skiphammercheck=%li)",
r->m_url,iptoa(r->m_urlIp) ,
(long)r->m_skipHammerCheck);
// use the default agent unless scraping
// force to event guru bot for now
//char *agent = "Mozilla/5.0 (compatible; ProCogSEOBot/1.0; +http://www.procog.com/ )";
//char *agent = "Mozilla/5.0 (compatible; GigaBot/1.0; +http://www.gigablast.com/ )";
char *agent = g_conf.m_spiderUserAgent;
if ( r->m_isScraping )
agent = "Mozilla/4.0 "
"(compatible; MSIE 6.0; Windows 98; "
"Win 9x 4.90)" ;
// download it
if ( ! g_httpServer.getDoc ( r->m_url ,
r->m_urlIp ,
@ -702,6 +783,21 @@ void gotHttpReply2 ( void *state ,
"for %s at ip %s",
mstrerror(g_errno),r->m_url,iptoa(r->m_urlIp));
// get time now
long long nowms = gettimeofdayInMilliseconds();
// . now store the current time in the cache
// . do NOT do this for robots.txt etc. where we skip hammer check
if ( r->m_crawlDelayFromEnd && ! r->m_skipHammerCheck )
s_hammerCache.addLongLong(0,r->m_firstIp,nowms);
// note it
if ( g_conf.m_logDebugSpider )
log("spider: adding final download end time of %lli for "
"firstIp=%s "
"url=%s "
"to msg13::hammerCache",
nowms,iptoa(r->m_firstIp),r->m_url);
// sanity. this was happening from iframe download
//if ( g_errno == EDNSTIMEDOUT ) { char *xx=NULL;*xx=0; }
@ -2086,5 +2182,48 @@ void gotIframeExpandedContent ( void *state ) {
delete ( xd );
}
// call this once every 10ms to launch queued up download requests so that
// we respect crawl delay for sure
void scanHammerQueue ( int fd , void *state ) {
Msg13Request *r = s_hammerQueueHead;
if ( ! r ) return;
long long nowms = gettimeofdayInMilliseconds();
Msg13Request *prev = NULL;
long long waited = -1LL;
// scan down the linked list of queued of msg13 requests
for ( ; r ; prev = r , r = r->m_nextLink ) {
long long last;
last = s_hammerCache.getLongLong(0,r->m_firstIp,30,true);
// is one from this ip outstanding?
if ( last == 0LL && r->m_crawlDelayFromEnd ) continue;
// download finished?
if ( last > 0 ) {
waited = nowms - last;
// but skip if haven't waited long enough
if ( waited < r->m_crawlDelayMS ) continue;
}
// debug
//log("spider: downloading %s from crawldelay queue "
// "waited=%llims crawldelay=%lims",
// r->m_url,waited,r->m_crawlDelayMS);
// good to go
downloadTheDocForReals ( r );
//
// remove from future scans
//
if ( prev )
prev->m_nextLink = r->m_nextLink;
if ( s_hammerQueueHead == r )
s_hammerQueueHead = r->m_nextLink;
if ( s_hammerQueueTail == r )
s_hammerQueueTail = prev;
// try to download some more i guess...
}
}

View File

@ -25,6 +25,10 @@ public:
long m_maxCacheAge;
long m_maxTextDocLen;
long m_maxOtherDocLen;
// in milliseconds. use -1 if none or unknown.
long m_crawlDelayMS;
// for linked list, this is the hammer queue
class Msg13Request *m_nextLink;
// if doing spider compression, compute contentHash32 of document
// downloaded, and if it matches this then send back EDOCUNCHANGED
long m_contentHash32;
@ -50,7 +54,8 @@ public:
long m_addToTestCache:1;
long m_skipHammerCheck:1;
long m_attemptedIframeExpansion:1;
long m_forEvents;
long m_crawlDelayFromEnd:1;
long m_forEvents:1;
//long m_testParserEnabled:1;
//long m_testSpiderEnabled:1;
//long m_isPageParser:1;
@ -83,6 +88,7 @@ public:
memset (this,0,(char *)m_url - (char *)this + 1);
m_maxTextDocLen = -1; // no limit
m_maxOtherDocLen = -1; // no limit
m_crawlDelayMS = -1; // unknown or none
};
};

View File

@ -224,6 +224,7 @@ bool Msg20::getSummary ( Msg20Request *req ) {
hostdb )) {
// sendto() sometimes returns "Network is down" so i guess
// we just had an "error reply".
log("msg20: error sending mcast %s",mstrerror(g_errno));
m_gotReply = true;
return true;
}
@ -428,6 +429,12 @@ Msg20Reply::Msg20Reply ( ) {
// this is free in destructor, so clear it here
//ptr_eventSummaryLines = NULL;
m_tmp = 0;
// seems to be an issue... caused a core with bogus size_dbuf
long *sizePtr = &size_tbuf;
long *sizeEnd = &size_note;
for ( ; sizePtr <= sizeEnd ; sizePtr++ )
*sizePtr = 0;
}

View File

@ -13,7 +13,8 @@ static void sendReply ( UdpSlot *slot ,
Msg39 *msg39 ,
char *reply ,
long replySize ,
long replyMaxSize );
long replyMaxSize ,
bool hadError );
// called when Msg2 has got all the termlists
static void gotListsWrapper ( void *state ) ;
// thread wrappers
@ -66,7 +67,7 @@ void handleRequest39 ( UdpSlot *slot , long netnice ) {
catch ( ... ) {
g_errno = ENOMEM;
log("msg39: new(%i): %s", sizeof(Msg39),mstrerror(g_errno));
sendReply ( slot , NULL , NULL , 0 , 0 );
sendReply ( slot , NULL , NULL , 0 , 0 ,true);
return;
}
mnew ( THIS , sizeof(Msg39) , "Msg39" );
@ -79,12 +80,15 @@ void handleRequest39 ( UdpSlot *slot , long netnice ) {
// this must always be called sometime AFTER handleRequest() is called
void sendReply ( UdpSlot *slot , Msg39 *msg39 , char *reply , long replyLen ,
long replyMaxSize ) {
long replyMaxSize , bool hadError ) {
// debug msg
if ( g_conf.m_logDebugQuery || (msg39&&msg39->m_debug) )
logf(LOG_DEBUG,"query: msg39: [%lu] Sending reply len=%li.",
(long)msg39,replyLen);
// sanity
if ( hadError && ! g_errno ) { char *xx=NULL;*xx=0; }
// no longer in use. msg39 will be NULL if ENOMEM or something
if ( msg39 ) msg39->m_inUse = false;
@ -140,7 +144,7 @@ void Msg39::getDocIds ( UdpSlot *slot ) {
g_errno = EBADREQUESTSIZE;
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return ;
}
@ -176,7 +180,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
g_errno = ENOCOLLREC;
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return ;
}
@ -185,7 +189,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
g_errno = ENOCOLLREC;
log(LOG_LOGIC,"query: msg39: getDocIds: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return ;
}
@ -199,7 +203,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
m_r->m_useQueryStopWords ) ) {
log(LOG_LOGIC,"query: msg39: setQuery: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return ;
}
@ -217,7 +221,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
,m_tmpq.m_orig
,(long)m_r->m_language
);
sendReply ( m_slot , this , NULL , 0 , 0 );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return ;
}
// debug
@ -286,7 +290,7 @@ void Msg39::getDocIds2 ( Msg39Request *req ) {
if ( g_errno ) {
log(LOG_LOGIC,"query: msg39: doDocIdSplitLoop: %s." ,
mstrerror(g_errno) );
sendReply ( m_slot , this , NULL , 0 , 0 );
sendReply ( m_slot , this , NULL , 0 , 0 , true );
return ;
}
// it might not have blocked! if all lists in tree and used no thread
@ -327,11 +331,13 @@ bool Msg39::doDocIdSplitLoop ( ) {
if ( d0 >= d1 ) break;
// use this
//m_debug = true;
//log("call1");
// . get the lists
// . i think this always should block!
// . it will also intersect the termlists to get the search
// results and accumulate the winners into the "tree"
if ( ! getLists() ) return false;
//log("call2 g_errno=%li",(long)g_errno);
// if there was an error, stop!
if ( g_errno ) break;
}
@ -339,7 +345,7 @@ bool Msg39::doDocIdSplitLoop ( ) {
// return error reply if we had an error
if ( g_errno ) {
log("msg39: Had error3: %s.", mstrerror(g_errno));
sendReply (m_slot,this,NULL,0,0);
sendReply (m_slot,this,NULL,0,0 , true);
return true;
}
@ -507,6 +513,7 @@ bool Msg39::getLists () {
"sign=%c "
"numPlusses=%hhu "
"required=%li "
"fielcode=%li "
"ebit=0x%0llx "
"impBits=0x%0llx "
@ -534,6 +541,7 @@ bool Msg39::getLists () {
sign , //c ,
0 ,
(long)qt->m_isRequired,
(long)qt->m_fieldCode,
(long long)qt->m_explicitBit ,
(long long)qt->m_implicitBits ,
@ -623,6 +631,16 @@ bool Msg39::getLists () {
m_blocked = true;
return false;
}
// error?
if ( g_errno ) {
log("msg39: Had error getting termlists2: %s.",
mstrerror(g_errno));
// don't bail out here because we are in docIdSplitLoop()
//sendReply (m_slot,this,NULL,0,0,true);
return true;
}
return gotLists ( true );
}
@ -630,7 +648,16 @@ void gotListsWrapper ( void *state ) {
Msg39 *THIS = (Msg39 *) state;
// . hash the lists into our index table
// . this will send back a reply or recycle and read more list data
THIS->gotLists ( true );
if ( ! THIS->gotLists ( true ) ) return;
// . if he did not block and there was an errno we send reply
// otherwise if there was NO error he will have sent the reply
// . if gotLists() was called in the ABOVE function and it returns
// true then the docIdLoop() function will send back the reply.
if ( g_errno ) {
log("msg39: sending back error reply = %s",mstrerror(g_errno));
sendReply ( THIS->m_slot , THIS , NULL , 0 , 0 ,true);
}
}
// . now come here when we got the necessary index lists
@ -641,7 +668,8 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
if ( g_errno ) {
log("msg39: Had error getting termlists: %s.",
mstrerror(g_errno));
sendReply (m_slot,this,NULL,0,0);
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
//sendReply (m_slot,this,NULL,0,0,true);
return true;
}
// timestamp log
@ -681,7 +709,8 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
// . actually we were using it before for rat=0/bool queries but
// i got rid of NO_RAT_SLOTS
if ( ! m_allocedTree && ! m_posdbTable.allocTopTree() ) {
sendReply ( m_slot , this , NULL , 0 , 0 );
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
//sendReply ( m_slot , this , NULL , 0 , 0 , true);
return true;
}
@ -690,7 +719,8 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
if ( ! m_posdbTable.allocWhiteListTable() ) {
log("msg39: Had error allocating white list table: %s.",
mstrerror(g_errno));
sendReply (m_slot,this,NULL,0,0);
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
//sendReply (m_slot,this,NULL,0,0,true);
return true;
}
@ -703,7 +733,6 @@ bool Msg39::gotLists ( bool updateReadInfo ) {
// . we have to re-set the QueryTermInfos with each docid range split
// since it will set the list ptrs from the msg2 lists
if ( m_r->m_useNewAlgo && ! m_posdbTable.setQueryTermInfo () ) {
sendReply ( m_slot , this , NULL , 0 , 0 );
return true;
}
@ -856,7 +885,7 @@ bool Msg39::addedLists ( ) {
m_posdbTable.freeMem();
g_errno = m_posdbTable.m_errno;
log("query: posdbtable had error = %s",mstrerror(g_errno));
sendReply ( m_slot , this , NULL , 0 , 0 );
sendReply ( m_slot , this , NULL , 0 , 0 ,true);
return true;
}
@ -899,7 +928,7 @@ bool Msg39::setClusterRecs ( ) {
// on error, return true, g_errno should be set
if ( ! m_buf ) {
log("query: msg39: Failed to alloc buf for clustering.");
sendReply(m_slot,this,NULL,0,0);
sendReply(m_slot,this,NULL,0,0,true);
return true;
}
@ -981,7 +1010,7 @@ void Msg39::gotClusterRecs ( ) {
m_clusterLevels )) {
m_errno = g_errno;
// send back an error reply
sendReply ( m_slot , this , NULL , 0 , 0 );
sendReply ( m_slot , this , NULL , 0 , 0 ,true);
return;
}
@ -1146,7 +1175,7 @@ void Msg39::estimateHits ( ) {
if ( ! reply ) {
log("query: Could not allocated memory "
"to hold reply of docids to send back.");
sendReply(m_slot,this,NULL,0,0);
sendReply(m_slot,this,NULL,0,0,true);
return ;
}
topDocIds = (long long *) mr.ptr_docIds;
@ -1233,6 +1262,6 @@ void Msg39::estimateHits ( ) {
}
// now send back the reply
sendReply(m_slot,this,reply,replySize,replySize);
sendReply(m_slot,this,reply,replySize,replySize,false);
return;
}

View File

@ -1107,7 +1107,7 @@ bool Msg40::launchMsg20s ( bool recalled ) {
req.m_bigSampleMaxLen = bigSampleMaxLen;
req.m_titleMaxLen = 256;
req.m_titleMaxLen = cr->m_titleMaxLen;
if(m_si->m_isAdmin && m_si->m_xml == 0)
if(m_si->m_isAdmin && m_si->m_format == FORMAT_HTML )
req.m_getGigabitVector = true;
else req.m_getGigabitVector = false;
req.m_flags = 0;
@ -1222,6 +1222,7 @@ bool Msg40::gotSummary ( ) {
if ( m_numReplies < m_numRequests )
return false;
doAgain:
// do we need to launch another batch of summary requests?
if ( m_numRequests < m_msg3a.m_numDocIds ) {
@ -1235,7 +1236,12 @@ bool Msg40::gotSummary ( ) {
// it returned true, so m_numRequests == m_numReplies and
// we don't need to launch any more! but that does NOT
// make sense because m_numContiguous < m_msg3a.m_numDocIds
char *xx=NULL; *xx=0;
// . i guess the launch can fail because of oom... and
// end up returning true here... seen it happen, and
// we had full requests/replies for m_msg3a.m_numDocIds
log("msg40: got all replies i guess");
goto doAgain;
//char *xx=NULL; *xx=0;
}
@ -1895,9 +1901,10 @@ bool Msg40::gotSummary ( ) {
}
// take this out for now...
#ifdef GB_PQR
// run post query reranks for this query
long wanted = m_si->m_docsWanted + m_si->m_firstResultNum + 1;
if ( m_postQueryRerank.isEnabled() &&
m_postQueryRerank.set2(wanted)){
if ( ! m_postQueryRerank.preRerank () ) {
@ -1916,6 +1923,7 @@ bool Msg40::gotSummary ( ) {
m_postQueryRerank.rerankFailed();
}
}
#endif
// set m_moreToCome, if true, we print a "Next 10" link
m_moreToCome = (visible > //m_visibleContiguous >

File diff suppressed because it is too large Load Diff

View File

@ -33,7 +33,7 @@ public:
//TagRec m_tagRec;
TcpSocket *m_socket;
HttpRequest m_r;
char m_coll[50];
char m_coll[MAX_COLL_LEN+2];
//CollectionRec *m_cr;
bool m_isAdmin;
bool m_isLocal;
@ -136,7 +136,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
uint8_t langId = getLangIdFromAbbr ( langAbbr );
st->m_langId = langId;
}
strncpy ( st->m_coll , coll , 40 );
strncpy ( st->m_coll , coll , MAX_COLL_LEN+1 );
// store query for query highlighting
st->m_netTestResults = r->getLong ("rnettest", false );
if( st->m_netTestResults ) {
@ -179,14 +179,22 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
sreq.reset();
strcpy(sreq.m_url, url );
sreq.setDataSize();
xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness );
// this returns false if "coll" is invalid
if ( ! xd->set4 ( &sreq , NULL , coll , NULL , st->m_niceness ) )
goto hadSetError;
}
// . when getTitleRec() is called it will load the old one
// since XmlDoc::m_setFromTitleRec will be true
// . niceness is 0
else {
// use st->m_coll since XmlDoc just points to it!
xd->set3 ( docId , st->m_coll , 0 );
// . use st->m_coll since XmlDoc just points to it!
// . this returns false if "coll" is invalid
else if ( ! xd->set3 ( docId , st->m_coll , 0 ) ) {
hadSetError:
mdelete ( st , sizeof(State2) , "PageGet1" );
delete ( st );
g_errno = ENOMEM;
log("PageGet: set3: %s", mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
}
// if it blocks while it loads title rec, it will re-call this routine
xd->setCallback ( st , processLoopWrapper );

View File

@ -23,6 +23,8 @@ bool sendPagePerf ( TcpSocket *s , HttpRequest *r ) {
// don't allow pages bigger than 128k in cache
char buf [ 64*1024 ];
SafeBuf p(buf, 64*1024);
p.setLabel ( "perfgrph" );
// print standard header
g_pages.printAdminTop ( &p , s , r );

File diff suppressed because it is too large Load Diff

View File

@ -43,15 +43,36 @@ bool sendPageRoot ( TcpSocket *s, HttpRequest *r ){
}
bool printNav ( SafeBuf &sb , HttpRequest *r ) {
char *root = "";
char *rootSecure = "";
if ( g_conf.m_isMattWells ) {
root = "http://www.gigablast.com";
rootSecure = "https://www.gigablast.com";
}
sb.safePrintf("<center><b><p class=nav>"
"<a href=\"/about.html\">About</a>"
" &nbsp; &nbsp; <a href=\"/contact.html\">Contact</a>"
" &nbsp; &nbsp;<a href=\"/help.html\">Help</a>"
" &nbsp; &nbsp; <a href=/privacy.html>Privacy Policy</a>"
" &nbsp; &nbsp;<a href=\"/searchfeed.html\">"
"Search API</a>"
" &nbsp; &nbsp; <a href=/seoapi.html>SEO API</a>"
" &nbsp; &nbsp; <a href=/account>My Account</a> "
"<a href=%s/about.html>About</a>"
" &nbsp; &nbsp; "
"<a href=%s/contact.html>Contact</a>"
" &nbsp; &nbsp; "
"<a href=%s/help.html>Help</a>"
" &nbsp; &nbsp; "
"<a href=%s/privacy.html>Privacy Policy</a>"
" &nbsp; &nbsp; "
"<a href=%s/searchfeed.html>Search API</a>"
" &nbsp; &nbsp; "
"<a href=%s/seoapi.html>SEO API</a>"
" &nbsp; &nbsp; "
"<a href=%s/account>My Account</a> "
, root
, root
, root
, root
, root
, root
, rootSecure
//" &nbsp; &nbsp; <a href=/logout>Logout</a>"
);
if ( r->isLocal() )
@ -115,7 +136,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
sb.safePrintf("<b>web</b> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"/Top\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<b>web</b> &nbsp;&nbsp;&nbsp;&nbsp; "
"<a href=http://www.gigablast.com/seo>seo</a> "
"&nbsp;&nbsp;&nbsp;&nbsp; "
"<a href=\"/Top\">directory</a> "
"&nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/adv.html>advanced search</a>");
sb.safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; ");
sb.safePrintf("<a href=/addurl title=\"Instantly add your url to "
@ -135,7 +160,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("<tr valign=top>\n");
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:red;></td>\n");
sb.safePrintf("<td align=center><div style=width:50px;height:50px;display:inline-block;background-color:red;></div></td>\n");
sb.safePrintf("<td><font size=+1><b>Open Source!</b>"
"</font><br>\n");
sb.brify2("Gigablast is now available as an <a href=https://github.com/gigablast/open-source-search-engine>open source search engine</a> on github.com. Download it today. Finally a robust, scalable search solution in C/C++ that has been in development and used commercially since 2000. <a href=/admin.html#features>Features.</a> Limited support available for free."
@ -144,19 +169,37 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("</td></tr>\n");
char *root = "";
if ( g_conf.m_isMattWells )
root = "http://www.gigablast.com";
sb.safePrintf("<tr valign=top>\n");
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:green;></td>\n");
sb.safePrintf("<td><font size=+1><b>The Green Search Engine</b></font><br>\n");
sb.brify2("Gigablast is the only clean-powered web search engine. 90% of its power usage comes from wind energy. Astoundingly, Gigablast is one of ONLY four search engines in the United States indexing over a billion pages.",80);
// 204x143
sb.safePrintf("<td><img height=52px width=75px "
"src=%s/eventguru.png></td>\n"
, root );
sb.safePrintf("<td><font size=+1><b>Event Guru Returns</b></font><br>\n");
sb.brify2("<a href=http://www.eventguru.com/>Event Guru</a> datamines events from the web. It identifies events on a web page, or even plain text, using the same rules of deduction used by the human mind. It also has Facebook integration and lots of other cool things.",80);
sb.safePrintf("<br><br></td></tr>\n");
sb.safePrintf("\n");
sb.safePrintf("\n");
/*
sb.safePrintf("<tr valign=top>\n");
sb.safePrintf("<td align=center><div style=width:50px;height:50px;display:inline-block;background-color:green;></div></td>\n");
sb.safePrintf("<td><font size=+1><b>The Green Search Engine</b></font><br>\n");
sb.brify2("Gigablast is the only clean-powered web search engine. 90% of its power usage comes from wind energy. Astoundingly, Gigablast is one of ONLY four search engines in the United States indexing over a billion pages.",80);
sb.safePrintf("<br><br></td></tr>\n");
sb.safePrintf("\n");
sb.safePrintf("\n");
*/
sb.safePrintf("<tr valign=top>\n");
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:0040fe;></td>\n");
sb.safePrintf("<td align=center><img src=%s/gears.png "
"height=50 width=50></div></td>\n"
, root );
sb.safePrintf("<td><font size=+1><b>The Transparent Search Engine</b></font><br>\n");
sb.brify2("Gigablast is the first truly transparent search engine. It tells you exactly why the search results are ranked the way they are. There is nothing left to the imagination.",85);
sb.safePrintf("<br><br>");
@ -165,9 +208,11 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("\n");
sb.safePrintf("<tr valign=top>\n");
sb.safePrintf("<td><div style=width:50px;height:50px;display:inline-block;background-color:f2b629;></td>\n");
sb.safePrintf("<td align=center><center><img src=%s/dollargear.png "
"height=50 width=50></center></div></center></td>\n"
, root );
sb.safePrintf("<td><font size=+1><b>The SEO Search Engine</b></font><br>\n");
sb.brify2("When it comes to search-engine based SEO, Gigablast is the place to be. With a frothy set of unique and effective <a href=/seo>SEO tools</a>, you will find all you need to execute a simple yet effective SEO strategy. Stop the guesswork, and let a search engine tell you how to SEO it.",85);
sb.brify2("When it comes to search-engine based SEO, Gigablast is the place to be. With a frothy set of unique and effective <a href=http://www.gigablast.com/seo>SEO tools</a>, you will find all you need to execute a simple yet effective SEO strategy. Stop the guesswork, and let a search engine tell you how to SEO it.",85);
sb.safePrintf("</td></tr>\n");
@ -325,7 +370,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"/Top\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=http://www.gigablast.com/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=\"/Top\">directory</a> &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/adv.html>advanced search</a>");
sb.safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; ");
sb.safePrintf("<b title=\"Instantly add your url to Gigablast's "
@ -368,19 +413,22 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
// . when loaded with the main page for the first time it will
// immediately replace its content...
if ( url ) {
char *root = "";
if ( g_conf.m_isMattWells )
root = "http://www.gigablast.com";
sb.safePrintf("<br>"
"<br>"
"<div id=msgbox>"
//"<b>Injecting your url. Please wait...</b>"
"<center>"
"<img src=/gears.gif width=50 height=50>"
"<img src=%s/gears.gif width=50 height=50>"
"</center>"
"<script type=text/javascript>"
//"alert('shit');"
"var client = new XMLHttpRequest();\n"
"client.onreadystatechange = handler;\n"
"var url='/addurl?u="
);
, root );
sb.urlEncode ( url );
// propagate "admin" if set
//long admin = hr->getLong("admin",-1);
@ -463,11 +511,17 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("<br><br>\n");
sb.safePrintf("<br><br><br>\n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <b>directory</b> &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/>web</a> &nbsp;&nbsp;&nbsp;&nbsp; <a href=http://www.gigablast.com/seo>seo</a> &nbsp;&nbsp;&nbsp;&nbsp; <b>directory</b> &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=http://www.gigablast.com/events>events</a>"
" &nbsp;&nbsp;&nbsp;&nbsp; \n");
sb.safePrintf("<a href=/adv.html>advanced search</a>");
sb.safePrintf(" &nbsp;&nbsp;&nbsp;&nbsp; ");
sb.safePrintf("<a href=/addurl title=\"Instantly add your url to "
"Gigablast's index\">add url</a>");
char *root = "";
if ( g_conf.m_isMattWells )
root = "http://www.gigablast.com";
sb.safePrintf("<a href=%s/addurl title=\"Instantly add your url to "
"Gigablast's index\">add url</a>"
, root );
sb.safePrintf("\n");
sb.safePrintf("<br><br>\n");
// submit to HTTPS now
@ -1591,7 +1645,7 @@ void doneInjectingWrapper3 ( void *st ) {
rand32);
sb.urlEncode(url);
sb.safePrintf(">Check it</a> or "
"<a href=/seo?u=");
"<a href=http://www.gigablast.com/seo?u=");
sb.urlEncode(url);
sb.safePrintf(">SEO it</a>"
".</b>");

View File

@ -97,7 +97,7 @@ bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
st->m_dateCustom = (bool)r->getLong( "custom", 0 );
// default to 10 hours, i would do 1 day except that there are
// some bugs that mess up the display a lot when i do that
st->m_datePeriod = r->getLong( "date_period" , 36000 );
st->m_datePeriod = r->getLong( "date_period" , 300 );//36000 );
st->m_dateUnits = r->getLong( "date_units" , 1 );//SECS_PER_MIN
st->m_now = (bool)r->getLong( "date_now" , 1 );
st->m_autoUpdate = (bool)r->getLong( "auto_update" , 0 );
@ -152,8 +152,8 @@ void sendReply ( void *state ) {
TcpSocket *s = st->m_socket;
SafeBuf buf( 1024*32 );
SafeBuf tmpBuf( 1024 );
SafeBuf buf( 1024*32 , "tmpbuf0" );
SafeBuf tmpBuf( 1024 , "tmpbuf1" );
//
// take these out until we need them!

View File

@ -361,6 +361,8 @@ long Pages::getDynamicPageNumber ( HttpRequest *r ) {
path = "admin/inject"; pathLen = gbstrlen(path); }
if ( pathLen == 9 && strncmp ( path , "index.php" , 9 ) == 0 ) {
path = "search"; pathLen = gbstrlen(path); }
if ( pathLen == 10 && strncmp ( path , "search.csv" , 10 ) == 0 ) {
path = "search"; pathLen = gbstrlen(path); }
// if it is like /GA/Atlanta then call sendPageResults
// and that should be smart enough to set the m_where in

106
Parms.cpp
View File

@ -2127,15 +2127,15 @@ bool Parms::printParm ( SafeBuf* sb,
// . if printing on crawlbot page hide these
// . we repeat this logic below when printing parm titles
// for the column headers in the table
char *vt = "";
if ( isCrawlbot &&
m->m_page == PAGE_FILTERS &&
(strcmp(m->m_xml,"spidersEnabled") == 0 ||
//strcmp(m->m_xml,"maxSpidersPerRule")==0||
//strcmp(m->m_xml,"maxSpidersPerIp") == 0||
strcmp(m->m_xml,"spiderIpWait") == 0
) )
vt = " style=display:none;";
//char *vt = "";
//if ( isCrawlbot &&
// m->m_page == PAGE_FILTERS &&
// (strcmp(m->m_xml,"spidersEnabled") == 0 ||
// //strcmp(m->m_xml,"maxSpidersPerRule")==0||
// //strcmp(m->m_xml,"maxSpidersPerIp") == 0||
// strcmp(m->m_xml,"spiderIpWait") == 0
// ) )
// vt = " style=display:none;";
// what type of parameter?
char t = m->m_type;
@ -2210,15 +2210,16 @@ bool Parms::printParm ( SafeBuf* sb,
if ( isJSON ) continue;
// . hide table column headers that are too advanced
// . we repeat this logic above for the actual parms
char *vt = "";
if ( isCrawlbot &&
m->m_page == PAGE_FILTERS &&
(strcmp(mk->m_xml,"spidersEnabled") == 0 ||
//strcmp(mk->m_xml,"maxSpidersPerRule")==0||
//strcmp(mk->m_xml,"maxSpidersPerIp") == 0||
strcmp(mk->m_xml,"spiderIpWait") == 0 ) )
vt = " style=display:none;display:none;";
sb->safePrintf ( "<td%s>" , vt );
//char *vt = "";
//if ( isCrawlbot &&
// m->m_page == PAGE_FILTERS &&
// (strcmp(mk->m_xml,"spidersEnabled") == 0 ||
// //strcmp(mk->m_xml,"maxSpidersPerRule")==0||
// //strcmp(mk->m_xml,"maxSpidersPerIp") == 0||
// strcmp(mk->m_xml,"spiderIpWait") == 0 ) )
// vt = " style=display:none;display:none;";
//sb->safePrintf ( "<td%s>" , vt );
sb->safePrintf ( "<td>" );
// if its of type checkbox in a table make it
// toggle them all on/off
if ( mk->m_type == TYPE_CHECKBOX &&
@ -2310,7 +2311,8 @@ bool Parms::printParm ( SafeBuf* sb,
else if ( firstInRow )
sb->safePrintf ( "<tr><td>" );
else
sb->safePrintf ( "<td%s>" , vt);
//sb->safePrintf ( "<td%s>" , vt);
sb->safePrintf ( "<td>" );
}
long cast = m->m_cast;
@ -4008,7 +4010,7 @@ char *Parms::getParmHtmlEncoded ( char *p , char *pend , Parm *m , char *s ) {
// time is stored as long
long ct = *(long *)s;
// get the time struct
struct tm *tp = gmtime ( (time_t *)&ct ) ;
struct tm *tp = localtime ( (time_t *)&ct ) ;
// set the "selected" month for the drop down
strftime ( p , 100 , "%d %b %Y %H:%M UTC" , tp );
}
@ -8499,6 +8501,30 @@ void Parms::init ( ) {
m->m_units = "seconds";
m++;
m->m_cgi = "dbapi";
m->m_xml = "diffbotApiUrl";
m->m_off = (char *)&cr.m_diffbotApiUrl - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_def = "";
m++;
m->m_cgi = "dbucp";
m->m_xml = "diffbotUrlCrawlPattern";
m->m_off = (char *)&cr.m_diffbotUrlCrawlPattern - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_def = "";
m++;
m->m_cgi = "dbupp";
m->m_xml = "diffbotUrlProcessPattern";
m->m_off = (char *)&cr.m_diffbotUrlProcessPattern - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_def = "";
m++;
m->m_cgi = "dbppp";
m->m_xml = "diffbotPageProcessPattern";
m->m_off = (char *)&cr.m_diffbotPageProcessPattern - x;
@ -8507,6 +8533,22 @@ void Parms::init ( ) {
m->m_def = "";
m++;
m->m_cgi = "dbucre";
m->m_xml = "diffbotUrlCrawlRegEx";
m->m_off = (char *)&cr.m_diffbotUrlCrawlRegEx - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_def = "";
m++;
m->m_cgi = "dbupre";
m->m_xml = "diffbotUrlProcessRegEx";
m->m_off = (char *)&cr.m_diffbotUrlProcessRegEx - x;
m->m_type = TYPE_SAFEBUF;
m->m_page = PAGE_NONE;
m->m_def = "";
m++;
m->m_cgi = "dbopn";
m->m_xml = "diffbotOnlyProcessIfNew";
m->m_off = (char *)&cr.m_diffbotOnlyProcessIfNew - x;
@ -13027,6 +13069,17 @@ void Parms::init ( ) {
m->m_def = "";
m++;
m->m_title = "harvest links";
m->m_cgi = "hspl";
m->m_xml = "harvestLinks";
m->m_max = MAX_FILTERS;
m->m_off = (char *)cr.m_harvestLinks - x;
m->m_type = TYPE_CHECKBOX;
m->m_def = "1";
m->m_page = PAGE_FILTERS;
m->m_rowid = 1;
m++;
m->m_title = "spidering enabled";
m->m_cgi = "cspe";
m->m_xml = "spidersEnabled";
@ -15116,18 +15169,19 @@ void Parms::init ( ) {
m->m_sprpp = 0;
m++;
/*
m->m_title = "format of the returned search results";
m->m_desc = "X is 0 to get back results in regular html, and 8 to "
"get back results in XML.";
m->m_desc = "X is 0 to get back results in regular html, 1 to "
"get back results in XML, 2 for JSON.";
m->m_def = "0";
m->m_soff = (char *)&si.m_xml - y;
m->m_type = TYPE_LONG;
m->m_soff = (char *)&si.m_formatStr - y;
m->m_type = TYPE_STRING;//CHAR;
m->m_sparm = 1;
m->m_scgi = "xml";
m->m_scgi = "format";
m->m_smin = 0;
m->m_smax = 12;
m++;
*/
m->m_title = "highlight query terms in summaries.";
m->m_desc = "Use to disable or enable "

View File

@ -3043,6 +3043,8 @@ void doneGettingNotifyUrlWrapper ( void *state , TcpSocket *sock ) {
ei->m_finalCallback ( ei->m_finalState );
}
bool printCrawlDetailsInJson ( SafeBuf &sb , CollectionRec *cx ) ;
// . return false if would block, true otherwise
// . used to send email and get a url when a crawl hits a maxToCrawl
// or maxToProcess limitation.
@ -3103,15 +3105,38 @@ bool sendNotification ( EmailInfo *ei ) {
if ( url && url[0] ) {
log("build: sending url notification to %s for coll \"%s\"",
url,crawl);
Url uu; uu.set ( url );
SafeBuf fullReq;
fullReq.safePrintf("POST %s HTTP/1.0\r\n"
"User-Agent: Crawlbot/2.0\r\n"
"Accept: */*\r\n"
"Host: "
, uu.getPath()
);
fullReq.safeMemcpy ( uu.getHost() , uu.getHostLen() );
// make custom headers
SafeBuf custom;
custom.safePrintf ( "X-Crawl-Name: %s\r\n"
fullReq.safePrintf ("X-Crawl-Name: %s\r\n"
// last \r\n is added in HttpRequest.cpp
"X-Crawl-Status: %s"// \r\n" // hdrs
"X-Crawl-Status: %s\r\n" // hdrs
, cr->m_diffbotCrawlName.getBufStart()
, ei->m_spiderStatusMsg.getBufStart()
);
// also in post body
SafeBuf postContent;
// the collection details
printCrawlDetailsInJson ( postContent , cr );
// content-length of it
fullReq.safePrintf("Content-Length: %li\r\n",
postContent.length());
// type is json
fullReq.safePrintf("Content-Type: application/json\r\n");
fullReq.safePrintf("\r\n");
// then the post content
fullReq.safeMemcpy ( &postContent );
fullReq.nullTerm();
// GET request
if ( ! g_httpServer.getDoc ( url ,
0 , // ip
@ -3129,8 +3154,9 @@ bool sendNotification ( EmailInfo *ei ) {
"HTTP/1.0", // proto
true , // doPost
NULL, // cookie
custom.getBufStart(),
NULL ) ) // fullRequest
NULL , // custom hdrs
fullReq.getBufStart() ,
NULL ) )
ei->m_notifyBlocked++;
}

View File

@ -712,6 +712,9 @@ bool PosdbTable::allocTopTree ( ) {
// return false;
if ( m_r->m_getDocIdScoringInfo ) {
m_scoreInfoBuf.setLabel ("scinfobuf" );
// . for holding the scoring info
// . add 1 for the \0 safeMemcpy() likes to put at the end so
// it will not realloc on us
@ -731,6 +734,10 @@ bool PosdbTable::allocTopTree ( ) {
// compute. so this could easily get into the megabytes, most
// of the time we will not need nearly that much however.
numPairs *= xx;
m_pairScoreBuf.setLabel ( "pairbuf" );
m_singleScoreBuf.setLabel ("snglbuf" );
// but alloc it just in case
if ( ! m_pairScoreBuf.reserve (numPairs * sizeof(PairScore) ) )
return false;
@ -786,7 +793,7 @@ bool PosdbTable::allocTopTree ( ) {
slots = 20000000;
}
// each site hash is 4 bytes
if ( ! m_siteHashList.reserve ( slots ) )
if ( ! m_siteHashList.reserve ( slots ,"shshbuf" ) )
return false;
// quad # of sites to have space in between
if ( ! m_dt.set(4,0,slots,NULL,0,false,0,"pdtdt"))
@ -1005,7 +1012,7 @@ void PosdbTable::evalSlidingWindow ( char **ptrs ,
for ( long i = 0 ; i < maxi ; i++ ) {
// skip if to the left of a pipe operator
if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
if ( m_bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
//if ( ptrs[i] ) wpi = ptrs[i];
// if term does not occur in body, sub-in the best term
@ -1027,7 +1034,7 @@ void PosdbTable::evalSlidingWindow ( char **ptrs ,
for ( ; j < maxj ; j++ ) {
// skip if to the left of a pipe operator
if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE) ) continue;
if ( m_bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
// TODO: use a cache using wpi/wpj as the key.
//if ( ptrs[j] ) wpj = ptrs[j];
@ -4097,6 +4104,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
long nrg = 0;
// assume not sorting by a numeric termlist
m_sortByTermNum = -1;
//for ( long i = 0 ; i < m_msg2->getNumLists() ; i++ ) {
for ( long i = 0 ; i < m_q->m_numTerms ; i++ ) {
QueryTerm *qt = &m_q->m_qterms[i];
@ -4111,6 +4121,10 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_qpos = wordNum;
qti->m_wikiPhraseId = qw->m_wikiPhraseId;
qti->m_quotedStartId = qw->m_quoteStart;
// is it gbsortby:?
if ( qt->m_fieldCode == FIELD_GBSORTBY ||
qt->m_fieldCode == FIELD_GBREVSORTBY )
m_sortByTermNum = i;
// count
long nn = 0;
// also add in bigram lists
@ -4226,6 +4240,18 @@ bool PosdbTable::setQueryTermInfo ( ) {
if ( qt->m_piped ) qti->m_bigramFlags[nn] |= BF_PIPED;
// is it a negative term?
if ( qt->m_termSign=='-')qti->m_bigramFlags[nn]|=BF_NEGATIVE;
// numeric posdb termlist flags. instead of word position
// they have a float stored there for sorting etc.
if (qt->m_fieldCode == FIELD_GBSORTBY )
qti->m_bigramFlags[nn]|=BF_NUMBER;
if (qt->m_fieldCode == FIELD_GBREVSORTBY )
qti->m_bigramFlags[nn]|=BF_NUMBER;
if (qt->m_fieldCode == FIELD_GBNUMBERMIN )
qti->m_bigramFlags[nn]|=BF_NUMBER;
if (qt->m_fieldCode == FIELD_GBNUMBERMAX )
qti->m_bigramFlags[nn]|=BF_NUMBER;
// only really add if useful
// no, because when inserting NEW (related) terms that are
// not currently in the document, this list may initially
@ -4912,15 +4938,27 @@ void PosdbTable::intersectLists10_r ( ) {
// sites right now. this hash table must have been pre-allocated
// in Posdb::allocTopTree() above since we might be in a thread.
//
RdbList *whiteLists = m_msg2->m_whiteLists;
long nw = m_msg2->m_w;
RdbList *whiteLists = NULL;
long nw = 0;
if ( m_msg2 ) {
whiteLists = m_msg2->m_whiteLists;
nw = m_msg2->m_w;
}
for ( long i = 0 ; ! m_addedSites && i < nw ; i++ ) {
RdbList *list = &whiteLists[i];
if ( list->isEmpty() ) continue;
// sanity test
long long d1 = g_posdb.getDocId(list->getList());
if ( d1 > m_msg2->m_docIdEnd ) { char *xx=NULL;*xx=0; }
if ( d1 < m_msg2->m_docIdStart ) { char *xx=NULL;*xx=0; }
if ( d1 > m_msg2->m_docIdEnd ) {
log("posdb: d1=%lli > %lli",
d1,m_msg2->m_docIdEnd);
//char *xx=NULL;*xx=0;
}
if ( d1 < m_msg2->m_docIdStart ) {
log("posdb: d1=%lli < %lli",
d1,m_msg2->m_docIdStart);
//char *xx=NULL;*xx=0;
}
// first key is always 18 bytes cuz it has the termid
// scan recs in the list
for ( ; ! list->isExhausted() ; list->skipCurrentRecord() ) {
@ -5049,6 +5087,8 @@ void PosdbTable::intersectLists10_r ( ) {
QueryTermInfo *qti = &qip[i];
// skip if negative query term
if ( qti->m_bigramFlags[0] & BF_NEGATIVE ) continue;
// skip if numeric field like gbsortby:price gbmin.price:1.23
if ( qti->m_bigramFlags[0] & BF_NUMBER ) continue;
// set it
if ( qti->m_wikiPhraseId == 1 ) continue;
// stop
@ -5298,6 +5338,9 @@ void PosdbTable::intersectLists10_r ( ) {
long nnn = m_numQueryTermInfos;
if ( ! m_r->m_doMaxScoreAlgo ) nnn = 0;
// do not do it if we got a gbsortby: field
if ( m_sortByTermNum >= 0 ) nnn = 0;
/*
// skip all this if getting score of just one docid on special
// posdb termlists that are 6-byte only keys
@ -5584,6 +5627,8 @@ void PosdbTable::intersectLists10_r ( ) {
pass0++;
if ( m_sortByTermNum >= 0 ) goto skipScoringFilter;
// test why we are slow
//if ( (s_sss++ % 8) != 0 ) { docIdPtr += 6; fail0++; goto docIdLoop;}
@ -5743,6 +5788,8 @@ void PosdbTable::intersectLists10_r ( ) {
}
}
skipScoringFilter:
pass++;
skipPreAdvance:
@ -5770,7 +5817,12 @@ void PosdbTable::intersectLists10_r ( ) {
// mini merge buf:
mptr = mbuf;
// merge each set of sublists
// . merge each set of sublists
// . like we merge a term's list with its two associated bigram
// lists, if there, the left bigram and right bigram list.
// . and merge all the synonym lists for that term together as well.
// so if the term is 'run' we merge it with the lists for
// 'running' 'ran' etc.
for ( long j = 0 ; j < m_numQueryTermInfos ; j++ ) {
// get the query term info
QueryTermInfo *qti = &qip[j];
@ -6045,12 +6097,12 @@ void PosdbTable::intersectLists10_r ( ) {
for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// skip if not part of score
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
// and pair it with each other possible query term
for ( long j = i+1 ; j < m_numQueryTermInfos ; j++ ) {
// skip if not part of score
if ( bflags[j] & (BF_PIPED|BF_NEGATIVE) ) continue;
if ( bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
// but if they are in the same wikipedia phrase
// then try to keep their positions as in the query.
// so for 'time enough for love' ideally we want
@ -6126,7 +6178,7 @@ void PosdbTable::intersectLists10_r ( ) {
for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
float sts;
// skip if to the left of a pipe operator
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
// sometimes there is no wordpos subtermlist for this docid
// because it just has the bigram, like "streetlight" and not
// the word "light" by itself for the query 'street light'
@ -6218,7 +6270,7 @@ void PosdbTable::intersectLists10_r ( ) {
//
for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// skip if to the left of a pipe operator
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
// skip wordposition until it in the body
while ( xpos[i] &&!s_inBody[g_posdb.getHashGroup(xpos[i])]) {
// advance
@ -6269,7 +6321,9 @@ void PosdbTable::intersectLists10_r ( ) {
minx = -1;
for ( long x = 0 ; x < m_numQueryTermInfos ; x++ ) {
// skip if to the left of a pipe operator
if ( bflags[x] & (BF_PIPED|BF_NEGATIVE) ) continue;
// and numeric posdb termlists do not have word positions,
// they store a float there.
if ( bflags[x] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
if ( ! xpos[x] ) continue;
if ( xpos[x] && minx == -1 ) {
minx = x;
@ -6298,7 +6352,8 @@ void PosdbTable::intersectLists10_r ( ) {
long k;
for ( k = 0 ; k < m_numQueryTermInfos ; k++ ) {
// skip if to the left of a pipe operator
if ( bflags[k] & (BF_PIPED|BF_NEGATIVE) ) continue;
if ( bflags[k] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) )
continue;
if ( xpos[k] ) break;
}
// all lists are now exhausted
@ -6337,12 +6392,12 @@ void PosdbTable::intersectLists10_r ( ) {
for ( long i = 0 ; i < m_numQueryTermInfos ; i++ ) {
// skip if to the left of a pipe operator
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE) ) continue;
if ( bflags[i] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
for ( long j = i+1 ; j < m_numQueryTermInfos ; j++ ) {
// skip if to the left of a pipe operator
if ( bflags[j] & (BF_PIPED|BF_NEGATIVE) ) continue;
if ( bflags[j] & (BF_PIPED|BF_NEGATIVE|BF_NUMBER) ) continue;
//
// get score for term pair from non-body occuring terms
@ -6404,6 +6459,12 @@ void PosdbTable::intersectLists10_r ( ) {
m_r->m_language == docLang)
score *= SAMELANGMULT;
//
// if we have a gbsortby:price term then score exclusively on that
//
if ( m_sortByTermNum >= 0 )
score = g_posdb.getFloat ( miniMergedList[m_sortByTermNum] );
// . seoDebug hack so we can set "dcs"
// . we only come here if we actually made it into m_topTree
if ( secondPass || m_r->m_seoDebug ) {

21
Posdb.h
View File

@ -99,6 +99,7 @@ float getTermFreqWeight ( long long termFreq , long long numDocsInColl );
#define BF_SYNONYM 0x04
#define BF_NEGATIVE 0x08 // query word has a negative sign before it
#define BF_BIGRAM 0x10 // query word has a negative sign before it
#define BF_NUMBER 0x20 // is it like gbsortby:price? numeric?
void printTermList ( long i, char *list, long listSize ) ;
@ -197,6 +198,23 @@ class Posdb {
if ( langId & 0x20 ) kp->n0 |= 0x08;
}
// set the word position bits et al to this float
void setFloat ( void *vkp , float f ) {
*(float *)(((char *)vkp) + 2) = f; };
// and read the float as well
float getFloat ( void *vkp ) {
return *(float *)(((char *)vkp) + 2); };
void setAlignmentBit ( void *vkp , char val ) {
char *p = (char *)vkp;
if ( val ) p[1] = p[1] | 0x02;
else p[1] = p[1] & 0xfd;
};
bool isAlignmentBitClear ( void *vkp ) {
return ( ( ((char *)vkp)[1] & 0x02 ) == 0x00 );
};
void makeStartKey ( void *kp, long long termId ,
long long docId=0LL){
@ -427,7 +445,7 @@ class PosdbList : public RdbList {
#include "Query.h" // MAX_QUERY_TERMS, qvec_t
// max # search results that can be viewed without using TopTree
#define MAX_RESULTS 1000
//#define MAX_RESULTS 1000
class PosdbTable {
@ -575,6 +593,7 @@ class PosdbTable {
class Msg39Request *m_r;
long m_sortByTermNum;
// the new intersection/scoring algo
void intersectLists10_r ( );

View File

@ -162,6 +162,11 @@ bool PostQueryRerank::set2 ( long resultsWanted ) {
m_pageUrl = (Url *)mcalloc( sizeof(Url)*m_maxResultsToRerank,
"pqrpageUrls" );
if ( ! m_pageUrl ) {
log("pqr: had out of memory error");
return false;
}
return true;
}

View File

@ -500,6 +500,11 @@ bool Process::isAnyTreeSaving ( ) {
Rdb *rdb = m_rdbs[i];
if ( rdb->m_isCollectionLess ) continue;
if ( rdb->isSavingTree() ) return true;
// we also just disable writing below in Process.cpp
// while saving other files. so hafta check that as well
// since we use isAnyTreeSaving() to determine if we can
// write to the tree or not.
if ( ! rdb->isWritable() ) return true;
}
return false;
}
@ -1064,7 +1069,10 @@ void processSleepWrapper ( int fd , void *state ) {
//if ( ! isClockInSync() && ! g_hostdb.m_myHost->m_isProxy ) return;
// get time the day started
long now = getTimeLocal();//GlobalNoCore();
long now;
if ( g_hostdb.m_myHost->m_isProxy ) now = getTimeLocal();
else now = getTimeGlobal();
// set this for the first time
if ( g_process.m_lastSaveTime == 0 )
g_process.m_lastSaveTime = now;

View File

@ -60,6 +60,7 @@ struct StateControl{
HttpRequest m_hr;
Host *m_forwardHost;
float m_pending;
bool m_isEventGuru;
};
#define UIF_ADMIN 0x01
@ -370,6 +371,8 @@ bool Proxy::handleRequest (TcpSocket *s){
char *host = hr.getHost();
char *hdom = host;
if ( strncasecmp(hdom,"www.",4) == 0 ) hdom += 4;
if ( strncasecmp(hdom,"www2.",5) == 0 ) hdom += 5;
if ( strncasecmp(hdom,"www1.",5) == 0 ) hdom += 5;
// auto redirect eventguru.com to www.eventguru.com so cookies
// are consistent
if ( ! redir &&
@ -387,9 +390,19 @@ bool Proxy::handleRequest (TcpSocket *s){
redirLen = gbstrlen(redir);
}
bool isEventGuru = false;
if ( strcasecmp(hdom,"eventguru.com") == 0 )
isEventGuru = true;
#ifdef MATTWELLS
#define HTTPS_REDIR 1
#endif
if ( redirLen > 0 && redir ) {
//redirect:
#ifdef HTTPS_REDIR
redirect:
#endif
HttpMime m;
m.makeRedirMime (redir,redirLen);
// . move the reply to a send buffer
@ -431,6 +444,10 @@ bool Proxy::handleRequest (TcpSocket *s){
char *path = hr.getPath();
//long pathLen = hr.getPathLen();
// serve events on the gigablast.com domain:
if ( path && strncmp(path,"/events",7) == 0 )
isEventGuru = true;
/*
bool badPage = false;
if ( n < 0 ) badPage = true;
@ -502,6 +519,32 @@ bool Proxy::handleRequest (TcpSocket *s){
if ( ! strncmp(path,"/?id=" ,5 ) ) handleIt = false;
// log the request iff filename does not end in .gif .jpg .
char *f = NULL;
long flen = 0;
if ( isEventGuru ) {
f = hr.getFilename();
flen = hr.getFilenameLen();
}
// proxy will handle eventguru images i guess
bool isGif = ( f && flen >= 4 && strncmp(&f[flen-4],".gif",4) == 0 );
bool isJpg = ( f && flen >= 4 && strncmp(&f[flen-4],".jpg",4) == 0 );
bool isBmp = ( f && flen >= 4 && strncmp(&f[flen-4],".bmp",4) == 0 );
bool isPng = ( f && flen >= 4 && strncmp(&f[flen-4],".png",4) == 0 );
bool isIco = ( f && flen >= 4 && strncmp(&f[flen-4],".ico",4) == 0 );
bool isPic = (isGif | isJpg | isBmp | isPng || isIco);
// use event guru favicon?
//if ( isEventGuru && isIco && strcmp(f,"favicon.ico") == 0 ) {
// f = "eventguru_favicon.ico";
// flen = gbstrlen(f);
//}
// eventguru.com host: in mime?
if ( isEventGuru && ! isPic )
handleIt = false;
// only proxy holds the accounting info
if ( ! strncmp ( path ,"/account", 8 ) ) {
printRequest(s, &hr);
@ -515,12 +558,14 @@ bool Proxy::handleRequest (TcpSocket *s){
if ( tcp == &g_httpServer.m_ssltcp ) max = g_conf.m_httpsMaxSockets;
else max = g_conf.m_httpMaxSockets;
#ifdef _HTTPS_REDIR_
#ifdef HTTPS_REDIR
// if hitting root page then tell them to go to https
// if not autobanned... but if it is an autobanned request on root
// page it should have go the turing test above!
if ( n == PAGE_ROOT &&
! g_isYippy &&
// not event guru homepage
! isEventGuru &&
// if not already on https
tcp != &g_httpServer.m_ssltcp &&
// do not redirect http://www.gigablast.com/?c=dmoz3 (directory)!
@ -1265,6 +1310,8 @@ bool Proxy::forwardRequest ( StateControl *stC ) {
p[5] = '9';
break;
}
// code is invalid if is not for an old client
//if ( userId32b == 0 ) code = NULL;
}
@ -1665,7 +1712,7 @@ void Proxy::gotReplyPage ( void *state, UdpSlot *slot ) {
// do not print login bars in the xml!! do not print for ixquick
// which gets results in html...
if ( ! stC->m_raw && ! stC->m_ch )
if ( ! stC->m_raw && ! stC->m_ch && ! stC->m_isEventGuru )
newReply = storeLoginBar ( reply ,
size , // transmit size
size , // allocsize
@ -5153,11 +5200,16 @@ void Proxy::printUsers ( SafeBuf *sb ) {
// but if admin we should still have set our cookie
// adminsessid to our current session id so we know we are
// also the admin!
sb->safePrintf("<td><a href=/account?login=%s&password=%s>"
"%s</td>"
sb->safePrintf("<td><nobr>%li. "
"<a href=/account?login=%s&password=%s>"
"%s</a></nobr></td>"
,i
,ui->m_login
,ui->m_password
,ui->m_login);
,ui->m_login
//,ui->m_userId32
);
}
sb->safePrintf("</tr>\n");
sb->safePrintf("</table>\n");
}

View File

@ -2199,6 +2199,10 @@ bool Query::setQWords ( char boolFlag ,
fieldCode == FIELD_IP ||
fieldCode == FIELD_ISCLEAN ||
fieldCode == FIELD_QUOTA ||
fieldCode == FIELD_GBSORTBY ||
fieldCode == FIELD_GBREVSORTBY ||
fieldCode == FIELD_GBNUMBERMIN ||
fieldCode == FIELD_GBNUMBERMAX ||
fieldCode == FIELD_GBAD ) {
// find first space -- that terminates the field value
char *end =
@ -2210,6 +2214,15 @@ bool Query::setQWords ( char boolFlag ,
ignoreTilSpace = true;
// the hash
unsigned long long wid = hash64 ( w , wlen, 0LL );
// i've decided not to make
// gbsortby:products.offerPrice case sensitive
if ( fieldCode == FIELD_GBSORTBY ||
fieldCode == FIELD_GBREVSORTBY ||
fieldCode == FIELD_GBNUMBERMIN ||
fieldCode == FIELD_GBNUMBERMAX )
wid = hash64Lower_utf8 ( w , wlen , 0LL );
// should we have normalized before hashing?
if ( fieldCode == FIELD_URL ||
fieldCode == FIELD_LINK ||
@ -3032,6 +3045,12 @@ struct QueryField g_fields[] = {
{"gbgigabitvector", FIELD_GBGIGABITVECTOR, false,""},
{"gbsamplevector", FIELD_GBSAMPLEVECTOR, false,""},
{"gbcontenthash", FIELD_GBCONTENTHASH, false,""},
{"gbsortby", FIELD_GBSORTBY, false,""},
{"gbrevsortby", FIELD_GBREVSORTBY, false,""},
{"gbnumbermin", FIELD_GBNUMBERMIN, false,""},
{"gbnumbermax", FIELD_GBNUMBERMAX, false,""},
{"gbcountry",FIELD_GBCOUNTRY,false,""},
{"gbad",FIELD_GBAD,false,""},

View File

@ -103,7 +103,11 @@ typedef unsigned long long qvec_t;
#define FIELD_GBCSENUM 50
#define FIELD_GBSECTIONHASH 51
#define FIELD_GBDOCID 52
#define FIELD_GBCONTENTHASH 53
#define FIELD_GBCONTENTHASH 53 // for deduping at spider time
#define FIELD_GBSORTBY 54 // i.e. sortby:price -> numeric termlist
#define FIELD_GBREVSORTBY 55 // i.e. sortby:price -> low to high
#define FIELD_GBNUMBERMIN 56
#define FIELD_GBNUMBERMAX 57
#define FIELD_GBOTHER 92

View File

@ -1,7 +1,7 @@
open-source-search-engine
=========================
An open source web and enterprise search engine. As can be seen http://www.gigablast.com/
An open source web and enterprise search engine. As can be seen on http://www.gigablast.com/ .
RUNNING GIGABLAST
-----------------

111
Rdb.cpp
View File

@ -132,7 +132,7 @@ bool Rdb::init ( char *dir ,
// sanity
if ( ! dir ) { char *xx=NULL;*xx=0; }
// this is the working dir, all collection repositiories are subdirs
m_dir.set ( dir );
//m_dir.set ( dir );
// catdb, statsdb, accessdb, facebookdb, syncdb
m_isCollectionLess = isCollectionLess;
// save the dbname NULL terminated into m_dbname/m_dbnameLen
@ -466,6 +466,11 @@ bool Rdb::updateToRebuildFiles ( Rdb *rdb2 , char *coll ) {
// . if this rdb is collectionless we set m_collectionlessBase in addBase()
bool Rdb::addColl ( char *coll ) {
collnum_t collnum = g_collectiondb.getCollnum ( coll );
return addColl2 ( collnum );
}
bool Rdb::addColl2 ( collnum_t collnum ) {
// catdb,statsbaccessdb,facebookdb,syncdb
if ( m_isCollectionLess )
collnum = (collnum_t)0;
@ -477,6 +482,12 @@ bool Rdb::addColl ( char *coll ) {
"breech maximum number of collections, %lli.",
m_dbname,collnum,maxColls);
}
CollectionRec *cr = g_collectiondb.m_recs[collnum];
char *coll = NULL;
if ( cr ) coll = cr->m_coll;
// . ensure no previous one exists
// . well it will be there but will be uninitialized, m_rdb will b NULL
RdbBase *base = getBase ( collnum );
@ -506,8 +517,9 @@ bool Rdb::addColl ( char *coll ) {
if(m_useTree) tree = &m_tree;
else buckets = &m_buckets;
// init it
if ( ! base->init ( m_dir.getDir() ,
// . init it
// . g_hostdb.m_dir should end in /
if ( ! base->init ( g_hostdb.m_dir, // m_dir.getDir() ,
m_dbname ,
m_dedup ,
m_fixedDataSize ,
@ -527,15 +539,16 @@ bool Rdb::addColl ( char *coll ) {
m_biasDiskPageCache ) ) {
logf(LOG_INFO,"db: %s: Failed to initialize db for "
"collection \"%s\".", m_dbname,coll);
exit(-1);
//exit(-1);
return false;
}
// . set CollectionRec::m_numPos/NegKeysInTree[rdbId]
// . these counts are now stored in the CollectionRec and not
// in RdbTree since the # of collections can be huge!
CollectionRec *cr = g_collectiondb.m_recs[collnum];
m_tree.setNumKeys ( cr );
if ( m_useTree ) {
m_tree.setNumKeys ( cr );
}
//if ( (long)collnum >= m_numBases ) m_numBases = (long)collnum + 1;
// Success
@ -544,7 +557,7 @@ bool Rdb::addColl ( char *coll ) {
bool Rdb::resetColl ( collnum_t collnum , collnum_t newCollnum ) {
char *coll = g_collectiondb.m_recs[collnum]->m_coll;
//char *coll = g_collectiondb.m_recs[collnum]->m_coll;
// remove these collnums from tree
if(m_useTree) m_tree.delColl ( collnum );
@ -552,11 +565,48 @@ bool Rdb::resetColl ( collnum_t collnum , collnum_t newCollnum ) {
// . close all files, set m_numFiles to 0 in RdbBase
// . TODO: what about outstanding merge or dump operations?
RdbBase *base = getBase ( collnum );
base->reset( );
// . it seems like we can't really recycle this too easily
// because reset it not resetting filenames or directory name?
// just nuke it and rebuild using addColl2()...
RdbBase *oldBase = getBase ( collnum );
mdelete (oldBase, sizeof(RdbBase), "Rdb Coll");
delete (oldBase);
// update this as well
base->m_collnum = newCollnum;
//base->reset( );
// NULL it out...
CollectionRec *oldcr = g_collectiondb.getRec(collnum);
oldcr->m_bases[(unsigned char)m_rdbId] = NULL;
char *coll = oldcr->m_coll;
char *msg = "deleted";
// if just resetting recycle base
if ( collnum != newCollnum ) {
addColl2 ( newCollnum );
// make a new base now
//RdbBase *newBase = mnew
// new cr
//CollectionRec *newcr = g_collectiondb.getRec(newCollnum);
// update this as well
//base->m_collnum = newCollnum;
// and the array
//newcr->m_bases[(unsigned char)m_rdbId] = base;
msg = "moved";
}
log("rdb: %s base from collrec "
"rdb=%s rdbid=%li coll=%s collnum=%li newcollnum=%li",
msg,m_dbname,(long)m_rdbId,coll,(long)collnum,
(long)newCollnum);
// new dir. otherwise RdbDump will try to dump out the recs to
// the old dir and it will end up coring
//char tmp[1024];
//sprintf(tmp , "%scoll.%s.%li",g_hostdb.m_dir,coll,(long)newCollnum );
//m_dir.set ( tmp );
// move the files into trash
// nuke it on disk
@ -597,19 +647,6 @@ bool Rdb::delColl ( char *coll ) {
// move all files to trash and clear the tree/buckets
resetColl ( collnum , collnum );
mdelete (base, sizeof(RdbBase), "Rdb Coll");
delete (base);
//m_bases[collnum] = NULL;
CollectionRec *cr = g_collectiondb.getRec(collnum);
// NULL it out...
cr->m_bases[(unsigned char)m_rdbId] = NULL;
log("rdb: deleted base from collrec "
"rdb=%s rdbid=%li coll=%s collnum=%li base=0x%lx",
m_dbname,(long)m_rdbId,coll,(long)collnum,(long)base);
// remove these collnums from tree
//if(m_useTree) m_tree.delColl ( collnum );
//else m_buckets.delColl ( collnum );
@ -921,7 +958,8 @@ bool Rdb::saveMaps ( bool useThread ) {
// shut it down
RdbBase *base = getBase(i);
//if ( m_bases[i] ) m_bases[i]->closeMaps ( m_urgent );
if ( base ) base->closeMaps ( m_urgent );
//if ( base ) base->closeMaps ( m_urgent );
if ( base ) base->saveMaps ( useThread );
}
return true;
}
@ -1242,6 +1280,7 @@ bool Rdb::gotTokenForDump ( ) {
m_dumpCollnum = (collnum_t)-1;
// clear this for dumpCollLoop()
g_errno = 0;
m_dumpErrno = 0;
m_fn = -1000;
// this returns false if blocked, which means we're ok, so we ret true
if ( ! dumpCollLoop ( ) ) return true;
@ -1414,9 +1453,16 @@ bool Rdb::dumpCollLoop ( ) {
// error?
if ( g_errno ) {
log("rdb: error dumping = %s",mstrerror(g_errno));
log("rdb: error dumping = %s . coll deleted from under us?",
mstrerror(g_errno));
// shit, what to do here? this is causing our RdbMem
// to get corrupted!
// because if we end up continuing it calls doneDumping()
// and updates RdbMem! maybe set a permanent error then!
// and if that is there do not clear RdbMem!
m_dumpErrno = g_errno;
// for now core out
char *xx=NULL;*xx=0;
//char *xx=NULL;*xx=0;
}
// loop back up since we did not block
@ -1437,11 +1483,12 @@ void Rdb::doneDumping ( ) {
// msg
//log(LOG_INFO,"db: Done dumping %s to %s (#%li): %s.",
// m_dbname,m_files[n]->getFilename(),n,mstrerror(g_errno));
log(LOG_INFO,"db: Done dumping %s: %s.",m_dbname,mstrerror(g_errno));
log(LOG_INFO,"db: Done dumping %s: %s.",m_dbname,
mstrerror(m_dumpErrno));
// give the token back so someone else can dump or merge
//g_msg35.releaseToken();
// free mem in the primary buffer
if ( ! g_errno ) m_mem.freeDumpedMem();
if ( ! m_dumpErrno ) m_mem.freeDumpedMem();
// . tell RdbDump it is done
// . we have to set this here otherwise RdbMem's memory ring buffer
// will think the dumping is no longer going on and use the primary
@ -2839,6 +2886,12 @@ void Rdb::enableWrites () {
else m_buckets.enableWrites();
}
bool Rdb::isWritable ( ) {
if(m_useTree) return m_tree.m_isWritable;
return m_buckets.m_isWritable;
}
bool Rdb::needsSave() {
if(m_useTree) return m_tree.m_needsSave;
else return m_buckets.needsSave();

12
Rdb.h
View File

@ -10,7 +10,7 @@
#include "RdbMem.h"
#include "RdbCache.h"
#include "RdbDump.h"
#include "Dir.h"
//#include "Dir.h"
#include "RdbBuckets.h"
// . each Rdb instance has an ID
@ -86,6 +86,7 @@ class Rdb {
~Rdb ( );
bool addColl ( char *coll );
bool addColl2 ( collnum_t collnum );
bool delColl ( char *coll );
bool resetColl ( collnum_t collnum , collnum_t newCollnum ) ;
@ -164,7 +165,8 @@ class Rdb {
bool deleteRecord ( collnum_t collnum , char *key );
// get the directory name where this rdb stores it's files
char *getDir ( ) { return m_dir.getDirname(); };
//char *getDir ( ) { return m_dir.getDirname(); };
char *getDir ( ) { return g_hostdb.m_dir; };
char *getStripeDir ( ) { return g_conf.m_stripeDir; };
long getFixedDataSize ( ) { return m_fixedDataSize; };
@ -185,7 +187,7 @@ class Rdb {
void disableWrites ();
void enableWrites ();
bool isWritable ( ) ;
RdbBase *getBase ( collnum_t collnum ) ;
long getNumBases ( ) { return g_collectiondb.m_numRecs; };
@ -352,7 +354,7 @@ class Rdb {
bool m_dedup;
long m_fixedDataSize;
Dir m_dir;
//Dir m_dir;
char m_dbname [32];
long m_dbnameLen;
@ -394,6 +396,8 @@ class Rdb {
long m_numFilesToMerge ;
long m_mergeStartFileNum ;
long m_dumpErrno;
// a dummy data string for deleting records when m_fixedDataSize > 0
char *m_dummy;
long m_dummySize ; // size of that dummy data

View File

@ -127,8 +127,15 @@ bool RdbBase::init ( char *dir ,
// set all our contained classes
//m_dir.set ( dir );
// set all our contained classes
// . "tmp" is bogus
// . /home/mwells/github/coll.john-test1113.654coll.john-test1113.655
char tmp[1024];
sprintf ( tmp , "%scoll.%s.%li" , dir , coll , (long)collnum );
// debug
log("base: adding new base for dir=%s coll=%s collnum=%li db=%s",
dir,coll,(long)collnum,dbname);
// catdb is collection independent
// make a special subdir to store the map and data files in if
@ -261,7 +268,8 @@ bool RdbBase::init ( char *dir ,
// we can't merge more than MAX_RDB_FILES files at a time
if ( minToMergeArg > MAX_RDB_FILES ) minToMergeArg = MAX_RDB_FILES;
m_minToMergeArg = minToMergeArg;
// set our m_files array
// . set our m_files array
// . m_dir is bogus causing this to fail
if ( ! setFiles () ) return false;
//long dataMem;
// if we're in read only mode, don't bother with *ANY* trees
@ -491,9 +499,11 @@ bool RdbBase::removeRebuildFromFilename ( BigFile *f ) {
bool RdbBase::setFiles ( ) {
// set our directory class
if ( ! m_dir.open ( ) )
// we are getting this from a bogus m_dir
return log("db: Had error opening directory %s", getDir());
// note it
logf(LOG_INFO,"db: Loading files for %s.",m_dbname );
logf(LOG_INFO,"db: Loading files for %s coll=%s (%li).",
m_dbname,m_coll,(long)m_collnum );
// . set our m_files array
// . addFile() will return -1 and set g_errno on error
// . the lower the fileId the older the data
@ -600,6 +610,8 @@ bool RdbBase::setFiles ( ) {
return false;
}
m_dir.close();
if ( ! converting ) return true;
// now if we are converting old titledb names to new...
@ -723,7 +735,6 @@ long RdbBase::addFile ( long id , bool isNew , long mergeNum , long id2 ,
sprintf ( name , "%s%04li.map", m_dbname, id );
m->set ( getDir() , name , m_fixedDataSize , m_useHalfKeys , m_ks ,
m_pageSize );
if ( ! isNew ) logf(LOG_INFO,"db: Adding %s.", name );
if ( ! isNew && ! m->readMap ( f ) ) {
// if out of memory, do not try to regen for that
if ( g_errno == ENOMEM ) return -1;
@ -759,6 +770,8 @@ long RdbBase::addFile ( long id , bool isNew , long mergeNum , long id2 ,
g_statsdb.m_disabled = false;
if ( ! status ) return log("db: Save failed.");
}
if ( ! isNew ) logf(LOG_INFO,"db: Added %s for collnum=%li pages=%li",
name ,(long)m_collnum,m->getNumPages());
// open this big data file for reading only
if ( ! isNew ) {
if ( mergeNum < 0 )
@ -1603,7 +1616,8 @@ void RdbBase::gotTokenForMerge ( ) {
return;
}
// make a log note
log(LOG_INFO,"merge: Resuming killed merge for %s.",m_dbname);
log(LOG_INFO,"merge: Resuming killed merge for %s coll=%s.",
m_dbname,m_coll);
// compute the total size of merged file
mint = 0;
long mm = 0;

View File

@ -416,6 +416,9 @@ bool RdbCache::getRecord ( collnum_t collnum ,
if ( maxAge == 0 ) return false;
// bail if no cache
if ( m_numPtrsMax <= 0 ) return false;
// if init() called failed because of oom...
if ( ! m_ptrs )
return log("cache: getRecord: failed because oom");
// time it -- debug
long long t = 0LL ;
if ( g_conf.m_logTimingDb ) t = gettimeofdayInMillisecondsLocal();

View File

@ -32,6 +32,10 @@ RdbTree::RdbTree () {
m_useProtection = false;
m_pickRight = false;
m_gettingList = 0;
// before resetting... we have to set this so clear() won't breach buffers
m_rdbId = -1;
reset();
}
@ -125,10 +129,6 @@ bool RdbTree::set ( long fixedDataSize ,
// sanity
if ( rdbId < -1 ) { char *xx=NULL;*xx=0; }
if ( rdbId >= RDB_END ) { char *xx=NULL;*xx=0; }
// is it a valid one
m_isRealTree = true;
if ( m_rdbId <= RDB_NONE ) m_isRealTree = false;
if ( m_rdbId >= RDB_END ) m_isRealTree = false;
// if its doledb, set it
//if ( dbname && strcmp(dbname,"doledb") == 0 ) m_rdbId = RDB_DOLEDB;
// adjust m_maxMem to virtual infinity if it was -1
@ -151,7 +151,7 @@ bool RdbTree::set ( long fixedDataSize ,
// initiate protection
if ( m_useProtection ) protect();
// allocate the nodes
return growTree ( maxNumNodes );
return growTree ( maxNumNodes , 0 );
}
void RdbTree::reset ( ) {
@ -273,11 +273,12 @@ long RdbTree::clear ( ) {
// clear tree counts for all collections!
long nc = g_collectiondb.m_numRecs;
// BUT only if we are an Rdb::m_tree!!!
if ( ! m_isRealTree ) nc = 0;
if ( m_rdbId == -1 ) nc = 0;
// otherwise, we overwrite stuff in CollectionRec we shouldn't
for ( long i = 0 ; i < nc ; i++ ) {
CollectionRec *cr = g_collectiondb.getRec(i);
if ( ! cr ) continue;
//if ( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
cr->m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
cr->m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;
}
@ -547,7 +548,8 @@ long RdbTree::addNode ( collnum_t collnum ,
// collections using the same Rdb::m_tree!
// crap, when fixing a tree this will segfault because
// m_recs[collnum] is NULL.
if ( m_isRealTree && g_collectiondb.m_recs[collnum] ) {
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[collnum]->
m_numNegKeysInTree[(unsigned char)m_rdbId] =0;
g_collectiondb.m_recs[collnum]->
@ -629,7 +631,8 @@ long RdbTree::addNode ( collnum_t collnum ,
// collections using the same Rdb::m_tree!
// crap, when fixing a tree this will segfault because
// m_recs[collnum] is NULL.
if ( m_isRealTree && g_collectiondb.m_recs[collnum] ) {
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[collnum]->
m_numNegKeysInTree[(unsigned char)m_rdbId]++;
}
@ -639,7 +642,8 @@ long RdbTree::addNode ( collnum_t collnum ,
//m_numPosKeysPerColl[collnum]++;
// crap, when fixing a tree this will segfault because
// m_recs[collnum] is NULL.
if ( m_isRealTree && g_collectiondb.m_recs[collnum] ) {
if ( m_rdbId >= 0 && g_collectiondb.m_recs[collnum] ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[collnum]->
m_numPosKeysInTree[(unsigned char)m_rdbId]++;
}
@ -834,14 +838,14 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
if ( KEYNEG(m_keys,i,m_ks) ) {
m_numNegativeKeys--;
//m_numNegKeysPerColl[m_collnums[i]]--;
if ( m_isRealTree )
if ( m_rdbId >= 0 )
g_collectiondb.m_recs[m_collnums[i]]->
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
}
else {
m_numPositiveKeys--;
//m_numPosKeysPerColl[m_collnums[i]]--;
if ( m_isRealTree )
if ( m_rdbId >= 0 )
g_collectiondb.m_recs[m_collnums[i]]->
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
}
@ -868,7 +872,8 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
m_numPositiveKeys = 0;
//m_numNegKeysPerColl[m_collnums[i]] = 0;
//m_numPosKeysPerColl[m_collnums[i]] = 0;
if ( m_isRealTree ) {
if ( m_rdbId >= 0 ) {
//if ( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[m_collnums[i]]->
m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
g_collectiondb.m_recs[m_collnums[i]]->
@ -937,16 +942,20 @@ void RdbTree::deleteNode ( long i , bool freeData ) {
if ( KEYNEG(m_keys,i,m_ks) ) {
m_numNegativeKeys--;
//m_numNegKeysPerColl[m_collnums[i]]--;
if ( m_isRealTree )
if ( m_rdbId >= 0 ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[m_collnums[i]]->
m_numNegKeysInTree[(unsigned char)m_rdbId]--;
}
}
else {
m_numPositiveKeys--;
//m_numPosKeysPerColl[m_collnums[i]]--;
if ( m_isRealTree )
if ( m_rdbId >= 0 ) {
//if( ((unsigned char)m_rdbId)>=RDB_END){char *xx=NULL;*xx=0; }
g_collectiondb.m_recs[m_collnums[i]]->
m_numPosKeysInTree[(unsigned char)m_rdbId]--;
}
}
// debug step -- check chain from iparent down making sure that
// all kids don't have -2 for their parent... seems to be a rare bug
@ -1310,7 +1319,7 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
// . grow tree to "n" nodes
// . this will now actually grow from a current size to a new one
bool RdbTree::growTree ( long nn ) {
bool RdbTree::growTree ( long nn , long niceness ) {
// if we're that size, bail
if ( m_numNodes == nn ) return true;
@ -1337,27 +1346,35 @@ bool RdbTree::growTree ( long nn ) {
long cs = sizeof(collnum_t);
cp =(collnum_t *)mrealloc (m_collnums, on*cs,nn*cs,m_allocName);
if ( ! cp ) goto error;
QUICKPOLL(niceness);
kp = (char *) mrealloc ( m_keys , on*k , nn*k , m_allocName );
if ( ! kp ) goto error;
QUICKPOLL(niceness);
lp = (long *) mrealloc ( m_left , on*4 , nn*4 , m_allocName );
if ( ! lp ) goto error;
QUICKPOLL(niceness);
rp = (long *) mrealloc ( m_right , on*4 , nn*4 , m_allocName );
if ( ! rp ) goto error;
QUICKPOLL(niceness);
pp = (long *) mrealloc ( m_parents , on*4 , nn*4 , m_allocName );
if ( ! pp ) goto error;
QUICKPOLL(niceness);
// deal with data, sizes and depth arrays on a basis of need
if ( m_fixedDataSize != 0 ) {
dp =(char **)mrealloc (m_data , on*d,nn*d,m_allocName);
if ( ! dp ) goto error;
QUICKPOLL(niceness);
}
if ( m_fixedDataSize == -1 ) {
sp =(long *)mrealloc (m_sizes , on*4,nn*4,m_allocName);
if ( ! sp ) goto error;
QUICKPOLL(niceness);
}
if ( m_doBalancing ) {
tp =(char *)mrealloc (m_depth , on ,nn ,m_allocName);
if ( ! tp ) goto error;
QUICKPOLL(niceness);
}
// re-assign
@ -1385,6 +1402,7 @@ bool RdbTree::growTree ( long nn ) {
// protect it from writes
if ( m_useProtection ) protect ( );
QUICKPOLL(niceness);
return true;
error:
@ -1399,41 +1417,49 @@ bool RdbTree::growTree ( long nn ) {
ss = (collnum_t *)mrealloc ( cp , nn*cs , on*cs , m_allocName);
if ( ! ss ) { char *xx = NULL; *xx = 0; }
m_collnums = ss;
QUICKPOLL(niceness);
}
if ( kp ) {
kk = (char *)mrealloc ( kp, nn*k, on*k, m_allocName );
if ( ! kk ) { char *xx = NULL; *xx = 0; }
m_keys = kk;
QUICKPOLL(niceness);
}
if ( lp ) {
x = (long *)mrealloc ( lp , nn*4 , on*4 , m_allocName );
if ( ! x ) { char *xx = NULL; *xx = 0; }
m_left = x;
QUICKPOLL(niceness);
}
if ( rp ) {
x = (long *)mrealloc ( rp , nn*4 , on*4 , m_allocName );
if ( ! x ) { char *xx = NULL; *xx = 0; }
m_right = x;
QUICKPOLL(niceness);
}
if ( pp ) {
x = (long *)mrealloc ( pp , nn*4 , on*4 , m_allocName );
if ( ! x ) { char *xx = NULL; *xx = 0; }
m_parents = x;
QUICKPOLL(niceness);
}
if ( dp && m_fixedDataSize != 0 ) {
p = (char **)mrealloc ( dp , nn*d , on*d , m_allocName );
if ( ! p ) { char *xx = NULL; *xx = 0; }
m_data = p;
QUICKPOLL(niceness);
}
if ( sp && m_fixedDataSize == -1 ) {
x = (long *)mrealloc ( sp , nn*4 , on*4 , m_allocName );
if ( ! x ) { char *xx = NULL; *xx = 0; }
m_sizes = x;
QUICKPOLL(niceness);
}
if ( tp && m_doBalancing ) {
s = (char *)mrealloc ( tp , nn , on , m_allocName );
if ( ! s ) { char *xx = NULL; *xx = 0; }
m_depth = s;
QUICKPOLL(niceness);
}
return log("db: Failed to grow tree for %s from %li to %li bytes: %s.",
@ -2612,7 +2638,7 @@ bool RdbTree::fastLoad ( BigFile *f , RdbMem *stack ) {
if ( m_numNodes < minUnusedNode ) {
log(LOG_INIT,
"db: Growing tree to make room for %s",f->getFilename());
if ( ! growTree ( minUnusedNode ) ) {
if ( ! growTree ( minUnusedNode , 0 ) ) {
f->close();
m_isLoading = false;
return log("db: Failed to grow tree: %s.",
@ -3050,14 +3076,14 @@ void RdbTree::cleanTree ( ) { // char **bases ) {
}
long RdbTree::getNumNegativeKeys ( collnum_t collnum ) {
if ( ! m_isRealTree ) { char *xx=NULL;*xx=0; }
if ( m_rdbId < 0 ) { char *xx=NULL;*xx=0; }
CollectionRec *cr = g_collectiondb.m_recs[collnum];
if ( ! cr ) return 0;
return cr->m_numNegKeysInTree[(unsigned char)m_rdbId];
}
long RdbTree::getNumPositiveKeys ( collnum_t collnum ) {
if ( ! m_isRealTree ) { char *xx=NULL;*xx=0; }
if ( m_rdbId < 0 ) { char *xx=NULL;*xx=0; }
CollectionRec *cr = g_collectiondb.m_recs[collnum];
if ( ! cr ) return 0;
return cr->m_numPosKeysInTree[(unsigned char)m_rdbId];
@ -3067,6 +3093,8 @@ void RdbTree::setNumKeys ( CollectionRec *cr ) {
if ( ! cr ) return;
if ( ((unsigned char)m_rdbId) >= RDB_END ) { char *xx=NULL;*xx=0; }
collnum_t collnum = cr->m_collnum;
cr->m_numNegKeysInTree[(unsigned char)m_rdbId] = 0;
cr->m_numPosKeysInTree[(unsigned char)m_rdbId] = 0;

View File

@ -360,7 +360,7 @@ class RdbTree {
// need to pass this file to the fastSave() thread
//BigFile *m_saveFile;
char m_rdbId;
char m_isRealTree;
//char m_isRealTree;
char m_dir[128];
char m_dbname[32];
char m_memTag[16];
@ -401,7 +401,7 @@ class RdbTree {
// . returns true if tree doesn't need to grow/shrink
// . re-allocs the m_keys,m_data,m_sizes,m_leftNodes,m_rightNodes
// . used for growing AND shrinking the table
bool growTree ( long newNumNodes );
bool growTree ( long newNumNodes , long niceness );
// are we responsible for freeing nodes' data
bool m_ownData;

View File

@ -22,11 +22,12 @@
// 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,0,0,0,0,0,0,0,0
// };
SafeBuf::SafeBuf(long initSize) {
SafeBuf::SafeBuf(long initSize, char *label ) {
if(initSize <= 0) initSize = 1;
m_capacity = initSize;
m_length = 0;
m_buf = (char*)mrealloc(NULL, 0, m_capacity, "SafeBuf");
m_label = label;
m_buf = (char*)mrealloc(NULL, 0, m_capacity, m_label );
if(!m_buf) m_capacity = 0;
m_usingStack = false;
m_encoding = csUTF8;
@ -39,6 +40,11 @@ SafeBuf::SafeBuf() {
m_buf = NULL;
m_usingStack = false;
m_encoding = csUTF8;
m_label = NULL;
}
void SafeBuf::setLabel ( char *label ) {
m_label = label;
}
SafeBuf::SafeBuf(char* stackBuf, long cap) {
@ -47,6 +53,7 @@ SafeBuf::SafeBuf(char* stackBuf, long cap) {
m_buf = stackBuf;
m_length = 0;
m_encoding = csUTF8;
m_label = NULL;
}
SafeBuf::SafeBuf(char *heapBuf, long bufMax, long bytesInUse, bool ownData) {
@ -292,8 +299,14 @@ bool SafeBuf::advance ( long i ) {
return true;
}
bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
if ( ! label ) label = "SafeBuf";
bool SafeBuf::reserve(long i , char *label, bool clearIt ) {
// if we don't already have a label and they provided one, use it
if ( ! m_label ) {
if ( label ) m_label = label;
else m_label = "SafeBuf";
}
if(m_length + i > m_capacity) {
char *tmpBuf = m_buf;
long tmpCap = m_capacity;
@ -301,7 +314,7 @@ bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
m_buf = NULL;
m_capacity += i;
//if(m_capacity < 8) m_capacity = 8;
m_buf = (char*)mrealloc(m_buf, 0, m_capacity, label);
m_buf = (char*)mrealloc(m_buf, 0, m_capacity,m_label);
if(!m_buf) {
m_buf = tmpBuf;
m_capacity = tmpCap;
@ -320,7 +333,7 @@ bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
}
m_capacity += i;
//if(m_capacity < 8) m_capacity = 8;
m_buf = (char*)mrealloc(m_buf, tmpCap, m_capacity,label);
m_buf = (char*)mrealloc(m_buf, tmpCap, m_capacity,m_label);
if(!m_buf) {
m_buf = tmpBuf;
m_capacity = tmpCap;
@ -344,11 +357,11 @@ bool SafeBuf::reserve(long i, char *label, bool clearIt ) {
//reserve this many bytes, if we need to alloc, we double the
//buffer size.
bool SafeBuf::reserve2x(long i) {
bool SafeBuf::reserve2x(long i, char *label) {
//watch out for overflow!
if((m_capacity << 1) + i < 0) return false;
if(i + m_length >= m_capacity)
return reserve(m_capacity + i);
return reserve(m_capacity + i,label);
else return true;
}
@ -369,8 +382,8 @@ long SafeBuf::dumpToFile(char *filename ) {
filename);
return -1;
}
logf(LOG_DEBUG, "test: safebuf %li bytes written to %s",m_length,
filename);
//logf(LOG_DEBUG, "test: safebuf %li bytes written to %s",m_length,
// filename);
retry23:
long bytes = write(fd, (char*)m_buf, m_length) ;
if ( bytes != m_length ) {
@ -972,7 +985,8 @@ bool SafeBuf::htmlEncode(char *s, long len, bool encodePoundSign ,
// . sanity check
if ( m_encoding == csUTF16 ) { char *xx = NULL; *xx = 0; }
// alloc some space if we need to. add a byte for NULL termination.
if(m_length+len+1>=m_capacity && !reserve(m_capacity+len))return false;
if(m_length+len+1>=m_capacity && !reserve(m_capacity+len+1))
return false;
// tmp vars
char *t = m_buf + m_length;
char *tend = m_buf + m_capacity;
@ -2517,7 +2531,11 @@ bool SafeBuf::decodeJSON ( long niceness ) {
// . this is used by xmldoc.cpp to PARTIALLY decode a json buf so we do not
// index letters in escapes like \n \r \f \t \uxxxx \\ \/
// . SO we do keep \"
bool SafeBuf::safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness) {
// . so when indexing a doc we set decodeAll to FALSE, but if you want to
// decode quotation marks as well then set decodeAll to TRUE!
bool SafeBuf::safeDecodeJSONToUtf8 ( char *json,
long jsonLen,
long niceness ) {
// how much space to reserve for the copy?
long need = jsonLen;
@ -2579,6 +2597,15 @@ bool SafeBuf::safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness) {
src += 2;
continue;
}
// we do not decode quotation marks when indexing
// the doc so we can preserve json names/value pair
// information for indexing purposes. however,
// Title.cpp DOES want to decode quotations.
if ( src[1] == '\"' ) { // && decodeAll ) {
*dst++ = '\"';
src += 2;
continue;
}
// utf8? if not, just skip the slash
if ( src[1] != 'u' ) {
// no, keep the slash so if we have /"
@ -3155,3 +3182,49 @@ bool SafeBuf::htmlDecode ( char *src,
// good to go
return true;
}
void SafeBuf::replaceChar ( char src , char dst ) {
char *px = m_buf;
char *pxEnd = m_buf + m_length;
for ( ; px < pxEnd ; px++ ) if ( *px == src ) *px = dst;
}
// encode a double quote char to two double quote chars
bool SafeBuf::csvEncode ( char *s , long len , long niceness ) {
if ( ! s ) return true;
// assume all chars are double quotes and will have to be encoded
long need = len * 2 + 1;
if ( ! reserve ( need ) ) return false;
// tmp vars
char *dst = m_buf + m_length;
//char *dstEnd = m_buf + m_capacity;
// scan through all
char *send = s + len;
for ( ; s < send ; s++ ) {
// breathe
QUICKPOLL ( niceness );
// convert it?
if ( *s == '\"' ) {
*dst++ = '\"';
*dst++ = '\"';
continue;
}
//if ( *s == '\\' ) {
// *dst++ = '\\';
// *dst++ = '\\';
// continue;
//}
*dst++ = *s;
}
m_length += dst - (m_buf + m_length);
nullTerm();
return true;
}

View File

@ -8,12 +8,14 @@
struct SafeBuf {
//*TRUCTORS
SafeBuf();
SafeBuf(long initSize);
SafeBuf(long initSize, char *label = NULL);
//be careful with passing in a stackBuf! it could go out
//of scope independently of the safebuf.
SafeBuf(char* stackBuf, long cap);
SafeBuf(char *heapBuf, long bufMax, long bytesInUse, bool ownData);
~SafeBuf();
void setLabel ( char *label );
// CAUTION: BE CAREFUL WHEN USING THE FOLLOWING TWO FUNCTIONS!!
// setBuf() allows you reset the contents of the SafeBuf to either
@ -59,6 +61,7 @@ struct SafeBuf {
bool convertJSONtoXML ( long niceness , long startConvertPos );
bool safeDecodeJSONToUtf8 ( char *json, long jsonLen, long niceness);
// bool decodeAll = false );
bool decodeJSONToUtf8 ( long niceness );
bool decodeJSON ( long niceness );
@ -96,6 +99,9 @@ struct SafeBuf {
bool safeStrcpy ( char *s ) ;
bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
bool safeUtf8ToJSON ( char *utf8 ) ;
bool csvEncode ( char *s , long len , long niceness = 0 );
//bool pushLong ( long val ) { return safeMemcpy((char *)&val,4); }
bool cat(SafeBuf& c);
// . only cat the sections/tag that start with "tagFilter"
@ -106,10 +112,11 @@ struct SafeBuf {
void reset() { m_length = 0; }
void purge(); // Clear all data and free all allocated memory
bool advance ( long i ) ;
// . if clearIt is true we init the new buffer space to zeroes
// . used by Collectiondb.cpp
bool reserve(long i, char *label=NULL , bool clearIt = false );
bool reserve2x(long i);
bool reserve2x(long i, char *label = NULL );
char *makeSpace ( long size ) {
if ( ! reserve ( size ) ) return NULL;
@ -143,6 +150,7 @@ struct SafeBuf {
char *t , long tlen ,
long niceness ,
long startOff = 0 );
void replaceChar ( char src , char dst );
bool copyToken(char* s);;
//output encoding
bool setEncoding(short cs);
@ -326,6 +334,7 @@ struct SafeBuf {
long m_capacity;
long m_length;
char *m_buf;
char *m_label;
bool m_usingStack;
short m_encoding; // output charset

View File

@ -342,7 +342,7 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
// we need to get some cgi values in order to correct the defaults
// based on if we're doing an xml feed, have a site: query, etc.
long xml = r->getLong ( "xml" , 0 ); // was "raw"
//long xml = r->getLong ( "xml" , 0 ); // was "raw"
long siteLen = 0; r->getString ("site",&siteLen);
long sitesLen = 0;
char *sites = r->getString ("sites",&sitesLen,NULL);
@ -353,8 +353,11 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
! m_whiteListBuf.nullTerm() ) )
return log("query: unable to strcpy whitelist");
char format = getFormatFromRequest ( r );
// now override automatic defaults for special cases
if ( xml > 0 ) {
if ( format != FORMAT_HTML ) {
m_familyFilter = 0;
// this is causing me a headache when on when i dont know it
m_restrictIndexdbForQuery = false;
@ -365,6 +368,8 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
m_spellCheck = 0;
m_refs_numToGenerate = 0;
m_refs_docsToScan = 0;
// default scoring info to off
m_getDocIdScoringInfo = false;
}
else if ( m_siteLen > 0 ) {
m_restrictIndexdbForQuery = false;
@ -654,18 +659,19 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
// use "&dg=1" to debug gigabits
m_debugGigabits = r->getLong("dg",0);
// override
m_format = format;
// . omit scoring info from the xml feed for now
// . we have to roll this out to gk144 net i think
if ( xml > 0 )
m_getDocIdScoringInfo = 0;
//if ( m_format != FORMAT_HTML )
// m_getDocIdScoringInfo = 0;
// turn off by default!
if ( ! r->getLong("gigabits",0) ) {
m_numTopicGroups = 0;
}
//////////////////////////////////////
//
// transform input into classes
@ -709,7 +715,8 @@ m if (! cr->hasSearchPermission ( sock, encapIp ) ) {
// . returns false and sets g_errno on error
// . sets m_qbuf1 and m_qbuf2
if ( ! setQueryBuffers ( r ) ) return false;
if ( ! setQueryBuffers (r) )
return log("query: setQueryBuffers: %s",mstrerror(g_errno));
/* --- Virtual host language detection --- */
if(r->getHost()) {
@ -1089,10 +1096,11 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
// if (qcs == csUTF8) {qcs = csISOLatin1;goto doOver;}
// if (qcs != csISOLatin1) {qcs = csUTF8;goto doOver;}
//}
// append plus terms
if ( m_plusLen > 0 ) {
char *s = m_plus, *send = m_plus + m_plusLen;
char *s = m_plus;
char *send = m_plus + m_plusLen;
//if ( p > pstart && p < pend ) *p++ = ' ';
//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
@ -1108,7 +1116,7 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
} else {
while (!isspace(*s2) && s2 < send) s2++;
}
if (s < send) break;
if (s2 < send) break;
//if (p < pend) *p++ = '+';
//if (p2 < pend2) *p2++ = '+';
m_sbuf1.pushChar('+');
@ -1142,7 +1150,8 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
}
// append minus terms
if ( m_minusLen > 0 ) {
char *s = m_minus, *send = m_minus + m_minusLen;
char *s = m_minus;
char *send = m_minus + m_minusLen;
//if ( p > pstart && p < pend ) *p++ = ' ';
//if ( p2 > pstart2 && p2 < pend2) *p2++ = ' ';
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
@ -1158,7 +1167,7 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
} else {
while (!isspace(*s2) && s2 < send) s2++;
}
if (s < send) break;
if (s2 < send) break;
//if (p < pend) *p++ = '-';
//if (p2 < pend2) *p2++ = '-';
m_sbuf1.pushChar('-');
@ -1202,9 +1211,9 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
}
// null terms
m_sbuf1.pushChar('\0');
m_sbuf2.pushChar('\0');
m_sbuf3.pushChar('\0');
if ( ! m_sbuf1.pushChar('\0') ) return false;
if ( ! m_sbuf2.pushChar('\0') ) return false;
if ( ! m_sbuf3.pushChar('\0') ) return false;
// the natural query
m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset;
@ -1239,6 +1248,7 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
long dcatId = -1;
// get the final query
char *q =m_sbuf1.getBufStart();
if ( q ) sscanf(q,"gbpcatid:%li",&pcatId);
if ( q ) sscanf(q,"gbcatid:%li",&dcatId);
// pick the one that is valid
@ -1301,3 +1311,33 @@ uint8_t SearchInput::detectQueryLanguage(void) {
return(lang);
}
char getFormatFromRequest ( HttpRequest *r ) {
char format = FORMAT_HTML;
// what format should search results be in? default is html
char *formatStr = r->getString("format", NULL );
if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
// support old api &xml=1 to mean &format=1
if ( r->getLong("xml",0) ) {
format = FORMAT_XML;
}
// also support &json=1
if ( r->getLong("json",0) ) {
format = FORMAT_JSON;
}
if ( r->getLong("csv",0) ) {
format = FORMAT_CSV;
}
return format;
}

View File

@ -22,6 +22,8 @@
#define MAX_TOPIC_GROUPS 10
char getFormatFromRequest ( class HttpRequest *r ) ;
// . parameters used to generate a set of related topics (gigabits)
// . you can have Msg24 generate multiple sets of related topics in one call
class TopicGroup {
@ -43,6 +45,11 @@ class TopicGroup {
long m_topicMaxPunctLen;
};
#define FORMAT_HTML 0
#define FORMAT_XML 1
#define FORMAT_JSON 2
#define FORMAT_CSV 3
class SearchInput {
public:
@ -211,7 +218,13 @@ class SearchInput {
// tier sizes can change with different "raw" values, therefore,
// so can search results
long m_xml; // msg40
//long m_xml; // msg40
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON
//long m_formatStrLen;
//char *m_formatStr;
// can be 0 for FORMAT_HTML, 1 = FORMAT_XML, 2=FORMAT_JSON, 3=csv
char m_format;
// this should be part of the key because it will affect the results!
char m_queryExpansion;

View File

@ -252,6 +252,8 @@ bool Sections::set ( Words *w ,
// breathe
QUICKPOLL(m_niceness);
m_sectionPtrBuf.setLabel("psectbuf");
// separate buf now for section ptr for each word
if ( ! m_sectionPtrBuf.reserve ( nw *4 ) ) return true;
m_sectionPtrs = (Section **)m_sectionPtrBuf.getBufStart();
@ -260,6 +262,8 @@ bool Sections::set ( Words *w ,
// allocate m_sectionBuf
m_sections = NULL;
m_sectionBuf.setLabel ( "sectbuf" );
if ( ! m_sectionBuf.reserve ( need ) )
return true;
@ -15160,6 +15164,9 @@ bool Sections::print2 ( SafeBuf *sbuf ,
// save ptrs
m_sbuf = sbuf;
m_sbuf->setLabel ("sectprnt");
//m_pt = pt;
//m_et = et;
//m_at = at;

View File

@ -1000,6 +1000,8 @@ bool Speller::loadUnifiedDict() {
bool needRebuild = false;
m_unifiedBuf.setLabel("unibuf");
// this MUST be there
if ( m_unifiedBuf.fillFromFile(g_hostdb.m_dir,
"unifiedDict-buf.txt" ) == 0 )

File diff suppressed because it is too large Load Diff

View File

@ -45,6 +45,9 @@
#define SP_ADMIN_PAUSED 8 // g_conf.m_spideringEnabled = false
#define SP_COMPLETED 9 // crawl is done, and no repeatCrawl is scheduled
bool testPatterns ( ) ;
bool doesStringContainPattern ( char *content , char *pattern ) ;
bool getSpiderStatusMsg ( class CollectionRec *cx ,
class SafeBuf *msg ,
long *status ) ;
@ -603,6 +606,8 @@ class SpiderRequest {
long m_hasContactInfoValid :1;
long m_isContactyValid :1;
long m_hasAddressValid :1;
//long m_matchesUrlCrawlPattern :1;
//long m_matchesUrlProcessPattern:1;
long m_hasTODValid :1;
long m_hasSiteVenueValid :1;
long m_siteNumInlinksValid :1;
@ -832,8 +837,8 @@ class SpiderReply {
// was the request an injection request
long m_fromInjectionRequest :1;
// did we TRY to send it to the diffbot backend filter? might be err?
long m_sentToDiffbot:1;
long m_reserved2 :1;
long m_sentToDiffbot :1;
long m_hadDiffbotError :1;
long m_reserved3 :1;
long m_reserved4 :1;
@ -1111,6 +1116,7 @@ class SpiderColl {
key_t m_waitingTreeKey;
bool m_waitingTreeKeyValid;
long m_scanningIp;
bool m_gotNewRequestsForScanningIp;
// start key for reading doledb
key_t m_msg5StartKey;
@ -1125,7 +1131,7 @@ class SpiderColl {
// for reading lists from spiderdb
Msg5 m_msg5;
bool m_gettingList;
bool m_gettingList1;
// how many outstanding spiders a priority has
long m_outstandingSpiders[MAX_SPIDER_PRIORITIES];
@ -1276,7 +1282,7 @@ class SpiderLoop {
bool printLockTable ( );
long getNumSpidersOutPerIp ( long firstIp ) ;
long getNumSpidersOutPerIp ( long firstIp , collnum_t collnum ) ;
// free all XmlDocs and m_list
void reset();
@ -1301,7 +1307,7 @@ class SpiderLoop {
// . returns true and sets g_errno on error
bool spiderUrl9 ( class SpiderRequest *sreq ,
key_t *doledbKey ,
char *coll ,
collnum_t collnum,//char *coll ,
long sameIpWaitTime , // in milliseconds
long maxSpidersOutPerIp );
@ -1312,7 +1318,8 @@ class SpiderLoop {
// state memory for calling SpiderUrl2() (maybe also getLocks()!)
SpiderRequest *m_sreq;
char *m_coll;
//char *m_coll;
collnum_t m_collnum;
char *m_content;
long m_contentLen;
char m_contentHasMime;
@ -1354,7 +1361,7 @@ class SpiderLoop {
class SpiderColl *m_sc;
// used to avoid calling getRec() twice!
bool m_gettingList;
//bool m_gettingList0;
long m_outstanding1;
bool m_gettingDoledbList;

View File

@ -499,7 +499,7 @@ void drawLine2 ( SafeBuf &sb ,
sb.safePrintf("<div style=\"position:absolute;"
"left:%li;"
"top:%li;"
"background-color:#%lx;"
"background-color:#%06lx;"
"z-index:-5;"
"min-height:%lipx;"
"min-width:%lipx;\"></div>\n"

View File

@ -25,7 +25,7 @@ class StatPoint {
#define MAX_POINTS 6000
#define MAX_WIDTH 6
#define DY 600 // pixels vertical
#define DY 1000 // pixels vertical
#define DX 1000 // pixels across
#define DT (20*1000) // time window, 20 seconds
#define MAX_LINES (DY / (MAX_WIDTH+1)) // leave free pixel above each line

View File

@ -526,16 +526,16 @@ bool Statsdb::makeGIF ( long t1Arg ,
#define MAX_POINTS 6000
#define MAX_WIDTH 6
#define DY 600 // pixels vertical
#define DX 1000 // pixels across
#define MAX_LINES (DY / (MAX_WIDTH+1)) // leave free pixel above each line
#define DY2 600 // pixels vertical
#define DX2 1000 // pixels across
#define MAX_LINES2 (DY2 / (MAX_WIDTH+1)) // leave free pixel above each line
long Statsdb::getImgHeight() {
return (long)DY + m_by * 2;
return (long)DY2 + m_by * 2;
}
long Statsdb::getImgWidth() {
return (long)DX + m_bx * 2;
return (long)DX2 + m_bx * 2;
}
// these are used for storing the "events"
@ -599,7 +599,7 @@ bool Statsdb::gifLoop ( ) {
// gif size
//char tmp[64];
// dimensions of the gif
//sprintf ( tmp , "%lix%li", (long)DX+m_bx*2 , (long)DY+m_by*2 );
//sprintf ( tmp , "%lix%li", (long)DX2+m_bx*2 , (long)DY2+m_by*2 );
//GIFPlotter::parampl ( "BITMAPSIZE" , (void *)tmp );
// create one
//GIFPlotter plotter ( NULL , m_fd , NULL );
@ -607,7 +607,7 @@ bool Statsdb::gifLoop ( ) {
//plotter.openpl ( );
// define the space with boundaries 100 unit wide boundaries
//plotter.space ( 0 , 0 , DX + m_bx * 2 , DY + m_by * 2 );
//plotter.space ( 0 , 0 , DX2 + m_bx * 2 , DY2 + m_by * 2 );
// line thickness in user coordinates (pixels for us)
//plotter.linewidth ( 1 );
@ -628,7 +628,7 @@ bool Statsdb::gifLoop ( ) {
"z-index:-10;"
// the tick marks we print below are based on it
// being a window of the last 20 seconds... and using
// DX pixels
// DX2 pixels
"min-width:%lipx;"
"min-height:%lipx;"
//"width:100%%;"
@ -637,15 +637,15 @@ bool Statsdb::gifLoop ( ) {
"margin-bottom:10px;"
"margin-right:10px;"
"margin-left:10px;\">"
,(long)DX + 2 *m_bx
,(long)DY + 2*m_by);
,(long)DX2 + 2 *m_bx
,(long)DY2 + 2*m_by);
// draw the x-axis
//plotter.line ( m_bx , m_by , DX + m_bx , m_by );
//plotter.line ( m_bx , m_by , DX2 + m_bx , m_by );
// 10 x-axis tick marks
for ( int x = DX/20 ; x <= DX ; x += DX/20 ) {
for ( int x = DX2/20 ; x <= DX2 ; x += DX2/20 ) {
// tick mark
//plotter.line ( x , -20 , x , 20 );
m_gw.safePrintf("<div style=\"position:absolute;"
@ -657,7 +657,7 @@ bool Statsdb::gifLoop ( ) {
"min-width:3px;\"></div>\n"
, m_bx + (long)x-1
);
long xv = (long)(dt * (long long)x/(long long)DX)-(long)dt;
long xv = (long)(dt * (long long)x/(long long)DX2)-(long)dt;
// LABEL
m_gw.safePrintf("<div style=\"position:absolute;"
"left:%li;"
@ -780,8 +780,8 @@ bool Statsdb::gifLoop ( ) {
// ensure at least 3 units wide for visibility
if ( x2 < x1 + 10 ) x2 = x1 + 10;
// . flip the y so we don't have to scroll the browser down
// . DY does not include the axis and tick marks
//long fy1 = DY - y1 + m_by ;
// . DY2 does not include the axis and tick marks
//long fy1 = DY2 - y1 + m_by ;
// plot it
//plotter.line ( x1 , fy1 , x2 , fy1 );
drawLine3 ( m_gw , x1 , x2 , y1 , c1 , pp->m_thickness );
@ -918,7 +918,7 @@ char *Statsdb::plotGraph ( char *pstart ,
// . the minimum difference between ymax and ymin is minDiff.
// . this prevents us from zooming in too close!
float minDiff = (float)DY * label->m_minRes ;
float minDiff = (float)DY2 * label->m_minRes ;
// we are already scaled!
float ourDiff = (ymax - ymin) ;
@ -976,14 +976,14 @@ char *Statsdb::plotGraph ( char *pstart ,
float y1 = lasty;
// normalize y into pixel space
y2 = ((float)DY * (y2 - ymin)) / (ymax-ymin);
y2 = ((float)DY2 * (y2 - ymin)) / (ymax-ymin);
// set lasts for next iteration of this loop
lastx = x2;
lasty = y2;
// . flip the y so we don't have to scroll the browser down
// . DY does not include the axis and tick marks
// . DY2 does not include the axis and tick marks
// . do not flip y any more for statsdb graphs
long fy1 = (long)(y1+.5);// + m_by ;
long fy2 = (long)(y2+.5);// + m_by ;
@ -1011,7 +1011,7 @@ char *Statsdb::plotGraph ( char *pstart ,
// plot it
// BUT only iff not more than 5 seconds difference
//float secondsPerPixel = (m_t2-m_t1)/(float)DX;
//float secondsPerPixel = (m_t2-m_t1)/(float)DX2;
// avoid this for now. mdw oct 14 2013.
//float dt = (x2 - x1) * secondsPerPixel;
@ -1068,7 +1068,7 @@ void Statsdb::drawHR ( float z ,
long color ) {
// convert into yspace
float z2 = ((float)DY * (float)(z - ymin)) /(float)(ymax-ymin);
float z2 = ((float)DY2 * (float)(z - ymin)) /(float)(ymax-ymin);
// avoid collisions with other graphs
z2 += zoff;
// border
@ -1076,7 +1076,7 @@ void Statsdb::drawHR ( float z ,
// round off error
z2 += 0.5;
// for adjusatmnet
float ptsPerPixel = (ymax-ymin)/ (float)DY;
float ptsPerPixel = (ymax-ymin)/ (float)DY2;
// make an adjustment to the label then! -- Commented out because it's currently not used.
float zadj = zoff * ptsPerPixel;
@ -1088,9 +1088,9 @@ void Statsdb::drawHR ( float z ,
// ((color >> 0) & 0xff) << 8 );
// horizontal line
//plotter->line ( m_bx, (long)z2 , DX + m_bx, (long)z2 );
//plotter->line ( m_bx, (long)z2 , DX2 + m_bx, (long)z2 );
long width = 1;
drawLine3 ( m_gw, 0, DX , (long)z2,color, width);
drawLine3 ( m_gw, 0, DX2 , (long)z2,color, width);
// make label
@ -1364,7 +1364,7 @@ bool Statsdb::addPoint ( long x ,
class StatState *ss ) {
// convert x into pixel position
float xf = (float)DX * (float)(x - m_t1) / (float)(m_t2 - m_t1);
float xf = (float)DX2 * (float)(x - m_t1) / (float)(m_t2 - m_t1);
// round it to nearest pixel
long x2 = (long)(xf + .5) ;//+ m_bx;
// make this our y pos
@ -1446,12 +1446,12 @@ bool Statsdb::addEventPoint ( long t1 ,
long thickness ) {
// convert t1 into pixel position
float af = (float)DX * (float)(t1 - m_t1) / (float)(m_t2 - m_t1);
float af = (float)DX2 * (float)(t1 - m_t1) / (float)(m_t2 - m_t1);
// round it to nearest pixel
long a = (long)(af + .5) ;//+ m_bx;
// convert t2 into pixel position
//float bf = (float)DX * (float)(t2 - m_t1) / (float)(m_t2 - m_t1);
//float bf = (float)DX2 * (float)(t2 - m_t1) / (float)(m_t2 - m_t1);
// round it to nearest pixel
//long b = (long)(bf + .5) + m_bx;
//if ( a > b ) { char *xx=NULL;*xx=0; }
@ -1468,7 +1468,7 @@ bool Statsdb::addEventPoint ( long t1 ,
}
// go down each line of points
for ( long i = 0 ; i < MAX_LINES ; i++ ) {
for ( long i = 0 ; i < MAX_LINES2 ; i++ ) {
// breathe
QUICKPOLL ( m_niceness );
// . is there room for us in this line?

View File

@ -429,6 +429,8 @@ char *getSourceString ( char source ) {
if ( source == SOURCE_BIGRAM ) return "bigram";
if ( source == SOURCE_TRIGRAM ) return "trigram";
if ( source == SOURCE_WIKTIONARY_EN ) return "wiktionary-en";
// the thing we are hashing is a "number"
if ( source == SOURCE_NUMBER ) return "number";
return "unknown";
}

View File

@ -15,6 +15,7 @@
#define SOURCE_GENERATED 4
#define SOURCE_BIGRAM 5
#define SOURCE_TRIGRAM 6
#define SOURCE_NUMBER 7
// per word!
#define MAX_SYNS 64

View File

@ -2513,6 +2513,13 @@ bool Msg8a::getTagRec ( Url *url ,
TagRec *tagRec ,
bool doInheritance ,
char rdbId ) {
CollectionRec *cr = g_collectiondb.getRec ( coll );
if ( ! cr ) {
g_errno = ENOCOLLREC;
return true;
}
// reset tag rec
tagRec->reset();//m_numListPtrs = 0;

View File

@ -780,7 +780,10 @@ TcpSocket *TcpServer::getNewSocket ( ) {
log("tcp: using statically linked libc that only supports "
"an fd of up to %li, but got an fd = %li. fd_set is "
"only geared for 1024 bits of file descriptors for "
"doing poll() in Loop.cpp",
"doing poll() in Loop.cpp. Ensure 'ulimit -a' limits "
"open files to 1024. "
"Check open fds using ls /proc/<gb-pid>/fds/ and ensure "
"they are all BELOW 1024.",
(long)MAX_NUM_FDS,(long)sd);
char *xx=NULL;*xx=0;
}
@ -1092,7 +1095,7 @@ bool TcpServer::closeLeastUsed ( long maxIdleTime ) {
// . g_errno will be set by Loop if there was a kinda socket reset error
void readSocketWrapper ( int sd , void *state ) {
// debug msg
// log("........... TcpServer::readSocketWrapper\n");
//log("........... TcpServer::readSocketWrapper\n");
// extract our this ptr
TcpServer *THIS = (TcpServer *)state;
// get a TcpSocket from sd
@ -1239,8 +1242,13 @@ long TcpServer::readSocket ( TcpSocket *s ) {
// do the read
int n;
if (m_useSSL)
n = SSL_read ( s->m_ssl, s->m_readBuf + s->m_readOffset, avail );
if (m_useSSL) {
//long long now1 = gettimeofdayInMilliseconds();
n = SSL_read(s->m_ssl, s->m_readBuf + s->m_readOffset, avail );
//long long now2 = gettimeofdayInMilliseconds();
//long long took = now2 - now1 ;
//if ( took >= 2 ) log("tcp: ssl_read took %llims", took);
}
else
n = ::read ( s->m_sd, s->m_readBuf + s->m_readOffset, avail );
@ -1483,8 +1491,13 @@ long TcpServer::writeSocket ( TcpSocket *s ) {
// send this piece
int n;
retry10:
if (m_useSSL)
if (m_useSSL) {
//long long now1 = gettimeofdayInMilliseconds();
n = SSL_write ( s->m_ssl, msg + s->m_sendOffset, toSend );
//long long now2 = gettimeofdayInMilliseconds();
//long long took = now2 - now1 ;
//if ( took >= 2 ) log("tcp: ssl_write took %llims", took);
}
else
n = ::send ( s->m_sd , msg + s->m_sendOffset , toSend , 0 );
// cancel harmless errors, return -1 on severe ones
@ -1626,8 +1639,12 @@ connected:
int r;
s->m_ssl = SSL_new(m_ctx);
SSL_set_fd(s->m_ssl, s->m_sd);
//long long now1 = gettimeofdayInMilliseconds();
SSL_set_connect_state(s->m_ssl);
r = SSL_connect(s->m_ssl);
//long long now2 = gettimeofdayInMilliseconds();
//long long took = now2 - now1 ;
//if ( took >= 2 ) log("tcp: ssl_connect took %llims", took);
if (!s->m_ssl) {
log("ssl: SSL is NULL after connect.");
char *xx = NULL; *xx = 0;
@ -2092,9 +2109,19 @@ bool TcpServer::sslAccept ( TcpSocket *s ) {
}
//log("ssl: SSL_accept %li",newsd);
long long now1 = gettimeofdayInMilliseconds();
retry19:
// javier put this in here, but it was not non-blocking!!!
// . javier put this in here, but it was not non-blocking!!!
// . it is non-blocking now, however, when it does block and
// complete the accept it takes 10ms on sp1, a server from ~2009
// using a custom build of the lastest libssl.a from about 2013.
// . this accept needs to be put in a thread then, maybe multiple
// threads
int r = SSL_accept(s->m_ssl);
long long now2 = gettimeofdayInMilliseconds();
long long took = now2 - now1 ;
if ( took >= 2 )
log("tcp: ssl_accept %li took %llims", (long)newsd, took);
// did it block?
if ( r < 0 && errno == EINTR ) goto retry19;
// copy errno to g_errno
@ -2103,7 +2130,7 @@ bool TcpServer::sslAccept ( TcpSocket *s ) {
if ( g_errno == SSL_ERROR_WANT_READ ||
g_errno == SSL_ERROR_WANT_WRITE ||
g_errno == EAGAIN ) {
//log("ssl: SSL_accept blocked %li",newsd);
//log("ssl: SSL_accept would block %li",newsd);
return true;
}
// any other?
@ -2117,8 +2144,9 @@ bool TcpServer::sslAccept ( TcpSocket *s ) {
}
// log this so we can monitor if we get too many of these per second
// because they take like 10ms each on sp1!!! mdw
log("ssl: SSL_accept (~10ms) completed %li",newsd);
// because they take like 10ms each on sp1!!! (even with non-blocking
// sockets, they'll block for 10ms) - mdw 2013
//log("ssl: SSL_accept (~10ms) completed %li",newsd);
// ok, we got it
s->m_sockState = ST_READING;
return true;

View File

@ -126,7 +126,7 @@ void Test::removeFiles ( ) {
// . kinda like Collectiondb::deleteRec() i guess but we need to
// preserve the parms!!
// . deletetagdb = false
g_collectiondb.resetColl ( "test" , NULL ); // false );
g_collectiondb.resetColl ( "test" , NULL , true );
// reset event count
//g_collectiondb.countEvents();

View File

@ -303,6 +303,10 @@ bool Threads::init ( ) {
// generic multipurpose
if ( ! g_threads.registerType (GENERIC_THREAD,100/*maxThreads*/,100) )
return log("thread: Failed to register thread type." );
// for call SSL_accept() which blocks for 10ms even when socket
// is non-blocking...
//if (!g_threads.registerType (SSLACCEPT_THREAD,20/*maxThreads*/,100))
// return log("thread: Failed to register thread type." );
#ifndef PTHREADS
@ -884,20 +888,28 @@ bool ThreadQueue::timedCleanUp ( long maxNiceness ) {
#ifdef PTHREADS
// . join up with that thread
// . damn, sometimes he can block forever on his
// call to sigqueue(),
long status = pthread_join ( t->m_joinTid , NULL );
if ( status != 0 ) {
log("threads: pthread_join %li = %s (%li)",
(long)t->m_joinTid,mstrerror(status),status);
}
// debug msg
if ( g_conf.m_logDebugThread )
log(LOG_DEBUG,"thread: joined1 with t=0x%lx "
"jointid=0x%lx.",
(long)t,(long)t->m_joinTid);
// if pthread_create() failed it returns the errno and we
// needsJoin is false, so do not try to join
// to a thread if we did not create it, lest pthread_join()
// cores
if ( t->m_needsJoin ) {
// . join up with that thread
// . damn, sometimes he can block forever on his
// call to sigqueue(),
long status = pthread_join ( t->m_joinTid , NULL );
if ( status != 0 ) {
log("threads: pthread_join %li = %s (%li)",
(long)t->m_joinTid,mstrerror(status),
status);
}
// debug msg
if ( g_conf.m_logDebugThread )
log(LOG_DEBUG,"thread: joined1 with t=0x%lx "
"jointid=0x%lx.",
(long)t,(long)t->m_joinTid);
}
#else
again:
@ -1211,20 +1223,22 @@ bool ThreadQueue::cleanUp ( ThreadEntry *tt , long maxNiceness ) {
#ifdef PTHREADS
// . join up with that thread
// . damn, sometimes he can block forever on his
// call to sigqueue(),
long status = pthread_join ( t->m_joinTid , NULL );
if ( status != 0 ) {
log("threads: pthread_join2 %li = %s (%li)",
(long)t->m_joinTid,mstrerror(status),status);
if ( t->m_needsJoin ) {
// . join up with that thread
// . damn, sometimes he can block forever on his
// call to sigqueue(),
long status = pthread_join ( t->m_joinTid , NULL );
if ( status != 0 ) {
log("threads: pthread_join2 %li = %s (%li)",
(long)t->m_joinTid,mstrerror(status),
status);
}
// debug msg
if ( g_conf.m_logDebugThread )
log(LOG_DEBUG,"thread: joined2 with t=0x%lx "
"jointid=0x%lx.",
(long)t,(long)t->m_joinTid);
}
// debug msg
if ( g_conf.m_logDebugThread )
log(LOG_DEBUG,"thread: joined2 with t=0x%lx "
"jointid=0x%lx.",
(long)t,(long)t->m_joinTid);
#else
again:
@ -1591,7 +1605,7 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
// return if the max is already launched
if ( active >= m_maxLaunched ) return false;
// do not launch a low priority merge, addlists or filter thread if we
// do not launch a low priority merge, intersect or filter thread if we
// have high priority cpu threads already going on. this way a
// low priority spider thread will not launch if a high priority
// cpu-based thread of any kind (right now just MERGE or INTERSECT)
@ -1642,7 +1656,7 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
// i dunno what the point of this was... so i commented it out
//long max2 = g_conf.m_queryMaxDiskThreads ;
//if ( max2 <= 0 ) max2 = 1;
// only do this check if we're a addlists thread queue
// only do this check if we're a addlists/instersect thread queue
//if (m_threadType == INTERSECT_THREAD&& hiActive >= max2)return false;
// loop through candidates
@ -2008,7 +2022,26 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
//
#else
pthread_create ( &t->m_joinTid , &s_attr, startUp2 , t) ;
// assume it does not go through
t->m_needsJoin = false;
// pthread inherits our sigmask, so don't let it handle sigalrm
// signals in Loop.cpp, it'll screw things up. that handler
// is only meant to be called by the main process. if we end up
// double calling it, this thread may think g_callback is non-null
// then it gets set to NULL, then the thread cores! seen it...
sigset_t sigs;
sigemptyset ( &sigs );
sigaddset ( &sigs , SIGALRM );
if ( sigprocmask ( SIG_BLOCK , &sigs , NULL ) < 0 )
log("threads: failed to block sig");
// this returns 0 on success, or the errno otherwise
g_errno = pthread_create ( &t->m_joinTid , &s_attr, startUp2 , t) ;
if ( sigprocmask ( SIG_UNBLOCK , &sigs , NULL ) < 0 )
log("threads: failed to unblock sig");
#endif
@ -2020,6 +2053,8 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
// return true on successful creation of the thread
if ( g_errno == 0 ) {
// good stuff, the thread needs a join now
t->m_needsJoin = true;
if ( count > 0 )
log("thread: Call to clone looped %li times.",count);
return true;
@ -2047,6 +2082,11 @@ bool ThreadQueue::launchThread ( ThreadEntry *te ) {
#ifndef PTHREADS
hadError:
#endif
if ( g_errno )
log("thread: pthread_create had error = %s",
mstrerror(g_errno));
// it didn't launch, did it? dec the count.
m_launched--;
// priority-based LOCAL & GLOBAL launch counts
@ -2326,7 +2366,7 @@ const char *ThreadQueue::getThreadType ( ) {
const char *s = "unknown";
if ( m_threadType == DISK_THREAD ) s = "disk";
if ( m_threadType == MERGE_THREAD ) s = "merge";
if ( m_threadType == INTERSECT_THREAD ) s = "addlists";
if ( m_threadType == INTERSECT_THREAD ) s = "intersectlists";
if ( m_threadType == FILTER_THREAD ) s = "filter";
if ( m_threadType == SAVETREE_THREAD ) s = "savetree";
if ( m_threadType == UNLINK_THREAD ) s = "unlink";

View File

@ -21,6 +21,7 @@ pid_t getpidtid();
#define SAVETREE_THREAD 4
#define UNLINK_THREAD 5
#define GENERIC_THREAD 6
//#define SSLACCEPT_THREAD 7
#define GB_SIGRTMIN (SIGRTMIN+4)
#define MAX_NICENESS 2
// . a ThreadQueue has a list of thread entries
@ -54,6 +55,7 @@ class ThreadEntry {
long m_stackSize ;
long m_si ; // s_stackPtrs[i] = m_stack
bool m_needsJoin;
pthread_t m_joinTid;
};

View File

@ -113,8 +113,17 @@ bool Title::setTitle ( XmlDoc *xd ,
char *val = NULL;
long vlen;
// look for the "title:" field in json then use that
if ( xd->m_contentType == CT_JSON )
val = getJSONFieldValue ( xd->ptr_utf8Content,"title",&vlen);
SafeBuf jsonTitle;
if ( xd->m_contentType == CT_JSON ) {
char *jt;
jt = getJSONFieldValue(xd->ptr_utf8Content,"title",&vlen);
if ( jt && vlen > 0 ) {
jsonTitle.safeDecodeJSONToUtf8 (jt, vlen, m_niceness);
//true ); // decodeAll?
jsonTitle.nullTerm();
val = jsonTitle.getBufStart();
}
}
// if we had a title: field in the json...
if ( val ) {
char *dst = NULL;

View File

@ -22,6 +22,8 @@ Wiktionary::Wiktionary () {
// . now m_langTable just maps to langId, no POS bits...
//m_langTable.set ( 6 , 1,0,NULL,0,false,0 ,"wkt-lang");
m_synTable.set ( 6 , 4,0,NULL,0,true,0 ,"wkt-synt");
m_synBuf.setLabel("synbuf");
}
void Wiktionary::reset() {
@ -47,6 +49,11 @@ Wiktionary::~Wiktionary () {
bool Wiktionary::test ( ) {
// test words parsing here
//Words w;
//w.set9 ("get $4,500.00 now",0);
// test it out!
char *str = "love";//pie"; //forsake";
//long long wid = hash64Lower_utf8(str);

View File

@ -416,7 +416,38 @@ bool Words::addWords(char *s,long nodeLen,bool computeWordIds, long niceness) {
}
// . c#, j#, ...
if ( s[i]=='#' && !is_alnum_utf8(&s[i+1]) ) i++;
// comma is ok if like ,ddd!d
if ( s[i]==',' &&
i-j <= 3 &&
is_digit(s[i-1]) ) {
// if word so far is 2 or 3 chars, make sure digits
if ( i-j >= 2 && ! is_digit(s[i-2]) ) goto nogo;
if ( i-j >= 3 && ! is_digit(s[i-3]) ) goto nogo;
// scan forward
subloop:
if ( s[i] == ',' &&
is_digit(s[i+1]) &&
is_digit(s[i+2]) &&
is_digit(s[i+3]) &&
! is_digit(s[i+4]) ) {
i += 4;
goto subloop;
}
}
// decimal point?
if ( s[i] == '.' &&
is_digit(s[i-1]) &&
is_digit(s[i+1]) ) {
// allow the decimal point
i++;
// skip over string of digits
while ( is_digit(s[i]) ) i++;
}
nogo:
// allow for words like we're dave's and i'm
if(s[i]=='\''&&s[i+1]&&is_alnum_utf8(&s[i+1])&&!hadApostrophe){
i++;

File diff suppressed because it is too large Load Diff

View File

@ -630,6 +630,8 @@ class XmlDoc {
long *getIp ( ) ;
long *gotIp ( bool save ) ;
bool *getIsAllowed ( ) ;
long *getFinalCrawlDelay();
long m_finalCrawlDelay;
//long getTryAgainTimeDelta() {
// if ( ! m_tryAgainTimeDeltaValid ) { char *xx=NULL;*xx=0;}
// return m_tryAgainTimeDelta;
@ -752,6 +754,7 @@ class XmlDoc {
bool hashDMOZCategories ( class HashTableX *table ) ;
bool hashLinks ( class HashTableX *table ) ;
bool hashUrl ( class HashTableX *table ) ;
bool hashDateNumbers ( class HashTableX *tt ) ;
bool hashSections ( class HashTableX *table ) ;
bool hashIncomingLinkText ( class HashTableX *table ,
bool hashAnomalies ,
@ -849,6 +852,15 @@ class XmlDoc {
long niceness );
bool hashNumber ( char *beginBuf ,
char *buf ,
long bufLen ,
class HashInfo *hi ) ;
bool hashNumber2 ( float f ,
class HashInfo *hi ,
char *gbsortByStr ) ;
// print out for PageTitledb.cpp and PageParser.cpp
bool printDoc ( class SafeBuf *pbuf );
bool printMenu ( class SafeBuf *pbuf );
@ -1159,6 +1171,7 @@ class XmlDoc {
*/
bool m_httpStatusValid;
bool m_crawlDelayValid;
bool m_finalCrawlDelayValid;
bool m_titleRecKeyValid;
bool m_adVectorValid;
bool m_wikiDocIdsValid;
@ -1279,6 +1292,7 @@ class XmlDoc {
bool m_replyValid;
bool m_recycleDiffbotReplyValid;
bool m_diffbotReplyValid;
bool m_tokenizedDiffbotReplyValid;
//bool m_diffbotUrlCrawlPatternMatchValid;
//bool m_diffbotUrlProcessPatternMatchValid;
//bool m_diffbotPageProcessPatternMatchValid;
@ -1480,6 +1494,7 @@ class XmlDoc {
char m_isWWWDup;
char m_calledMsg0b;
Url m_tmpUrl;
SafeBuf m_tmpsb1;
SafeBuf m_tmpsb2;
SafeBuf m_turkBuf;
@ -1548,9 +1563,9 @@ class XmlDoc {
//
XmlDoc *m_dx;
char *m_diffbotObj;
char *m_diffbotObjEnd;
char m_diffbotSavedChar;
SafeBuf m_diffbotReply;
SafeBuf *m_tokenizedDiffbotReplyPtr;
SafeBuf m_tokenizedDiffbotReply;
long m_diffbotReplyError;
bool m_recycleDiffbotReply;
//bool m_diffbotUrlCrawlPatternMatch;
@ -1562,6 +1577,7 @@ class XmlDoc {
SafeBuf m_diffbotApiUrl;
bool *getRecycleDiffbotReply ( ) ;
SafeBuf *getTokenizedDiffbotReply ( ) ;
SafeBuf *getDiffbotReply ( ) ;
//bool doesUrlMatchDiffbotCrawlPattern() ;
//bool doesUrlMatchDiffbotProcessPattern() ;

View File

@ -382,9 +382,9 @@ void gotDocWrapper ( void *state , TcpSocket *s ) {
// parse status message out of response
// HTTP/1.0
while ( p < pend && !is_space(*p) ) p++;
while ( p < pend && !isspace(*p) ) p++;
// skip space
while ( p < pend && is_space(*p) ) p++;
while ( p < pend && isspace(*p) ) p++;
// copy to end of line
while (p < pend && mlen < 255 && *p != '\r' && *p != '\n'){
message[mlen++] = *p;

View File

@ -4,7 +4,12 @@
<notifyUrl><![CDATA[ccc]]></>
<collectiveRespiderFrequency>0.000000</>
<collectiveCrawlDelay>0.250000</>
<diffbotApiUrl><![CDATA[]]></>
<diffbotUrlCrawlPattern><![CDATA[]]></>
<diffbotUrlProcessPattern><![CDATA[]]></>
<diffbotPageProcessPattern><![CDATA[]]></>
<diffbotUrlCrawlRegEx><![CDATA[]]></>
<diffbotUrlProcessRegEx><![CDATA[]]></>
<diffbotOnlyProcessIfNew>1</>
<diffbotSeeds><![CDATA[]]></>
<isCustomCrawl>0</>
@ -79,6 +84,9 @@
# The spider round number.
<spiderRoundNum>0</>
# The spider status number.
<spiderStatus>0</>
# Do searches for queries in this hosts part of the query log.
<scrapingEnabledProcog>0</>
@ -326,12 +334,12 @@
<maxRobotstxtCacheAge>86400</>
# Only spider URLs scheduled to be spidered at this time or after. In UTC.
<spiderStartTime>17 Jan 1970 20:00 UTC</>
<spiderStartTime>24 Jan 1970 20:00 UTC</>
# Only spider URLs scheduled to be spidered at this time or before. If "use
# current time" is true then the current local time is used for this value
# instead. in UTC.
<spiderEndTime>01 Jan 1970 08:00 UTC</>
<spiderEndTime>08 Jan 1970 08:00 UTC</>
# Use the current time as the spider end time?
<useCurrentTime>1</>
@ -812,7 +820,7 @@
<highlightQueryTermsInRelatedPagesSummary>0</>
# Truncates a related page title after this many charaters and adds ...
<numberOfCharactersToDisplayInTitleBeforeTruncating>50</>
<numberOfCharactersToDisplayInTitleBeforeTruncating>0</>
# Use the search results' links in order to generate related pages.
<useResultsPagesAsReferences>0</>
@ -1017,173 +1025,23 @@
<filterExpression><![CDATA[hopcount&gt;=3]]></>
<filterExpression><![CDATA[isnew]]></>
<filterExpression><![CDATA[default]]></>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>1</>
<spidersEnabled>0</>
<spidersEnabled>1</>
<spidersEnabled>0</>
<spidersEnabled>1</>
<spidersEnabled>0</>
<spidersEnabled>1</>
<spidersEnabled>0</>
<spidersEnabled>1</>
<spidersEnabled>0</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>1.000000</>
<filterFrequency>1.000000</>
<filterFrequency>1.000000</>
<filterFrequency>0.000000</>
<filterFrequency>7.000000</>
<filterFrequency>0.000000</>
<filterFrequency>10.000000</>
<filterFrequency>0.000000</>
<filterFrequency>20.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>0.000000</>
<filterFrequency>30.000000</>
<filterFrequency>30.000000</>
# Use <harvestLinks> tag.
# Use <spidersEnabled> tag.
# Use <filterFrequency> tag.
# Do not allow more than this many outstanding spiders for all urls in this
# priority.
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>4</>
<maxSpidersPerRule>2</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>2</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>1</>
<maxSpidersPerRule>99</>
<maxSpidersPerRule>99</>
# Use <maxSpidersPerRule> tag.
# Allow this many spiders per IP.
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
<maxSpidersPerIp>1</>
# Use <maxSpidersPerIp> tag.
# Wait at least this long before downloading urls from the same IP address.
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<spiderIpWait>1000</>
<filterPriority>80</>
<filterPriority>-3</>
<filterPriority>-3</>
<filterPriority>-3</>
<filterPriority>-3</>
<filterPriority>-3</>
<filterPriority>-3</>
<filterPriority>-3</>
<filterPriority>0</>
<filterPriority>-3</>
<filterPriority>-3</>
<filterPriority>-3</>
<filterPriority>3</>
<filterPriority>45</>
<filterPriority>85</>
<filterPriority>50</>
<filterPriority>48</>
<filterPriority>49</>
<filterPriority>47</>
<filterPriority>40</>
<filterPriority>39</>
<filterPriority>30</>
<filterPriority>29</>
<filterPriority>20</>
<filterPriority>19</>
<filterPriority>1</>
<filterPriority>0</>
<diffbotAPI><![CDATA[]]></>
# Use <spiderIpWait> tag.
# Use <filterPriority> tag.
# Use <diffbotAPI> tag.

66
errnotest.cpp Normal file
View File

@ -0,0 +1,66 @@
#include <stdio.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <errno.h>
#include <sched.h>
#include <unistd.h>
#include <assert.h>
static int s_called = 0;
#define MAX_PID 32767
static int s_errno ;
static int s_errnos [ MAX_PID + 1 ];
static long s_bad = 0;
static long s_badPid = -1;
// WARNING: you MUST compile with -DREENTRANT for this to work
int *__errno_location (void) {
long pid = (long) getpid();
s_called++;
if ( pid <= (long)MAX_PID ) return &s_errnos[pid];
s_bad++;
s_badPid = pid;
return &s_errno;
}
//extern __thread int errno;
int g_errno = 0;
int startup ( void *state ) {
char buf[5];
// this sets errno, but does not seem to call our __errno_location
// override, BUT does seem to not affect "errno" in main() either!
// maybe this is the TLS support?
int bytes = read(-9,buf,5);
//errno = 7; // E2BIG;
//assert ( errno && bytes == -1 );
g_errno = errno;
}
int main() {
errno = 10; // EINVAL;
g_errno = 10;
char stack[10000];
pid_t pid = clone( startup ,
stack + 10000 ,
//CLONE_SETTLS |
CLONE_VM | SIGCHLD,
NULL );
int status;
waitpid ( pid , &status, 0 );
if ( s_called ) fprintf(stderr,"__errno_location() was called %i "
"times\n",s_called);
if ( errno != 10 ) fprintf(stderr,"errno=%i (failed)\n",errno);
else fprintf(stderr,"errno=%i (success)\n",errno);
if ( g_errno == 10 || g_errno == 0 )
fprintf(stderr,"gerrno=%i (failed)\n",g_errno);
else
fprintf(stderr,"gerrno=%i (success)\n",g_errno);
}

View File

@ -999,14 +999,27 @@ long long atoll2 ( const char *s, long len ) {
double atof2 ( const char *s, long len ) {
// skip over spaces
const char *end = s + len;
while ( s < end && is_wspace_a ( *s ) ) s++;
while ( s < end && is_wspace_a ( *s ) ) { s++; len--; }
// return 0 if all spaces
if ( s == end ) return 0;
char buf[128];
char tmpBuf[128];
if ( len >= 128 ) len = 127;
strncpy ( buf , s , len );
buf[len] = '\0';
return atof ( buf );
//strncpy ( dst , s , len );
const char *p = s;
const char *srcEnd = s + len;
char *dst = tmpBuf;
// remove commas
for ( ; p < srcEnd ; p++ ) {
// skip commas
if ( *p == ',' ) continue;
// otherwise store it
*dst++ = *p;
}
// null term
*dst = '\0';
//buf[len] = '\0';
return atof ( tmpBuf );
}
double atod2 ( char *s, long len ) {

View File

@ -57,7 +57,7 @@
<doNarrowSearch>0</>
# Overrides all spidering for all collections on just this host.
<localSpideringEnabled>1</>
<localSpideringEnabled>0</>
# Overrides all add urls for all collections on just this host.
<localAddUrlEnabled>1</>
@ -73,10 +73,10 @@
<qaSearchTestEnabled>1</>
# Enable spidering on all hosts
<allSpidersOn>1</>
<allSpidersOn>0</>
# Disable spidering on all hosts
<allSpidersOff>1</>
<allSpidersOff>0</>
# Serves ads unless pure=1 is in cgi parms.
<adFeedEnabled>0</>
@ -385,7 +385,7 @@
# Maximum number of threads to use per Gigablast process for intersecting
# docid lists. Generally, set this to the number of CPUs on the machine.
<maxCpuThreads>1</>
<maxCpuThreads>10</>
# Maximum number of pages to index or delete from index per second for all
# hosts combined.

View File

@ -78,21 +78,19 @@ You will need the following packages installed<br>
2. Edit hosts.conf so the working directory is not /home/mwells/github/ but
rather your current working directory, where the 'gb' binary resides.
<br><br>
3. Run './gb 0' to start a single gigablast node.
3. Run './gb 0' to start a single gigablast node which listens on port 8000.
<br><br>
4. Access the server with your browser on port 8000 (default port). You can change this default port in the gb.conf file.
4. The first time you run it you will have to wait for it to build some binary data files from the txt files it uses that are based on wiktionary and wikipedia that it uses to do synonyms and phrasing.
<br><br>
5. The first time you run it you will have to wait for it to build some binary data files from the txt files it uses that are based on wiktionary and wikipedia that it uses to do synonyms and phrasing.
5. Re-run it after it builds those binaries.
<br><br>
6. Re-run it after it builds those binaries.
6. Check out the <a href=http://127.0.0.1:8000/master>Master Controls</a>. You need to connect to port 8000 from a local IP address or from an IP address on the same C-Class as part of Gigablast's security. Consider using an ssh tunnel if your browser's IP is not on the same C-Class as the server's. i.e. From your browser machine, ssh to the machine running the gb server: <i>ssh someservername.com -L 8000:127.0.0.1:8000</i> . Then on your browser go to the <a href=http://127.0.0.1:8000/master>Master Controls</a>.
<br><br>
7. Check out the <a href=http://127.0.0.1:8000/master>Master Controls</a>. You need to connect to port 8000 from a local IP address or from an IP address on the same C-Class as part of Gigablast's security. Consider using an ssh tunnel if your browser's IP is not on the same C-Class. i.e. From your browser machine, ssh to the machine running the gb server: <i>ssh someservername.com -L 8000:127.0.0.1:8000</i> . Then on your browser go to the <a href=http://127.0.0.1:8000/master>Master Controls</a>.
7. Click on the <a href=http://127.0.0.1:8000/admin/inject?c=main>inject</a> menu and inject a URL into the index. It might be slow because it uses Google's public DNSes as specified in the Master Controls as 8.8.8.8 and 8.8.4.4. You should change those to your own local bind9 server for speed.
<br><br>
8. Click on the <a href=http://127.0.0.1:8000/admin/inject?c=main>inject</a> menu and inject a URL into the index. It might be slow because it uses Google's public DNSes as specified in the Master Controls as 8.8.8.8 and 8.8.4.4. You should change those to your own local bind9 server for speed.
8. When the injection completes, try a <a href=http://127.0.0.1:8000/>search</a> for the document you injected.
<br><br>
9. When the injection completes, try a <a href=http://127.0.0.1:8000/>search</a> for the document you injected.
<br><br>
10. Turn on spiders on the <a href=http://127.0.0.1:8000>Master Controls</a> page so that it will begin spidering the outlinks of the page you injected.
9. <a href=http://127.0.0.1:8000/master?se=1>Turn on spiders</a> on the <a href=http://127.0.0.1:8000/master>Master Controls</a> page so that it will begin spidering the outlinks of the page you injected.
<br>

115
html/adv.html Normal file
View File

@ -0,0 +1,115 @@
<?xml version="1.0" encoding="iso-8859-1"?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title>Gigablast Advanced Search</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
<meta name="MSSmartTagsPreventParsing" content="true" />
<meta http-equiv="imagetoolbar" content="no" />
<link href="stylesmain.css" rel="stylesheet" type="text/css" />
<script type="text/javascript">
<!--
function x(){document.f.q.focus();}
// -->
</script>
</head>
<body onload="x()">
<a href="/" target="_top"><img src="logo-small.png" alt="Gigablast Logo" title="Return to Basic Search" border="0" style="margin-bottom:15px;" /></a>
<h2>Advanced Search</h2>
<form method="get" action="/search">
<table width="605" border="0" align="center" cellpadding="5" cellspacing="3">
<tbody>
<tr align="left" valign="middle">
<th colspan="3">Search for...</th>
</tr>
<tr align="left" valign="middle">
<td><strong>all</strong> of these words</td>
<td><input type="text" name="plus" size="40" /></td>
<td><input type="submit" value="Search" /></td>
</tr>
<tr align="left" valign="middle">
<td>this <strong>exact phrase</strong></td>
<td colspan="2"><input type="text" name="quote1" size="40" /></td>
</tr>
<tr align="left" valign="middle">
<td>and this <strong>exact phrase</strong></td>
<td colspan="2"><input type="text" name="quote2" size="40" /></td>
</tr>
<tr align="left" valign="middle">
<td><strong>any</strong> of these words</td>
<td colspan="2"><input type="text" name="q" size="40" /></td>
</tr>
<tr align="left" valign="middle">
<td><strong>none</strong> of these words</td>
<td colspan="2"><input type="text" name="minus" size="40" /></td>
</tr>
<tr align="left" valign="middle">
<td>In this language:
</td>
<td colspan="2">
<select name=gblang>
<option value=0>Any</option>
<option value=1>English</option>
<option value=2>French</option>
<option value=3>Spanish</option>
<option value=4>Russian</option>
<option value=5>Turkish</option>
<option value=6>Japanese</option>
<option value=7>ChineseTrad</option>
<option value=8>ChineseSimp</option>
<option value=9>Korean</option>
<option value=10>German</option>
<option value=11>Dutch</option>
<option value=12>Italian</option>
<option value=13>Finnish</option>
<option value=14>Swedish</option>
<option value=15>Norwegian</option>
<option value=16>Portuguese</option>
<option value=17>Vietnamese</option>
<option value=18>Arabic</option>
<option value=19>Hebrew</option>
<option value=20>Indonesian</option>
<option value=21>Greek</option>
<option value=22>Thai</option>
<option value=23>Hindi</option>
<option value=24>Bengala</option>
<option value=25>Polish</option>
<option value=26>Tagalog</option>
</select>
</td>
</tr>
<tr align="left" valign="middle">
<td>Restrict to this URL</td>
<td colspan="2"><input type="text" name="url" size="40" /></td>
</tr>
<tr align="left" valign="middle">
<td>Pages that link to this URL</td>
<td colspan="2"><input type="text" name="link" size="40" /></td>
</tr>
<tr align="left" valign="middle">
<td>Site Clustering</td>
<td colspan="2"><input type="radio" name="sc" value="1" checked="checked" />yes&nbsp;&nbsp;&nbsp;<input type="radio" name="sc" value="0" />no</td>
</tr>
<tr align="left" valign="middle">
<td>Number of summary excerpts</td>
<td colspan="2"><input type="radio" name="ns" value="0" />0&nbsp;&nbsp;&nbsp;<input type="radio" name="ns" value="1" />1&nbsp;&nbsp;&nbsp;<input type="radio" name="ns" value="2" />2&nbsp;&nbsp;&nbsp;<input type="radio" name="ns" value="3" checked="checked" />3&nbsp;&nbsp;&nbsp;<input type="radio" name="ns" value="4" />4&nbsp;&nbsp;&nbsp;<input type="radio" name="ns" value="5" />5</td>
</tr>
<tr align="left" valign="middle">
<td>Results per Page</td>
<td colspan="2"><input type="radio" name="n" value="10" checked="checked" />10&nbsp;&nbsp;<input type="radio" name="n" value="20" />20&nbsp;&nbsp;<input type="radio" name="n" value="30" />30&nbsp;&nbsp;<input type="radio" name="n" value="40" />40&nbsp;&nbsp;<input type="radio" name="n" value="50" />50&nbsp;&nbsp;<input type="radio" name="n" value="100" />100</td>
</tr>
<tr align="left" valign="middle">
<td>Restrict to these Sites</td>
<td colspan="2"><textarea rows="10" cols="40" name="sites"></textarea></td>
</tr>
</tbody>
</table>
</form>
<div id="footer">Copyright &copy; 2010-2020 <a href="http://www.gigablast.com" target="_top">Gigablast,
Inc.</a> All rights reserved.</div>
</body>
</html>

BIN
html/dollargear.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.9 KiB

BIN
html/eventguru.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 39 KiB

BIN
html/gears.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.6 KiB

View File

@ -34,7 +34,7 @@ Gigablast - The Private Search Engine</font>
Gigablast does not give your IP address to any third parties, nor allow any third party to deduce what queries might be coming from your IP address. Read the text below to understand what we mean by <i>deduce</i>.
<br><br>
In the summer of 2013 <a href=https://en.wikipedia.org/wiki/Edward_Snowden>Edward Snowden</a>, an ex-NSA contractor, described a secret NSA project known as <a href=https://en.wikipedia.org/wiki/PRISM_%28surveillance_program%29>project PRISM</a>. This project by the NSA wire taps not just live data traversing the internet, but also has automated access to large data repositories controlled by major internet companies. The data repositories consist of anything from search engine query logs and private emails to chat histories, among others. With today's fairly accurate audio-to-text transcription software, even services like <a href=https://en.wikipedia.org/wiki/Skype>Skype</a> audio and video calls are being tapped.
In the summer of 2013 <a href=https://en.wikipedia.org/wiki/Edward_Snowden>Edward Snowden</a>, an ex-NSA contractor, described a secret NSA project known as <a href=https://en.wikipedia.org/wiki/PRISM_%28surveillance_program%29>project PRISM</a>, and more recently <a href="http://www.washingtonpost.com/world/national-security/nsa-infiltrates-links-to-yahoo-google-data-centers-worldwide-snowden-documents-say/2013/10/30/e51d661e-4166-11e3-8b74-d89d714ca4dd_story.html">Muscular</a>. These NSA projects wire tap not just live data traversing the internet, but also has automated access to large data repositories controlled by major internet companies. The data repositories consist of anything from search engine query logs and private emails to chat histories, among others. With today's fairly accurate audio-to-text transcription software, even services like <a href=https://en.wikipedia.org/wiki/Skype>Skype</a> audio and video calls are being tapped.
<br><br>
Such data access makes it very easy for government agencies like the NSA to set up large search engines that index these data streams and execute a list of queries on such search engines in order to profile and flag individuals for further examination.
<br><br>

View File

@ -1,18 +1,18 @@
User-Agent: googlebot
Disallow: /search
Disallow: /search?
User-Agent: bingbot
Disallow: /search
Disallow: /search?
User-Agent: msnbot
Disallow: /search
Disallow: /search?
User-Agent: slurp
Disallow: /search
Disallow: /search?
User-Agent: gigabot
Disallow: /search
Disallow: /search?
User-Agent: *
Disallow: /search
Disallow: /search?

View File

@ -41,8 +41,9 @@ counts as a single query.
<!--<li>Gigablast has many powerful <a href="/features.html">features</a>.
<br><br>-->
<li><a href=https://www.gigablast.com/account.html>Sign up now</a> to start accessing the feed.
</ul>
<br><br>
<li>You can use the search results however you want. You can rearrange them, embed ads, etc.
</ul>
</td>
</tr>
</table>

View File

@ -2454,9 +2454,21 @@ int main ( int argc , char *argv[] ) {
if ( setrlimit(RLIMIT_CORE,&lim) )
log("db: setrlimit: %s.", mstrerror(errno) );
// limit fds
//lim.rlim_cur = lim.rlim_max = 511;
//if ( setrlimit(RLIMIT_NOFILE,&lim))
// log("db: setrlimit2: %s.", mstrerror(errno) );
// try to prevent core from systems where it is above 1024
// because our FD_ISSET() libc function will core! (it's older)
long NOFILE = 1024;
lim.rlim_cur = lim.rlim_max = NOFILE;
if ( setrlimit(RLIMIT_NOFILE,&lim))
log("db: setrlimit RLIMIT_NOFILE %li: %s.",
NOFILE,mstrerror(errno) );
struct rlimit rlim;
getrlimit ( RLIMIT_NOFILE,&rlim);
if ( (long)rlim.rlim_max > NOFILE || (long)rlim.rlim_cur > NOFILE ) {
log("db: setrlimit RLIMIT_NOFILE failed!");
char *xx=NULL;*xx=0;
}
log("db: RLIMIT_NOFILE = %li",(long)rlim.rlim_max);
//exit(0);
// . disable o/s's and hard drive's read ahead
// . set multcount to 16 --> 1 interrupt for every 16 sectors read
// . multcount of 16 reduces OS overhead by 30%-50% (more throughput)

View File

@ -154,6 +154,13 @@ int main ( int argc , char *argv[] ) {
printf("%s\n", out );
}
// encoded
char dst[MAX_URL_LEN+200];
urlEncode ( dst,MAX_URL_LEN+100,
u.getUrl(), u.getUrlLen(),
false ); // are we encoding a request path?
printf("encoded: %s\n",dst);
// the probable docid
long long pd = g_titledb.getProbableDocId(&u);
printf("pdocid: %llu\n", pd );