Merge branch 'testing' into diffbot-testing

This commit is contained in:
Matt Wells 2014-09-30 16:02:07 -07:00
commit 23d26e26ba
55 changed files with 3185 additions and 698 deletions

View File

@ -1686,6 +1686,7 @@ CollectionRec::CollectionRec() {
// inits for sortbydatetable // inits for sortbydatetable
m_inProgress = false; m_inProgress = false;
m_msg5 = NULL; m_msg5 = NULL;
m_importState = NULL;
// JAB - track which regex parsers have been initialized // JAB - track which regex parsers have been initialized
//log(LOG_DEBUG,"regex: %p initalizing empty parsers", m_pRegExParser); //log(LOG_DEBUG,"regex: %p initalizing empty parsers", m_pRegExParser);

View File

@ -501,6 +501,7 @@ class CollectionRec {
char m_enforceNewQuotas ; char m_enforceNewQuotas ;
char m_doIpLookups ; // considered iff using proxy char m_doIpLookups ; // considered iff using proxy
char m_useRobotsTxt ; char m_useRobotsTxt ;
char m_forceUseFloaters ;
//char m_restrictDomain ; // say on same domain as seeds? //char m_restrictDomain ; // say on same domain as seeds?
char m_doTuringTest ; // for addurl char m_doTuringTest ; // for addurl
char m_applyFilterToText ; // speeds us up char m_applyFilterToText ; // speeds us up
@ -566,6 +567,8 @@ class CollectionRec {
long m_numImportInjects; long m_numImportInjects;
class ImportState *m_importState; class ImportState *m_importState;
SafeBuf m_collectionPasswords;
SafeBuf m_collectionIps;
// from Conf.h // from Conf.h
long m_posdbMinFilesToMerge ; long m_posdbMinFilesToMerge ;

140
Conf.cpp
View File

@ -88,17 +88,96 @@ bool Conf::isMasterAdmin ( TcpSocket *s , HttpRequest *r ) {
} }
*/ */
bool Conf::isCollAdmin ( TcpSocket *socket , HttpRequest *hr ) { bool isInWhiteSpaceList ( char *p , char *buf ) {
// until we have coll tokens use this...
return isRootAdmin ( socket , hr ); if ( ! p ) return false;
char *match = strstr ( buf , p );
if ( ! match ) return false;
long len = gbstrlen(p);
// ensure book-ended by whitespace
if ( match &&
(match == buf || is_wspace_a(match[-1])) &&
(!match[len] || is_wspace_a(match[len])) )
return true;
// no match
return false;
} }
bool Conf::isCollAdmin ( TcpSocket *sock , HttpRequest *hr ) {
// until we have coll tokens use this...
//return isRootAdmin ( socket , hr );
// root always does
if ( isRootAdmin ( sock , hr ) ) return true;
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
if ( ! cr ) return false;
return isCollAdmin2 ( sock , hr , cr );
}
bool Conf::isCollAdminForColl ( TcpSocket *sock, HttpRequest *hr, char *coll ){
CollectionRec *cr = g_collectiondb.getRec ( coll );
if ( ! cr ) return false;
return isCollAdmin2 ( sock , hr , cr );
}
bool Conf::isCollAdmin2 ( TcpSocket *sock ,
HttpRequest *hr ,
CollectionRec *cr ) {
if ( ! cr ) return false;
//long page = g_pages.getDynamicPageNumber(hr);
// never for main or dmoz! must be root!
if ( strcmp(cr->m_coll,"main")==0 ) return false;
if ( strcmp(cr->m_coll,"dmoz")==0 ) return false;
// empty password field? then allow them through
if ( cr->m_collectionPasswords.length() <= 0 &&
cr->m_collectionIps .length() <= 0 )
return true;
// a good ip?
char *p = iptoa(sock->m_ip);
char *buf = cr->m_collectionIps.getBufStart();
if ( isInWhiteSpaceList ( p , buf ) ) return true;
// if they got the password, let them in
p = hr->getString("pwd");
if ( ! p ) p = hr->getString("password");
if ( ! p ) p = hr->getStringFromCookie("pwd");
if ( ! p ) return false;
buf = cr->m_collectionPasswords.getBufStart();
if ( isInWhiteSpaceList ( p , buf ) ) return true;
// the very act of just knowing the collname of a guest account
// is good enough to update it
//if ( strncmp ( cr->m_coll , "guest_" , 6 ) == 0 )
// return true;
return false;
}
// . is user a root administrator? // . is user a root administrator?
// . only need to be from root IP *OR* have password, not both // . only need to be from root IP *OR* have password, not both
bool Conf::isRootAdmin ( TcpSocket *socket , HttpRequest *hr ) { bool Conf::isRootAdmin ( TcpSocket *socket , HttpRequest *hr ) {
// totally open access? // totally open access?
if ( m_numConnectIps <= 0 && m_numMasterPwds <= 0 ) //if ( m_numConnectIps <= 0 && m_numMasterPwds <= 0 )
if ( m_connectIps.length() <= 0 &&
m_masterPwds.length() <= 0 )
return true; return true;
// coming from root gets you in // coming from root gets you in
@ -114,7 +193,9 @@ bool Conf::isRootAdmin ( TcpSocket *socket , HttpRequest *hr ) {
bool Conf::hasRootPwd ( HttpRequest *hr ) { bool Conf::hasRootPwd ( HttpRequest *hr ) {
if ( m_numMasterPwds == 0 ) return false; //if ( m_numMasterPwds == 0 ) return false;
if ( m_masterPwds.length() <= 0 )
return false;
char *p = hr->getString("pwd"); char *p = hr->getString("pwd");
@ -124,43 +205,46 @@ bool Conf::hasRootPwd ( HttpRequest *hr ) {
if ( ! p ) return false; if ( ! p ) return false;
for ( long i = 0 ; i < m_numMasterPwds ; i++ ) { char *buf = m_masterPwds.getBufStart();
if ( strcmp ( m_masterPwds[i], p ) != 0 ) continue;
// we got a match return isInWhiteSpaceList ( p , buf );
return true;
}
return false;
} }
// . check this ip in the list of admin ips // . check this ip in the list of admin ips
bool Conf::isRootIp ( unsigned long ip ) { bool Conf::isRootIp ( unsigned long ip ) {
//if ( m_numMasterIps == 0 ) return false; //if ( m_numMasterIps == 0 ) return false;
if ( m_numConnectIps == 0 ) return false; //if ( m_numConnectIps == 0 ) return false;
if ( m_connectIps.length() <= 0 ) return false;
for ( long i = 0 ; i < m_numConnectIps ; i++ ) // for ( long i = 0 ; i < m_numConnectIps ; i++ )
if ( m_connectIps[i] == (long)ip ) // if ( m_connectIps[i] == (long)ip )
return true; // return true;
//if ( ip == atoip("10.5.0.2",8) ) return true; //if ( ip == atoip("10.5.0.2",8) ) return true;
// no match char *p = iptoa(ip);
return false; char *buf = m_connectIps.getBufStart();
return isInWhiteSpaceList ( p , buf );
} }
bool Conf::isConnectIp ( unsigned long ip ) { bool Conf::isConnectIp ( unsigned long ip ) {
for ( long i = 0 ; i < m_numConnectIps ; i++ ) {
if ( m_connectIps[i] == (long)ip ) return isRootIp(ip);
return true;
// . 1.2.3.0 ips mean the whole block // for ( long i = 0 ; i < m_numConnectIps ; i++ ) {
// . the high byte in the long is the Least Signficant Byte // if ( m_connectIps[i] == (long)ip )
if ( (m_connectIps[i] >> 24) == 0 && // return true;
(m_connectIps[i] & 0x00ffffff) == // // . 1.2.3.0 ips mean the whole block
((long)ip & 0x00ffffff) ) // // . the high byte in the long is the Least Signficant Byte
return true; // if ( (m_connectIps[i] >> 24) == 0 &&
} // (m_connectIps[i] & 0x00ffffff) ==
// ((long)ip & 0x00ffffff) )
// return true;
// }
// no match // no match
return false; //return false;
} }
// . set all member vars to their default values // . set all member vars to their default values

15
Conf.h
View File

@ -50,6 +50,10 @@ class Conf {
Conf(); Conf();
bool isCollAdmin ( TcpSocket *socket , HttpRequest *hr ) ; bool isCollAdmin ( TcpSocket *socket , HttpRequest *hr ) ;
bool isCollAdminForColl (TcpSocket *sock, HttpRequest *hr,char *coll );
bool isCollAdmin2 (TcpSocket *socket , HttpRequest *hr,
class CollectionRec *cr) ;
bool isRootAdmin ( TcpSocket *socket , HttpRequest *hr ) ; bool isRootAdmin ( TcpSocket *socket , HttpRequest *hr ) ;
//bool isMasterAdmin ( class TcpSocket *s , class HttpRequest *r ); //bool isMasterAdmin ( class TcpSocket *s , class HttpRequest *r );
@ -686,14 +690,17 @@ class Conf {
// programmer reminders. // programmer reminders.
bool m_logReminders; bool m_logReminders;
long m_numMasterPwds; //long m_numMasterPwds;
char m_masterPwds[MAX_MASTER_PASSWORDS][PASSWORD_MAX_LEN]; //char m_masterPwds[MAX_MASTER_PASSWORDS][PASSWORD_MAX_LEN];
SafeBuf m_masterPwds;
//long m_numMasterIps; //long m_numMasterIps;
//long m_masterIps[MAX_MASTER_IPS]; //long m_masterIps[MAX_MASTER_IPS];
// these are the new master ips // these are the new master ips
long m_numConnectIps; //long m_numConnectIps;
long m_connectIps [ MAX_CONNECT_IPS ]; //long m_connectIps [ MAX_CONNECT_IPS ];
SafeBuf m_connectIps;
// should we generate similarity/content vector for titleRecs lacking? // should we generate similarity/content vector for titleRecs lacking?
// this takes a ~100+ ms, very expensive, so it is just meant for // this takes a ~100+ ms, very expensive, so it is just meant for

View File

@ -618,27 +618,41 @@ bool File::closeLeastUsed () {
long File::getFileSize ( ) { long File::getFileSize ( ) {
// allow the substitution of another filename // allow the substitution of another filename
struct stat stats; //struct stat stats;
stats.st_size = 0; //stats.st_size = 0;
int status = stat ( m_filename , &stats ); //int status = stat ( m_filename , &stats );
FILE *fd = fopen ( m_filename , "r" );
if ( ! fd ) {
log("disk: error getFileSize(%s) : %s",
m_filename , strerror(g_errno));
return -1;
}
fseek(fd,0,SEEK_END);
long fileSize = ftell ( fd );
fclose ( fd );
return fileSize;
// return the size if the status was ok // return the size if the status was ok
if ( status == 0 ) return stats.st_size; //if ( status == 0 ) return stats.st_size;
// copy errno to g_errno // copy errno to g_errno
g_errno = errno; //g_errno = errno;
// return 0 and reset g_errno if it just does not exist // return 0 and reset g_errno if it just does not exist
if ( g_errno == ENOENT ) { g_errno = 0; return 0; } //if ( g_errno == ENOENT ) { g_errno = 0; return 0; }
// resource temporarily unavailable (for newer libc) // resource temporarily unavailable (for newer libc)
if ( g_errno == EAGAIN ) { g_errno = 0; return 0; } //if ( g_errno == EAGAIN ) { g_errno = 0; return 0; }
// log & return -1 on any other error // log & return -1 on any other error
log("disk: error getFileSize(%s) : %s",m_filename , strerror(g_errno)); //log("disk: error getFileSize(%s) : %s",m_filename,strerror(g_errno));
return -1; //return -1;
} }
// . return 0 on error // . return 0 on error

View File

@ -432,7 +432,6 @@ bool HashTableX::load ( char *dir , char *filename , SafeBuf *fillBuf ) {
// both return false and set g_errno on error, true otherwise // both return false and set g_errno on error, true otherwise
bool HashTableX::load ( char *dir, char *filename, char **tbuf, long *tsize ) { bool HashTableX::load ( char *dir, char *filename, char **tbuf, long *tsize ) {
reset();
File f; File f;
f.set ( dir , filename ); f.set ( dir , filename );
if ( ! f.doesExist() ) return false; if ( ! f.doesExist() ) return false;
@ -447,10 +446,27 @@ bool HashTableX::load ( char *dir, char *filename, char **tbuf, long *tsize ) {
off += 4; off += 4;
if ( ! f.read ( &numSlotsUsed , 4 , off ) ) return false; if ( ! f.read ( &numSlotsUsed , 4 , off ) ) return false;
off += 4; off += 4;
if ( ! f.read ( &m_ks , 4 , off ) ) return false; long ks;
if ( ! f.read ( &ks , 4 , off ) ) return false;
off += 4; off += 4;
if ( ! f.read ( &m_ds , 4 , off ) ) return false; long ds;
if ( ! f.read ( &ds , 4 , off ) ) return false;
off += 4; off += 4;
// bogus key size?
if ( ks <= 0 ) {
log("htable: reading hashtable from %s%s: "
"bogus keysize of %li",
dir,filename,ks );
return false;
}
// just in case m_ks was already set, call reset() down here
reset();
m_ks = ks;
m_ds = ds;
if ( ! setTableSize ( numSlots , NULL , 0 ) ) return false; if ( ! setTableSize ( numSlots , NULL , 0 ) ) return false;
if ( ! f.read ( m_keys , numSlots * m_ks , off ) ) return false; if ( ! f.read ( m_keys , numSlots * m_ks , off ) ) return false;
off += numSlots * m_ks; off += numSlots * m_ks;

View File

@ -893,11 +893,11 @@ bool HttpRequest::set ( char *origReq , long origReqLen , TcpSocket *sock ) {
// connectips/adminips // connectips/adminips
for ( long i = 0 ; i < g_conf.m_numConnectIps ; i++ ) { // for ( long i = 0 ; i < g_conf.m_numConnectIps ; i++ ) {
if ( sock->m_ip != g_conf.m_connectIps[i] ) continue; // if ( sock->m_ip != g_conf.m_connectIps[i] ) continue;
m_isLocal = true; // m_isLocal = true;
break; // break;
} // }
// roadrunner ip // roadrunner ip
// if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0) // if ( sock && strncmp(iptoa(sock->m_ip),"66.162.42.131",13) == 0)

View File

@ -1194,8 +1194,8 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
//if ( ! strncmp ( path ,"/help.html", pathLen ) ) //if ( ! strncmp ( path ,"/help.html", pathLen ) )
// return sendPageAbout ( s , r , path ); // return sendPageAbout ( s , r , path );
if ( ! strncmp ( path ,"/adv.html", pathLen ) ) //if ( ! strncmp ( path ,"/adv.html", pathLen ) )
return sendPageAdvanced ( s , r ); // return sendPageAdvanced ( s , r );
//if ( ! strncmp ( path ,"/about.html", pathLen ) ) //if ( ! strncmp ( path ,"/about.html", pathLen ) )
// return sendPageAbout ( s , r ); // return sendPageAbout ( s , r );
@ -1208,6 +1208,9 @@ bool HttpServer::sendReply ( TcpSocket *s , HttpRequest *r , bool isAdmin) {
if ( ! strncmp ( path ,"/widgets.html", pathLen ) ) if ( ! strncmp ( path ,"/widgets.html", pathLen ) )
return sendPageWidgets ( s , r ); return sendPageWidgets ( s , r );
if ( ! strncmp ( path ,"/adv.html", pathLen ) )
return sendPagePretty ( s , r,"adv.html","advanced");
// who uses gigablast? // who uses gigablast?
if ( ! strncmp ( path ,"/users.html", pathLen ) ) if ( ! strncmp ( path ,"/users.html", pathLen ) )
return sendPagePretty ( s , r,"users.html","users"); // special return sendPagePretty ( s , r,"users.html","users"); // special

View File

@ -190,10 +190,12 @@ class HttpServer {
//header to reflect the new size and encoding //header to reflect the new size and encoding
TcpSocket *unzipReply(TcpSocket* s); TcpSocket *unzipReply(TcpSocket* s);
float getCompressionRatio() float getCompressionRatio() {
{return (float)m_uncompressedBytes/m_bytesDownloaded;} if ( m_bytesDownloaded )
return (float)m_uncompressedBytes/m_bytesDownloaded;
else
return 0.0;
};
//this is for low priority requests which come in while we are //this is for low priority requests which come in while we are
//in a quickpoll //in a quickpoll
@ -225,8 +227,8 @@ class HttpServer {
void *states[MAX_DOWNLOADS]; void *states[MAX_DOWNLOADS];
tcp_callback_t callbacks[MAX_DOWNLOADS]; tcp_callback_t callbacks[MAX_DOWNLOADS];
long m_bytesDownloaded; long long m_bytesDownloaded;
long m_uncompressedBytes; long long m_uncompressedBytes;
//QueuedRequest m_requestQueue[MAX_REQUEST_QUEUE]; //QueuedRequest m_requestQueue[MAX_REQUEST_QUEUE];
//long m_lastSlotUsed; //long m_lastSlotUsed;

View File

@ -2,6 +2,7 @@ SHELL = /bin/bash
CC=g++ CC=g++
# remove dlstubs.o for CYGWIN
OBJS = UdpSlot.o Rebalance.o \ OBJS = UdpSlot.o Rebalance.o \
Msg13.o Mime.o IndexReadInfo.o \ Msg13.o Mime.o IndexReadInfo.o \
PageGet.o PageHosts.o PageIndexdb.o \ PageGet.o PageHosts.o PageIndexdb.o \
@ -96,7 +97,7 @@ CPPFLAGS = -m32 -g -Wall -pipe -fno-stack-protector -Wno-write-strings -Wstrict-
LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread LIBS= -L. ./libz.a ./libssl.a ./libcrypto.a ./libiconv.a ./libm.a ./libstdc++.a -lpthread
# use this for compiling on CYGWIN: (only for 32bit cygwin right now and # use this for compiling on CYGWIN: (only for 32bit cygwin right now and
# you have to install the packages that have these libs. # you have to install the packages that have these libs.
#LIBS= -lz -lm -lpthread -lssl -lcrypto -iconv -lz #LIBS= -lz -lm -lpthread -lssl -lcrypto -liconv
endif endif
@ -567,10 +568,10 @@ master-rpm:
# deb-master # deb-master
master-deb: master-deb:
# need to change in changelog too!! dont' forget!!! # need to change in changelog too!! dont' forget!!!
git archive --format=tar --prefix=gb-1.14/ master > ../gb_1.14.orig.tar git archive --format=tar --prefix=gb-1.16/ master > ../gb_1.16.orig.tar
rm -rf debian rm -rf debian
# change "-p gb_1.0" to "-p gb_1.1" to update version for example # change "-p gb_1.0" to "-p gb_1.1" to update version for example
dh_make -e gigablast@mail.com -p gb_1.14 -f ../gb_1.14.orig.tar dh_make -e gigablast@mail.com -p gb_1.16 -f ../gb_1.16.orig.tar
# zero this out, it is just filed with the .txt files erroneously and it'll # zero this out, it is just filed with the .txt files erroneously and it'll
# try to automatiicaly install in /usr/docs/ # try to automatiicaly install in /usr/docs/
rm debian/docs rm debian/docs
@ -595,7 +596,7 @@ master-deb:
# upload den # upload den
scp gb*.deb gk268:/w/html/ scp gb*.deb gk268:/w/html/
# alien it # alien it
sudo alien --to-rpm gb_1.14-1_i386.deb sudo alien --to-rpm gb_1.16-1_i386.deb
# upload rpm # upload rpm
scp gb*.rpm gk268:/w/html/ scp gb*.rpm gk268:/w/html/

View File

@ -732,9 +732,13 @@ void downloadTheDocForReals2 ( Msg13Request *r ) {
// user can turn off proxy use with this switch // user can turn off proxy use with this switch
if ( ! g_conf.m_useProxyIps ) useProxies = false; if ( ! g_conf.m_useProxyIps ) useProxies = false;
// for diffbot turn ON if use robots is off
if ( r->m_forceUseFloaters ) useProxies = true;
// we gotta have some proxy ips that we can use // we gotta have some proxy ips that we can use
if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false; if ( ! g_conf.m_proxyIps.hasDigits() ) useProxies = false;
// we did not need a spider proxy ip so send this reuest to a host // we did not need a spider proxy ip so send this reuest to a host
// to download the url // to download the url
if ( ! useProxies ) { if ( ! useProxies ) {

View File

@ -97,6 +97,7 @@ public:
long m_isSquidProxiedUrl:1; long m_isSquidProxiedUrl:1;
long m_foundInCache:1; long m_foundInCache:1;
long m_forceUseFloaters:1;
//long m_testParserEnabled:1; //long m_testParserEnabled:1;
//long m_testSpiderEnabled:1; //long m_testSpiderEnabled:1;

View File

@ -105,6 +105,7 @@ Msg40::Msg40() {
m_numPrintedSoFar = 0; m_numPrintedSoFar = 0;
m_lastChunk = false; m_lastChunk = false;
m_didSummarySkip = false; m_didSummarySkip = false;
m_omitCount = 0;
//m_numGigabitInfos = 0; //m_numGigabitInfos = 0;
} }
@ -159,6 +160,9 @@ bool Msg40::getResults ( SearchInput *si ,
bool forward , bool forward ,
void *state , void *state ,
void (* callback) ( void *state ) ) { void (* callback) ( void *state ) ) {
m_omitCount = 0;
// warning // warning
//if ( ! si->m_coll2 ) log(LOG_LOGIC,"net: NULL collection. msg40."); //if ( ! si->m_coll2 ) log(LOG_LOGIC,"net: NULL collection. msg40.");
if ( si->m_collnumBuf.length() < (long)sizeof(collnum_t) ) if ( si->m_collnumBuf.length() < (long)sizeof(collnum_t) )
@ -2404,6 +2408,9 @@ bool Msg40::gotSummary ( ) {
// how many docids are visible? (unfiltered) // how many docids are visible? (unfiltered)
//long visible = m_filterStats[CR_OK]; //long visible = m_filterStats[CR_OK];
m_omitCount = 0;
// count how many are visible! // count how many are visible!
long visible = 0; long visible = 0;
// loop over each clusterLevel and set it // loop over each clusterLevel and set it
@ -2412,6 +2419,8 @@ bool Msg40::gotSummary ( ) {
char *level = &m_msg3a.m_clusterLevels[i]; char *level = &m_msg3a.m_clusterLevels[i];
// on CR_OK // on CR_OK
if ( *level == CR_OK ) visible++; if ( *level == CR_OK ) visible++;
// otherwise count as ommitted
else m_omitCount++;
} }
// do we got enough search results now? // do we got enough search results now?
@ -2464,10 +2473,16 @@ bool Msg40::gotSummary ( ) {
// if we do not have enough visible, try to get more // if we do not have enough visible, try to get more
if ( visible < m_docsToGetVisible && m_msg3a.m_moreDocIdsAvail && if ( visible < m_docsToGetVisible && m_msg3a.m_moreDocIdsAvail &&
// do not spin too long in this!
// TODO: fix this better somehow later
m_docsToGet <= 1000 &&
// doesn't work on multi-coll just yet, it cores // doesn't work on multi-coll just yet, it cores
m_numCollsToSearch == 1 ) { m_numCollsToSearch == 1 ) {
// can it cover us? // can it cover us?
long need = m_msg3a.m_docsToGet + 20; //long need = m_msg3a.m_docsToGet + 20;
long need = m_docsToGet + 20;
// increase by 25 percent as well
need *= 1.25;
// note it // note it
log("msg40: too many summaries invisible. getting more " log("msg40: too many summaries invisible. getting more "
"docids from msg3a merge and getting summaries. " "docids from msg3a merge and getting summaries. "
@ -2479,20 +2494,31 @@ bool Msg40::gotSummary ( ) {
m_numReplies, m_numRequests); m_numReplies, m_numRequests);
// get more // get more
//m_docsToGet = need; //m_docsToGet = need;
// merge more
m_msg3a.m_docsToGet = need; // get more!
m_msg3a.mergeLists(); //m_msg3a.m_docsToGet = need;
// rellaoc the msg20 array m_docsToGet = need;
if ( ! reallocMsg20Buf() ) return true;
// reset this before launch // reset this before launch
m_numReplies = 0; m_numReplies = 0;
m_numRequests = 0; m_numRequests = 0;
// reprocess all! // reprocess all!
m_lastProcessedi = -1; m_lastProcessedi = -1;
// let's do it all from the top!
return getDocIds ( true ) ;
//m_msg3a.mergeLists();
// rellaoc the msg20 array
//if ( ! reallocMsg20Buf() ) return true;
// reset this before launch
//m_numReplies = 0;
//m_numRequests = 0;
// reprocess all!
//m_lastProcessedi = -1;
// now launch! // now launch!
if ( ! launchMsg20s ( true ) ) return false; //if ( ! launchMsg20s ( true ) ) return false;
// all done, call callback // all done, call callback
return true; //return true;
} }
/* /*

View File

@ -220,6 +220,7 @@ class Msg40 {
HashTableX m_facetTextTable; HashTableX m_facetTextTable;
SafeBuf m_facetTextBuf; SafeBuf m_facetTextBuf;
bool m_firstTime; bool m_firstTime;
long m_omitCount;
bool printFacetTables ( class SafeBuf *sb ) ; bool printFacetTables ( class SafeBuf *sb ) ;
bool printFacetsForTable ( SafeBuf *sb , QueryTerm *qt ); bool printFacetsForTable ( SafeBuf *sb , QueryTerm *qt );

View File

@ -245,6 +245,11 @@ class Multicast {
long m_hack32; long m_hack32;
long long m_hack64; long long m_hack64;
// more hack stuff used by PageInject.cpp
long m_hackFileId;
long long m_hackFileOff;
class ImportState *m_importState;
// hacky crunk use by seo pipeline in xmldoc.cpp // hacky crunk use by seo pipeline in xmldoc.cpp
//void *m_hackxd; //void *m_hackxd;
//void *m_hackHost; //void *m_hackHost;

View File

@ -81,6 +81,9 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
char *action = r->getString("action",NULL); char *action = r->getString("action",NULL);
char *addColl = r->getString("addcoll",NULL); char *addColl = r->getString("addcoll",NULL);
// add our ip to the list
//char *ips = r->getString("collips",NULL);
//char *pwds = r->getString("collpwd",NULL);
char buf [ 64*1024 ]; char buf [ 64*1024 ];
@ -88,7 +91,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
// //
// CLOUD SEARCH ENGIEN SUPPORT - GIGABOT ERRORS // CLOUD SEARCH ENGINE SUPPORT - GIGABOT ERRORS
// //
SafeBuf gtmp; SafeBuf gtmp;
@ -239,6 +242,41 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
"</tr>" "</tr>"
, LIGHT_BLUE , LIGHT_BLUE
); );
// collection pwds
p.safePrintf(
"<tr bgcolor=#%s>"
"<td><b>collection passwords"
"</b>"
"<br><font size=1>List of white space separated "
"passwords allowed to adminster collection."
"</font>"
"</td>\n"
"<td><input type=text name=collpwd "
"size=60>"
"</td>"
"</tr>"
, LIGHT_BLUE
);
// ips box for security
p.safePrintf(
"<tr bgcolor=#%s>"
"<td><b>collection ips"
"</b>"
"<br><font size=1>List of white space separated "
"IPs allowed to adminster collection."
"</font>"
"</td>\n"
"<td><input type=text name=collips "
"size=60>"
"</td>"
"</tr>"
, LIGHT_BLUE
);
// now list collections from which to copy the config // now list collections from which to copy the config
//p.safePrintf ( //p.safePrintf (
// "<tr><td><b>copy configuration from this " // "<tr><td><b>copy configuration from this "

View File

@ -622,6 +622,8 @@ bool processLoop ( void *state ) {
// do not show header for json object display // do not show header for json object display
if ( xd->m_contentType == CT_JSON ) if ( xd->m_contentType == CT_JSON )
includeHeader = false; includeHeader = false;
if ( xd->m_contentType == CT_XML )
includeHeader = false;
if ( format == FORMAT_XML ) includeHeader = false; if ( format == FORMAT_XML ) includeHeader = false;
if ( format == FORMAT_JSON ) includeHeader = false; if ( format == FORMAT_JSON ) includeHeader = false;
@ -868,6 +870,10 @@ bool processLoop ( void *state ) {
// calculate bufLen // calculate bufLen
//long bufLen = p - buf; //long bufLen = p - buf;
/*
MDW: return the xml page as is now. 9/28/2014
long ct = xd->m_contentType; long ct = xd->m_contentType;
// now filter the entire buffer to escape out the xml tags // now filter the entire buffer to escape out the xml tags
@ -890,6 +896,7 @@ bool processLoop ( void *state ) {
//bufLen = newbuf.length(); //bufLen = newbuf.length();
sb->stealBuf ( &newbuf ); sb->stealBuf ( &newbuf );
} }
*/
// now encapsulate it in html head/tail and send it off // now encapsulate it in html head/tail and send it off
// sendErr: // sendErr:

View File

@ -234,10 +234,10 @@ skipReplaceHost:
"<b>mem used</a></td>" "<b>mem used</a></td>"
"<td><a href=\"/admin/hosts?c=%s&sort=10\">" "<td><a href=\"/admin/hosts?c=%s&sort=10\">"
"<b>cpu</a></td>" "<b>cpu used</a></td>"
"<td><a href=\"/admin/hosts?c=%s&sort=17\">" "<td><a href=\"/admin/hosts?c=%s&sort=17\">"
"<b>disk</a></td>" "<b>disk used</a></td>"
"<td><a href=\"/admin/hosts?c=%s&sort=14\">" "<td><a href=\"/admin/hosts?c=%s&sort=14\">"
"<b>max ping1</a></td>" "<b>max ping1</a></td>"
@ -1224,13 +1224,13 @@ skipReplaceHost:
"</tr>\n" "</tr>\n"
"<tr class=poo>" "<tr class=poo>"
"<td>cpu usage</td>" "<td>cpu used</td>"
"<td>Percentage of cpu resources in use by the gb process." "<td>Percentage of cpu resources in use by the gb process."
"</td>" "</td>"
"</tr>\n" "</tr>\n"
"<tr class=poo>" "<tr class=poo>"
"<td>disk usage</td>" "<td>disk used</td>"
"<td>Percentage of disk in use. When this gets close to " "<td>Percentage of disk in use. When this gets close to "
"100%% you need to do something." "100%% you need to do something."
"</td>" "</td>"

View File

@ -418,21 +418,67 @@ bool Msg7::inject ( char *coll ,
} }
// returns false if would block // returns false if would block
bool Msg7::injectTitleRec ( void *state , // bool Msg7::injectTitleRec ( void *state ,
void (*callback)(void *state) , // void (*callback)(void *state) ,
CollectionRec *cr ) { // CollectionRec *cr ) {
m_state = state;
m_callback = callback;
static void sendReply ( UdpSlot *slot ) {
if ( g_errno )
g_udpServer.sendErrorReply(slot,g_errno);
else
g_udpServer.sendReply_ass(NULL,0,NULL,0,slot);
}
// when XmlDoc::inject() complets it calls this
void doneInjectingWrapper10 ( void *state ) {
XmlDoc *xd = (XmlDoc *)state;
UdpSlot *slot = (UdpSlot *)xd->m_slot;
long err = g_errno;
mdelete ( xd, sizeof(XmlDoc) , "PageInject" );
delete (xd);
g_errno = err;
sendReply ( slot );
}
void handleRequest7 ( UdpSlot *slot , long netnice ) {
//m_state = state;
//m_callback = callback;
// shortcut // shortcut
XmlDoc *xd = &m_xd; XmlDoc *xd;
try { xd = new (XmlDoc); }
catch ( ... ) {
g_errno = ENOMEM;
log("PageInject: import failed: new(%i): %s",
(int)sizeof(XmlDoc),mstrerror(g_errno));
sendReply(slot);
return;
}
mnew ( xd, sizeof(XmlDoc) , "PageInject" );
xd->reset(); //xd->reset();
char *titleRec = slot->m_readBuf;
long titleRecSize = slot->m_readBufSize;
long collnum = *(long *)titleRec;
titleRec += 4;
titleRecSize -= 4;
CollectionRec *cr = g_collectiondb.m_recs[collnum];
if ( ! cr ) {
sendReply(slot);
return;
}
// if injecting a titlerec from an import operation use set2() // if injecting a titlerec from an import operation use set2()
//if ( m_sbuf.length() > 0 ) { //if ( m_sbuf.length() > 0 ) {
xd->set2 ( m_sbuf.getBufStart() , xd->set2 ( titleRec,//m_sbuf.getBufStart() ,
m_sbuf.length() , titleRecSize,//m_sbuf.length() ,
cr->m_coll , cr->m_coll ,
NULL, // pbuf NULL, // pbuf
MAX_NICENESS , MAX_NICENESS ,
@ -442,14 +488,20 @@ bool Msg7::injectTitleRec ( void *state ,
// call this when done indexing // call this when done indexing
//xd->m_masterState = this; //xd->m_masterState = this;
//xd->m_masterLoop = doneInjectingWrapper9; //xd->m_masterLoop = doneInjectingWrapper9;
xd->m_state = this; xd->m_state = xd;//this;
xd->m_callback1 = doneInjectingWrapper9; xd->m_callback1 = doneInjectingWrapper10;
xd->m_isImporting = true; xd->m_isImporting = true;
xd->m_isImportingValid = true; xd->m_isImportingValid = true;
// hack this
xd->m_slot = slot;
// then index it // then index it
if ( ! xd->indexDoc() ) if ( ! xd->indexDoc() )
return false; // return if would block
return true; return;
// all done?
//return true;
sendReply ( slot );
} }
@ -795,7 +847,7 @@ class ImportState {
public: public:
// available msg7s to use // available msg7s to use
class Msg7 **m_ptrs; class Multicast *m_ptrs;
long m_numPtrs; long m_numPtrs;
// collection we are importing INTO // collection we are importing INTO
@ -811,7 +863,7 @@ public:
bool m_loadedPlaceHolder; bool m_loadedPlaceHolder;
long long m_bfFileSize; long long m_bfFileSize;
class Msg7 *getAvailMsg7(); class Multicast *getAvailMulticast();// Msg7();
void saveFileBookMark ( );//class Msg7 *msg7 ); void saveFileBookMark ( );//class Msg7 *msg7 );
@ -837,14 +889,11 @@ ImportState::ImportState () {
void ImportState::reset() { void ImportState::reset() {
for ( long i = 0 ; i < m_numPtrs ; i++ ) { for ( long i = 0 ; i < m_numPtrs ; i++ ) {
Msg7 *msg7 = m_ptrs[i]; Multicast *mcast = &m_ptrs[i];
if ( ! msg7 ) continue; mcast->destructor();
msg7->reset();
mdelete ( msg7, sizeof(Msg7) , "PageInject" );
delete (msg7);
//m_ptrs[i] = NULL; //m_ptrs[i] = NULL;
} }
mfree ( m_ptrs , MAXINJECTSOUT * sizeof(Msg7 *) , "ism7f" ); mfree ( m_ptrs , MAXINJECTSOUT * sizeof(Multicast) , "ism7f" );
m_ptrs = NULL; m_ptrs = NULL;
m_numPtrs = 0; m_numPtrs = 0;
m_fileOffset = 0LL; m_fileOffset = 0LL;
@ -868,6 +917,8 @@ bool resumeImports ( ) {
if ( s_tried ) return true; if ( s_tried ) return true;
s_tried = true; s_tried = true;
if ( g_hostdb.m_hostId != 0 ) return true;
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
CollectionRec *cr = g_collectiondb.m_recs[i]; CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue; if ( ! cr ) continue;
@ -1016,7 +1067,7 @@ bool ImportState::setCurrentTitleFileAndOffset ( ) {
return true;//&m_bf; return true;//&m_bf;
} }
void gotMsg7ReplyWrapper ( void *state ) ; void gotMulticastReplyWrapper ( void *state , void *state2 ) ;
// //
@ -1036,7 +1087,7 @@ bool ImportState::importLoop ( ) {
CollectionRec *cr = g_collectiondb.getRec ( m_collnum ); CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! cr ) { if ( ! cr || g_hostdb.m_hostId != 0 ) {
// if coll was deleted! // if coll was deleted!
log("import: collnum %li deleted while importing into", log("import: collnum %li deleted while importing into",
(long)m_collnum); (long)m_collnum);
@ -1059,6 +1110,20 @@ bool ImportState::importLoop ( ) {
} }
if ( ! cr->m_importEnabled ) {
// wait for all to return
if ( out > 0 ) return false;
// then delete it
log("import: collnum %li import loop disabled",
(long)m_collnum);
mdelete ( this, sizeof(ImportState) , "impstate");
delete (this);
return true;
}
// scan each titledb file scanning titledb0001.dat first, // scan each titledb file scanning titledb0001.dat first,
// titledb0003.dat second etc. // titledb0003.dat second etc.
@ -1082,16 +1147,23 @@ bool ImportState::importLoop ( ) {
long long saved = m_fileOffset; long long saved = m_fileOffset;
Msg7 *msg7; //Msg7 *msg7;
//GigablastRequest *gr; //GigablastRequest *gr;
SafeBuf *sbuf = NULL; //SafeBuf *sbuf = NULL;
long need = 12; long need = 12;
long dataSize = -1; long dataSize = -1;
XmlDoc xd; //XmlDoc xd;
key128_t tkey; key_t tkey;
bool status; bool status;
SafeBuf tmp;
SafeBuf *sbuf = &tmp;
long long docId;
long shardNum;
long key;
Multicast *mcast;
char *req;
long reqSize;
if ( m_fileOffset >= m_bfFileSize ) { if ( m_fileOffset >= m_bfFileSize ) {
log("inject: import: done processing file %li %s", log("inject: import: done processing file %li %s",
@ -1100,7 +1172,7 @@ bool ImportState::importLoop ( ) {
} }
// read in title rec key and data size // read in title rec key and data size
status = m_bf.read ( &tkey, 12 , m_fileOffset ); status = m_bf.read ( &tkey, sizeof(key_t) , m_fileOffset );
//if ( n != 12 ) goto nextFile; //if ( n != 12 ) goto nextFile;
if ( g_errno ) { if ( g_errno ) {
@ -1127,6 +1199,7 @@ bool ImportState::importLoop ( ) {
m_fileOffset += 4; m_fileOffset += 4;
need += 4; need += 4;
need += dataSize; need += dataSize;
need += 4; // collnum, first 4 bytes
if ( dataSize < 0 || dataSize > 500000000 ) { if ( dataSize < 0 || dataSize > 500000000 ) {
log("main: could not scan in titledb rec of " log("main: could not scan in titledb rec of "
"corrupt dataSize of %li. BAILING ENTIRE " "corrupt dataSize of %li. BAILING ENTIRE "
@ -1137,19 +1210,20 @@ bool ImportState::importLoop ( ) {
//gr = &msg7->m_gr; //gr = &msg7->m_gr;
//XmlDoc *xd = getAvailXmlDoc(); //XmlDoc *xd = getAvailXmlDoc();
msg7 = getAvailMsg7(); //msg7 = getAvailMsg7();
mcast = getAvailMulticast();
// if none, must have to wait for some to come back to us // if none, must have to wait for some to come back to us
if ( ! msg7 ) { if ( ! mcast ) {
// restore file offset // restore file offset
//m_fileOffset = saved; //m_fileOffset = saved;
// no, must have been a oom or something // no, must have been a oom or something
log("import: import no msg7 available"); log("import: import no mcast available");
return true;//false; return true;//false;
} }
// this is for holding a compressed titlerec // this is for holding a compressed titlerec
sbuf = &msg7->m_sbuf;//&gr->m_sbuf; //sbuf = &mcast->m_sbuf;//&gr->m_sbuf;
// point to start of buf // point to start of buf
sbuf->reset(); sbuf->reset();
@ -1157,6 +1231,9 @@ bool ImportState::importLoop ( ) {
// ensure we have enough room // ensure we have enough room
sbuf->reserve ( need ); sbuf->reserve ( need );
// collnum first 4 bytes
sbuf->pushLong( (long)m_collnum );
// store title key // store title key
sbuf->safeMemcpy ( &tkey , sizeof(key_t) ); sbuf->safeMemcpy ( &tkey , sizeof(key_t) );
@ -1175,8 +1252,8 @@ bool ImportState::importLoop ( ) {
"file. %s. Skipping file %s", "file. %s. Skipping file %s",
mstrerror(g_errno),m_bf.getFilename()); mstrerror(g_errno),m_bf.getFilename());
// essentially free up this msg7 now // essentially free up this msg7 now
msg7->m_inUse = false; //msg7->m_inUse = false;
msg7->reset(); //msg7->reset();
goto nextFile; goto nextFile;
} }
// advance // advance
@ -1193,8 +1270,8 @@ bool ImportState::importLoop ( ) {
// we use this so we know where the doc we are injecting // we use this so we know where the doc we are injecting
// was in the foregien titledb file. so we can update our bookmark // was in the foregien titledb file. so we can update our bookmark
// code. // code.
msg7->m_hackFileOff = saved;//m_fileOffset; mcast->m_hackFileOff = saved;//m_fileOffset;
msg7->m_hackFileId = m_bfFileId; mcast->m_hackFileId = m_bfFileId;
// //
// inject a title rec buf this time, we are doing an import // inject a title rec buf this time, we are doing an import
@ -1243,21 +1320,55 @@ bool ImportState::importLoop ( ) {
// //
//m_fileOffset += need; //m_fileOffset += need;
// get docid from key
docId = g_titledb.getDocIdFromKey ( &tkey );
// get shard that holds the titlerec for it
shardNum = g_hostdb.getShardNumFromDocId ( docId );
// for selecting which host in the shard receives it
key = (long)docId;
m_numOut++; m_numOut++;
// then index it. master callback will be called // then index it. master callback will be called
//if ( ! xd->index() ) return false; //if ( ! xd->index() ) return false;
// TODO: make this forward the request to an appropriate host!! // TODO: make this forward the request to an appropriate host!!
// . gr->m_sbuf is set to the titlerec so this should handle that // . gr->m_sbuf is set to the titlerec so this should handle that
// and use XmlDoc::set4() or whatever // and use XmlDoc::set4() or whatever
if ( msg7->injectTitleRec ( msg7 , // state // if ( msg7->injectTitleRec ( msg7 , // state
gotMsg7ReplyWrapper , // callback // gotMsg7ReplyWrapper , // callback
cr )) { // cr )) {
// it didn't block somehow... // // it didn't block somehow...
msg7->m_inUse = false; // msg7->m_inUse = false;
msg7->gotMsg7Reply(); // msg7->gotMsg7Reply();
// }
req = sbuf->getBufStart();
reqSize = sbuf->length();
if ( reqSize != need ) { char *xx=NULL;*xx=0 ; }
// do not free it, let multicast free it after sending it
sbuf->detachBuf();
if ( ! mcast->send ( req ,
reqSize ,
0x07 ,
true , // ownmsg?
shardNum,
false, // send to whole shard?
key , // for selecting host in shard
mcast , // state
NULL , // state2
gotMulticastReplyWrapper ,
999999 ) ) { // total timeout in seconds
log("import: import mcast had error: %s",mstrerror(g_errno));
m_numIn++;
} }
goto INJECTLOOP; goto INJECTLOOP;
@ -1288,43 +1399,37 @@ bool ImportState::importLoop ( ) {
return true; return true;
} }
void gotMsg7ReplyWrapper ( void *state ) { void gotMulticastReplyWrapper ( void *state , void *state2 ) {
Msg7 *msg7 = (Msg7 *)state; Multicast *mcast = (Multicast *)state;
msg7->gotMsg7Reply(); //msg7->gotMsg7Reply();
ImportState *is = msg7->m_importState; ImportState *is = mcast->m_importState;
if ( ! is->importLoop() ) return;
log("inject: import is done");
mdelete ( is, sizeof(ImportState) , "impstate");
delete (is);
}
void Msg7::gotMsg7Reply ( ) {
if ( m_inUse ) { char *xx=NULL;*xx=0; }
ImportState *is = m_importState;
is->m_numIn++; is->m_numIn++;
log("import: imported %lli docs (off=%lli)", log("import: imported %lli docs (off=%lli)",
is->m_numIn,is->m_fileOffset); is->m_numIn,is->m_fileOffset);
// if we were the least far ahead of scanning the files if ( ! is->importLoop() ) return;
// then save our position in case server crashes so we can
// resume
//is->saveFileBookMark ( this );
}
// we will be called again when this multicast reply comes in...
if ( is->m_numIn < is->m_numOut ) return;
log("inject: import is done");
CollectionRec *cr = g_collectiondb.getRec ( is->m_collnum );
// signify to qa.cpp that we are done
if ( cr ) cr->m_importState = NULL;
mdelete ( is, sizeof(ImportState) , "impstate");
delete (is);
}
// . return NULL with g_errno set on error // . return NULL with g_errno set on error
// . importLoop() calls this to get a msg7 to inject a doc from the foreign // . importLoop() calls this to get a msg7 to inject a doc from the foreign
// titledb file into our local collection // titledb file into our local collection
Msg7 *ImportState::getAvailMsg7 ( ) { Multicast *ImportState::getAvailMulticast() { // Msg7 ( ) {
//static XmlDoc **s_ptrs = NULL; //static XmlDoc **s_ptrs = NULL;
@ -1334,11 +1439,11 @@ Msg7 *ImportState::getAvailMsg7 ( ) {
// each msg7 has an xmldoc doc in it // each msg7 has an xmldoc doc in it
if ( ! m_ptrs ) { if ( ! m_ptrs ) {
long max = (long)MAXINJECTSOUT; long max = (long)MAXINJECTSOUT;
m_ptrs=(Msg7 **)mcalloc(sizeof(Msg7 *)* max,"sxdp"); m_ptrs=(Multicast *)mcalloc(sizeof(Multicast)* max,"sxdp");
if ( ! m_ptrs ) return NULL; if ( ! m_ptrs ) return NULL;
m_numPtrs = max;//(long)MAXINJECTSOUT; m_numPtrs = max;//(long)MAXINJECTSOUT;
//for ( long i = 0 ; i < MAXINJECTSOUT ;i++ ) for ( long i = 0 ; i < m_numPtrs ;i++ )
// m_ptrs[i].constructor(); m_ptrs[i].constructor();
} }
// respect the user limit for this coll // respect the user limit for this coll
@ -1351,24 +1456,11 @@ Msg7 *ImportState::getAvailMsg7 ( ) {
// find one not in use and return it // find one not in use and return it
for ( long i = 0 ; i < m_numPtrs ; i++ ) { for ( long i = 0 ; i < m_numPtrs ; i++ ) {
// point to it // point to it
Msg7 *m7 = m_ptrs[i]; Multicast *mcast = &m_ptrs[i];
// if NULL then init it and use it if ( mcast->m_inUse ) continue;
if ( ! m7 ) { //m7->m_inUse = true;
try { m7 = new (Msg7); } mcast->m_importState = this;
catch ( ... ) { return mcast;
g_errno = ENOMEM;
log("PageInject: new(%li): %s",
(long)sizeof(Msg7),mstrerror(g_errno));
return NULL;
}
mnew ( m7, sizeof(Msg7) , "dmsg7");
// assign so we can delete later
m_ptrs[i] = m7;
}
if ( m7->m_inUse ) continue;
m7->m_inUse = true;
m7->m_importState = this;
return m7;
} }
// none avail // none avail
g_errno = 0; g_errno = 0;
@ -1376,6 +1468,7 @@ Msg7 *ImportState::getAvailMsg7 ( ) {
} }
void saveImportStates ( ) { void saveImportStates ( ) {
if ( g_hostdb.m_myHost->m_hostId != 0 ) return;
for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) { for ( long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
CollectionRec *cr = g_collectiondb.m_recs[i]; CollectionRec *cr = g_collectiondb.m_recs[i];
if ( ! cr ) continue; if ( ! cr ) continue;
@ -1396,22 +1489,20 @@ void ImportState::saveFileBookMark ( ) { //Msg7 *msg7 ) {
// if there is one outstanding the preceeded us, we can't update // if there is one outstanding the preceeded us, we can't update
// the bookmark just yet. // the bookmark just yet.
for ( long i = 0 ; i < m_numPtrs ; i++ ) { for ( long i = 0 ; i < m_numPtrs ; i++ ) {
Msg7 *m7 = m_ptrs[i]; Multicast *mcast = &m_ptrs[i];
// can be null if never used if ( ! mcast->m_inUse ) continue;
if ( ! m7 ) continue;
if ( ! m7->m_inUse ) continue;
if ( minOff == -1 ) { if ( minOff == -1 ) {
minOff = m7->m_hackFileOff; minOff = mcast->m_hackFileOff;
minFileId = m7->m_hackFileId; minFileId = mcast->m_hackFileId;
continue; continue;
} }
if ( m7->m_hackFileId > minFileId ) if ( mcast->m_hackFileId > minFileId )
continue; continue;
if ( m7->m_hackFileId == minFileId && if ( mcast->m_hackFileId == minFileId &&
m7->m_hackFileOff > minOff ) mcast->m_hackFileOff > minOff )
continue; continue;
minOff = m7->m_hackFileOff; minOff = mcast->m_hackFileOff;
minFileId = m7->m_hackFileId; minFileId = mcast->m_hackFileId;
} }
char fname[256]; char fname[256];

View File

@ -1,6 +1,8 @@
#ifndef GBINJECT_H #ifndef GBINJECT_H
#define GBINJECT_H #define GBINJECT_H
void handleRequest7 ( class UdpSlot *slot , long netnice ) ;
bool sendPageInject ( class TcpSocket *s, class HttpRequest *hr ); bool sendPageInject ( class TcpSocket *s, class HttpRequest *hr );
bool resumeImports ( ) ; bool resumeImports ( ) ;
@ -36,8 +38,8 @@ public:
void *m_state; void *m_state;
void (* m_callback )(void *state); void (* m_callback )(void *state);
long long m_hackFileOff; //long long m_hackFileOff;
long m_hackFileId; //long m_hackFileId;
//long m_crawlbotAPI; //long m_crawlbotAPI;
@ -63,9 +65,9 @@ public:
void (*callback)(void *state) ); void (*callback)(void *state) );
bool injectTitleRec ( void *state , //bool injectTitleRec ( void *state ,
void (*callback)(void *state) , // void (*callback)(void *state) ,
class CollectionRec *cr ); // class CollectionRec *cr );
void gotMsg7Reply (); void gotMsg7Reply ();

View File

@ -171,8 +171,58 @@ void doneReindexing ( void *state ) {
// //
///// /////
HttpRequest *hr = &gr->m_hr;
char format = hr->getReplyFormat();
SafeBuf sb; SafeBuf sb;
char *ct = "text/html";
if ( format == FORMAT_JSON ) ct = "application/json";
if ( format == FORMAT_XML ) ct = "text/xml";
if ( format == FORMAT_XML ) {
sb.safePrintf("<response>\n"
"\t<statusCode>0</statusCode>\n"
"\t<statusMsg>Success</statusMsg>\n"
"\t<matchingResults>%li</matchingResults>\n"
"</response>"
, st->m_msg1c.m_numDocIdsAdded
);
g_httpServer.sendDynamicPage ( gr->m_socket,
sb.getBufStart(),
sb.length(),
-1,
false,ct);
mdelete ( st , sizeof(State13) , "PageTagdb" );
delete (st);
return;
}
if ( format == FORMAT_JSON ) {
sb.safePrintf("{\"response\":{\n"
"\t\"statusCode\":0,\n"
"\t\"statusMsg\":\"Success\",\n"
"\t\"matchingResults\":%li\n"
"}\n"
"}\n"
, st->m_msg1c.m_numDocIdsAdded
);
g_httpServer.sendDynamicPage ( gr->m_socket,
sb.getBufStart(),
sb.length(),
-1,
false,ct);
mdelete ( st , sizeof(State13) , "PageTagdb" );
delete (st);
return;
}
g_pages.printAdminTop ( &sb , gr->m_socket , &gr->m_hr ); g_pages.printAdminTop ( &sb , gr->m_socket , &gr->m_hr );
sb.safePrintf("<style>" sb.safePrintf("<style>"

View File

@ -38,6 +38,9 @@ static void gotState ( void *state ) ;
static bool gotResults ( void *state ) ; static bool gotResults ( void *state ) ;
bool replaceParm ( char *cgi , SafeBuf *newUrl , HttpRequest *hr ) ; bool replaceParm ( char *cgi , SafeBuf *newUrl , HttpRequest *hr ) ;
bool replaceParm2 ( char *cgi , SafeBuf *newUrl ,
char *oldUrl , long oldUrlLen ) ;
bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) ; bool printCSVHeaderRow ( SafeBuf *sb , State0 *st ) ;
@ -48,6 +51,8 @@ bool printPairScore ( SafeBuf *sb , SearchInput *si , PairScore *ps ,
bool printScoresHeader ( SafeBuf *sb ) ; bool printScoresHeader ( SafeBuf *sb ) ;
bool printMetaContent ( Msg40 *msg40 , long i ,State0 *st, SafeBuf *sb );
bool printSingleScore ( SafeBuf *sb , SearchInput *si , SingleScore *ss , bool printSingleScore ( SafeBuf *sb , SearchInput *si , SingleScore *ss ,
Msg20Reply *mr , Msg40 *msg40 ) ; Msg20Reply *mr , Msg40 *msg40 ) ;
@ -2275,6 +2280,18 @@ bool printSearchResultsHeader ( State0 *st ) {
} }
if ( si->m_format == FORMAT_XML )
sb->safePrintf("\t<numResultsOmitted>%li"
"</numResultsOmitted>\n",
msg40->m_omitCount);
if ( si->m_format == FORMAT_JSON )
sb->safePrintf("\"numResultsOmitted\":%li,\n",
msg40->m_omitCount);
//bool xml = si->m_xml; //bool xml = si->m_xml;
@ -2531,7 +2548,8 @@ bool printSearchResultsHeader ( State0 *st ) {
Query qq3; Query qq3;
Query *qq2; Query *qq2;
bool firstIgnored; bool firstIgnored;
bool isAdmin = si->m_isRootAdmin; //bool isAdmin = si->m_isRootAdmin;
bool isAdmin = (si->m_isRootAdmin || si->m_isCollAdmin);
if ( si->m_format != FORMAT_HTML ) isAdmin = false; if ( si->m_format != FORMAT_HTML ) isAdmin = false;
// otherwise, we had no error // otherwise, we had no error
@ -3012,6 +3030,45 @@ bool printSearchResultsTail ( State0 *st ) {
args.safePrintf("&sites=%s",si->m_sites); args.safePrintf("&sites=%s",si->m_sites);
if ( si->m_format == FORMAT_HTML &&
msg40->m_omitCount ) { // && firstNum == 0 ) {
// . add our cgi to the original url
// . so if it has &qlang=de and they select &qlang=en
// we have to replace it... etc.
SafeBuf newUrl;
// show banned results
replaceParm2 ("sb=1",
&newUrl,
hr->m_origUrlRequest,
hr->m_origUrlRequestLen );
// no deduping by summary or content hash etc.
SafeBuf newUrl2;
replaceParm2("dr=0",&newUrl2,newUrl.getBufStart(),
newUrl.length());
// and no site clustering
SafeBuf newUrl3;
replaceParm2 ( "sc=0", &newUrl3 , newUrl2.getBufStart(),
newUrl2.length());
// start at results #0 again
SafeBuf newUrl4;
replaceParm2 ( "s=0", &newUrl4 , newUrl3.getBufStart(),
newUrl3.length());
sb->safePrintf("<center>"
"<i>"
"%li results were omitted because they "
"were considered duplicates, banned, <br>"
"or "
"from the same site as other results. "
"<a href=%s>Click here to show all results</a>."
"</i>"
"</center>"
"<br><br>"
, msg40->m_omitCount
, newUrl4.getBufStart() );
}
if ( firstNum > 0 && if ( firstNum > 0 &&
(si->m_format == FORMAT_HTML || (si->m_format == FORMAT_HTML ||
si->m_format == FORMAT_WIDGET_IFRAME //|| si->m_format == FORMAT_WIDGET_IFRAME //||
@ -3075,7 +3132,9 @@ bool printSearchResultsTail ( State0 *st ) {
// print try this search on... // print try this search on...
// an additional <br> if we had a Next or Prev results link // an additional <br> if we had a Next or Prev results link
if ( sb->length() > remember ) sb->safeMemcpy ("<br>" , 4 ); if ( sb->length() > remember &&
si->m_format == FORMAT_HTML )
sb->safeMemcpy ("<br>" , 4 );
// //
// END PRINT PREV 10 NEXT 10 links! // END PRINT PREV 10 NEXT 10 links!
@ -3107,7 +3166,7 @@ bool printSearchResultsTail ( State0 *st ) {
sb->safePrintf("<input name=c type=hidden value=\"%s\">",coll); sb->safePrintf("<input name=c type=hidden value=\"%s\">",coll);
} }
bool isAdmin = si->m_isRootAdmin; bool isAdmin = (si->m_isRootAdmin || si->m_isCollAdmin);
if ( si->m_format != FORMAT_HTML ) isAdmin = false; if ( si->m_format != FORMAT_HTML ) isAdmin = false;
if ( isAdmin && banSites.length() > 0 ) if ( isAdmin && banSites.length() > 0 )
@ -3554,6 +3613,12 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
long long d = msg40->getDocId(ix); long long d = msg40->getDocId(ix);
// do not print if it is a summary dup or had some error
// long level = (long)msg40->getClusterLevel(ix);
// if ( level != CR_OK &&
// level != CR_INDENT )
// return true;
if ( si->m_docIdsOnly ) { if ( si->m_docIdsOnly ) {
@ -3618,7 +3683,9 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
} }
// just print cached web page? // just print cached web page?
if ( mr->ptr_content ) { if ( mr->ptr_content &&
si->m_format == FORMAT_JSON &&
strstr(mr->ptr_ubuf,"-diffbotxyz") ) {
// for json items separate with \n,\n // for json items separate with \n,\n
if ( si->m_format != FORMAT_HTML && *numPrintedSoFar > 0 ) if ( si->m_format != FORMAT_HTML && *numPrintedSoFar > 0 )
@ -3627,8 +3694,11 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
// a dud? just print empty {}'s // a dud? just print empty {}'s
if ( mr->size_content == 1 ) if ( mr->size_content == 1 )
sb->safePrintf("{}"); sb->safePrintf("{}");
// if it's a diffbot object just print it out directly
// into the json. it is already json.
else else
sb->safeStrcpy ( mr->ptr_content ); sb->safeStrcpy ( mr->ptr_content );
// . let's hack the spidertime onto the end // . let's hack the spidertime onto the end
// . so when we sort by that using gbsortby:spiderdate // . so when we sort by that using gbsortby:spiderdate
@ -3682,6 +3752,27 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
} }
if ( si->m_format == FORMAT_XML )
sb->safePrintf("\t<result>\n" );
if ( si->m_format == FORMAT_JSON ) {
if ( *numPrintedSoFar != 0 ) sb->safePrintf(",\n");
sb->safePrintf("\t{\n" );
}
if ( mr->ptr_content && si->m_format == FORMAT_XML ) {
sb->safePrintf("\t\t<content><![CDATA[" );
sb->cdataEncode ( mr->ptr_content );
sb->safePrintf("]]></content>\n");
}
if ( mr->ptr_content && si->m_format == FORMAT_JSON ) {
sb->safePrintf("\t\t\"content\":\"" );
sb->jsonEncode ( mr->ptr_content );
sb->safePrintf("\",\n");
}
Highlight hi; Highlight hi;
// get the url // get the url
@ -3703,7 +3794,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
// indent it if level is 2 // indent it if level is 2
bool indent = false; bool indent = false;
bool isAdmin = si->m_isRootAdmin; bool isAdmin = (si->m_isRootAdmin || si->m_isCollAdmin);
if ( si->m_format == FORMAT_XML ) isAdmin = false; if ( si->m_format == FORMAT_XML ) isAdmin = false;
//unsigned long long lastSiteHash = siteHash; //unsigned long long lastSiteHash = siteHash;
@ -3747,15 +3838,6 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
return true; return true;
} }
if ( si->m_format == FORMAT_XML )
sb->safePrintf("\t<result>\n" );
if ( si->m_format == FORMAT_JSON ) {
if ( *numPrintedSoFar != 0 ) sb->safePrintf(",\n");
sb->safePrintf("\t{\n" );
}
// the score if admin // the score if admin
/* /*
if ( isAdmin ) { if ( isAdmin ) {
@ -4354,10 +4436,21 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
} }
// new line if not xml // new line if not xml. even summary is empty we need it too like
if ( si->m_format == FORMAT_HTML && strLen ) // when showing xml docs - MDW 9/28/2014
if ( si->m_format == FORMAT_HTML ) // && strLen )
sb->safePrintf("<br>\n"); sb->safePrintf("<br>\n");
/////////
//
// meta tag values for &dt=keywords ...
//
/////////
if ( mr->ptr_dbuf && mr->size_dbuf>1 )
printMetaContent ( msg40 , ix,st,sb);
//////////// ////////////
// //
// . print DMOZ topics under the summary // . print DMOZ topics under the summary
@ -4678,7 +4771,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
if ( isAdmin && si->m_format == FORMAT_HTML ) { if ( si->m_format == FORMAT_HTML ) {
long lang = mr->m_language; long lang = mr->m_language;
if ( lang ) sb->safePrintf(" - %s",getLanguageString(lang)); if ( lang ) sb->safePrintf(" - %s",getLanguageString(lang));
uint16_t cc = mr->m_computedCountry; uint16_t cc = mr->m_computedCountry;
@ -4826,7 +4919,8 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
"urls="); "urls=");
sb->urlEncode ( url , gbstrlen(url) , false ); sb->urlEncode ( url , gbstrlen(url) , false );
unsigned long long rand64 = gettimeofdayInMillisecondsLocal(); unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
sb->safePrintf("&rand64=%llu\">respider</a>\n",rand64); sb->safePrintf("&c=%s&rand64=%llu\">respider</a>\n",
coll,rand64);
} }
if ( si->m_format == FORMAT_HTML ) { if ( si->m_format == FORMAT_HTML ) {
@ -4955,6 +5049,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
coll );//, dbuf ); coll );//, dbuf );
} }
if ( si->m_format == FORMAT_HTML && ( isAdmin || cr->m_isCustomCrawl)){ if ( si->m_format == FORMAT_HTML && ( isAdmin || cr->m_isCustomCrawl)){
char *un = ""; char *un = "";
long banVal = 1; long banVal = 1;
@ -6308,7 +6403,7 @@ bool printScoresHeader ( SafeBuf *sb ) {
"<td>spam</td>" "<td>spam</td>"
"<td>inlnkPR</td>" // nlinkSiteRank</td>" "<td>inlnkPR</td>" // nlinkSiteRank</td>"
"<td>termFreq</td>" "<td>termFreq</td>"
"</tr>" "</tr>\n"
); );
return true; return true;
} }
@ -6532,9 +6627,9 @@ bool printSingleScore ( SafeBuf *sb ,
sb->safePrintf("<tr>" sb->safePrintf("<tr>"
"<td rowspan=2>%.03f</td>" "<td rowspan=2>%.03f</td>\n"
"<td>%s <font color=orange>%.1f" "<td>%s <font color=orange>%.1f"
"</font></td>" "</font></td\n>"
// wordpos // wordpos
"<td>" "<td>"
"<a href=\"/get?d=" "<a href=\"/get?d="
@ -6548,17 +6643,17 @@ bool printSingleScore ( SafeBuf *sb ,
"hipos=%li&c=%s#hipos\">" "hipos=%li&c=%s#hipos\">"
,(long)ss->m_wordPos ,(long)ss->m_wordPos
,si->m_cr->m_coll); ,si->m_cr->m_coll);
sb->safePrintf("%li</a></td>" sb->safePrintf("%li</a></td>\n"
"<td>%s <font color=blue>%.1f" "<td>%s <font color=blue>%.1f"
"</font></td>" // syn "</font></td>\n" // syn
// wikibigram?/weight // wikibigram?/weight
"<td>%s <font color=green>%.02f</font></td>" "<td>%s <font color=green>%.02f</font></td>\n"
//"<td>%li/<font color=green>%f" //"<td>%li/<font color=green>%f"
//"</font></td>" // diversity //"</font></td>" // diversity
"<td>%li <font color=purple>" "<td>%li <font color=purple>"
"%.02f</font></td>" // density "%.02f</font></td>\n" // density
, (long)ss->m_wordPos , (long)ss->m_wordPos
, syn , syn
, sw // synonym weight , sw // synonym weight
@ -6572,7 +6667,7 @@ bool printSingleScore ( SafeBuf *sb ,
if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) { if ( ss->m_hashGroup == HASHGROUP_INLINKTEXT ) {
sb->safePrintf("<td>&nbsp;</td>" sb->safePrintf("<td>&nbsp;</td>"
"<td>%li <font color=red>%.02f" "<td>%li <font color=red>%.02f"
"</font></td>" // wordspam "</font></td>\n" // wordspam
, (long)ss->m_wordSpamRank , (long)ss->m_wordSpamRank
, wsw , wsw
); );
@ -6580,7 +6675,7 @@ bool printSingleScore ( SafeBuf *sb ,
else { else {
sb->safePrintf("<td>%li <font color=red>%.02f" sb->safePrintf("<td>%li <font color=red>%.02f"
"</font></td>" // wordspam "</font></td>" // wordspam
"<td>&nbsp;</td>" "<td>&nbsp;</td>\n"
, (long)ss->m_wordSpamRank , (long)ss->m_wordSpamRank
, wsw , wsw
); );
@ -6588,8 +6683,8 @@ bool printSingleScore ( SafeBuf *sb ,
} }
sb->safePrintf("<td id=tf>%lli <font color=magenta>" sb->safePrintf("<td id=tf>%lli <font color=magenta>"
"%.02f</font></td>" // termfreq "%.02f</font></td>\n" // termfreq
"</tr>" "</tr>\n"
, tf , tf
, tfw , tfw
); );
@ -6624,7 +6719,7 @@ bool printSingleScore ( SafeBuf *sb ,
"<font color=magenta>%.02f</font>" "<font color=magenta>%.02f</font>"
//" / ( 3.0 )" //" / ( 3.0 )"
// end formula // end formula
"</td></tr>" "</td></tr>\n"
, ss->m_finalScore , ss->m_finalScore
//, (long)MAXWORDPOS+1 //, (long)MAXWORDPOS+1
, hgw , hgw
@ -7298,6 +7393,11 @@ bool printLogoAndSearchBox ( SafeBuf *sb , HttpRequest *hr , long catId ,
long qlen; long qlen;
char *qstr = hr->getString("q",&qlen,"",NULL); char *qstr = hr->getString("q",&qlen,"",NULL);
sb->htmlEncode ( qstr , qlen , false ); sb->htmlEncode ( qstr , qlen , false );
// if it was an advanced search, this can be empty
if ( qlen == 0 && si->m_displayQuery )
sb->htmlEncode ( si->m_displayQuery );
sb->safePrintf ("\">" sb->safePrintf ("\">"
//"<input type=submit value=\"Search\" border=0>" //"<input type=submit value=\"Search\" border=0>"
@ -8677,40 +8777,66 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {
n++; n++;
// family filter
s_mi[n].m_menuNum = 8;
s_mi[n].m_title = "Family Filter Off";
s_mi[n].m_cgi = "ff=0";
s_mi[n].m_icon = NULL;
n++;
s_mi[n].m_menuNum = 8;
s_mi[n].m_title = "Family Filter On";
s_mi[n].m_cgi = "ff=1";
s_mi[n].m_icon = NULL;
n++;
// META TAGS
s_mi[n].m_menuNum = 9;
s_mi[n].m_title = "No Meta Tags";
s_mi[n].m_cgi = "dt=";
s_mi[n].m_icon = NULL;
n++;
s_mi[n].m_menuNum = 9;
s_mi[n].m_title = "Show Meta Tags";
s_mi[n].m_cgi = "dt=keywords+description";
s_mi[n].m_icon = NULL;
n++;
// ADMIN // ADMIN
s_mi[n].m_menuNum = 8; s_mi[n].m_menuNum = 10;
s_mi[n].m_title = "Show Admin View"; s_mi[n].m_title = "Show Admin View";
s_mi[n].m_cgi = "admin=1"; s_mi[n].m_cgi = "admin=1";
s_mi[n].m_icon = NULL; s_mi[n].m_icon = NULL;
n++; n++;
s_mi[n].m_menuNum = 8; s_mi[n].m_menuNum = 10;
s_mi[n].m_title = "Show User View"; s_mi[n].m_title = "Show User View";
s_mi[n].m_cgi = "admin=0"; s_mi[n].m_cgi = "admin=0";
s_mi[n].m_icon = NULL; s_mi[n].m_icon = NULL;
n++; n++;
s_mi[n].m_menuNum = 9; s_mi[n].m_menuNum = 11;
s_mi[n].m_title = "Action"; s_mi[n].m_title = "Action";
s_mi[n].m_cgi = ""; s_mi[n].m_cgi = "";
s_mi[n].m_icon = NULL; s_mi[n].m_icon = NULL;
n++; n++;
s_mi[n].m_menuNum = 9; s_mi[n].m_menuNum = 11;
s_mi[n].m_title = "Respider all results"; s_mi[n].m_title = "Respider all results";
s_mi[n].m_cgi = "/admin/reindex"; s_mi[n].m_cgi = "/admin/reindex";
s_mi[n].m_icon = NULL; s_mi[n].m_icon = NULL;
n++; n++;
s_mi[n].m_menuNum = 9; s_mi[n].m_menuNum = 11;
s_mi[n].m_title = "Delete all results"; s_mi[n].m_title = "Delete all results";
s_mi[n].m_cgi = "/admin/reindex"; s_mi[n].m_cgi = "/admin/reindex";
s_mi[n].m_icon = NULL; s_mi[n].m_icon = NULL;
n++; n++;
s_mi[n].m_menuNum = 9; s_mi[n].m_menuNum = 11;
s_mi[n].m_title = "Scrape from google/bing"; s_mi[n].m_title = "Scrape from google/bing";
s_mi[n].m_cgi = "/admin/inject"; s_mi[n].m_cgi = "/admin/inject";
s_mi[n].m_icon = NULL; s_mi[n].m_icon = NULL;
@ -8729,10 +8855,12 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {
for ( long i = 0 ; i <= s_mi[s_num-1].m_menuNum ; i++ ) { for ( long i = 0 ; i <= s_mi[s_num-1].m_menuNum ; i++ ) {
// after 4 make a new line // after 4 make a new line
if ( i == 5 ) sb->safePrintf("<br><br>"); if ( i == 5 ) sb->safePrintf("<br><br>");
if ( i == 9 ) sb->safePrintf("<br><br>");
printMenu ( sb , i , hr ); printMenu ( sb , i , hr );
} }
sb->safePrintf("</div>\n"); sb->safePrintf("</div>\n");
sb->safePrintf("<br>\n");
return true; return true;
} }
@ -8829,6 +8957,9 @@ bool printMenu ( SafeBuf *sb , long menuNum , HttpRequest *hr ) {
//" onmouseout=\"" //" onmouseout=\""
//"this.style.display='none';\"" //"this.style.display='none';\""
// if clicking on scrollbar do not hide menu!
" onmousedown=\"inmenuclick=1;\" "
">" ">"
, mi->m_menuNum , mi->m_menuNum
); );
@ -8923,6 +9054,7 @@ bool printMenu ( SafeBuf *sb , long menuNum , HttpRequest *hr ) {
, frontTag , frontTag
, first->m_title , first->m_title
, backTag , backTag
// print triangle
,0xe2 ,0xe2
,0x96 ,0x96
,0xbc ,0xbc
@ -8937,6 +9069,15 @@ bool replaceParm ( char *cgi , SafeBuf *newUrl , HttpRequest *hr ) {
// get original request url. this is not \0 terminated // get original request url. this is not \0 terminated
char *src = hr->m_origUrlRequest; char *src = hr->m_origUrlRequest;
long srcLen = hr->m_origUrlRequestLen; long srcLen = hr->m_origUrlRequestLen;
return replaceParm2 ( cgi ,newUrl, src, srcLen );
}
bool replaceParm2 ( char *cgi , SafeBuf *newUrl ,
char *oldUrl , long oldUrlLen ) {
char *src = oldUrl;
long srcLen = oldUrlLen;
char *srcEnd = src + srcLen; char *srcEnd = src + srcLen;
char *equal = strstr(cgi,"="); char *equal = strstr(cgi,"=");
@ -8985,3 +9126,90 @@ bool replaceParm ( char *cgi , SafeBuf *newUrl , HttpRequest *hr ) {
if ( ! newUrl->nullTerm() ) return false; if ( ! newUrl->nullTerm() ) return false;
return true; return true;
} }
bool printMetaContent ( Msg40 *msg40 , long i , State0 *st, SafeBuf *sb ) {
// store the user-requested meta tags content
SearchInput *si = &st->m_si;
char *pp = si->m_displayMetas;
char *ppend = pp + gbstrlen(si->m_displayMetas);
Msg20 *m = msg40->m_msg20[i];//getMsg20(i);
Msg20Reply *mr = m->m_r;
char *dbuf = mr->ptr_dbuf;//msg40->getDisplayBuf(i);
long dbufLen = mr->size_dbuf-1;//msg40->getDisplayBufLen(i);
char *dbufEnd = dbuf + (dbufLen-1);
char *dptr = dbuf;
//bool printedSomething = false;
// loop over the names of the requested meta tags
while ( pp < ppend && dptr < dbufEnd ) {
// . assure last byte of dbuf is \0
// provided dbufLen > 0
// . this insures sprintf and gbstrlen won't
// crash on dbuf/dptr
if ( dbuf [ dbufLen ] != '\0' ) {
log(LOG_LOGIC,"query: Meta tag buffer has no \\0.");
break;
}
// skip initial spaces
while ( pp < ppend && is_wspace_a(*pp) ) pp++;
// break if done
if ( ! *pp ) break;
// that's the start of the meta tag name
char *ss = pp;
// . find end of that meta tag name
// . can end in :<integer> -- specifies max len
while ( pp < ppend && ! is_wspace_a(*pp) &&
*pp != ':' ) pp++;
// save current char
char c = *pp;
char *cp = pp;
// NULL terminate the name
*pp++ = '\0';
// if ':' was specified, skip the rest
if ( c == ':' ) while ( pp < ppend && ! is_wspace_a(*pp)) pp++;
// print the name
//long sslen = gbstrlen ( ss );
//long ddlen = gbstrlen ( dptr );
long ddlen = dbufLen;
//if ( p + sslen + ddlen + 100 > pend ) continue;
// newspaperarchive wants tags printed even if no value
// make sure the meta tag isn't fucked up
for ( long ti = 0; ti < ddlen; ti++ ) {
if ( dptr[ti] == '"' ||
dptr[ti] == '>' ||
dptr[ti] == '<' ||
dptr[ti] == '\r' ||
dptr[ti] == '\n' ||
dptr[ti] == '\0' ) {
ddlen = ti;
break;
}
}
if ( ddlen > 0 ) {
// ship it out
if ( si->m_format == FORMAT_XML ) {
sb->safePrintf ( "\t\t<display name=\"%s\">"
"<![CDATA[", ss );
sb->cdataEncode ( dptr, ddlen );
sb->safePrintf ( "]]></display>\n" );
}
else if ( si->m_format == FORMAT_JSON ) {
sb->safePrintf ( "\t\t\"display.%s\":\"",ss);
sb->jsonEncode ( dptr, ddlen );
sb->safePrintf ( "\",\n");
}
// otherwise, print in light gray
else {
sb->safePrintf("<font color=#c62939>"
"<b>%s</b>: ", ss );
sb->safeMemcpy ( dptr, ddlen );
sb->safePrintf ( "</font><br>" );
}
}
// restore tag name buffer
*cp = c;
// point to next content of tag to display
dptr += ddlen + 1;
}
return true;
}

View File

@ -654,10 +654,10 @@ bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
{"SEARCH","/"}, {"SEARCH","/"},
// {"IMAGES","/?searchtype=images"}, {"DISCUSSIONS","/?searchtype=discussions"},
// {"PRODUCTS","/?searchtype=products"}, {"PRODUCTS","/?searchtype=products"},
// {"ARTICLES","/?searchtype=articles"}, {"ARTICLES","/?searchtype=articles"},
// {"DISCUSSIONS","/?searchtype=discussions"}, {"IMAGES","/?searchtype=images"},
{"DIRECTORY","/Top"}, {"DIRECTORY","/Top"},
{"ADVANCED","/adv.html"}, {"ADVANCED","/adv.html"},
@ -679,7 +679,7 @@ bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
// first the nav column // first the nav column
// //
sb->safePrintf( sb->safePrintf(
"<TD bgcolor=#f3c714 " // yellow/gold "<TD bgcolor=#%s " // f3c714 " // yellow/gold
"valign=top " "valign=top "
"style=\"width:210px;" "style=\"width:210px;"
"border-right:3px solid blue;" "border-right:3px solid blue;"
@ -699,6 +699,7 @@ bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
"width:100px;" "width:100px;"
"height:100px;" "height:100px;"
"\">" "\">"
, GOLD
, coll , coll
); );
@ -707,7 +708,8 @@ bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
"height=57 src=/computer2.png>"); "height=57 src=/computer2.png>");
else else
sb->safePrintf("<br style=line-height:10px;>" sb->safePrintf("<br style=line-height:10px;>"
"<img width=54 height=79 src=/rocket.jpg>" "<img border=0 "
"width=54 height=79 src=/rocket.jpg>"
); );
sb->safePrintf ( "</div>" sb->safePrintf ( "</div>"
@ -725,6 +727,10 @@ bool printLeftColumnRocketAndTabs ( SafeBuf *sb ,
if ( isSearchResultsPage && i >= 5 ) break; if ( isSearchResultsPage && i >= 5 ) break;
if ( i >= 1 && i <= 4 &&
cr->m_diffbotApiUrl.length()<= 0 )
continue;
char delim = '?'; char delim = '?';
if ( strstr ( mi[i].m_url,"?") ) delim = '&'; if ( strstr ( mi[i].m_url,"?") ) delim = '&';
@ -1042,14 +1048,16 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
// put search box in a box // put search box in a box
sb.safePrintf("<div style=" sb.safePrintf("<div style="
"background-color:#fcc714;" "background-color:#%s;"//fcc714;"
"border-style:solid;" "border-style:solid;"
"border-width:3px;" "border-width:3px;"
"border-color:blue;" "border-color:blue;"
//"background-color:blue;" //"background-color:blue;"
"padding:20px;" "padding:20px;"
"border-radius:20px;" "border-radius:20px;"
">"); ">"
,GOLD
);
sb.safePrintf("<input name=q type=text " sb.safePrintf("<input name=q type=text "
@ -1113,8 +1121,79 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
sb.safePrintf("\n"); sb.safePrintf("\n");
} }
// always the option to add event guru to their list of
// search engine in their browser
sb.safePrintf("<br>"
//"<br>"
"<script>\n"
"function addEngine() {\n"
"if (window.external && "
"('AddSearchProvider' in window.external)) {\n"
// Firefox 2 and IE 7, OpenSearch
"window.external.AddSearchProvider('http://"
"www.gigablast.com/searchbar.xml');\n"
"}\n"
"else if (window.sidebar && ('addSearchEngine' "
"in window.sidebar)) {\n"
// Firefox <= 1.5, Sherlock
"window.sidebar.addSearchEngine('http://"
"www.gigablast.com/searchbar.xml',"
//"example.com/search-plugin.src',"
"'http://www.gigablast.com/rocket.jpg'," //guru.png
"'Search Plugin', '');\n"
"}\n"
"else {"
// No search engine support (IE 6, Opera, etc).
"alert('No search engine support');\n"
"}\n"
// do not ask again if they tried to add it
// meta cookie should store this
//"document.getElementById('addedse').value='1';\n"
// NEVER ask again! permanent cookie
"document.cookie = 'didse=3';"
// make it invisible again
//"var e = document.getElementById('addse');\n"
//"e.style.display = 'none';\n"
"}\n"
"</script>\n"
"<center>"
"<a onclick='addEngine();' style="
"cursor:pointer;"
"cursor:hand;"
"color:blue;"
">"
"<img height=16 width=16 border=0 src=/rocket16.png>"
"<font color=#505050>"
"%c%c%c "
"</font>"
"&nbsp; "
"Add Gigablast to your browser's "
"search engines"
"</a>"
"</center>"
"<br>"
"<br>"
// print triangle
,0xe2
,0x96
,0xbc
);
// print any red boxes we might need to // print any red boxes we might need to
if ( printRedBox2 ( &sb , true ) ) if ( printRedBox2 ( &sb , sock , r ) ) // true ) )
sb.safePrintf("<br>\n"); sb.safePrintf("<br>\n");
/* /*
@ -1280,9 +1359,6 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
*/ */
/*
// //
// begin new stuff // begin new stuff
// //
@ -1352,6 +1428,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
sb.safePrintf("</div>"); sb.safePrintf("</div>");
/*
sb.safePrintf("<div class=grad style=\"border-radius:200px;border-color:blue;border-style:solid;border-width:3px;padding:12px;width:280px;height:280px;display:inline-block;z-index:105;color:black;margin-left:-50px;position:absolute;margin-top:50px;background-color:lightgray;\">"); sb.safePrintf("<div class=grad style=\"border-radius:200px;border-color:blue;border-style:solid;border-width:3px;padding:12px;width:280px;height:280px;display:inline-block;z-index:105;color:black;margin-left:-50px;position:absolute;margin-top:50px;background-color:lightgray;\">");
@ -1378,6 +1455,7 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
sb.safePrintf("</div>"); sb.safePrintf("</div>");
sb.safePrintf("</div>"); sb.safePrintf("</div>");
*/
sb.safePrintf("<div class=grad style=\"border-radius:300px;border-color:blue;border-style:solid;border-width:3px;padding:12px;width:240px;height:240px;display:inline-block;z-index:110;color:black;margin-left:-240px;position:absolute;margin-top:230px;background-color:lightgray;\">"); sb.safePrintf("<div class=grad style=\"border-radius:300px;border-color:blue;border-style:solid;border-width:3px;padding:12px;width:240px;height:240px;display:inline-block;z-index:110;color:black;margin-left:-240px;position:absolute;margin-top:230px;background-color:lightgray;\">");
@ -1399,16 +1477,49 @@ bool printWebHomePage ( SafeBuf &sb , HttpRequest *r , TcpSocket *sock ) {
sb.safePrintf("</div>"); sb.safePrintf("</div>");
//
// donate with paypal bubble
//
sb.safePrintf("<div class=grad style=\"border-radius:300px;border-color:blue;border-style:solid;border-width:3px;padding:12px;width:180px;height:180px;display:inline-block;z-index:120;color:black;margin-left:10px;position:absolute;margin-top:270px;background-color:lightgray;\">");
sb.safePrintf("<br>");
sb.safePrintf("<b>");
sb.safePrintf("<font style=font-size:18px;margin-left:40px;>");
sb.safePrintf("Contribute");
sb.safePrintf("</font>");
sb.safePrintf("<br>");
sb.safePrintf("<br>");
sb.safePrintf("</b>");
sb.safePrintf("<div style=margin-left:15px;margin-right:5px;>");
//sb.safePrintf("</TD></TR></TABLE></body></html>"); sb.safePrintf(
"Help Gigablast development with PayPal."
"<br>"
"<br>"
// BEGIN PAYPAL DONATE BUTTON
"<form action=\"https://www.paypal.com/cgi-bin/webscr\" method=\"post\" target=\"_top\">"
"<input type=\"hidden\" name=\"cmd\" value=\"_donations\">"
"<input type=\"hidden\" name=\"business\" value=\"2SFSFLUY3KS9Y\">"
"<input type=\"hidden\" name=\"lc\" value=\"US\">"
"<input type=\"hidden\" name=\"item_name\" value=\"Gigablast, Inc.\">"
"<input type=\"hidden\" name=\"currency_code\" value=\"USD\">"
"<input type=\"hidden\" name=\"bn\" value=\"PP-DonationsBF:btn_donateCC_LG.gif:NonHosted\">"
"<input type=\"image\" src=\"https://www.paypalobjects.com/en_US/i/btn/btn_donateCC_LG.gif\" border=\"0\" name=\"submit\" alt=\"PayPal - The safer, easier way to pay online!\" height=47 width=147>"
"<img alt=\"\" border=\"0\" src=\"https://www.paypalobjects.com/en_US/i/scr/pixel.gif\" width=\"1\" height=\"1\">"
"</form>"
// END PAYPAY BUTTON
"</center></div></center>"
//"</td>\n"
);
// //
// end new stuff // end new stuff
// //
*/
sb.safePrintf("\n"); sb.safePrintf("\n");
sb.safePrintf("\n"); sb.safePrintf("\n");
@ -1466,14 +1577,16 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
// put search box in a box // put search box in a box
sb.safePrintf("<div style=" sb.safePrintf("<div style="
"background-color:#fcc714;" "background-color:#%s;" // fcc714;"
"border-style:solid;" "border-style:solid;"
"border-width:3px;" "border-width:3px;"
"border-color:blue;" "border-color:blue;"
//"background-color:blue;" //"background-color:blue;"
"padding:20px;" "padding:20px;"
"border-radius:20px;" "border-radius:20px;"
">"); ">"
, GOLD
);
sb.safePrintf("<input name=urls type=text " sb.safePrintf("<input name=urls type=text "
@ -1637,14 +1750,16 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
// put search box in a box // put search box in a box
sb.safePrintf("<div style=" sb.safePrintf("<div style="
"background-color:#fcc714;" "background-color:#%s;" // fcc714;"
"border-style:solid;" "border-style:solid;"
"border-width:3px;" "border-width:3px;"
"border-color:blue;" "border-color:blue;"
//"background-color:blue;" //"background-color:blue;"
"padding:20px;" "padding:20px;"
"border-radius:20px;" "border-radius:20px;"
">"); ">"
,GOLD
);
sb.safePrintf("<input name=q type=text " sb.safePrintf("<input name=q type=text "
@ -2627,7 +2742,7 @@ void resetPageAddUrl ( ) {
s_htable.reset(); s_htable.reset();
} }
/*
bool sendPageAdvanced ( TcpSocket *sock , HttpRequest *hr ) { bool sendPageAdvanced ( TcpSocket *sock , HttpRequest *hr ) {
SafeBuf sb; SafeBuf sb;
@ -2802,7 +2917,7 @@ bool sendPageAdvanced ( TcpSocket *sock , HttpRequest *hr ) {
return true; return true;
} }
*/
bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) { bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
@ -2833,7 +2948,7 @@ bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
// yellow/gold bar // yellow/gold bar
"<tr>" "<tr>"
"<td colspan=2 bgcolor=#f3c714>" "<td colspan=2 bgcolor=#%s>" // f3c714>"
"<b>" "<b>"
"Basic Query Syntax" "Basic Query Syntax"
"</b>" "</b>"
@ -2946,6 +3061,7 @@ bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
// " </tr>" // " </tr>"
// "" // ""
// "" // ""
, GOLD
); );
@ -3002,7 +3118,7 @@ bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
"cellpadding=5 cellspacing=0 border=0>" "cellpadding=5 cellspacing=0 border=0>"
// yellow/gold bar // yellow/gold bar
"<tr>" "<tr>"
"<td colspan=2 bgcolor=#f3c714>" "<td colspan=2 bgcolor=#%s>"//f3c714>"
"<b>" "<b>"
"%s" "%s"
"</b>" "</b>"
@ -3014,6 +3130,7 @@ bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
"<th><font color=33dcff>" "<th><font color=33dcff>"
"Description</font></th>" "Description</font></th>"
"</tr>\n" "</tr>\n"
, GOLD
, g_fields[i].m_title , g_fields[i].m_title
); );
} }
@ -3055,7 +3172,7 @@ bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
// yellow/gold bar // yellow/gold bar
"<tr>" "<tr>"
"<td colspan=2 bgcolor=#f3c714>" "<td colspan=2 bgcolor=#%s>" // f3c714>"
"<b>" "<b>"
"Boolean Queries" "Boolean Queries"
"</b>" "</b>"
@ -3160,6 +3277,7 @@ bool sendPageHelp ( TcpSocket *sock , HttpRequest *hr ) {
//"</td></tr>" //"</td></tr>"
//"</table>" //"</table>"
//"<br>" //"<br>"
, GOLD
); );

View File

@ -983,6 +983,19 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
"<td colspan=2 class=hdrow>" "<td colspan=2 class=hdrow>"
"<center><b>Network</b></td></tr>\n" "<center><b>Network</b></td></tr>\n"
"<tr class=poo><td><b>http server "
"bytes downloaded</b>"
"</td><td>%llu</td></tr>\n"
"<tr class=poo><td><b>http server "
"bytes downloaded (uncompressed)</b>"
"</td><td>%llu</td></tr>\n"
"<tr class=poo><td><b>http server "
"compression ratio</b>"
"</td><td>%.02f</td></tr>\n"
"<tr class=poo><td><b>ip1 bytes/packets in</b>" "<tr class=poo><td><b>ip1 bytes/packets in</b>"
"</td><td>%llu / %llu</td></tr>\n" "</td><td>%llu / %llu</td></tr>\n"
@ -1007,6 +1020,11 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
, ,
TABLE_STYLE, TABLE_STYLE,
g_httpServer.m_bytesDownloaded,
g_httpServer.m_uncompressedBytes,
g_httpServer.getCompressionRatio(),
g_udpServer.m_eth0BytesIn, g_udpServer.m_eth0BytesIn,
g_udpServer.m_eth0PacketsIn, g_udpServer.m_eth0PacketsIn,
@ -1030,6 +1048,15 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
p.safePrintf ( p.safePrintf (
"\t<networkStats>\n" "\t<networkStats>\n"
"\t\t<httpServerBytesDownloaded>%llu"
"</httpServerBytesDownloaded>\n"
"\t\t<httpServerBytesDownloadedUncompressed>%llu"
"</httpServerBytesDownloadedUncompressed>\n"
"\t\t<httpServerCompressionRatio>%.02f"
"</httpServerCompressionRatio>\n"
"\t\t<ip1BytesIn>%llu</ip1BytesIn>\n" "\t\t<ip1BytesIn>%llu</ip1BytesIn>\n"
"\t\t<ip1PacketsIn>%llu</ip1PacketsIn>\n" "\t\t<ip1PacketsIn>%llu</ip1PacketsIn>\n"
@ -1053,6 +1080,12 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
"\t</networkStats>\n" "\t</networkStats>\n"
, ,
g_httpServer.m_bytesDownloaded,
g_httpServer.m_uncompressedBytes,
g_httpServer.getCompressionRatio(),
g_udpServer.m_eth0BytesIn, g_udpServer.m_eth0BytesIn,
g_udpServer.m_eth0PacketsIn, g_udpServer.m_eth0PacketsIn,
@ -1075,6 +1108,12 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
p.safePrintf ( p.safePrintf (
"\t\"networkStats\":{\n" "\t\"networkStats\":{\n"
"\t\t\"httpServerBytesDownloaded\":%llu,\n"
"\t\t\"httpServerBytesDownloadedUncompressed\""
":%llu,\n"
"\t\t\"httpServerCompressionRatio\":%.02f,\n"
"\t\t\"ip1BytesIn\":%llu,\n" "\t\t\"ip1BytesIn\":%llu,\n"
"\t\t\"ip1PacketsIn\":%llu,\n" "\t\t\"ip1PacketsIn\":%llu,\n"
@ -1098,6 +1137,12 @@ bool sendPageStats ( TcpSocket *s , HttpRequest *r ) {
"\t},\n" "\t},\n"
, ,
g_httpServer.m_bytesDownloaded,
g_httpServer.m_uncompressedBytes,
g_httpServer.getCompressionRatio(),
g_udpServer.m_eth0BytesIn, g_udpServer.m_eth0BytesIn,
g_udpServer.m_eth0PacketsIn, g_udpServer.m_eth0PacketsIn,

451
Pages.cpp
View File

@ -41,17 +41,20 @@ class WebPage {
// otherwise you'll get a malformed error when running // otherwise you'll get a malformed error when running
static long s_numPages = 0; static long s_numPages = 0;
static WebPage s_pages[] = { static WebPage s_pages[] = {
/*
// dummy pages // dummy pages
{ PAGE_NOHOSTLINKS , "nohostlinks", 0, "host links", 0, 0, { PAGE_NOHOSTLINKS , "nohostlinks", 0, "host links", 0, 0,
"dummy page - if set in the users row then host links will not be " "dummy page - if set in the users row then host links will not be "
" shown", " shown",
NULL, 0 ,NULL,NULL,PG_NOAPI}, NULL, 0 ,NULL,NULL,
PG_NOAPI},
{ PAGE_ADMIN , "colladmin", 0, "master=0", 0, 0, { PAGE_ADMIN , "colladmin", 0, "master=0", 0, 0,
"dummy page - if set in the users row then user will have master=0 and " "dummy page - if set in the users row then user will have master=0 and "
" collection links will be highlighted in red", " collection links will be highlighted in red",
NULL, 0 ,NULL,NULL,PG_NOAPI}, NULL, 0 ,NULL,NULL,
PG_NOAPI},
//{ PAGE_QUALITY , "quality", 0, "quality", 0, 0, //{ PAGE_QUALITY , "quality", 0, "quality", 0, 0,
// "dummy page - if set in the users row then \"Quality Control\"" // "dummy page - if set in the users row then \"Quality Control\""
@ -61,14 +64,19 @@ static WebPage s_pages[] = {
"dummy page - if set in the users row then page function is" "dummy page - if set in the users row then page function is"
" called directly and not through g_parms.setFromRequest", " called directly and not through g_parms.setFromRequest",
NULL, 0 ,NULL,NULL,PG_NOAPI}, NULL, 0 ,NULL,NULL,PG_NOAPI},
*/
// publicly accessible pages // publicly accessible pages
{ PAGE_ROOT , "index.html" , 0 , "root" , 0 , 0 , { PAGE_ROOT , "index.html" , 0 , "root" , 0 , 0 ,
"search page to query", "search page to query",
sendPageRoot , 0 ,NULL,NULL,PG_NOAPI}, sendPageRoot , 0 ,NULL,NULL,
PG_NOAPI},
{ PAGE_RESULTS , "search" , 0 , "search" , 0 , 0 , { PAGE_RESULTS , "search" , 0 , "search" , 0 , 0 ,
"search results page", "search results page",
sendPageResults, 0 ,NULL,NULL,0}, sendPageResults, 0 ,NULL,NULL,
0},
//{ PAGE_WIDGET , "widget" , 0 , "widget" , 0 , 0 , //{ PAGE_WIDGET , "widget" , 0 , "widget" , 0 , 0 ,
// "widget page", // "widget page",
// sendPageWidget, 0 ,NULL,NULL,PG_NOAPI}, // sendPageWidget, 0 ,NULL,NULL,PG_NOAPI},
@ -77,25 +85,33 @@ static WebPage s_pages[] = {
// api use PAGE_ADDURL2 which is /admin/addurl. so we set PG_NOAPI here // api use PAGE_ADDURL2 which is /admin/addurl. so we set PG_NOAPI here
{ PAGE_ADDURL , "addurl" , 0 , "add url" , 0 , 0 , { PAGE_ADDURL , "addurl" , 0 , "add url" , 0 , 0 ,
"Page where you can add url for spidering", "Page where you can add url for spidering",
sendPageAddUrl, 0 ,NULL,NULL,PG_NOAPI}, sendPageAddUrl, 0 ,NULL,NULL,
PG_NOAPI},
{ PAGE_GET , "get" , 0 , "get" , 0 , 0 , { PAGE_GET , "get" , 0 , "get" , 0 , 0 ,
//USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_CLIENT, //USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_CLIENT,
"gets cached web page", "gets cached web page",
sendPageGet , 0 ,NULL,NULL,0}, sendPageGet , 0 ,NULL,NULL,
0},
{ PAGE_LOGIN , "login" , 0 , "login" , 0 , 0 , { PAGE_LOGIN , "login" , 0 , "login" , 0 , 0 ,
//USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_SPAM | USER_CLIENT, //USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_SPAM | USER_CLIENT,
"login", "login",
sendPageLogin, 0 ,NULL,NULL,PG_NOAPI}, sendPageLogin, 0 ,NULL,NULL,
PG_NOAPI},
{ PAGE_DIRECTORY , "dir" , 0 , "directory" , 0 , 0 , { PAGE_DIRECTORY , "dir" , 0 , "directory" , 0 , 0 ,
//USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_CLIENT, //USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_CLIENT,
"directory", "directory",
// until api is ready, take this out of the menu // until api is ready, take this out of the menu
sendPageDirectory , 0 ,NULL,NULL,PG_NOAPI}, sendPageDirectory , 0 ,NULL,NULL,
PG_NOAPI},
{ PAGE_REPORTSPAM , "reportspam" , 0 , "report spam" , 0 , 0 , { PAGE_REPORTSPAM , "reportspam" , 0 , "report spam" , 0 , 0 ,
//USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_PROXY | USER_CLIENT, //USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_PROXY | USER_CLIENT
"report spam", "report spam",
sendPageReportSpam , 0 ,NULL,NULL,PG_NOAPI}, sendPageReportSpam , 0 ,NULL,NULL,PG_NOAPI},
//{ PAGE_WORDVECTOR, "vec" , 0 , "word vectors" , 0 , 1 , //{ PAGE_WORDVECTOR, "vec" , 0 , "word vectors" , 0 , 1 ,
// //USER_PUBLIC | USER_MASTER | USER_ADMIN , // //USER_PUBLIC | USER_MASTER | USER_ADMIN ,
// "word vectors", // "word vectors",
@ -103,115 +119,142 @@ static WebPage s_pages[] = {
// use post now for the "site list" which can be big // use post now for the "site list" which can be big
{ PAGE_BASIC_SETTINGS, "admin/settings", 0 , "settings",1, M_POST , { PAGE_BASIC_SETTINGS, "admin/settings", 0 , "settings",1, M_POST ,
"basic settings", sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI}, "basic settings", sendPageGeneric , 0 ,NULL,NULL,
PG_NOAPI|PG_COLLADMIN},
{ PAGE_BASIC_STATUS, "admin/status", 0 , "status",1, 0 , { PAGE_BASIC_STATUS, "admin/status", 0 , "status",1, 0 ,
"basic status", sendPageBasicStatus , 0 ,NULL,NULL,PG_STATUS}, "basic status", sendPageBasicStatus , 0 ,NULL,NULL,
PG_STATUS|PG_COLLADMIN},
//{ PAGE_BASIC_DIFFBOT, "admin/diffbot", 0 , "diffbot",1, 0 , //{ PAGE_BASIC_DIFFBOT, "admin/diffbot", 0 , "diffbot",1, 0 ,
// "Basic diffbot page.", sendPageBasicDiffbot , 0 ,NULL,NULL,PG_NOAPI}, // "Basic diffbot page.", sendPageBasicDiffbot , 0 ,
{ PAGE_BASIC_SECURITY, "admin/security", 0 , "security",1, 0 , //NULL,NULL,PG_NOAPI},
"basic security", sendPageGeneric , 0 ,NULL,NULL,0},
{ PAGE_BASIC_SECURITY,
"admin/collectionpasswords", 0,"collection passwords",0,0,
"passwords", sendPageGeneric , 0 ,NULL,NULL,
PG_COLLADMIN},
{ PAGE_BASIC_SEARCH, "", 0 , "search",1, 0 , { PAGE_BASIC_SEARCH, "", 0 , "search",1, 0 ,
"basic search", sendPageRoot , 0 ,NULL,NULL,PG_NOAPI}, "basic search", sendPageRoot , 0 ,NULL,NULL,
PG_NOAPI},
{ PAGE_HOSTS , "admin/hosts" , 0 , "hosts" , 0 , 0 , { PAGE_HOSTS , "admin/hosts" , 0 , "hosts" , 0 , 0 ,
//USER_MASTER | USER_PROXY, //USER_MASTER | USER_PROXY,
"hosts status", "hosts status", sendPageHosts , 0 ,NULL,NULL,
sendPageHosts , 0 ,NULL,NULL,PG_STATUS}, PG_STATUS|PG_ROOTADMIN},
{ PAGE_MASTER , "admin/master" , 0 , "master controls" , 1 , 0 , { PAGE_MASTER , "admin/master" , 0 , "master controls" , 1 , 0 ,
//USER_MASTER | USER_PROXY , //USER_MASTER | USER_PROXY ,
"master controls", "master controls", sendPageGeneric , 0 ,NULL,NULL,
sendPageGeneric , 0 ,NULL,NULL,0}, PG_ROOTADMIN},
// use POST for html head/tail and page root html. might be large. // use POST for html head/tail and page root html. might be large.
{ PAGE_SEARCH , "admin/search" , 0 , "search controls" ,1,M_POST, { PAGE_SEARCH , "admin/search" , 0 , "search controls" ,1,M_POST,
//USER_ADMIN | USER_MASTER , //USER_ADMIN | USER_MASTER ,
"search controls", "search controls", sendPageGeneric , 0 ,NULL,NULL,
sendPageGeneric , 0 ,NULL,NULL,0}, 0},
// use post now for the "site list" which can be big // use post now for the "site list" which can be big
{ PAGE_SPIDER , "admin/spider" , 0 , "spider controls" ,1,M_POST, { PAGE_SPIDER , "admin/spider" , 0 , "spider controls" ,1,M_POST,
//USER_ADMIN | USER_MASTER | USER_PROXY , //USER_ADMIN | USER_MASTER | USER_PROXY ,
"spider controls", "spider controls", sendPageGeneric , 0 ,NULL,NULL,
sendPageGeneric , 0 ,NULL,NULL,0}, PG_COLLADMIN},
{ PAGE_SPIDERPROXIES,"admin/proxies" , 0 , "proxies" , 1 , 0, { PAGE_SPIDERPROXIES,"admin/proxies" , 0 , "proxies" , 1 , 0,
"proxies", sendPageGeneric , 0,NULL,NULL,0 } , "proxies", sendPageGeneric , 0,NULL,NULL,
PG_ROOTADMIN } ,
{ PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 , { PAGE_LOG , "admin/log" , 0 , "log controls" , 1 , 0 ,
//USER_MASTER | USER_PROXY, //USER_MASTER | USER_PROXY,
"log controls", "log controls", sendPageGeneric , 0 ,NULL,NULL,
sendPageGeneric , 0 ,NULL,NULL,0}, PG_ROOTADMIN},
{ PAGE_SECURITY, "admin/security2", 0 , "security" , 1 , 0 ,
{ PAGE_ROOTPASSWORDS, "admin/rootpasswords",
0 , "root passwords" , 1 , 0 ,
//USER_MASTER | USER_PROXY , //USER_MASTER | USER_PROXY ,
"advanced security", "root passwords",
sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI}, sendPageGeneric , 0 ,NULL,NULL,
PG_ROOTADMIN},
{ PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 , { PAGE_ADDCOLL , "admin/addcoll" , 0 , "add collection" , 1 , 0 ,
//USER_MASTER , //USER_MASTER ,
"add a new collection", "add a new collection",
sendPageAddColl , 0 ,NULL,NULL,0}, sendPageAddColl , 0 ,NULL,NULL,
PG_ROOTADMIN},
{ PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0, { PAGE_DELCOLL , "admin/delcoll" , 0 , "delete collections" , 1 ,0,
//USER_MASTER , //USER_MASTER ,
"delete a collection", "delete a collection",
sendPageDelColl , 0 ,NULL,NULL,0}, sendPageDelColl , 0 ,NULL,NULL,
PG_COLLADMIN},
{ PAGE_CLONECOLL, "admin/clonecoll" , 0 , "clone collection" , 1 ,0, { PAGE_CLONECOLL, "admin/clonecoll" , 0 , "clone collection" , 1 ,0,
//USER_MASTER , //USER_MASTER ,
"clone one collection's settings to another", "clone one collection's settings to another",
sendPageCloneColl , 0 ,NULL,NULL,0}, sendPageCloneColl , 0 ,NULL,NULL,
PG_ROOTADMIN},
{ PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 , { PAGE_REPAIR , "admin/repair" , 0 , "repair" , 1 , 0 ,
//USER_MASTER ,
"repair data", "repair data",
sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI}, //USER_MASTER ,
// { PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1, sendPageGeneric , 0 ,NULL,NULL,
// "what sites can be spidered", PG_ROOTADMIN },
// sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI}, // sendPageBasicSettings
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 ,M_POST, { PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 ,M_POST,
//USER_ADMIN | USER_MASTER ,
"prioritize urls for spidering", "prioritize urls for spidering",
// until we get this working, set PG_NOAPI sendPageGeneric , 0 ,NULL,NULL,
sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI}, PG_NOAPI|PG_COLLADMIN},
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0,M_MULTI , { PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0,M_MULTI ,
//USER_ADMIN | USER_MASTER , //USER_ADMIN | USER_MASTER ,
"inject url in the index here", "inject url in the index here",
sendPageInject , 2 } , sendPageInject , 2 ,NULL,NULL,
0} ,
// this is the addurl page the the admin! // this is the addurl page the the admin!
{ PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 , { PAGE_ADDURL2 , "admin/addurl" , 0 , "add urls" , 0 , 0 ,
"add url page for admin", "add url page for admin",
sendPageAddUrl2 , 0 ,NULL,NULL,0}, sendPageAddUrl2 , 0 ,NULL,NULL,
PG_COLLADMIN},
{ PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 , { PAGE_REINDEX , "admin/reindex" , 0 , "query reindex" , 0 , 0 ,
//USER_ADMIN | USER_MASTER, //USER_ADMIN | USER_MASTER,
"query delete/reindex", "query delete/reindex",
sendPageReindex , 0 ,NULL,NULL,0}, sendPageReindex , 0 ,NULL,NULL,
PG_COLLADMIN},
// master admin pages // master admin pages
{ PAGE_STATS , "admin/stats" , 0 , "stats" , 0 , 0 , { PAGE_STATS , "admin/stats" , 0 , "stats" , 0 , 0 ,
//USER_MASTER | USER_PROXY , //USER_MASTER | USER_PROXY ,
"general statistics", "general statistics",
sendPageStats , 0 ,NULL,NULL,PG_STATUS}, sendPageStats , 0 ,NULL,NULL,
PG_STATUS|PG_ROOTADMIN},
{ PAGE_GRAPH , "admin/graph" , 0 , "graph" , 0 , 0 , { PAGE_GRAPH , "admin/graph" , 0 , "graph" , 0 , 0 ,
//USER_MASTER , //USER_MASTER ,
"query stats graph", "query stats graph",
sendPageGraph , 2 /*niceness*/ ,NULL,NULL,PG_STATUS|PG_NOAPI}, sendPageGraph , 2 ,NULL,NULL,
PG_STATUS|PG_NOAPI|PG_ROOTADMIN},
{ PAGE_PERF , "admin/perf" , 0 , "performance" , 0 , 0 , { PAGE_PERF , "admin/perf" , 0 , "performance" , 0 , 0 ,
//USER_MASTER | USER_PROXY , //USER_MASTER | USER_PROXY ,
"function performance graph", "function performance graph",
sendPagePerf , 0 ,NULL,NULL,PG_STATUS|PG_NOAPI}, sendPagePerf , 0 ,NULL,NULL,
PG_STATUS|PG_NOAPI|PG_ROOTADMIN},
{ PAGE_SOCKETS , "admin/sockets" , 0 , "sockets" , 0 , 0 , { PAGE_SOCKETS , "admin/sockets" , 0 , "sockets" , 0 , 0 ,
//USER_MASTER | USER_PROXY, //USER_MASTER | USER_PROXY,
"sockets", "sockets",
sendPageSockets , 0 ,NULL,NULL,PG_STATUS|PG_NOAPI}, sendPageSockets , 0 ,NULL,NULL,
PG_STATUS|PG_NOAPI|PG_ROOTADMIN},
{ PAGE_LOGVIEW , "admin/logview" , 0 , "log view" , 0 , 0 , { PAGE_LOGVIEW , "admin/logview" , 0 , "log view" , 0 , 0 ,
//USER_MASTER , //USER_MASTER ,
"logview", "logview",
sendPageLogView , 0 ,NULL,NULL,PG_STATUS|PG_NOAPI}, sendPageLogView , 0 ,NULL,NULL,
PG_STATUS|PG_NOAPI|PG_ROOTADMIN},
// { PAGE_SYNC , "master/sync" , 0 , "sync" , 0 , 0 , // { PAGE_SYNC , "master/sync" , 0 , "sync" , 0 , 0 ,
// //USER_MASTER , // //USER_MASTER ,
// "sync", // "sync",
@ -220,19 +263,21 @@ static WebPage s_pages[] = {
{ PAGE_AUTOBAN ,"admin/autoban" , 0 , "autoban" , 1 , M_POST , { PAGE_AUTOBAN ,"admin/autoban" , 0 , "autoban" , 1 , M_POST ,
//USER_MASTER | USER_PROXY , //USER_MASTER | USER_PROXY ,
"autobanned ips", "autobanned ips",
sendPageAutoban , 0 ,NULL,NULL,PG_NOAPI}, sendPageAutoban , 0 ,NULL,NULL,
/* PG_NOAPI|PG_ROOTADMIN},
{ PAGE_SPIDERLOCKS,"admin/spiderlocks" , 0 , "spider locks" , 0 , 0 ,
USER_MASTER , sendPageSpiderLocks , 0 ,NULL,NULL,PG_NOAPI},
*/
{ PAGE_PROFILER , "admin/profiler" , 0 , "profiler" , 0 ,M_POST, { PAGE_PROFILER , "admin/profiler" , 0 , "profiler" , 0 ,M_POST,
//USER_MASTER , //USER_MASTER ,
"profiler", "profiler",
sendPageProfiler , 0 ,NULL,NULL,PG_NOAPI}, sendPageProfiler , 0 ,NULL,NULL,
PG_NOAPI|PG_ROOTADMIN},
{ PAGE_THREADS , "admin/threads" , 0 , "threads" , 0 , 0 , { PAGE_THREADS , "admin/threads" , 0 , "threads" , 0 , 0 ,
//USER_MASTER , //USER_MASTER ,
"threads", "threads",
sendPageThreads , 0 ,NULL,NULL,PG_STATUS|PG_NOAPI}, sendPageThreads , 0 ,NULL,NULL,
PG_STATUS|PG_NOAPI|PG_ROOTADMIN},
//{ PAGE_THESAURUS, "admin/thesaurus", 0 , "thesaurus", 0 , 0 , //{ PAGE_THESAURUS, "admin/thesaurus", 0 , "thesaurus", 0 , 0 ,
// //USER_MASTER , // //USER_MASTER ,
// "thesaurus", // "thesaurus",
@ -246,38 +291,51 @@ static WebPage s_pages[] = {
// sendPageOverview , 0 ,NULL,NULL,PG_NOAPI}, // sendPageOverview , 0 ,NULL,NULL,PG_NOAPI},
{ PAGE_QA , "admin/qa" , 0 , "qa" , 0 , 0 , { PAGE_QA , "admin/qa" , 0 , "qa" , 0 , 0 ,
"quality assurance", sendPageQA , 0 ,NULL,NULL,PG_NOAPI}, "quality assurance",
sendPageQA , 0 ,NULL,NULL,
PG_NOAPI|PG_ROOTADMIN},
{ PAGE_IMPORT , "admin/import" , 0 , "import" , 0 , 0 , { PAGE_IMPORT , "admin/import" , 0 , "import" , 0 , 0 ,
"import documents from another cluster", "import documents from another cluster",
sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI}, sendPageGeneric , 0 ,NULL,NULL,
PG_NOAPI|PG_ROOTADMIN},
{ PAGE_API , "admin/api" , 0 , "api" , 0 , 0 , { PAGE_API , "admin/api" , 0 , "api" , 0 , 0 ,
//USER_MASTER | USER_ADMIN , //USER_MASTER | USER_ADMIN ,
"api", sendPageAPI , 0 ,NULL,NULL,PG_NOAPI}, "api",
sendPageAPI , 0 ,NULL,NULL,
PG_NOAPI|PG_COLLADMIN},
{ PAGE_RULES , "admin/siterules", 0 , "site rules", 1, M_POST, { PAGE_RULES , "admin/siterules", 0 , "site rules", 1, M_POST,
//USER_ADMIN | USER_MASTER , //USER_ADMIN | USER_MASTER ,
"site rules", "site rules",
sendPageGeneric , 0,NULL,NULL,PG_NOAPI}, sendPageGeneric , 0,NULL,NULL,
PG_NOAPI},
{ PAGE_INDEXDB , "admin/indexdb" , 0 , "indexdb" , 0 , 0, { PAGE_INDEXDB , "admin/indexdb" , 0 , "indexdb" , 0 , 0,
//USER_MASTER , //USER_MASTER ,
"indexdb", "indexdb",
sendPageIndexdb , 0 ,NULL,NULL,PG_NOAPI}, sendPageIndexdb , 0 ,NULL,NULL,
PG_NOAPI|PG_ROOTADMIN},
{ PAGE_TITLEDB , "admin/titledb" , 0 , "titledb" , 0 , 0, { PAGE_TITLEDB , "admin/titledb" , 0 , "titledb" , 0 , 0,
//USER_MASTER , //USER_MASTER ,
"titledb", "titledb",
sendPageTitledb , 2,NULL,NULL,PG_NOAPI}, sendPageTitledb , 2,NULL,NULL,
PG_NOAPI|PG_ROOTADMIN},
// 1 = usePost // 1 = usePost
{ PAGE_CRAWLBOT , "crawlbot" , 0 , "crawlbot" , 1 , 0, { PAGE_CRAWLBOT , "crawlbot" , 0 , "crawlbot" , 1 , 0,
"simplified spider controls", "simplified spider controls",
sendPageCrawlbot , 0 ,NULL,NULL,PG_NOAPI}, sendPageCrawlbot , 0 ,NULL,NULL,
PG_NOAPI|PG_ROOTADMIN},
{ PAGE_SPIDERDB , "admin/spiderdb" , 0 , "spider queue" , 0 , 0 , { PAGE_SPIDERDB , "admin/spiderdb" , 0 , "spider queue" , 0 , 0 ,
//USER_ADMIN | USER_MASTER , //USER_ADMIN | USER_MASTER ,
"spider queue", "spider queue",
sendPageSpiderdb , 0 ,NULL,NULL,PG_STATUS|PG_NOAPI}, sendPageSpiderdb , 0 ,NULL,NULL,
PG_STATUS|PG_NOAPI|PG_ROOTADMIN},
//{ PAGE_PRIORITIES, "admin/priorities" , 0 , "priority controls",1,1, //{ PAGE_PRIORITIES, "admin/priorities" , 0 , "priority controls",1,1,
// //USER_ADMIN | USER_MASTER , // //USER_ADMIN | USER_MASTER ,
// "spider priorities", // "spider priorities",
@ -290,33 +348,45 @@ static WebPage s_pages[] = {
#ifndef CYGWIN #ifndef CYGWIN
{ PAGE_SEO, "seo",0,"seo" , 0 , 0 , { PAGE_SEO, "seo",0,"seo" , 0 , 0 ,
"SEO info", "SEO info",
sendPageSEO , 2 ,NULL,NULL,PG_NOAPI}, sendPageSEO , 2 ,NULL,NULL,
PG_NOAPI|PG_ROOTADMIN},
#else #else
{ PAGE_SEO, "seo",0,"seo" , 0 , 0 , { PAGE_SEO, "seo",0,"seo" , 0 , 0 ,
"SEO info", "SEO info",
sendPageResults , 0 ,NULL,NULL,PG_NOAPI}, sendPageResults , 0 ,NULL,NULL,
PG_NOAPI|PG_ROOTADMIN},
#endif #endif
{ PAGE_ACCESS , "admin/access" , 0 , "access" , 1 , M_POST, { PAGE_ACCESS , "admin/access" , 0 , "access" , 1 , M_POST,
//USER_ADMIN | USER_MASTER , //USER_ADMIN | USER_MASTER ,
"access password, ip, admin ips etc. all goes in here", "access password, ip, admin ips etc. all goes in here",
sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI}, sendPageGeneric , 0 ,NULL,NULL,
PG_NOAPI|PG_ROOTADMIN},
{ PAGE_SEARCHBOX , "admin/searchbox", 0 , "search" , 0 , 0 , { PAGE_SEARCHBOX , "admin/searchbox", 0 , "search" , 0 , 0 ,
//USER_ADMIN | USER_MASTER , //USER_ADMIN | USER_MASTER ,
"search box", "search box",
sendPageResults , 0 ,NULL,NULL,PG_NOAPI}, sendPageResults , 0 ,NULL,NULL,
PG_NOAPI},
{ PAGE_PARSER , "admin/parser" , 0 , "parser" , 0,M_POST, { PAGE_PARSER , "admin/parser" , 0 , "parser" , 0,M_POST,
//USER_MASTER , //USER_MASTER ,
"page parser", "page parser",
sendPageParser , 2 ,NULL,NULL,PG_NOAPI}, sendPageParser , 2 ,NULL,NULL,
PG_NOAPI|PG_COLLADMIN},
{ PAGE_SITEDB , "admin/tagdb" , 0 , "tagdb" , 0 , M_POST, { PAGE_SITEDB , "admin/tagdb" , 0 , "tagdb" , 0 , M_POST,
//USER_MASTER | USER_ADMIN, //USER_MASTER | USER_ADMIN,
"add/remove/get tags for sites/urls", "add/remove/get tags for sites/urls",
sendPageTagdb , 0 ,NULL,NULL,PG_NOAPI}, sendPageTagdb , 0 ,NULL,NULL,
PG_NOAPI|PG_COLLADMIN},
{ PAGE_CATDB , "admin/catdb" , 0 , "catdb" , 0,M_POST, { PAGE_CATDB , "admin/catdb" , 0 , "catdb" , 0,M_POST,
//USER_MASTER | USER_ADMIN, //USER_MASTER | USER_ADMIN,
"catdb", "catdb",
sendPageCatdb , 0 ,NULL,NULL,PG_NOAPI}, sendPageCatdb , 0 ,NULL,NULL,
PG_NOAPI|PG_ROOTADMIN},
//{ PAGE_LOGIN2 , "admin/login" , 0 , "login" , 0 , 0, //{ PAGE_LOGIN2 , "admin/login" , 0 , "login" , 0 , 0,
// //USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_SPAM | USER_CLIENT, // //USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_SPAM | USER_CLIENT,
//"login link - also logoffs user", //"login link - also logoffs user",
@ -525,7 +595,11 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
//Host *h = g_hostdb.m_myHost; //Host *h = g_hostdb.m_myHost;
// now use this... // now use this...
bool isAdmin = g_conf.isRootAdmin ( s , r ); bool isRootAdmin = g_conf.isRootAdmin ( s , r );
CollectionRec *cr = g_collectiondb.getRec ( r , true );
//////////////////// ////////////////////
//////////////////// ////////////////////
@ -534,10 +608,14 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
// //
//////////////////// ////////////////////
//////////////////// ////////////////////
if ( ! publicPage && ! isAdmin )
return sendPageLogin ( s , r );
if ( page == PAGE_CRAWLBOT && ! isAdmin ) // no longer, we let anyone snoop around to check out the gui
//char guest = r->getLong("guest",0);
//if ( ! publicPage && ! isRootAdmin && ! guest )
// return sendPageLogin ( s , r );
if ( page == PAGE_CRAWLBOT && ! isRootAdmin )
log("pages: accessing a crawlbot page without admin privs. " log("pages: accessing a crawlbot page without admin privs. "
"no parms can be changed."); "no parms can be changed.");
@ -655,6 +733,39 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
// } // }
//
// CLOUD SEARCH ENGINE SUPPORT
//
// if not the root admin only all user to change settings, etc.
// if the collection rec is a guest collection. i.e. in the cloud.
//
//bool isRootAdmin = g_conf.isRootAdmin(sock,hr);
bool isRootColl = false;
if ( cr && strcmp(cr->m_coll,"main")==0 ) isRootColl = true;
if ( cr && strcmp(cr->m_coll,"dmoz")==0 ) isRootColl = true;
if ( cr && strcmp(cr->m_coll,"demo")==0 ) isRootColl = true;
// the main,dmoz and demo collections are root admin only
// if ( ! isRootAdmin && isRootColl ) {
// g_errno = ENOPERM;
// return log("parms: root admin can only change main/dmoz/demo"
// " collections.");
// }
// just knowing the collection name is enough for a cloud user to
// modify the collection's parms. however, to modify the master
// controls or stuff in g_conf, you have to be root admin.
if ( ! g_conf.m_allowCloudUsers && ! isRootAdmin ) {
//g_errno = ENOPERM;
//return log("parms: permission denied for user");
return sendPageLogin ( s , r );
}
// get safebuf stored in TcpSocket class // get safebuf stored in TcpSocket class
SafeBuf *parmList = &s->m_handyBuf; SafeBuf *parmList = &s->m_handyBuf;
@ -668,13 +779,12 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
//////// ////////
// . convert http request to list of parmdb records // . convert http request to list of parmdb records
// . will only add parm recs we have permission to modify // . will only add parm recs we have permission to modify!!!
// . if no collection supplied will just return true with no g_errno // . if no collection supplied will just return true with no g_errno
if ( isAdmin && if ( //isRootAdmin &&
! g_parms.convertHttpRequestToParmList ( r, parmList, page, s)) ! g_parms.convertHttpRequestToParmList ( r, parmList, page, s))
return g_httpServer.sendErrorReply(s,505,mstrerror(g_errno)); return g_httpServer.sendErrorReply(s,505,mstrerror(g_errno));
// . add parmList using Parms::m_msg4 to all hosts! // . add parmList using Parms::m_msg4 to all hosts!
// . returns true and sets g_errno on error // . returns true and sets g_errno on error
// . returns false if would block // . returns false if would block
@ -682,7 +792,7 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
// . so then doneBroadcastingParms() is called when all hosts // . so then doneBroadcastingParms() is called when all hosts
// have received the updated parms, unless a host is dead, // have received the updated parms, unless a host is dead,
// in which case he should sync up when he comes back up // in which case he should sync up when he comes back up
if ( isAdmin && if ( //isCollAdmin &&
! g_parms.broadcastParmList ( parmList , ! g_parms.broadcastParmList ( parmList ,
s , // state is socket i guess s , // state is socket i guess
doneBroadcastingParms ) ) doneBroadcastingParms ) )
@ -960,6 +1070,10 @@ bool printTopNavButton ( char *text,
"border-style:solid;" "border-style:solid;"
//"margin-bottom:-3px;" //"margin-bottom:-3px;"
"border-color:blue;" "border-color:blue;"
// fix for msie. no this is bad for firefox
//"padding-bottom:7px;"
// fix msie this way:
"border-bottom-width:4px;"
"border-bottom-color:white;" "border-bottom-color:white;"
//"overflow-y:hidden;" //"overflow-y:hidden;"
//"overflow-x:hidden;" //"overflow-x:hidden;"
@ -1190,7 +1304,7 @@ bool Pages::printAdminTop (SafeBuf *sb ,
// //
// first the nav column // first the nav column
// //
sb->safePrintf("<TD bgcolor=#f3c714 " // yellow/gold sb->safePrintf("<TD bgcolor=#%s "//f3c714 " // yellow/gold
"valign=top " "valign=top "
"style=\"" "style=\""
"width:210px;" "width:210px;"
@ -1213,13 +1327,15 @@ bool Pages::printAdminTop (SafeBuf *sb ,
"height:100px;" "height:100px;"
"\">" "\">"
"<br style=line-height:10px;>" "<br style=line-height:10px;>"
"<img width=54 height=79 alt=HOME src=/rocket.jpg>" "<img width=54 height=79 alt=HOME border=0 "
"src=/rocket.jpg>"
"</div>" "</div>"
"</a>" "</a>"
"</center>" "</center>"
"<br>" "<br>"
"<br>" "<br>"
, GOLD
,coll ,coll
); );
@ -1302,6 +1418,27 @@ bool Pages::printAdminTop (SafeBuf *sb ,
// collection navbar // collection navbar
status&=printCollectionNavBar ( sb, page , username, coll,pwd, qs,s,r); status&=printCollectionNavBar ( sb, page , username, coll,pwd, qs,s,r);
// count the statuses
long emptyCount = 0;
long doneCount = 0;
long activeCount = 0;
long pauseCount = 0;
for (long i = 0 ; i < g_collectiondb.m_numRecs ; i++ ) {
CollectionRec *cc = g_collectiondb.m_recs[i];
if ( ! cc ) continue;
CrawlInfo *ci = &cc->m_globalCrawlInfo;
if ( cc->m_spideringEnabled &&
! ci->m_hasUrlsReadyToSpider &&
ci->m_urlsHarvested )
emptyCount++;
else if ( ! ci->m_hasUrlsReadyToSpider )
doneCount++;
else if (cc->m_spideringEnabled && ci->m_hasUrlsReadyToSpider )
activeCount++;
else if (!cc->m_spideringEnabled && ci->m_hasUrlsReadyToSpider)
pauseCount++;
}
sb->safePrintf("</div>"); sb->safePrintf("</div>");
@ -1313,21 +1450,27 @@ bool Pages::printAdminTop (SafeBuf *sb ,
); );
sb->safePrintf( sb->safePrintf(
"<font color=black>" "<font color=black>"
"&#x25cf;</font> spider is done" "&#x25cf;</font> spider is done (%li)"
"<br>" "<br>"
"<font color=orange>" "<font color=orange>"
"&#x25cf;</font> spider is paused" "&#x25cf;</font> spider is paused (%li)"
"<br>" "<br>"
"<font color=green>" "<font color=green>"
"&#x25cf;</font> spider is active" "&#x25cf;</font> spider is active (%li)"
"<br>" "<br>"
"<font color=gray>" "<font color=gray>"
"&#x25cf;</font> spider queue is empty" "&#x25cf;</font> spider queue empty (%li)"
"<br>" "<br>"
"</div>" "</div>"
,doneCount
,pauseCount
,activeCount
,emptyCount
); );
@ -1339,14 +1482,31 @@ bool Pages::printAdminTop (SafeBuf *sb ,
// //
// the controls will go here // the controls will go here
sb->safePrintf("<TD valign=top >" sb->safePrintf("<TD valign=top>"
// MDW 9/27/2014: tried to fix that blue border
// in MSIE but could not easily make it go away.
// seems like the table cell truncates the div's
// left border below even if i put a z-index:1000;
// on there.
// "style="
// "border-color:green;"
// "border-left-width:3px;"
// "border-style:solid;"
// "margin-left:-30px;"
// ">"
"<div style=\"padding-left:20px;" "<div style=\"padding-left:20px;"
"margin-left:-3px;" "margin-left:-3px;"
"border-color:#f3c714;" "border-color:#%s;"//f3c714;"
"border-width:3px;" "border-width:3px;"
"border-left-width:3px;" // make this from 3px to 4px for msie
"border-left-width:4px;"
// another msie fix:
//"position:absolute;"
"border-top-width:0px;" "border-top-width:0px;"
"border-right-width:0px;" "border-right-width:0px;"
"border-bottom-color:blue;" "border-bottom-color:blue;"
@ -1354,8 +1514,11 @@ bool Pages::printAdminTop (SafeBuf *sb ,
"border-style:solid;" "border-style:solid;"
"padding:4px;" "padding:4px;"
"background-color:#f3c714;\" " // yellow/gold "background-color:#%s;\" "//f3c714;\" " // yellow/gold
"id=prepane>"); "id=prepane>"
, GOLD
, GOLD
);
// logout link on far right // logout link on far right
sb->safePrintf("<div align=right " sb->safePrintf("<div align=right "
@ -1392,7 +1555,7 @@ bool Pages::printAdminTop (SafeBuf *sb ,
//sb->safePrintf ("</td></tr></table><br/>\n");//<br/>\n"); //sb->safePrintf ("</td></tr></table><br/>\n");//<br/>\n");
SafeBuf mb; SafeBuf mb;
bool added = printRedBox ( &mb ); bool added = printRedBox ( &mb , s , r );
// print emergency msg box // print emergency msg box
if ( added ) if ( added )
@ -2229,6 +2392,7 @@ bool Pages::printAdminLinks ( SafeBuf *sb,
if ( i == PAGE_API ) continue; if ( i == PAGE_API ) continue;
if ( i == PAGE_SEARCHBOX ) continue; if ( i == PAGE_SEARCHBOX ) continue;
if ( i == PAGE_TITLEDB ) continue; if ( i == PAGE_TITLEDB ) continue;
if ( i == PAGE_IMPORT ) continue;
// move these links to the coll nav bar on the left // move these links to the coll nav bar on the left
if ( i == PAGE_ADDCOLL ) continue; if ( i == PAGE_ADDCOLL ) continue;
if ( i == PAGE_DELCOLL ) continue; if ( i == PAGE_DELCOLL ) continue;
@ -2511,7 +2675,7 @@ bool Pages::printCollectionNavBar ( SafeBuf *sb ,
// every other coll in a darker div // every other coll in a darker div
if ( (row % 2) == 0 ) if ( (row % 2) == 0 )
sb->safePrintf("</div>"); sb->safePrintf("</div>\n");
else else
sb->safePrintf("<br>\n"); sb->safePrintf("<br>\n");
} }
@ -3170,7 +3334,8 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
"<td>STRING</td>" "<td>STRING</td>"
"<td>output format</td>" "<td>output format</td>"
"<td>html</td>" "<td>html</td>"
"<td>Display output in this format.</td>" "<td>Display output in this format. Can be "
"<i>html</i>, <i>json</i> or <i>xml</i>.</td>"
"</tr>" "</tr>"
, blues[count%2] , blues[count%2]
, count , count
@ -3243,7 +3408,7 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
// dup page fix. so we should 'masterpwd' and 'masterip' // dup page fix. so we should 'masterpwd' and 'masterip'
// in the list now. // in the list now.
if ( pageNum == PAGE_SECURITY ) pageNum = PAGE_BASIC_SECURITY; //if ( pageNum ==PAGE_SECURITY ) pageNum = PAGE_BASIC_SECURITY;
if ( pageNum != PAGENUM ) continue; if ( pageNum != PAGENUM ) continue;
@ -3629,7 +3794,9 @@ bool sendPageLogin ( TcpSocket *socket , HttpRequest *hr ) {
"<input type=submit value=ok border=0 onclick=\"" "<input type=submit value=ok border=0 onclick=\""
"document.cookie='pwd='+document.getElementById('ppp')" "document.cookie='pwd='+document.getElementById('ppp')"
".value+" ".value+"
"';expires=0';" // fix so cookies work for msie. expires= is wrong i guess.
//"';expires=9999999';"
"';max-age=9999999';"
"\"></td>" "\"></td>"
"</tr></table>" "</tr></table>"
"</center>" "</center>"
@ -3649,10 +3816,10 @@ bool sendPageLogin ( TcpSocket *socket , HttpRequest *hr ) {
NULL);// cookie NULL);// cookie
} }
bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage ) { bool printRedBox2 ( SafeBuf *sb , TcpSocket *sock , HttpRequest *hr ) {
SafeBuf mb; SafeBuf mb;
// return false if no red box // return false if no red box
if ( ! printRedBox ( &mb , isRootWebPage ) ) return false; if ( ! printRedBox ( &mb , sock , hr ) ) return false;
// otherwise, print it // otherwise, print it
sb->safeStrcpy ( mb.getBufStart() ); sb->safeStrcpy ( mb.getBufStart() );
// return true since we printed one // return true since we printed one
@ -3660,7 +3827,7 @@ bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage ) {
} }
// emergency message box // emergency message box
bool printRedBox ( SafeBuf *mb , bool isRootWebPage ) { bool printRedBox ( SafeBuf *mb , TcpSocket *sock , HttpRequest *hr ) {
PingServer *ps = &g_pingServer; PingServer *ps = &g_pingServer;
@ -3685,12 +3852,14 @@ bool printRedBox ( SafeBuf *mb , bool isRootWebPage ) {
mb->safePrintf("<div style=max-width:500px;>"); mb->safePrintf("<div style=max-width:500px;>");
long page = g_pages.getDynamicPageNumber ( hr );
// are we just starting off? give them a little help. // are we just starting off? give them a little help.
CollectionRec *cr = g_collectiondb.getRec("main"); CollectionRec *crm = g_collectiondb.getRec("main");
if ( g_collectiondb.m_numRecs == 1 && if ( g_collectiondb.m_numRecs == 1 &&
cr && crm &&
isRootWebPage && page == PAGE_ROOT && // isRootWebPage &&
cr->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) { crm->m_globalCrawlInfo.m_pageDownloadAttempts == 0 ) {
if ( adds ) mb->safePrintf("<br>"); if ( adds ) mb->safePrintf("<br>");
adds++; adds++;
mb->safePrintf("%s",box); mb->safePrintf("%s",box);
@ -3702,18 +3871,36 @@ bool printRedBox ( SafeBuf *mb , bool isRootWebPage ) {
mb->safePrintf("%s",boxEnd); mb->safePrintf("%s",boxEnd);
} }
if ( isRootWebPage ) { if ( page == PAGE_ROOT ) { // isRootWebPage ) {
mb->safePrintf("</div>"); mb->safePrintf("</div>");
return (bool)adds; return (bool)adds;
} }
if ( g_conf.m_numConnectIps == 0 && g_conf.m_numMasterPwds == 0 ) { if ( g_conf.m_masterPwds.length() == 0 ) {
if ( adds ) mb->safePrintf("<br>"); if ( adds ) mb->safePrintf("<br>");
adds++; adds++;
mb->safePrintf("%s",box); mb->safePrintf("%s",box);
mb->safePrintf("URGENT. Please specify a password " mb->safePrintf("URGENT. Please specify a ROOT password "
"or IP address in the " "or IP address in the "
"<a href=/admin/security>security</a> " "<a href=/admin/rootpassword>root "
"password</a> "
"table. Right now anybody might be able "
"to access the Gigablast admin controls.");
mb->safePrintf("%s",boxEnd);
}
CollectionRec *cr = g_collectiondb.getRec ( hr );
if ( cr &&
cr->m_collectionPasswords.length() == 0 &&
cr->m_collectionIps.length() == 0 ) {
if ( adds ) mb->safePrintf("<br>");
adds++;
mb->safePrintf("%s",box);
mb->safePrintf("URGENT. Please specify a COLLECTION password "
"or IP address in the "
"<a href=/admin/collectionpasswords>"
"password</a> "
"table. Right now anybody might be able " "table. Right now anybody might be able "
"to access the Gigablast admin controls."); "to access the Gigablast admin controls.");
mb->safePrintf("%s",boxEnd); mb->safePrintf("%s",boxEnd);
@ -3792,6 +3979,42 @@ bool printRedBox ( SafeBuf *mb , bool isRootWebPage ) {
mb->safePrintf("%s",boxEnd); mb->safePrintf("%s",boxEnd);
} }
WebPage *wp = g_pages.getPage(page);
if ( wp &&
(wp->m_pgflags & (PG_ROOTADMIN|PG_COLLADMIN)) &&
! g_conf.isRootAdmin(sock,hr) &&
! g_conf.isCollAdmin(sock,hr) ) {
if ( adds ) mb->safePrintf("<br>");
adds++;
mb->safePrintf("%s",box);
char *ff = "admin/settings";
if ( wp ) ff = wp->m_filename;
mb->safePrintf("You have no write access to these "
"controls. Please enter the collection or "
"root password to get access: "
"<form method=GET action=\"/%s\" name=xyz>"
"<input type=password id=ppp name=xpwd size=20>"
"<input type=submit value=ok "
"border=0 onclick=\""
"document.cookie='pwd='+"
"document.getElementById('ppp')"
".value+"
"';max-age=9999999';"
"\">"
"</form>"
, ff );
mb->safePrintf("%s",boxEnd);
}
if ( ps->m_numHostsDead ) { if ( ps->m_numHostsDead ) {
if ( adds ) mb->safePrintf("<br>"); if ( adds ) mb->safePrintf("<br>");
adds++; adds++;

23
Pages.h
View File

@ -5,8 +5,13 @@
#ifndef _PAGES_H_ #ifndef _PAGES_H_
#define _PAGES_H_ #define _PAGES_H_
bool printRedBox2 ( SafeBuf *sb , bool isRootWebPage = false ) ; bool printRedBox2 ( SafeBuf *sb ,
bool printRedBox ( SafeBuf *mb , bool isRootWebPage = false ) ; class TcpSocket *sock ,
class HttpRequest *hr );
bool printRedBox ( SafeBuf *mb ,
class TcpSocket *sock ,
class HttpRequest *hr );
// for PageEvents.cpp and Accessdb.cpp // for PageEvents.cpp and Accessdb.cpp
//#define RESULTSWIDTHSTR "550px" //#define RESULTSWIDTHSTR "550px"
@ -17,6 +22,8 @@ bool printRedBox ( SafeBuf *mb , bool isRootWebPage = false ) ;
#include "SafeBuf.h" #include "SafeBuf.h"
#include "PageCrawlBot.h" // sendPageCrawlBot() #include "PageCrawlBot.h" // sendPageCrawlBot()
#define GOLD "f3c734"
#define LIGHTER_BLUE "e8e8ff" #define LIGHTER_BLUE "e8e8ff"
#define LIGHT_BLUE "d0d0e0" #define LIGHT_BLUE "d0d0e0"
#define DARK_BLUE "c0c0f0" #define DARK_BLUE "c0c0f0"
@ -102,6 +109,8 @@ bool sendPageQA ( TcpSocket *sock , HttpRequest *hr ) ;
// values for WebPage::m_flags // values for WebPage::m_flags
#define PG_NOAPI 0x01 #define PG_NOAPI 0x01
#define PG_STATUS 0x02 #define PG_STATUS 0x02
#define PG_COLLADMIN 0x04
#define PG_ROOTADMIN 0x08
// . description of a dynamic page // . description of a dynamic page
// . we have a static array of these in Pages.cpp // . we have a static array of these in Pages.cpp
@ -308,13 +317,13 @@ extern class Pages g_pages;
// . some pages also have urls like /search to mean page=0 // . some pages also have urls like /search to mean page=0
enum { enum {
// dummy pages // dummy pages
PAGE_NOHOSTLINKS = 0, //PAGE_NOHOSTLINKS = 0,
PAGE_ADMIN , //PAGE_ADMIN ,
//PAGE_QUALITY , //PAGE_QUALITY ,
PAGE_PUBLIC , //PAGE_PUBLIC ,
// public pages // public pages
PAGE_ROOT , PAGE_ROOT =0,
PAGE_RESULTS , PAGE_RESULTS ,
//PAGE_WIDGET, //PAGE_WIDGET,
PAGE_ADDURL , // 5 PAGE_ADDURL , // 5
@ -339,7 +348,7 @@ enum {
PAGE_SPIDER , PAGE_SPIDER ,
PAGE_SPIDERPROXIES , PAGE_SPIDERPROXIES ,
PAGE_LOG , PAGE_LOG ,
PAGE_SECURITY , // 19 PAGE_ROOTPASSWORDS , // 19
PAGE_ADDCOLL , //20 PAGE_ADDCOLL , //20
PAGE_DELCOLL , PAGE_DELCOLL ,
PAGE_CLONECOLL , PAGE_CLONECOLL ,

475
Parms.cpp
View File

@ -234,7 +234,7 @@ bool CommandRemoveConnectIpRow ( char *rec ) {
for ( long i = 0 ; i < g_parms.m_numParms ; i++ ) { for ( long i = 0 ; i < g_parms.m_numParms ; i++ ) {
Parm *m = &g_parms.m_parms[i]; Parm *m = &g_parms.m_parms[i];
// parm must be a url filters parm // parm must be a url filters parm
if ( m->m_page != PAGE_SECURITY ) continue; if ( m->m_page != PAGE_ROOTPASSWORDS ) continue;
// must be an array! // must be an array!
if ( ! m->isArray() ) continue; if ( ! m->isArray() ) continue;
// sanity check // sanity check
@ -263,7 +263,7 @@ bool CommandRemovePasswordRow ( char *rec ) {
for ( long i = 0 ; i < g_parms.m_numParms ; i++ ) { for ( long i = 0 ; i < g_parms.m_numParms ; i++ ) {
Parm *m = &g_parms.m_parms[i]; Parm *m = &g_parms.m_parms[i];
// parm must be a url filters parm // parm must be a url filters parm
if ( m->m_page != PAGE_SECURITY ) continue; if ( m->m_page != PAGE_ROOTPASSWORDS ) continue;
// must be an array! // must be an array!
if ( ! m->isArray() ) continue; if ( ! m->isArray() ) continue;
// sanity check // sanity check
@ -1164,11 +1164,14 @@ bool Parms::sendPageGeneric ( TcpSocket *s , HttpRequest *r ) {
char format = r->getReplyFormat(); char format = r->getReplyFormat();
char guide = r->getLong("guide",0);
// //
// CLOUD SEARCH ENGINE SUPPORT // CLOUD SEARCH ENGINE SUPPORT
// //
char *action = r->getString("action",NULL); char *action = r->getString("action",NULL);
if ( page == PAGE_BASIC_SETTINGS && if ( page == PAGE_BASIC_SETTINGS &&
guide &&
// this is non-null if handling a submit request // this is non-null if handling a submit request
action && action &&
format == FORMAT_HTML ) { format == FORMAT_HTML ) {
@ -1299,14 +1302,14 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
if ( page == PAGE_LOG ) tt = "Log Controls"; if ( page == PAGE_LOG ) tt = "Log Controls";
if ( page == PAGE_MASTER ) tt = "Master Controls"; if ( page == PAGE_MASTER ) tt = "Master Controls";
if ( page == PAGE_INJECT ) tt = "Inject Url"; if ( page == PAGE_INJECT ) tt = "Inject Url";
if ( page == PAGE_SECURITY ) tt = "Security"; if ( page == PAGE_ROOTPASSWORDS ) tt = "Root Passwords";
if ( page == PAGE_ADDURL2 ) tt = "Add Urls"; if ( page == PAGE_ADDURL2 ) tt = "Add Urls";
if ( page == PAGE_SPIDER ) tt = "Spider Controls"; if ( page == PAGE_SPIDER ) tt = "Spider Controls";
if ( page == PAGE_SEARCH ) tt = "Search Controls"; if ( page == PAGE_SEARCH ) tt = "Search Controls";
if ( page == PAGE_ACCESS ) tt = "Access Controls"; if ( page == PAGE_ACCESS ) tt = "Access Controls";
if ( page == PAGE_FILTERS ) tt = "Url Filters"; if ( page == PAGE_FILTERS ) tt = "Url Filters";
if ( page == PAGE_BASIC_SETTINGS ) tt = "Settings"; if ( page == PAGE_BASIC_SETTINGS ) tt = "Settings";
if ( page == PAGE_BASIC_SECURITY ) tt = "Security"; if ( page == PAGE_BASIC_SECURITY ) tt = "Collection Passwords";
//if ( page == PAGE_SITES ) tt = "Site List"; //if ( page == PAGE_SITES ) tt = "Site List";
//if ( page == PAGE_PRIORITIES ) tt = "Priority Controls"; //if ( page == PAGE_PRIORITIES ) tt = "Priority Controls";
//if ( page == PAGE_RULES ) tt = "Site Rules"; //if ( page == PAGE_RULES ) tt = "Site Rules";
@ -1329,6 +1332,8 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
if ( format == FORMAT_XML || format == FORMAT_JSON ) { if ( format == FORMAT_XML || format == FORMAT_JSON ) {
char *coll = g_collectiondb.getDefaultColl(r); char *coll = g_collectiondb.getDefaultColl(r);
CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true); CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true);
bool isRootAdmin = g_conf.isRootAdmin ( s , r );
bool isCollAdmin = g_conf.isCollAdmin ( s , r );
g_parms.printParms2 ( sb , g_parms.printParms2 ( sb ,
page , page ,
cr , cr ,
@ -1336,7 +1341,9 @@ bool Parms::printParmTable ( SafeBuf *sb , TcpSocket *s , HttpRequest *r ) {
1 , // long pd , print desc? 1 , // long pd , print desc?
false , // isCrawlbot false , // isCrawlbot
format , format ,
NULL ); // TcpSocket *sock NULL , // TcpSocket *sock
isRootAdmin ,
isCollAdmin );
return true; return true;
} }
@ -1705,12 +1712,16 @@ bool Parms::printParms (SafeBuf* sb, TcpSocket *s , HttpRequest *r) {
long pd = r->getLong("pd",1); long pd = r->getLong("pd",1);
char *coll = g_collectiondb.getDefaultColl(r); char *coll = g_collectiondb.getDefaultColl(r);
CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true); CollectionRec *cr = g_collectiondb.getRec(coll);//2(r,true);
bool isRootAdmin = g_conf.isRootAdmin ( s , r );
bool isCollAdmin = g_conf.isCollAdmin ( s , r );
//char *coll = r->getString ( "c" ); //char *coll = r->getString ( "c" );
//if ( ! coll || ! coll[0] ) coll = "main"; //if ( ! coll || ! coll[0] ) coll = "main";
//CollectionRec *cr = g_collectiondb.getRec ( coll ); //CollectionRec *cr = g_collectiondb.getRec ( coll );
// if "main" collection does not exist, try another // if "main" collection does not exist, try another
//if ( ! cr ) cr = getCollRecFromHttpRequest ( r ); //if ( ! cr ) cr = getCollRecFromHttpRequest ( r );
printParms2 ( sb, page, cr, nc, pd,0,0 , s); printParms2 ( sb, page, cr, nc, pd,0,0 , s,isRootAdmin,isCollAdmin);
return true; return true;
} }
@ -1723,7 +1734,9 @@ bool Parms::printParms2 ( SafeBuf* sb ,
long pd , long pd ,
bool isCrawlbot , bool isCrawlbot ,
char format , // bool isJSON , char format , // bool isJSON ,
TcpSocket *sock ) { TcpSocket *sock ,
bool isRootAdmin ,
bool isCollAdmin ) {
bool status = true; bool status = true;
s_count = 0; s_count = 0;
// background color // background color
@ -1736,13 +1749,12 @@ bool Parms::printParms2 ( SafeBuf* sb ,
if ( cr ) coll = cr->m_coll; if ( cr ) coll = cr->m_coll;
// page aliases // page aliases
if ( page == PAGE_BASIC_SECURITY ) //if ( page == PAGE_BASIC_SECURITY )
page = PAGE_SECURITY; // page = PAGE_ROOTPASSWORDS;
GigablastRequest gr; GigablastRequest gr;
g_parms.setToDefault ( (char *)&gr , OBJ_GBREQUEST , NULL); g_parms.setToDefault ( (char *)&gr , OBJ_GBREQUEST , NULL);
// find in parms list // find in parms list
for ( long i = 0 ; i < m_numParms ; i++ ) { for ( long i = 0 ; i < m_numParms ; i++ ) {
// get it // get it
@ -1824,7 +1836,7 @@ bool Parms::printParms2 ( SafeBuf* sb ,
sb->safePrintf ( "%s" , m->m_desc ); sb->safePrintf ( "%s" , m->m_desc );
// print users current ip if showing the list // print users current ip if showing the list
// of "Master IPs" for admin access // of "Master IPs" for admin access
if ( m->m_page == PAGE_SECURITY && if ( m->m_page == PAGE_ROOTPASSWORDS &&
sock && sock &&
m->m_title && m->m_title &&
strstr(m->m_title,"IP") ) strstr(m->m_title,"IP") )
@ -1852,7 +1864,9 @@ bool Parms::printParms2 ( SafeBuf* sb ,
bg,nc,pd, bg,nc,pd,
false, false,
isCrawlbot, isCrawlbot,
format);//isJSON); format,
isRootAdmin,
isCollAdmin);
continue; continue;
} }
// if not first in a row, skip it, we printed it already // if not first in a row, skip it, we printed it already
@ -1872,7 +1886,9 @@ bool Parms::printParms2 ( SafeBuf* sb ,
status &=printParm(sb,NULL,&m_parms[k],k, status &=printParm(sb,NULL,&m_parms[k],k,
newj,jend,(char *)THIS,coll,NULL, newj,jend,(char *)THIS,coll,NULL,
bg,nc,pd, j==size-1, bg,nc,pd, j==size-1,
isCrawlbot,format);//isJSON) isCrawlbot,format,
isRootAdmin,
isCollAdmin);
} }
} }
// end array table // end array table
@ -1901,7 +1917,9 @@ bool Parms::printParm ( SafeBuf* sb,
bool lastRow , bool lastRow ,
bool isCrawlbot , bool isCrawlbot ,
//bool isJSON ) { //bool isJSON ) {
char format ) { char format ,
bool isRootAdmin ,
bool isCollAdmin ) {
bool status = true; bool status = true;
// do not print if no permissions // do not print if no permissions
//if ( m->m_perms != 0 && !g_users.hasPermission(username,m->m_perms) ) //if ( m->m_perms != 0 && !g_users.hasPermission(username,m->m_perms) )
@ -1961,7 +1979,7 @@ bool Parms::printParm ( SafeBuf* sb,
page == PAGE_SPIDER || page == PAGE_SPIDER ||
page == PAGE_SPIDERPROXIES || page == PAGE_SPIDERPROXIES ||
page == PAGE_FILTERS || page == PAGE_FILTERS ||
page == PAGE_SECURITY || page == PAGE_ROOTPASSWORDS ||
page == PAGE_REPAIR || page == PAGE_REPAIR ||
page == PAGE_LOG ) { page == PAGE_LOG ) {
sb->safePrintf ( "\t\t<currentValue><![CDATA["); sb->safePrintf ( "\t\t<currentValue><![CDATA[");
@ -1994,7 +2012,7 @@ bool Parms::printParm ( SafeBuf* sb,
page == PAGE_SPIDER || page == PAGE_SPIDER ||
page == PAGE_SPIDERPROXIES || page == PAGE_SPIDERPROXIES ||
page == PAGE_FILTERS || page == PAGE_FILTERS ||
page == PAGE_SECURITY || page == PAGE_ROOTPASSWORDS ||
page == PAGE_REPAIR || page == PAGE_REPAIR ||
page == PAGE_LOG ) { page == PAGE_LOG ) {
sb->safePrintf ( "\t\t\"currentValue\":\""); sb->safePrintf ( "\t\t\"currentValue\":\"");
@ -2390,6 +2408,19 @@ bool Parms::printParm ( SafeBuf* sb,
strcmp(m->m_title,"url filters profile")==0) strcmp(m->m_title,"url filters profile")==0)
// url filters profile drop down "ufp" // url filters profile drop down "ufp"
printDropDownProfile ( sb , "ufp" , cr );//*s ); printDropDownProfile ( sb , "ufp" , cr );//*s );
// do not expose master passwords or IPs to non-root admins
else if ( ( m->m_flags & PF_PRIVATE ) &&
m->m_obj == OBJ_CONF &&
! isRootAdmin )
return true;
// do not expose master passwords or IPs to non-root admins
else if ( ( m->m_flags & PF_PRIVATE ) &&
m->m_obj == OBJ_COLL &&
! isCollAdmin )
return true;
else if ( t == TYPE_RETRIES ) else if ( t == TYPE_RETRIES )
printDropDown ( 4 , sb , cgi , *s , false , false ); printDropDown ( 4 , sb , cgi , *s , false , false );
else if ( t == TYPE_FILEUPLOADBUTTON ) { else if ( t == TYPE_FILEUPLOADBUTTON ) {
@ -2742,10 +2773,10 @@ bool Parms::printParm ( SafeBuf* sb,
// do not allow removal of last default url filters rule // do not allow removal of last default url filters rule
//if ( lastRow && !strcmp(m->m_cgi,"fsp")) show = false; //if ( lastRow && !strcmp(m->m_cgi,"fsp")) show = false;
char *suffix = ""; char *suffix = "";
if ( m->m_page == PAGE_SECURITY && if ( m->m_page == PAGE_ROOTPASSWORDS &&
m->m_type == TYPE_IP ) m->m_type == TYPE_IP )
suffix = "ip"; suffix = "ip";
if ( m->m_page == PAGE_SECURITY && if ( m->m_page == PAGE_ROOTPASSWORDS &&
m->m_type == TYPE_STRINGNONEMPTY ) m->m_type == TYPE_STRINGNONEMPTY )
suffix = "pwd"; suffix = "pwd";
if ( show ) if ( show )
@ -4993,7 +5024,7 @@ void Parms::init ( ) {
"assigns a url or site to a ruleset. Each tagdb record is " "assigns a url or site to a ruleset. Each tagdb record is "
"about 100 bytes or so."; "about 100 bytes or so.";
m->m_off = (char *)&g_conf.m_tagdbMaxTreeMem - g; m->m_off = (char *)&g_conf.m_tagdbMaxTreeMem - g;
m->m_def = "31028000"; m->m_def = "1028000";
m->m_type = TYPE_LONG; m->m_type = TYPE_LONG;
m->m_flags = PF_NOSYNC|PF_NOAPI; m->m_flags = PF_NOSYNC|PF_NOAPI;
m->m_page = PAGE_NONE; m->m_page = PAGE_NONE;
@ -6416,7 +6447,8 @@ void Parms::init ( ) {
m++; m++;
m->m_title = "percent similar dedup summary"; m->m_title = "percent similar dedup summary";
m->m_desc = "If document summary is this percent similar " m->m_desc = "If document summary (and title) are "
"this percent similar "
"to a document summary above it, then remove it from the " "to a document summary above it, then remove it from the "
"search results. 100 means only to remove if exactly the " "search results. 100 means only to remove if exactly the "
"same. 0 means no summary deduping. You must also supply " "same. 0 means no summary deduping. You must also supply "
@ -6790,6 +6822,7 @@ void Parms::init ( ) {
m->m_flags = PF_API; m->m_flags = PF_API;
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
/* /*
@ -8690,6 +8723,17 @@ void Parms::init ( ) {
m++; m++;
m->m_title = "use cache";
m->m_desc = "Use 0 if Gigablast should not read or write from "
"any caches at any level.";
m->m_def = "-1";
m->m_off = (char *)&si.m_useCache - y;
m->m_type = TYPE_CHAR;
m->m_cgi = "usecache";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "read from cache"; m->m_title = "read from cache";
m->m_desc = "Should we read search results from the cache? Set " m->m_desc = "Should we read search results from the cache? Set "
"to false to fix dmoz bug."; "to false to fix dmoz bug.";
@ -8704,17 +8748,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m++; m++;
m->m_title = "use cache";
m->m_desc = "Use 0 if Gigablast should not read or write from "
"any caches at any level.";
m->m_def = "-1";
m->m_off = (char *)&si.m_useCache - y;
m->m_type = TYPE_CHAR;
m->m_cgi = "usecache";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "write to cache"; m->m_title = "write to cache";
m->m_desc = "Use 0 if Gigablast should not write to " m->m_desc = "Use 0 if Gigablast should not write to "
"any caches at any level."; "any caches at any level.";
@ -8768,6 +8801,7 @@ void Parms::init ( ) {
m->m_sprpp = 0; m->m_sprpp = 0;
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
m->m_title = "restrict search to pages that link to this url"; m->m_title = "restrict search to pages that link to this url";
@ -8783,7 +8817,8 @@ void Parms::init ( ) {
m++; m++;
m->m_title = "search for this phrase quoted"; m->m_title = "search for this phrase quoted";
m->m_desc = "The phrase which will be quoted."; m->m_desc = "The phrase which will be quoted in the query. From the "
"advanced search page, adv.html.";
m->m_off = (char *)&si.m_quote1 - y; m->m_off = (char *)&si.m_quote1 - y;
m->m_type = TYPE_CHARPTR;//STRING; m->m_type = TYPE_CHARPTR;//STRING;
//m->m_size = 512; //m->m_size = 512;
@ -8792,10 +8827,12 @@ void Parms::init ( ) {
m->m_sprpp = 0; m->m_sprpp = 0;
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
m->m_title = "search for this second phrase quoted"; m->m_title = "search for this second phrase quoted";
m->m_desc = "The phrase which will be quoted."; m->m_desc = "The phrase which will be quoted in the query. From the "
"advanced search page, adv.html.";
m->m_off = (char *)&si.m_quote2 - y; m->m_off = (char *)&si.m_quote2 - y;
m->m_type = TYPE_CHARPTR;//STRING; m->m_type = TYPE_CHARPTR;//STRING;
//m->m_size = 512; //m->m_size = 512;
@ -8804,6 +8841,7 @@ void Parms::init ( ) {
m->m_sprpp = 0; m->m_sprpp = 0;
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
/* /*
@ -8836,7 +8874,8 @@ void Parms::init ( ) {
m++; m++;
m->m_title = "require these query terms"; m->m_title = "require these query terms";
m->m_desc = "Returned results will have all the words in X."; m->m_desc = "Returned results will have all the words in X. "
"From the advanced search page, adv.html.";
m->m_off = (char *)&si.m_plus - y; m->m_off = (char *)&si.m_plus - y;
m->m_def = NULL; m->m_def = NULL;
m->m_type = TYPE_CHARPTR;//STRING; m->m_type = TYPE_CHARPTR;//STRING;
@ -8846,10 +8885,12 @@ void Parms::init ( ) {
m->m_sprpp = 0; m->m_sprpp = 0;
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
m->m_title = "avoid these query terms"; m->m_title = "avoid these query terms";
m->m_desc = "Returned results will NOT have any of the words in X."; m->m_desc = "Returned results will NOT have any of the words in X. "
"From the advanced search page, adv.html.";
m->m_off = (char *)&si.m_minus - y; m->m_off = (char *)&si.m_minus - y;
m->m_type = TYPE_CHARPTR;//STRING; m->m_type = TYPE_CHARPTR;//STRING;
m->m_cgi = "minus"; m->m_cgi = "minus";
@ -8858,6 +8899,7 @@ void Parms::init ( ) {
m->m_sprpp = 0; m->m_sprpp = 0;
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
m->m_title = "format of the returned search results"; m->m_title = "format of the returned search results";
@ -8869,6 +8911,7 @@ void Parms::init ( ) {
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_cgi = "format"; m->m_cgi = "format";
m->m_flags = PF_NOAPI; // alread in the api, so don't repeat
m++; m++;
m->m_title = "family filter"; m->m_title = "family filter";
@ -8899,10 +8942,8 @@ void Parms::init ( ) {
m++; m++;
m->m_title = "cached page highlight query"; m->m_title = "cached page highlight query";
m->m_desc = "Highlight the terms in this query instead. For " m->m_desc = "Highlight the terms in this query instead.";
"display of the cached page.";
m->m_def = NULL; m->m_def = NULL;
m->m_off = (char *)&si.m_highlightQuery - y; m->m_off = (char *)&si.m_highlightQuery - y;
m->m_type = TYPE_CHARPTR;//STRING; m->m_type = TYPE_CHARPTR;//STRING;
@ -8914,6 +8955,7 @@ void Parms::init ( ) {
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m++; m++;
/* /*
m->m_title = "highlight event date in summaries."; m->m_title = "highlight event date in summaries.";
m->m_desc = "Can be 0 or 1 to respectively disable or enable " m->m_desc = "Can be 0 or 1 to respectively disable or enable "
@ -8942,8 +8984,8 @@ void Parms::init ( ) {
*/ */
m->m_title = "Query match offsets"; m->m_title = "Query match offsets";
m->m_desc = "Return a list of the offsets of each query word" m->m_desc = "Return a list of the offsets of each query word "
"actually matched in the document. 1 means byte offset," "actually matched in the document. 1 means byte offset, "
"and 2 means word offset."; "and 2 means word offset.";
m->m_def = "0"; m->m_def = "0";
m->m_off = (char *)&si.m_queryMatchOffsets - y; m->m_off = (char *)&si.m_queryMatchOffsets - y;
@ -8953,6 +8995,7 @@ void Parms::init ( ) {
m->m_smax = 2; m->m_smax = 2;
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
m->m_title = "boolean status"; m->m_title = "boolean status";
@ -9016,7 +9059,7 @@ void Parms::init ( ) {
"<br><br>\n" "<br><br>\n"
"<b>META</b> is the meta tag name to which Gigablast will " "<b>META</b> is the meta tag name to which Gigablast will "
"restrict the content used to generate the topics. Do not " "restrict the content used to generate the topics. Do not "
"specify thie field to restrict the content to the body of " "specify this field to restrict the content to the body of "
"each document, that is the default.\n" "each document, that is the default.\n"
"<br><br>\n" "<br><br>\n"
"<b>DEL</b> is a single character delimeter which defines " "<b>DEL</b> is a single character delimeter which defines "
@ -9060,21 +9103,46 @@ void Parms::init ( ) {
m++; m++;
*/ */
m->m_title = "niceness";
m->m_desc = "Can be 0 or 1. 0 is usually a faster, high-priority "
"query, 1 is a slower, lower-priority query.";
m->m_def = "0";
m->m_off = (char *)&si.m_niceness - y;
m->m_type = TYPE_LONG;
m->m_cgi = "niceness";
m->m_smin = 0;
m->m_smax = 1;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "debug flag";
m->m_desc = "Is 1 to log debug information, 0 otherwise.";
m->m_def = "0";
m->m_off = (char *)&si.m_debug - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "debug";
//m->m_priv = 1;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "return number of docs per topic"; m->m_title = "return number of docs per topic";
m->m_desc = "Use 1 if you want Gigablast to return the number of " m->m_desc = "Use 1 if you want Gigablast to return the number of "
"documents in the search results that contained each topic."; "documents in the search results that contained each topic "
"(gigabit).";
m->m_def = "1"; m->m_def = "1";
m->m_off = (char *)&si.m_returnDocIdCount - y; m->m_off = (char *)&si.m_returnDocIdCount - y;
m->m_type = TYPE_BOOL; m->m_type = TYPE_BOOL;
m->m_cgi = "rdc"; m->m_cgi = "rdc";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m++; m++;
m->m_title = "return docids per topic"; m->m_title = "return docids per topic";
m->m_desc = "Use 1 if you want Gigablast to return the list of " m->m_desc = "Use 1 if you want Gigablast to return the list of "
"docIds from the search results that contained each topic."; "docIds from the search results that contained each topic "
"(gigabit).";
m->m_def = "0"; m->m_def = "0";
m->m_off = (char *)&si.m_returnDocIds - y; m->m_off = (char *)&si.m_returnDocIds - y;
m->m_type = TYPE_BOOL; m->m_type = TYPE_BOOL;
@ -9085,7 +9153,7 @@ void Parms::init ( ) {
m->m_title = "return popularity per topic"; m->m_title = "return popularity per topic";
m->m_desc = "Use 1 if you want Gigablast to return the popularity " m->m_desc = "Use 1 if you want Gigablast to return the popularity "
"of each topic."; "of each topic (gigabit).";
m->m_def = "0"; m->m_def = "0";
m->m_off = (char *)&si.m_returnPops - y; m->m_off = (char *)&si.m_returnPops - y;
m->m_type = TYPE_BOOL; m->m_type = TYPE_BOOL;
@ -9095,19 +9163,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m++; m++;
m->m_title = "niceness";
m->m_desc = "Can be 0 or 1. 0 is usually a faster, high-priority "
"query, 1 is a slower, lower-priority query.";
m->m_def = "0";
m->m_off = (char *)&si.m_niceness - y;
m->m_type = TYPE_LONG;
m->m_cgi = "niceness";
m->m_smin = 0;
m->m_smax = 1;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
//m->m_title = "compound list max size"; //m->m_title = "compound list max size";
//m->m_desc = "Is the max size in bytes of the compound termlist. " //m->m_desc = "Is the max size in bytes of the compound termlist. "
// "Each document id is 6 bytes."; // "Each document id is 6 bytes.";
@ -9120,23 +9175,12 @@ void Parms::init ( ) {
//m++; //m++;
m->m_title = "debug flag";
m->m_desc = "Is 1 to log debug information, 0 otherwise.";
m->m_def = "0";
m->m_off = (char *)&si.m_debug - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "debug";
//m->m_priv = 1;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "debug gigabits flag"; m->m_title = "debug gigabits flag";
m->m_desc = "Is 1 to log gigabits debug information, 0 otherwise."; m->m_desc = "Is 1 to log gigabits debug information, 0 otherwise.";
m->m_def = "0"; m->m_def = "0";
m->m_off = (char *)&si.m_debugGigabits - y; m->m_off = (char *)&si.m_debugGigabits - y;
m->m_type = TYPE_BOOL; m->m_type = TYPE_BOOL;
m->m_cgi = "debug"; m->m_cgi = "debuggigabits";
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m++; m++;
@ -9161,6 +9205,7 @@ void Parms::init ( ) {
m->m_cgi = "iu"; m->m_cgi = "iu";
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
m->m_title = "image link"; m->m_title = "image link";
@ -9173,6 +9218,7 @@ void Parms::init ( ) {
m->m_cgi = "ix"; m->m_cgi = "ix";
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
m->m_title = "image width"; m->m_title = "image width";
@ -9183,6 +9229,7 @@ void Parms::init ( ) {
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_def = "200"; m->m_def = "200";
m->m_flags = PF_NOAPI;
m++; m++;
m->m_title = "image height"; m->m_title = "image height";
@ -9194,6 +9241,7 @@ void Parms::init ( ) {
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_def = "200"; m->m_def = "200";
m->m_flags = PF_NOAPI;
m++; m++;
// m->m_title = "password"; // m->m_title = "password";
@ -9269,6 +9317,7 @@ void Parms::init ( ) {
m->m_cgi = "gbcountry"; m->m_cgi = "gbcountry";
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
/* /*
@ -9370,6 +9419,7 @@ void Parms::init ( ) {
m->m_cgi = "qcs"; m->m_cgi = "qcs";
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
// buzz // buzz
@ -9381,6 +9431,7 @@ void Parms::init ( ) {
m->m_cgi = "inlinks"; m->m_cgi = "inlinks";
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
// buzz // buzz
@ -9394,6 +9445,7 @@ void Parms::init ( ) {
m->m_cgi = "outlinks"; m->m_cgi = "outlinks";
m->m_page = PAGE_RESULTS; m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI; m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++; m++;
// buzz // buzz
@ -9537,6 +9589,17 @@ void Parms::init ( ) {
m->m_flags = PF_API; m->m_flags = PF_API;
m++; m++;
m->m_title = "query";
m->m_desc = "Highlight this query in the page.";
m->m_def = "";
m->m_type = TYPE_CHARPTR;
m->m_page = PAGE_GET;
m->m_obj = OBJ_GBREQUEST;
m->m_cgi = "q";
m->m_off = (char *)&gr.m_query - (char *)&gr;
m->m_flags = PF_API;
m++;
/* /*
// for /get // for /get
m->m_title = "query highlighting query"; m->m_title = "query highlighting query";
@ -10071,7 +10134,7 @@ void Parms::init ( ) {
m->m_cgi = "afgdwd"; m->m_cgi = "afgdwd";
m->m_off = (char *)&g_conf.m_gzipDownloads - g; m->m_off = (char *)&g_conf.m_gzipDownloads - g;
m->m_type = TYPE_BOOL; m->m_type = TYPE_BOOL;
m->m_def = "0"; m->m_def = "1";
m->m_page = PAGE_MASTER; m->m_page = PAGE_MASTER;
m->m_obj = OBJ_CONF; m->m_obj = OBJ_CONF;
m++; m++;
@ -14309,7 +14372,7 @@ void Parms::init ( ) {
m->m_title = "directory containing titledb files"; m->m_title = "directory containing titledb files";
m->m_desc = "Import documents contained in titledb files in this " m->m_desc = "Import documents contained in titledb files in this "
"directory."; "directory. This is an ABSOLUTE directory path.";
m->m_cgi = "importdir"; m->m_cgi = "importdir";
m->m_xml = "importDir"; m->m_xml = "importDir";
m->m_page = PAGE_IMPORT; m->m_page = PAGE_IMPORT;
@ -14951,7 +15014,8 @@ void Parms::init ( ) {
m++; m++;
m->m_title = "percent similar dedup summary default value"; m->m_title = "percent similar dedup summary default value";
m->m_desc = "If document summary is this percent similar " m->m_desc = "If document summary (and title) are "
"this percent similar "
"to a document summary above it, then remove it from the " "to a document summary above it, then remove it from the "
"search results. 100 means only to remove if exactly the " "search results. 100 means only to remove if exactly the "
"same. 0 means no summary deduping."; "same. 0 means no summary deduping.";
@ -15991,7 +16055,19 @@ void Parms::init ( ) {
m++; m++;
m->m_title = "use proxies for spidering";
m->m_desc = "If this is true Gigablast will use the proxies "
"listed on the <i>proxies</i> page for spidering for "
"this collection regardless whether the proxies are enabled "
"on the <i>proxies</i> page.";
m->m_cgi = "useproxies";
m->m_off = (char *)&cr.m_forceUseFloaters - x;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE;
m++;
/* /*
m->m_title = "add url enabled"; m->m_title = "add url enabled";
@ -17305,14 +17381,14 @@ void Parms::init ( ) {
m->m_def = "1"; m->m_def = "1";
m->m_page = PAGE_SPIDER; m->m_page = PAGE_SPIDER;
m->m_obj = OBJ_COLL; m->m_obj = OBJ_COLL;
m->m_flags = PF_CLONE; m->m_flags = PF_CLONE | PF_HIDDEN;
m++; m++;
m->m_cgi = "apiUrl"; m->m_cgi = "apiUrl";
m->m_desc = "Send every spidered url to this url and index " m->m_desc = "Send every spidered url to this url and index "
"the reply in addition to the normal indexing process. " "the reply in addition to the normal indexing process. "
"Example: by specifying http://api.diffbot.com/v2/" "Example: by specifying http://api.diffbot.com/v3/"
"analyze?mode=auto&token=<yourDiffbotToken> here " "analyze?mode=high-precision&token=<yourDiffbotToken> here "
"you can index the structured JSON replies from diffbot for " "you can index the structured JSON replies from diffbot for "
"every url that is spidered. " "every url that is spidered. "
"Gigablast will automatically " "Gigablast will automatically "
@ -18331,12 +18407,13 @@ void Parms::init ( ) {
///////////// /////////////
/////////////////////////////////////////// ///////////////////////////////////////////
// SECURITY CONTROLS // ROOT PASSWORDS page
/////////////////////////////////////////// ///////////////////////////////////////////
m->m_title = "Master Passwords"; m->m_title = "Root Passwords";
m->m_desc = "Any matching password will have administrative access " m->m_desc = "Whitespace separated list of passwords. "
"Any matching password will have administrative access "
"to Gigablast and all collections."; "to Gigablast and all collections.";
//"If no Admin Password or Admin IP is specified then " //"If no Admin Password or Admin IP is specified then "
//"Gigablast will only allow local IPs to connect to it " //"Gigablast will only allow local IPs to connect to it "
@ -18344,17 +18421,17 @@ void Parms::init ( ) {
m->m_cgi = "masterpwd"; m->m_cgi = "masterpwd";
m->m_xml = "masterPassword"; m->m_xml = "masterPassword";
m->m_obj = OBJ_CONF; m->m_obj = OBJ_CONF;
m->m_max = MAX_MASTER_PASSWORDS;
m->m_off = (char *)&g_conf.m_masterPwds - g; m->m_off = (char *)&g_conf.m_masterPwds - g;
m->m_type = TYPE_STRINGNONEMPTY; m->m_type = TYPE_SAFEBUF; // STRINGNONEMPTY;
m->m_size = PASSWORD_MAX_LEN+1; m->m_page = PAGE_ROOTPASSWORDS;
m->m_page = PAGE_SECURITY; //m->m_max = MAX_MASTER_PASSWORDS;
m->m_addin = 1; // "insert" follows? //m->m_size = PASSWORD_MAX_LEN+1;
m->m_flags = PF_PRIVATE; //m->m_addin = 1; // "insert" follows?
m->m_flags = PF_PRIVATE | PF_TEXTAREA;
m++; m++;
m->m_title = "Master IPs"; m->m_title = "Root IPs";
//m->m_desc = "Allow UDP requests from this list of IPs. Any datagram " //m->m_desc = "Allow UDP requests from this list of IPs. Any datagram "
// "received not coming from one of these IPs, or an IP in " // "received not coming from one of these IPs, or an IP in "
// "hosts.conf, is dropped. If another cluster is accessing this " // "hosts.conf, is dropped. If another cluster is accessing this "
@ -18364,41 +18441,42 @@ void Parms::init ( ) {
// "was disabled in the Master Controls. IPs that have 0 has " // "was disabled in the Master Controls. IPs that have 0 has "
// "their Least Significant Byte are treated as wildcards for " // "their Least Significant Byte are treated as wildcards for "
// "IP blocks. That is, 1.2.3.0 means 1.2.3.*."; // "IP blocks. That is, 1.2.3.0 means 1.2.3.*.";
m->m_desc = "Any IPs in this list will have administrative access " m->m_desc = "Whitespace separated list of Ips. "
"Any IPs in this list will have administrative access "
"to Gigablast and all collections."; "to Gigablast and all collections.";
m->m_cgi = "masterip"; m->m_cgi = "masterip";
m->m_xml = "masterIp"; m->m_xml = "masterIp";
m->m_page = PAGE_SECURITY; m->m_page = PAGE_ROOTPASSWORDS;
m->m_max = MAX_CONNECT_IPS; m->m_off = (char *)&g_conf.m_connectIps - g;
m->m_off = (char *)g_conf.m_connectIps - g; m->m_type = TYPE_SAFEBUF;//IP;
m->m_type = TYPE_IP;
m->m_priv = 2;
m->m_def = ""; m->m_def = "";
m->m_addin = 1; // "insert" follows? //m->m_max = MAX_CONNECT_IPS;
//m->m_priv = 2;
//m->m_addin = 1; // "insert" follows?
//m->m_flags = PF_HIDDEN | PF_NOSAVE; //m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_obj = OBJ_CONF; m->m_obj = OBJ_CONF;
m->m_flags = PF_PRIVATE; m->m_flags = PF_PRIVATE | PF_TEXTAREA;
m++; m++;
m->m_title = "remove connect ip"; // m->m_title = "remove connect ip";
m->m_desc = "remove a connect ip"; // m->m_desc = "remove a connect ip";
m->m_cgi = "removeip"; // m->m_cgi = "removeip";
m->m_type = TYPE_CMD; // m->m_type = TYPE_CMD;
m->m_page = PAGE_NONE; // m->m_page = PAGE_NONE;
m->m_func = CommandRemoveConnectIpRow; // m->m_func = CommandRemoveConnectIpRow;
m->m_cast = 1; // m->m_cast = 1;
m->m_obj = OBJ_CONF; // m->m_obj = OBJ_CONF;
m++; // m++;
m->m_title = "remove a password"; // m->m_title = "remove a password";
m->m_desc = "remove a password"; // m->m_desc = "remove a password";
m->m_cgi = "removepwd"; // m->m_cgi = "removepwd";
m->m_type = TYPE_CMD; // m->m_type = TYPE_CMD;
m->m_page = PAGE_NONE; // m->m_page = PAGE_NONE;
m->m_func = CommandRemovePasswordRow; // m->m_func = CommandRemovePasswordRow;
m->m_cast = 1; // m->m_cast = 1;
m->m_obj = OBJ_CONF; // m->m_obj = OBJ_CONF;
m++; // m++;
/* /*
@ -18414,7 +18492,7 @@ void Parms::init ( ) {
m->m_perms = PAGE_MASTER; m->m_perms = PAGE_MASTER;
m->m_size = USERS_TEXT_SIZE; m->m_size = USERS_TEXT_SIZE;
m->m_plen = (char *)&g_conf.m_superTurksLen - g; m->m_plen = (char *)&g_conf.m_superTurksLen - g;
m->m_page = PAGE_SECURITY; m->m_page = PAGE_ROOTPASSWORDS;
m->m_flags = PF_HIDDEN | PF_NOSAVE; m->m_flags = PF_HIDDEN | PF_NOSAVE;
m++; m++;
*/ */
@ -18447,7 +18525,7 @@ void Parms::init ( ) {
m->m_perms = PAGE_MASTER; m->m_perms = PAGE_MASTER;
m->m_size = USERS_TEXT_SIZE; m->m_size = USERS_TEXT_SIZE;
m->m_plen = (char *)&g_conf.m_usersLen - g; m->m_plen = (char *)&g_conf.m_usersLen - g;
m->m_page = PAGE_SECURITY; m->m_page = PAGE_ROOTPASSWORDS;
m++; m++;
*/ */
@ -18469,6 +18547,36 @@ void Parms::init ( ) {
m++; m++;
*/ */
m->m_title = "Collection Passwords";
m->m_desc = "Whitespace separated list of passwords. "
"Any matching password will have administrative access "
"to the controls for just this collection.";
m->m_cgi = "collpwd";
m->m_xml = "collectionPasswords";
m->m_obj = OBJ_COLL;
m->m_off = (char *)&cr.m_collectionPasswords - x;
m->m_def = "";
m->m_type = TYPE_SAFEBUF; // STRINGNONEMPTY;
m->m_page = PAGE_BASIC_SECURITY;
m->m_flags = PF_PRIVATE | PF_TEXTAREA;
m++;
m->m_title = "Collection Ips";
m->m_desc = "Whitespace separated list of IPs. "
"Any matching IP will have administrative access "
"to the controls for just this collection.";
m->m_cgi = "collips";
m->m_xml = "collectionIps";
m->m_obj = OBJ_COLL;
m->m_off = (char *)&cr.m_collectionIps - x;
m->m_def = "";
m->m_type = TYPE_SAFEBUF; // STRINGNONEMPTY;
m->m_page = PAGE_BASIC_SECURITY;
m->m_flags = PF_PRIVATE | PF_TEXTAREA;
m++;
////// //////
// END SECURITY CONTROLS // END SECURITY CONTROLS
////// //////
@ -19820,37 +19928,17 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
// false = useDefaultRec? // false = useDefaultRec?
CollectionRec *cr = g_collectiondb.getRec ( hr , false ); CollectionRec *cr = g_collectiondb.getRec ( hr , false );
//
// CLOUD SEARCH ENGINE SUPPORT
//
// if not the root admin only all user to change settings, etc.
// if the collection rec is a guest collection. i.e. in the cloud.
//
bool isRootAdmin = g_conf.isRootAdmin(sock,hr);
bool isRootColl = false;
if ( cr && strcmp(cr->m_coll,"main")==0 ) isRootColl = true;
if ( cr && strcmp(cr->m_coll,"dmoz")==0 ) isRootColl = true;
if ( cr && strcmp(cr->m_coll,"demo")==0 ) isRootColl = true;
// the main,dmoz and demo collections are root admin only
if ( ! isRootAdmin && isRootColl ) {
g_errno = ENOPERM;
return log("parms: root admin can only change main/dmoz/demo"
" collections.");
}
// just knowing the collection name is enough for a cloud user to
// modify the collection's parms. however, to modify the master
// controls or stuff in g_conf, you have to be root admin.
if ( ! g_conf.m_allowCloudUsers && ! isRootAdmin ) {
g_errno = ENOPERM;
return log("parms: permission denied for user");
}
//if ( c ) { //if ( c ) {
// cr = g_collectiondb.getRec ( hr ); // cr = g_collectiondb.getRec ( hr );
// if ( ! cr ) log("parms: coll not found"); // if ( ! cr ) log("parms: coll not found");
//} //}
bool isRootAdmin = g_conf.isRootAdmin ( sock , hr );
// does this user have permission to update the parms?
bool isCollAdmin = g_conf.isCollAdmin ( sock , hr ) ;
// might be g_conf specific, not coll specific // might be g_conf specific, not coll specific
//bool hasPerm = false; //bool hasPerm = false;
// just knowing the collection name of a custom crawl means you // just knowing the collection name of a custom crawl means you
@ -19964,6 +20052,9 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
// skip if not a command parm, like "addcoll" // skip if not a command parm, like "addcoll"
if ( m->m_type != TYPE_CMD ) continue; if ( m->m_type != TYPE_CMD ) continue;
if ( m->m_obj != OBJ_CONF && m->m_obj != OBJ_COLL )
continue;
// //
// HACK // HACK
// //
@ -20042,9 +20133,49 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
// //
// CLOUD SEARCH ENGINE SUPPORT // CLOUD SEARCH ENGINE SUPPORT
// //
//
// if this is the "delcoll" parm then "c" may have been
// excluded from http request, therefore isCollAdmin and
// isRootAdmin may be false, so see if they have permission
// for the "val" collection for this one...
bool hasPerm = false;
if ( m->m_page == PAGE_DELCOLL &&
strcmp(m->m_cgi,"delcoll") == 0 ) {
// permission override for /admin/delcoll cmd & parm
hasPerm = g_conf.isCollAdminForColl (sock,hr,val);
}
// if this IP c-block as already added a collection then do not
// allow it to add another.
if ( m->m_page == PAGE_ADDCOLL &&
g_conf.m_allowCloudUsers &&
! isRootAdmin &&
strcmp(m->m_cgi,"addcoll")==0 ) {
// see if user's c block has already added a collection
long numAdded = 0;
if ( numAdded >= 1 ) {
g_errno = ENOPERM;
log("parms: already added a collection from "
"this cloud user's c-block.");
return false;
}
hasPerm = true;
}
// master controls require root permission // master controls require root permission
if ( m->m_obj == OBJ_CONF && ! isRootAdmin ) if ( m->m_obj == OBJ_CONF && ! isRootAdmin ) {
log("parms: could not run root parm \"%s\" no perm.",
m->m_title);
continue; continue;
}
// need to have permission for collection for collrec parms
if ( m->m_obj == OBJ_COLL && ! isCollAdmin && ! hasPerm ) {
log("parms: could not run coll parm \"%s\" no perm.",
m->m_title);
continue;
}
// add the cmd parm // add the cmd parm
if ( ! addNewParmToList2 ( parmList , if ( ! addNewParmToList2 ( parmList ,
@ -20127,35 +20258,6 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
long occNum; long occNum;
Parm *m = getParmFast1 ( field , &occNum ); Parm *m = getParmFast1 ( field , &occNum );
//
// CLOUD SEARCH ENGINE SUPPORT
//
// master controls require root permission. otherwise, just
// knowing the collection name is enough for a cloud user
// to change settings.
//
if ( m && m->m_obj == OBJ_CONF && ! isRootAdmin )
continue;
//
// CLOUD SEARCH ENGINE SUPPORT
//
// if this IP c-block as already added a collection then do not
// allow it to add another.
//
if ( m && strcmp(m->m_cgi,"addcoll")==0 && ! isRootAdmin ) {
// see if user's c block has already added a collection
long numAdded = 0;
if ( numAdded >= 1 ) {
g_errno = ENOPERM;
log("parms: already added a collection from "
"this cloud user's c-block.");
return false;
}
}
// //
// map "pause" to spidering enabled // map "pause" to spidering enabled
// //
@ -20168,10 +20270,28 @@ bool Parms::convertHttpRequestToParmList (HttpRequest *hr, SafeBuf *parmList,
} }
if ( ! m ) continue; if ( ! m ) continue;
if ( m->m_type == TYPE_CMD ) continue;
if ( m->m_obj == OBJ_NONE ) continue; // skip if IS a command parm, like "addcoll", we did that above
if ( m->m_obj == OBJ_SI ) continue; if ( m->m_type == TYPE_CMD )
continue;
if ( m->m_obj != OBJ_CONF && m->m_obj != OBJ_COLL )
continue;
//
// CLOUD SEARCH ENGINE SUPPORT
//
// master controls require root permission. otherwise, just
// knowing the collection name is enough for a cloud user
// to change settings.
//
if ( m->m_obj == OBJ_CONF && ! isRootAdmin )
continue;
// need to have permission for collection for collrec parms
if ( m->m_obj == OBJ_COLL && ! isCollAdmin )
continue;
// convert spiderRoundStartTime=0 (roundStart=0 roundStart=1) // convert spiderRoundStartTime=0 (roundStart=0 roundStart=1)
// to spiderRoundStartTime=<currenttime>+30secs // to spiderRoundStartTime=<currenttime>+30secs
@ -21283,7 +21403,10 @@ bool Parms::updateParm ( char *rec , WaitEntry *we ) {
if ( collnum >= 0 ) { if ( collnum >= 0 ) {
cr = g_collectiondb.getRec ( collnum ); cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) { if ( ! cr ) {
log("parmdb: invalid collnum for parm"); char *ps = "unknown parm";
if ( parm ) ps = parm->m_title;
log("parmdb: invalid collnum %li for parm \"%s\"",
(long)collnum,ps);
g_errno = ENOCOLLREC; g_errno = ENOCOLLREC;
return true; return true;
} }
@ -21389,7 +21512,7 @@ bool Parms::updateParm ( char *rec , WaitEntry *we ) {
cr->m_regExs[occNum].getLength() == 0 ) cr->m_regExs[occNum].getLength() == 0 )
updateCount = false; updateCount = false;
// and for other pages, like master ips, skip if empty! // and for other pages, like master ips, skip if empty!
// PAGE_PASSWORDS, PAGE_SECURITY, ... // PAGE_PASSWORDS, PAGE_ROOTPASSWORDS, ...
if ( parm->m_page != PAGE_FILTERS && ! changed ) if ( parm->m_page != PAGE_FILTERS && ! changed )
updateCount = false; updateCount = false;

11
Parms.h
View File

@ -159,6 +159,7 @@ class GigablastRequest {
long long m_docId; long long m_docId;
long m_strip; long m_strip;
char m_includeHeader; char m_includeHeader;
char m_highlightQuery;
/////////// ///////////
// //
@ -345,7 +346,9 @@ class Parms {
long pd , long pd ,
bool isCrawlbot , bool isCrawlbot ,
char format, //bool isJSON, char format, //bool isJSON,
TcpSocket *sock TcpSocket *sock,
bool isRootAdmin,
bool isCollAdmin
); );
/* /*
@ -379,8 +382,10 @@ class Parms {
long nc , long nc ,
long pd , long pd ,
bool lastRow , bool lastRow ,
bool isCrawlbot = false, bool isCrawlbot ,//= false,
char format = FORMAT_HTML);//bool isJSON = false ) ; char format , //= FORMAT_HTML,
bool isRootAdmin ,
bool isCollAdmin );
char *getTHIS ( HttpRequest *r , long page ); char *getTHIS ( HttpRequest *r , long page );

View File

@ -4156,8 +4156,8 @@ bool PosdbTable::setQueryTermInfo ( ) {
qti->m_wikiPhraseId = qw->m_wikiPhraseId; qti->m_wikiPhraseId = qw->m_wikiPhraseId;
qti->m_quotedStartId = qw->m_quoteStart; qti->m_quotedStartId = qw->m_quoteStart;
// is it gbsortby:? // is it gbsortby:?
if ( qt->m_fieldCode == FIELD_GBSORTBY || if ( qt->m_fieldCode == FIELD_GBSORTBYFLOAT ||
qt->m_fieldCode == FIELD_GBREVSORTBY ) qt->m_fieldCode == FIELD_GBREVSORTBYFLOAT )
m_sortByTermNum = i; m_sortByTermNum = i;
if ( qt->m_fieldCode == FIELD_GBSORTBYINT || if ( qt->m_fieldCode == FIELD_GBSORTBYINT ||
@ -4314,9 +4314,9 @@ bool PosdbTable::setQueryTermInfo ( ) {
// numeric posdb termlist flags. instead of word position // numeric posdb termlist flags. instead of word position
// they have a float stored there for sorting etc. // they have a float stored there for sorting etc.
if (qt->m_fieldCode == FIELD_GBSORTBY ) if (qt->m_fieldCode == FIELD_GBSORTBYFLOAT )
qti->m_bigramFlags[nn]|=BF_NUMBER; qti->m_bigramFlags[nn]|=BF_NUMBER;
if (qt->m_fieldCode == FIELD_GBREVSORTBY ) if (qt->m_fieldCode == FIELD_GBREVSORTBYFLOAT )
qti->m_bigramFlags[nn]|=BF_NUMBER; qti->m_bigramFlags[nn]|=BF_NUMBER;
if (qt->m_fieldCode == FIELD_GBNUMBERMIN ) if (qt->m_fieldCode == FIELD_GBNUMBERMIN )
qti->m_bigramFlags[nn]|=BF_NUMBER; qti->m_bigramFlags[nn]|=BF_NUMBER;

View File

@ -33,7 +33,7 @@
// so we do not need to repeat the same link text over and over again. // so we do not need to repeat the same link text over and over again.
// Use M bits to hold # of inlinks the page has for other terms. // Use M bits to hold # of inlinks the page has for other terms.
// NOTE: for inlinktext terms the pattern rank is the siterank of the // NOTE: for inlinktext terms the spam rank is the siterank of the
// inlinker! // inlinker!
// NOTE: densityrank for title is based on # of title words only. same goes // NOTE: densityrank for title is based on # of title words only. same goes

View File

@ -2305,6 +2305,10 @@ bool Query::setQWords ( char boolFlag ,
if ( fieldCode == FIELD_GBNUMBEREQUALFLOAT ) if ( fieldCode == FIELD_GBNUMBEREQUALFLOAT )
ph = hash64 ("gbsortby", 8); ph = hash64 ("gbsortby", 8);
// fix for gbsortbyfloat:product.price
if ( fieldCode == FIELD_GBSORTBYFLOAT )
ph = hash64 ("gbsortby", 8);
if ( fieldCode == FIELD_GBNUMBERMININT ) if ( fieldCode == FIELD_GBNUMBERMININT )
ph = hash64 ("gbsortbyint", 11); ph = hash64 ("gbsortbyint", 11);
if ( fieldCode == FIELD_GBNUMBERMAXINT ) if ( fieldCode == FIELD_GBNUMBERMAXINT )
@ -2346,8 +2350,8 @@ bool Query::setQWords ( char boolFlag ,
fieldCode == FIELD_IP || fieldCode == FIELD_IP ||
fieldCode == FIELD_ISCLEAN || fieldCode == FIELD_ISCLEAN ||
fieldCode == FIELD_QUOTA || fieldCode == FIELD_QUOTA ||
fieldCode == FIELD_GBSORTBY || fieldCode == FIELD_GBSORTBYFLOAT ||
fieldCode == FIELD_GBREVSORTBY || fieldCode == FIELD_GBREVSORTBYFLOAT ||
// gbmin:price:1.23 // gbmin:price:1.23
fieldCode == FIELD_GBNUMBERMIN || fieldCode == FIELD_GBNUMBERMIN ||
fieldCode == FIELD_GBNUMBERMAX || fieldCode == FIELD_GBNUMBERMAX ||
@ -2489,8 +2493,8 @@ bool Query::setQWords ( char boolFlag ,
// i've decided not to make // i've decided not to make
// gbsortby:products.offerPrice // gbsortby:products.offerPrice
// gbmin:price:1.23 case insensitive // gbmin:price:1.23 case insensitive
if ( fieldCode == FIELD_GBSORTBY || if ( fieldCode == FIELD_GBSORTBYFLOAT ||
fieldCode == FIELD_GBREVSORTBY || fieldCode == FIELD_GBREVSORTBYFLOAT ||
fieldCode == FIELD_GBSORTBYINT || fieldCode == FIELD_GBSORTBYINT ||
fieldCode == FIELD_GBREVSORTBYINT ) { fieldCode == FIELD_GBREVSORTBYINT ) {
wid = hash64Lower_utf8 ( w , wlen , 0LL ); wid = hash64Lower_utf8 ( w , wlen , 0LL );
@ -3652,8 +3656,11 @@ struct QueryField g_fields[] = {
"gblang:de", "gblang:de",
"Matches all documents in german. " "Matches all documents in german. "
"The supported language abbreviations " "The supported language abbreviations "
"are at the bottom of the <i>url filters</i> page. Some more " "are at the bottom of the <a href=/admin/filters>url filters</a> "
"common ones are <i>en, es, fr, zh_cn</i>.", "page. Some more "
"common ones are <i>gblang:en, gblang:es, gblang:fr, "
// need quotes for this one!!
"gblang:\"zh_cn\"</i> (note the quotes for zh_cn!).",
NULL, NULL,
0}, 0},
@ -3751,7 +3758,7 @@ struct QueryField g_fields[] = {
{"gbsortbyfloat", {"gbsortbyfloat",
FIELD_GBSORTBY, FIELD_GBSORTBYFLOAT,
false, false,
"cameras gbsortbyfloat:price", "cameras gbsortbyfloat:price",
"Sort all documents that " "Sort all documents that "
@ -3762,7 +3769,7 @@ struct QueryField g_fields[] = {
{"gbsortbyfloat", {"gbsortbyfloat",
FIELD_GBSORTBY, FIELD_GBSORTBYFLOAT,
false, false,
"cameras gbsortbyfloat:product.price", "cameras gbsortbyfloat:product.price",
"Sort all documents that " "Sort all documents that "
@ -3777,7 +3784,7 @@ struct QueryField g_fields[] = {
{"gbrevsortbyfloat", {"gbrevsortbyfloat",
FIELD_GBREVSORTBY, FIELD_GBREVSORTBYFLOAT,
false, false,
"cameras gbrevsortbyfloat:product.price", "cameras gbrevsortbyfloat:product.price",
"Like above example but sorted with highest prices on top.", "Like above example but sorted with highest prices on top.",
@ -3786,7 +3793,7 @@ struct QueryField g_fields[] = {
{"gbsortby", {"gbsortby",
FIELD_GBSORTBY, FIELD_GBSORTBYFLOAT,
false, false,
"dog gbsortbyint:gbspiderdate", "dog gbsortbyint:gbspiderdate",
"Sort the documents that contain 'dog' by " "Sort the documents that contain 'dog' by "
@ -3796,7 +3803,7 @@ struct QueryField g_fields[] = {
QTF_HIDE}, QTF_HIDE},
{"gbrevsortby", {"gbrevsortby",
FIELD_GBREVSORTBY, FIELD_GBREVSORTBYFLOAT,
false, false,
"dog gbrevsortbyint:gbspiderdate", "dog gbrevsortbyint:gbspiderdate",
"Sort the documents that contain 'dog' by " "Sort the documents that contain 'dog' by "

View File

@ -111,8 +111,8 @@ typedef unsigned long long qvec_t;
#define FIELD_GBSECTIONHASH 51 #define FIELD_GBSECTIONHASH 51
#define FIELD_GBDOCID 52 #define FIELD_GBDOCID 52
#define FIELD_GBCONTENTHASH 53 // for deduping at spider time #define FIELD_GBCONTENTHASH 53 // for deduping at spider time
#define FIELD_GBSORTBY 54 // i.e. sortby:price -> numeric termlist #define FIELD_GBSORTBYFLOAT 54 // i.e. sortby:price -> numeric termlist
#define FIELD_GBREVSORTBY 55 // i.e. sortby:price -> low to high #define FIELD_GBREVSORTBYFLOAT 55 // i.e. sortby:price -> low to high
#define FIELD_GBNUMBERMIN 56 #define FIELD_GBNUMBERMIN 56
#define FIELD_GBNUMBERMAX 57 #define FIELD_GBNUMBERMAX 57
#define FIELD_GBPARENTURL 58 #define FIELD_GBPARENTURL 58

View File

@ -360,6 +360,8 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
// set m_isRootAdmin to zero if no correct ip or password // set m_isRootAdmin to zero if no correct ip or password
if ( ! g_conf.isRootAdmin ( sock , &m_hr ) ) m_isRootAdmin = 0; if ( ! g_conf.isRootAdmin ( sock , &m_hr ) ) m_isRootAdmin = 0;
// collection admin?
m_isCollAdmin = g_conf.isCollAdmin ( sock , &m_hr );
////////////////////////////////////// //////////////////////////////////////
// //
@ -641,6 +643,42 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
m_sbuf1.safePrintf( "%s", qp ); m_sbuf1.safePrintf( "%s", qp );
} }
// boolean OR terms
bool boolq = false;
char *any = hr->getString("any",NULL);
bool first = true;
if ( any ) {
char *s = any;
char *send = any + gbstrlen(any);
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
while (s < send) {
while (isspace(*s) && s < send) s++;
char *s2 = s+1;
if (*s == '\"') {
// if there's no closing quote just treat
// the end of the line as such
while (*s2 != '\"' && s2 < send) s2++;
if (s2 < send) s2++;
} else {
while (!isspace(*s2) && s2 < send) s2++;
}
if ( first ) m_sbuf1.safeStrcpy("(");
if ( first ) m_sbuf2.safeStrcpy("(");
if ( ! first ) m_sbuf1.safeStrcpy(" OR ");
if ( ! first ) m_sbuf2.safeStrcpy(" OR ");
first = false;
m_sbuf1.safeMemcpy ( s , s2 - s );
m_sbuf2.safeMemcpy ( s , s2 - s );
s = s2 + 1;
}
}
if ( ! first ) m_sbuf1.safeStrcpy(") AND ");
if ( ! first ) m_sbuf2.safeStrcpy(") AND ");
if ( ! first ) boolq = true;
// and this // and this
if ( m_secsBack > 0 ) { if ( m_secsBack > 0 ) {
long timestamp = getTimeGlobalNoCore(); long timestamp = getTimeGlobalNoCore();
@ -694,36 +732,65 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
// } // }
if ( m_familyFilter ) { if ( m_familyFilter ) {
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
m_sbuf1.safePrintf("gbisadult:0 | "); //if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
m_sbuf1.safePrintf( "+gbisadult:0");
//m_sbuf2.safePrintf( "+gbisadult:0");
if ( ! boolq ) {
m_sbuf1.safeStrcpy(" |");
//m_sbuf2.safeStrcpy(" |");
}
else {
m_sbuf1.safeStrcpy(" AND ");
//m_sbuf2.safeStrcpy(" AND ");
}
}
// PRE-pend gblang: term
long gblang = hr->getLong("gblang",-1);
if( gblang >= 0 ) {
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
m_sbuf1.safePrintf( "+gblang:%li", gblang );
m_sbuf2.safePrintf( "+gblang:%li", gblang );
if ( ! boolq ) {
m_sbuf1.safeStrcpy(" |");
m_sbuf2.safeStrcpy(" |");
}
else {
m_sbuf1.safeStrcpy(" AND ");
m_sbuf2.safeStrcpy(" AND ");
}
} }
// append gblang: term
// if( m_gblang > 0 ) {
// //if( p > pstart ) *p++ = ' ';
// if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
// //p += sprintf( p, "+gblang:%li |", m_gblang );
// m_sbuf1.safePrintf( "+gblang:%li |", m_gblang );
// }
// bookmark here so we can copy into st->m_displayQuery below // bookmark here so we can copy into st->m_displayQuery below
//long displayQueryOffset = m_sbuf1.length(); //long displayQueryOffset = m_sbuf1.length();
// append url: term // append url: term
if ( m_url && m_url[0] ) { // if ( m_url && m_url[0] ) {
//if ( p > pstart ) *p++ = ' '; // //if ( p > pstart ) *p++ = ' ';
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); // if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
//memcpy ( p , "+url:" , 5 ); p += 5; // //memcpy ( p , "+url:" , 5 ); p += 5;
m_sbuf1.safeStrcpy ( "+url:"); // m_sbuf1.safeStrcpy ( "+url:");
//memcpy ( p , m_url , m_urlLen ); p += m_urlLen; // //memcpy ( p , m_url , m_urlLen ); p += m_urlLen;
m_sbuf1.safeStrcpy ( m_url ); // m_sbuf1.safeStrcpy ( m_url );
} // }
// append url: term // append url: term
if ( m_link && m_link[0] ) { if ( m_link && m_link[0] ) {
//if ( p > pstart ) *p++ = ' '; if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
//memcpy ( p , "+link:" , 6 ); p += 6;
m_sbuf1.safeStrcpy ( "+link:"); m_sbuf1.safeStrcpy ( "+link:");
//memcpy ( p , m_link , m_linkLen ); p += m_linkLen; m_sbuf2.safeStrcpy ( "+link:");
m_sbuf1.safeStrcpy ( m_link ); m_sbuf1.safeStrcpy ( m_link );
m_sbuf2.safeStrcpy ( m_link );
if ( ! boolq ) {
m_sbuf1.safeStrcpy(" |");
m_sbuf2.safeStrcpy(" |");
}
else {
m_sbuf1.safeStrcpy(" AND ");
m_sbuf2.safeStrcpy(" AND ");
}
} }
// append the natural query // append the natural query
if ( m_query && m_query[0] ) { if ( m_query && m_query[0] ) {
@ -757,7 +824,14 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
//*p++ = '+'; //*p++ = '+';
//*p++ = '\"'; //*p++ = '\"';
m_sbuf1.safeStrcpy("+\""); if ( ! boolq ) {
m_sbuf1.safeStrcpy(" +\"");
m_sbuf2.safeStrcpy(" +\"");
}
else {
m_sbuf1.safeStrcpy(" AND \"");
m_sbuf2.safeStrcpy(" AND \"");
}
//p += ucToUtf8(p, pend-p, m_quote1, m_quoteLen1, csStr, 0,0); //p += ucToUtf8(p, pend-p, m_quote1, m_quoteLen1, csStr, 0,0);
m_sbuf1.safeStrcpy ( m_quote1 ); m_sbuf1.safeStrcpy ( m_quote1 );
//memcpy ( p , m_quote1 , m_quoteLen1 ); p += m_quoteLen1 ; //memcpy ( p , m_quote1 , m_quoteLen1 ); p += m_quoteLen1 ;
@ -768,7 +842,6 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
//*p2++ = '+'; //*p2++ = '+';
//*p2++ = '\"'; //*p2++ = '\"';
m_sbuf2.safeStrcpy("+\"");
//p2+=ucToUtf8(p2, pend2-p2, m_quote1, m_quoteLen1, csStr,0,0); //p2+=ucToUtf8(p2, pend2-p2, m_quote1, m_quoteLen1, csStr,0,0);
m_sbuf2.safeStrcpy ( m_quote1 ); m_sbuf2.safeStrcpy ( m_quote1 );
//memcpy ( p2 , m_quote1 , m_quoteLen1 ); p2 += m_quoteLen1 ; //memcpy ( p2 , m_quote1 , m_quoteLen1 ); p2 += m_quoteLen1 ;
@ -785,7 +858,17 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
if ( m_sbuf1.length() ) m_sbuf1.pushChar(' '); if ( m_sbuf1.length() ) m_sbuf1.pushChar(' ');
//*p++ = '+'; //*p++ = '+';
//*p++ = '\"'; //*p++ = '\"';
m_sbuf1.safeStrcpy("+\"");
if ( ! boolq ) {
m_sbuf1.safeStrcpy(" +\"");
m_sbuf2.safeStrcpy(" +\"");
}
else {
m_sbuf1.safeStrcpy(" AND \"");
m_sbuf2.safeStrcpy(" AND \"");
}
//m_sbuf1.safeStrcpy("+\"");
//p += ucToUtf8(p, pend-p, m_quote2, m_quoteLen2, csStr, 0,0); //p += ucToUtf8(p, pend-p, m_quote2, m_quoteLen2, csStr, 0,0);
m_sbuf1.safeStrcpy ( m_quote2 ); m_sbuf1.safeStrcpy ( m_quote2 );
//memcpy ( p , m_quote2 , m_quoteLen2 ); p += m_quoteLen2 ; //memcpy ( p , m_quote2 , m_quoteLen2 ); p += m_quoteLen2 ;
@ -796,7 +879,7 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
if ( m_sbuf2.length() ) m_sbuf2.pushChar(' '); if ( m_sbuf2.length() ) m_sbuf2.pushChar(' ');
//*p2++ = '+'; //*p2++ = '+';
//*p2++ = '\"'; //*p2++ = '\"';
m_sbuf2.safeStrcpy("+\""); //m_sbuf2.safeStrcpy("+\"");
//p2+=ucToUtf8(p2, pend2-p2, m_quote2, m_quoteLen2, csStr,0,0); //p2+=ucToUtf8(p2, pend2-p2, m_quote2, m_quoteLen2, csStr,0,0);
m_sbuf2.safeStrcpy ( m_quote2 ); m_sbuf2.safeStrcpy ( m_quote2 );
//memcpy ( p2 , m_quote2 , m_quoteLen2 ); p2 += m_quoteLen2 ; //memcpy ( p2 , m_quote2 , m_quoteLen2 ); p2 += m_quoteLen2 ;
@ -828,11 +911,20 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
} else { } else {
while (!isspace(*s2) && s2 < send) s2++; while (!isspace(*s2) && s2 < send) s2++;
} }
if (s2 < send) break; //if (s2 < send) break;
//if (p < pend) *p++ = '+'; //if (p < pend) *p++ = '+';
//if (p2 < pend2) *p2++ = '+'; //if (p2 < pend2) *p2++ = '+';
m_sbuf1.pushChar('+'); //m_sbuf1.pushChar('+');
m_sbuf2.pushChar('+'); //m_sbuf2.pushChar('+');
if ( ! boolq ) {
m_sbuf1.safeStrcpy("+");
m_sbuf2.safeStrcpy("+");
}
else {
m_sbuf1.safeStrcpy(" AND ");
m_sbuf2.safeStrcpy(" AND ");
}
//p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0); //p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0);
//p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0); //p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0);
m_sbuf1.safeMemcpy ( s , s2 - s ); m_sbuf1.safeMemcpy ( s , s2 - s );
@ -882,8 +974,18 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
if (s2 < send) break; if (s2 < send) break;
//if (p < pend) *p++ = '-'; //if (p < pend) *p++ = '-';
//if (p2 < pend2) *p2++ = '-'; //if (p2 < pend2) *p2++ = '-';
m_sbuf1.pushChar('-'); // m_sbuf1.pushChar('-');
m_sbuf2.pushChar('-'); // m_sbuf2.pushChar('-');
if ( ! boolq ) {
m_sbuf1.safeStrcpy("-");
m_sbuf2.safeStrcpy("-");
}
else {
m_sbuf1.safeStrcpy(" AND NOT ");
m_sbuf2.safeStrcpy(" AND NOT ");
}
//p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0); //p += ucToUtf8(p, pend-p, s, s2-s, csStr, 0,0);
//p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0); //p2 += ucToUtf8(p2, pend2-p2, s, s2-s, csStr, 0,0);
m_sbuf1.safeMemcpy ( s , s2 - s ); m_sbuf1.safeMemcpy ( s , s2 - s );
@ -923,9 +1025,9 @@ bool SearchInput::setQueryBuffers ( HttpRequest *hr ) {
} }
// null terms // null terms
if ( ! m_sbuf1.pushChar('\0') ) return false; if ( ! m_sbuf1.nullTerm() ) return false;
if ( ! m_sbuf2.pushChar('\0') ) return false; if ( ! m_sbuf2.nullTerm() ) return false;
if ( ! m_sbuf3.pushChar('\0') ) return false; if ( ! m_sbuf3.nullTerm() ) return false;
// the natural query // the natural query
m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset; m_displayQuery = m_sbuf2.getBufStart();// + displayQueryOffset;

View File

@ -118,7 +118,7 @@ class SearchInput {
Query *m_q2; Query *m_q2;
char m_isRootAdmin; char m_isRootAdmin;
char m_isCollAdmin;
// these are set from things above // these are set from things above
TopicGroup m_topicGroups [ MAX_TOPIC_GROUPS ];// msg40 TopicGroup m_topicGroups [ MAX_TOPIC_GROUPS ];// msg40

View File

@ -12862,6 +12862,18 @@ bool getSpiderStatusMsg ( CollectionRec *cx , SafeBuf *msg , long *status ) {
"adding new urls, or wait for " "adding new urls, or wait for "
"existing urls to be respidered."); "existing urls to be respidered.");
} }
// let's pass the qareindex() test in qa.cpp... it wasn't updating
// the status to done. it kept saying in progress.
if ( ! cx->m_isCustomCrawl &&
! cx->m_globalCrawlInfo.m_hasUrlsReadyToSpider ) {
//*status = SP_COMPLETED;
return msg->safePrintf ( "Nothing currently "
"available to spider. "
"Change your url filters, try "
"adding new urls, or wait for "
"existing urls to be respidered.");
}
if ( cx->m_spiderStatus == SP_ROUNDDONE ) { if ( cx->m_spiderStatus == SP_ROUNDDONE ) {

View File

@ -337,9 +337,9 @@ bool printSpiderProxyTable ( SafeBuf *sb ) {
// we fetch a test url every minute or so through // we fetch a test url every minute or so through
// each proxy to ensure it is up. typically this should // each proxy to ensure it is up. typically this should
// be your website so you do not make someone angry. // be your website so you do not make someone angry.
"<td><b>test url last download</b></td>" "<td><b>test url last download attempt</b></td>"
// print "FAILED" in red if it failed to download // print "FAILED" in red if it failed to download
"<td><b>test url download time</b></td>" "<td><b>test url download took</b></td>"
"<td><b>last bytes downloaded</b></td>" "<td><b>last bytes downloaded</b></td>"
@ -505,6 +505,9 @@ bool downloadTestUrlFromProxies ( ) {
// only host #0 should do the testing i guess // only host #0 should do the testing i guess
//if ( g_hostdb.m_myHost->m_hostId != 0 ) return true; //if ( g_hostdb.m_myHost->m_hostId != 0 ) return true;
// no need if no url
if ( g_conf.m_proxyTestUrl.length() <= 1 ) return true;
// if host #0 dies then host #1 must take its place managing the // if host #0 dies then host #1 must take its place managing the
// spider proxies // spider proxies
Host *h0 = g_hostdb.getFirstAliveHost(); Host *h0 = g_hostdb.getFirstAliveHost();
@ -706,8 +709,11 @@ void handleRequest54 ( UdpSlot *udpSlot , long niceness ) {
goto redo; goto redo;
} }
// reset minCount so we can take the min over those we check here
minCount = -1;
long long oldest = 0x7fffffffffffffffLL; long long oldest = 0x7fffffffffffffffLL;
SpiderProxy *winnersp = NULL; SpiderProxy *winnersp = NULL;
long count = 0;
// now find the best proxy wih the minCount // now find the best proxy wih the minCount
for ( long i = 0 ; i < s_iptab.getNumSlots() ; i++ ) { for ( long i = 0 ; i < s_iptab.getNumSlots() ; i++ ) {
// skip empty slots // skip empty slots
@ -716,12 +722,7 @@ void handleRequest54 ( UdpSlot *udpSlot , long niceness ) {
SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i); SpiderProxy *sp = (SpiderProxy *)s_iptab.getValueFromSlot(i);
// if it failed the last test, skip it... not here... // if it failed the last test, skip it... not here...
if ( skipDead && sp->m_lastDownloadError ) continue; if ( skipDead && sp->m_lastDownloadError ) continue;
// if all hosts were "dead" because they all had
// m_lastDownloadError set then minCount will be 999999
// and nobody should continue from this statement:
if ( sp->m_countForThisIp > minCount ) continue;
// then go by last download time for this ip
if ( sp->m_lastTimeUsedForThisIp >= oldest ) continue;
// if this proxy was banned by the url's ip... skip it. it is // if this proxy was banned by the url's ip... skip it. it is
// not a candidate... // not a candidate...
if ( skipDead ) { if ( skipDead ) {
@ -730,8 +731,34 @@ void handleRequest54 ( UdpSlot *udpSlot , long niceness ) {
long long h64 = hash64h ( uip , pip ); long long h64 = hash64h ( uip , pip );
if ( s_proxyBannedTable.isInTable ( &h64 ) ) continue; if ( s_proxyBannedTable.isInTable ( &h64 ) ) continue;
} }
// if some proxies are "alive" then only pick from
// the first half of the proxies that are alive (i.e. still
// work). that way, when one of those goes dead we will inc
// the backoff (crawldelay) and a new proxy that we haven't
// used for this url's IP will take it's place. and such
// new proxies will only have the new backoff count used
// through them. that way, we don't get ALL of our proxies
// banned at about the same time since we do somewhat uniform
// load balancing over them.
if ( skipDead && count > aliveProxyCandidates / 2 )
continue;
// count the alive/non-banned candidates
count++;
// if all hosts were "dead" because they all had
// m_lastDownloadError set then minCount will be 999999
// and nobody should continue from this statement:
if ( sp->m_countForThisIp > minCount && minCount>=0 ) continue;
// then go by last download time for this ip
if ( sp->m_countForThisIp == minCount && minCount>=0 &&
sp->m_lastTimeUsedForThisIp >= oldest )
continue;
// pick the spider proxy used longest ago // pick the spider proxy used longest ago
oldest = sp->m_lastTimeUsedForThisIp; oldest = sp->m_lastTimeUsedForThisIp;
minCount = sp->m_countForThisIp;
// got a new winner // got a new winner
winnersp = sp; winnersp = sp;
} }

View File

@ -27,6 +27,7 @@ void Summary::reset() {
m_bitScoresBufSize = 0; m_bitScoresBufSize = 0;
} }
m_summaryLen = 0; m_summaryLen = 0;
m_displayLen = 0;
//m_bufMaxLen = 0; //m_bufMaxLen = 0;
//m_bufLen = 0; //m_bufLen = 0;
//m_buf = NULL; //m_buf = NULL;

View File

@ -80,8 +80,8 @@ bool User::verifyPageNum ( uint16_t pageNum ){
} }
// check if pageNum is of dummy page // check if pageNum is of dummy page
bool isDummy = true; bool isDummy = true;
if ( pageNum > PAGE_PUBLIC ) //if ( pageNum > PAGE_PUBLIC )
isDummy = false; isDummy = false;
// //
if ( m_allPages && !isDummy ) if ( m_allPages && !isDummy )
return true; return true;
@ -93,8 +93,9 @@ bool User::verifyPageNum ( uint16_t pageNum ){
long User::firstPage ( ){ long User::firstPage ( ){
// return first allowed page // return first allowed page
for ( uint16_t i = 0; i < m_numPages; i++ ) for ( uint16_t i = 0; i < m_numPages; i++ )
if ( ! (m_pages[i] & 0x8000) && if ( ! (m_pages[i] & 0x8000) ) //&&
(m_pages[i]&0x7fff) > PAGE_PUBLIC ) return m_pages[i]; // (m_pages[i]&0x7fff) > PAGE_PUBLIC )
return m_pages[i];
// if all pages is set then just return the root page // if all pages is set then just return the root page
if ( m_allPages ) return PAGE_ROOT; if ( m_allPages ) return PAGE_ROOT;

81
Xml.cpp
View File

@ -196,6 +196,35 @@ void Xml::reset ( ) {
m_allocSize = 0; m_allocSize = 0;
} }
bool Xml::getCompoundName ( long node , SafeBuf *sb ) {
XmlNode *buf[256];
XmlNode *xn = &m_nodes[node];
long np = 0;
for ( ; xn ; xn = xn->m_parent ) {
if ( ! xn->m_nodeId ) continue;
if ( np >= 256 ) {g_errno = EBUFTOOSMALL;return false;}
buf[np++] = xn;
}
// ignore that initial <?xml ..> tag they all have
if ( np > 0 &&
buf[np-1]->m_tagNameLen == 3 &&
strncasecmp(buf[np-1]->m_tagName,"xml",3) == 0 )
np--;
for ( long i = np - 1 ; i >= 0 ; i-- ) {
XmlNode *xn = buf[i];
sb->safeMemcpy ( xn->m_tagName , xn->m_tagNameLen );
sb->pushChar('.');
}
// remove last '.'
if ( sb->length() ) sb->m_length--;
sb->nullTerm();
return true;
}
#include "HttpMime.h" // CT_JSON #include "HttpMime.h" // CT_JSON
// "s" must be in utf8 // "s" must be in utf8
@ -258,6 +287,10 @@ bool Xml::set ( char *s ,
return true; return true;
} }
// override
if ( contentType == CT_XML )
pureXml = true;
QUICKPOLL((niceness)); QUICKPOLL((niceness));
long i; long i;
@ -310,6 +343,11 @@ bool Xml::set ( char *s ,
logf(LOG_TIMING, logf(LOG_TIMING,
"build: xml: set: 4c. %llu",gettimeofdayInMilliseconds()); "build: xml: set: 4c. %llu",gettimeofdayInMilliseconds());
XmlNode *parent = NULL;
XmlNode *parentStackStart[256];
XmlNode **parentStackPtr = &parentStackStart[0];
XmlNode **parentStackEnd = &parentStackStart[256];
// . TODO: do this on demand // . TODO: do this on demand
// . now fill our nodes array // . now fill our nodes array
// . loop over the xml // . loop over the xml
@ -320,14 +358,51 @@ bool Xml::set ( char *s ,
QUICKPOLL(niceness); QUICKPOLL(niceness);
// remember oldi // remember oldi
oldi = i; oldi = i;
// convenience ptr
XmlNode *xi = &m_nodes[m_numNodes];
// set that node // set that node
i += m_nodes[m_numNodes].set (&m_xml[i],pureXml,version); i += xi->set (&m_xml[i],pureXml,version);
// set his parent xml node if is xml
xi->m_parent = parent;
// if not text node then he's the new parent
if ( pureXml &&
xi->m_nodeId &&
xi->m_nodeId != TAG_COMMENT ) {
// if we are a back tag pop the stack
if ( ! xi->isFrontTag() ) {
// pop old parent
if ( parentStackPtr > parentStackStart )
parent = *(--parentStackPtr);
}
// we are a front tag...
else {
// did we overflow?
if ( parentStackPtr >= parentStackEnd ) {
log("xml: xml parent overflow");
g_errno = EBUFTOOSMALL;
return false;
}
// push the old parent ptr
if ( parent ) *parentStackPtr++ = parent;
// set the new parent to us
parent = xi;
}
}
// in script? // in script?
if ( m_nodes[m_numNodes].m_nodeId != TAG_SCRIPT ) { if ( xi->m_nodeId != TAG_SCRIPT ) {
m_numNodes++; m_numNodes++;
continue; continue;
} }
if ( ! m_nodes[m_numNodes].isFrontTag() ) { if ( ! xi->isFrontTag() ) {
m_numNodes++; m_numNodes++;
continue; continue;
} }

8
Xml.h
View File

@ -86,8 +86,12 @@ class Xml {
// . ie. "xml.country.state.city" // . ie. "xml.country.state.city"
// . fullTag option returns the entire node text // . fullTag option returns the entire node text
// . ie. "<xml>.<country>.<state abbrev="true">.<city arg="foo"> // . ie. "<xml>.<country>.<state abbrev="true">.<city arg="foo">
long getCompoundName ( long n , char *buf , long bufMaxLen, //long getCompoundName ( long n , char *buf , long bufMaxLen,
bool fullTag = false ) ; // bool fullTag = false ) ;
// get like compound name like "node1.node2.node3\0"
bool getCompoundName ( long node , class SafeBuf *sb ) ;
// . used for parsing xml conf files // . used for parsing xml conf files
// . used for getting the title in an html doc, etc. // . used for getting the title in an html doc, etc.

View File

@ -186,6 +186,8 @@ XmlDoc::~XmlDoc() {
static long long s_lastTimeStart = 0LL; static long long s_lastTimeStart = 0LL;
void XmlDoc::reset ( ) { void XmlDoc::reset ( ) {
m_isImporting = false;
m_printedMenu = false; m_printedMenu = false;
@ -1335,7 +1337,13 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
// similar to set3() above // similar to set3() above
m_setFromDocId = true; m_setFromDocId = true;
// use content and ip from old title rec to save time // use content and ip from old title rec to save time
m_recycleContent = true; // . crap this is making the query reindex not actually
// re-download the content.
// . we already check the m_deleteFromIndex flag below
// in getUtf8Content() and use the old content in that case
// so i'm not sure why we are recycling here, so take
// this out. MDW 9/25/2014.
//m_recycleContent = true;
// sanity // sanity
if ( m_docId == 0LL ) { char *xx=NULL;*xx=0; } if ( m_docId == 0LL ) { char *xx=NULL;*xx=0; }
} }
@ -3192,6 +3200,36 @@ long *XmlDoc::getIndexCode2 ( ) {
if ( m_sreqValid && m_sreq.m_ignoreDocUnchangedError ) if ( m_sreqValid && m_sreq.m_ignoreDocUnchangedError )
check = false; check = false;
if ( check ) { if ( check ) {
// check inlinks now too!
LinkInfo *info1 = getLinkInfo1 ();
if ( ! info1 || info1 == (LinkInfo *)-1 ) return (long *)info1;
LinkInfo *info2 = od->getLinkInfo1 ();
if ( ! info2 || info2 == (LinkInfo *)-1 ) return (long *)info2;
Inlink *k1 = NULL;
Inlink *k2 = NULL;
char *s1, *s2;
long len1,len2;
if ( info1->getNumGoodInlinks() !=
info2->getNumGoodInlinks() )
goto changed;
for ( ; k1=info1->getNextInlink(k1) ,
k2=info2->getNextInlink(k2); ) {
if ( ! k1 )
break;
if ( ! k2 )
break;
if ( k1->m_siteNumInlinks != k2->m_siteNumInlinks )
goto changed;
s1 = k1->ptr_linkText;
len1 = k1->size_linkText - 1; // exclude \0
s2 = k2->ptr_linkText;
len2 = k2->size_linkText - 1; // exclude \0
if ( len1 != len2 )
goto changed;
if ( memcmp(s1,s2,len1) != 0 )
goto changed;
}
// no change in link text, look for change in page content now
long *ch32 = getContentHash32(); long *ch32 = getContentHash32();
if ( ! ch32 || ch32 == (void *)-1 ) return (long *)ch32; if ( ! ch32 || ch32 == (void *)-1 ) return (long *)ch32;
if ( *ch32 == od->m_contentHash32 ) { if ( *ch32 == od->m_contentHash32 ) {
@ -3201,6 +3239,7 @@ long *XmlDoc::getIndexCode2 ( ) {
} }
} }
changed:
// words // words
Words *words = getWords(); Words *words = getWords();
if ( ! words || words == (Words *)-1 ) return (long *)words; if ( ! words || words == (Words *)-1 ) return (long *)words;
@ -15222,6 +15261,16 @@ char **XmlDoc::getHttpReply2 ( ) {
if ( od ) if ( od )
r->m_contentHash32 = od->m_contentHash32; r->m_contentHash32 = od->m_contentHash32;
// force floater usage on even if "use spider proxies" parms is off
// if we're a diffbot crawl and use robots is off.
//if ( cr && ! cr->m_useRobotsTxt && cr->m_isCustomCrawl )
// r->m_forceUseFloaters = true;
// for beta testing, make it a collection specific parm for diffbot
// so we can turn on manually
if ( cr->m_forceUseFloaters )
r->m_forceUseFloaters = true;
// eventgurubot is the max // eventgurubot is the max
//char *userAgent = g_conf.m_spiderUserAgent; //char *userAgent = g_conf.m_spiderUserAgent;
// hardcode it // hardcode it
@ -15766,7 +15815,10 @@ char **XmlDoc::getContent ( ) {
// if we were set from a title rec use that we do not have the original // if we were set from a title rec use that we do not have the original
// content, and caller should be calling getUtf8Content() anyway!! // content, and caller should be calling getUtf8Content() anyway!!
if ( m_setFromTitleRec ) { char *xx=NULL; *xx=0; } if ( m_setFromTitleRec ) { char *xx=NULL; *xx=0; }
if ( m_setFromDocId ) { char *xx=NULL; *xx=0; }
// query reindex has m_setFromDocId to true and we WANT to re-download
// the content... so why did i have this here? MDW 9/25/2014
//if ( m_setFromDocId ) { char *xx=NULL; *xx=0; }
// recycle? // recycle?
//if ( m_recycleContent ) { char *xx=NULL; *xx=0; } //if ( m_recycleContent ) { char *xx=NULL; *xx=0; }
@ -17603,7 +17655,9 @@ char **XmlDoc::getUtf8Content ( ) {
// all tags like <title> or <link> to <gbtitle> or <gblink> so we // all tags like <title> or <link> to <gbtitle> or <gblink> so we
// know they are xml tags. because stuff like &lt;br&gt; will // know they are xml tags. because stuff like &lt;br&gt; will
// become <br> and will be within its xml tag like <gbdescription> // become <br> and will be within its xml tag like <gbdescription>
// or <gbtitle> // or <gbtitle>.
// MDW: 9/28/2014. no longer do this since i added hashXmlFields().
/*
if ( m_contentType == CT_XML ) { if ( m_contentType == CT_XML ) {
// count the xml tags // count the xml tags
char *p = m_expandedUtf8Content; char *p = m_expandedUtf8Content;
@ -17659,6 +17713,7 @@ char **XmlDoc::getUtf8Content ( ) {
// free esbuf if we were referencing that to save mem // free esbuf if we were referencing that to save mem
m_esbuf.purge(); m_esbuf.purge();
} }
*/
// richmondspca.org has &quot; in some tags and we do not like // richmondspca.org has &quot; in some tags and we do not like
// expanding that to " because it messes up XmlNode::getTagLen() // expanding that to " because it messes up XmlNode::getTagLen()
@ -17675,11 +17730,15 @@ char **XmlDoc::getUtf8Content ( ) {
// utf8 chars so that Xml::set(), etc. still work properly and don't // utf8 chars so that Xml::set(), etc. still work properly and don't
// add any more html tags than it should // add any more html tags than it should
// . this will decode in place // . this will decode in place
long n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content, // . MDW: 9/28/2014. no longer do for xml docs since i added
m_expandedUtf8Content,//ptr_utf8Content, // hashXmlFields()
m_expandedUtf8ContentSize-1,//size_utf8Content-1, long n = m_expandedUtf8ContentSize - 1;
doSpecial, if ( m_contentType != CT_XML )
m_niceness); n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content,
m_expandedUtf8Content,//ptr_utf8Content,
m_expandedUtf8ContentSize-1,//size_utf8Con
doSpecial,
m_niceness);
// can't exceed this! n does not include the final \0 even though // can't exceed this! n does not include the final \0 even though
// we do right it out. // we do right it out.
@ -17689,12 +17748,14 @@ char **XmlDoc::getUtf8Content ( ) {
// now rss has crap in it like "&amp;nbsp;" so we have to do another // now rss has crap in it like "&amp;nbsp;" so we have to do another
// decoding pass // decoding pass
if ( m_contentType == CT_XML ) // isRSSExt ) // . MDW: 9/28/2014. no longer do for xml docs since i added
n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content, // hashXmlFields()
m_expandedUtf8Content,//ptr_utf8Content, // if ( m_contentType == CT_XML ) // isRSSExt )
n, // n = htmlDecode(m_expandedUtf8Content,//ptr_utf8Content,
false,//doSpecial, // m_expandedUtf8Content,//ptr_utf8Content,
m_niceness); // n,
// false,//doSpecial,
// m_niceness);
// sanity // sanity
if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; } if ( n > m_expandedUtf8ContentSize-1 ) {char *xx=NULL;*xx=0; }
// sanity // sanity
@ -18943,6 +19004,17 @@ char *XmlDoc::getSpiderLinks ( ) {
// m_spiderLinks2 = false; // m_spiderLinks2 = false;
// m_spiderLinksValid = true ; } // m_spiderLinksValid = true ; }
// this slows importing down because we end up doing ip lookups
// for every outlink if "firstip" not in tagdb.
// shoot. set2() already sets m_spiderLinksValid to true so we
// have to override if importing.
if ( m_isImporting && m_isImportingValid ) {
m_spiderLinks = false;
m_spiderLinks2 = false;
m_spiderLinksValid = true;
return &m_spiderLinks2;
}
// return the valid value // return the valid value
if ( m_spiderLinksValid ) return &m_spiderLinks2; if ( m_spiderLinksValid ) return &m_spiderLinks2;
@ -21761,8 +21833,6 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// likewise if there error was ENONCANONICAL treat it like that // likewise if there error was ENONCANONICAL treat it like that
if ( m_indexCode == EDOCNONCANONICAL ) if ( m_indexCode == EDOCNONCANONICAL )
spideringLinks = true; spideringLinks = true;
// //
// . prepare the outlink info if we are adding links to spiderdb! // . prepare the outlink info if we are adding links to spiderdb!
@ -22273,13 +22343,17 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// . LINKDB // . LINKDB
// . linkdb records. assume one per outlink // . linkdb records. assume one per outlink
// . we may index 2 16-byte keys for each outlink // . we may index 2 16-byte keys for each outlink
Links *nl = NULL; if ( spideringLinks ) nl = &m_links; Links *nl2 = NULL;
//if ( spideringLinks ) nl2 = &m_links;
// if injecting, spideringLinks is false, but then we don't
// add the links to linkdb, which causes the qainlinks() test to fail
nl2 = &m_links;
// do not bother if deleting. but we do add simplified redirects // do not bother if deleting. but we do add simplified redirects
// to spiderdb as SpiderRequests now. // to spiderdb as SpiderRequests now.
long code = m_indexCode; long code = m_indexCode;
if ( code == EDOCSIMPLIFIEDREDIR ) code = 0; if ( code == EDOCSIMPLIFIEDREDIR ) code = 0;
if ( code == EDOCNONCANONICAL ) code = 0; if ( code == EDOCNONCANONICAL ) code = 0;
if ( code ) nl = NULL; if ( code ) nl2 = NULL;
//Links *ol = NULL; if ( od ) ol = od->getLinks(); //Links *ol = NULL; if ( od ) ol = od->getLinks();
// . set key/data size // . set key/data size
// . use a 16 byte key, not the usual 12 // . use a 16 byte key, not the usual 12
@ -22288,7 +22362,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
HashTableX kt1; HashTableX kt1;
//HashTableX kt2; //HashTableX kt2;
long nis = 0; long nis = 0;
if ( nl && m_useLinkdb ) nis = nl->getNumLinks() * 4; if ( nl2 && m_useLinkdb ) nis = nl2->getNumLinks() * 4;
// pre-grow table based on # outlinks // pre-grow table based on # outlinks
kt1.set ( sizeof(key224_t),0,nis,NULL,0,false,m_niceness,"link-indx" ); kt1.set ( sizeof(key224_t),0,nis,NULL,0,false,m_niceness,"link-indx" );
// use magic to make fast // use magic to make fast
@ -22307,7 +22381,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// but this will have to be for adding to Linkdb. basically take a // but this will have to be for adding to Linkdb. basically take a
// lot of it from Linkdb::fillLinkdbList() // lot of it from Linkdb::fillLinkdbList()
// . these return false with g_errno set on error // . these return false with g_errno set on error
if ( m_useLinkdb && nl && ! hashLinksForLinkdb(&kt1) ) return NULL; if ( m_useLinkdb && nl2 && ! hashLinksForLinkdb(&kt1) ) return NULL;
//if ( add2 && ol && ! !od->m_skipIndexing && //if ( add2 && ol && ! !od->m_skipIndexing &&
// ol->hash(&kt2,od,m_niceness) ) // ol->hash(&kt2,od,m_niceness) )
// return NULL; // return NULL;
@ -22432,6 +22506,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// if we were set from a titleRec, see if we got // if we were set from a titleRec, see if we got
// a different hash of terms to index this time around... // a different hash of terms to index this time around...
m_setFromTitleRec && m_setFromTitleRec &&
// fix for import log spam
! m_isImporting &&
m_version >= 120 && m_version >= 120 &&
m_metaListCheckSum8 != currentMetaListCheckSum8 ) m_metaListCheckSum8 != currentMetaListCheckSum8 )
log("xmldoc: checksum parsing inconsistency for %s", log("xmldoc: checksum parsing inconsistency for %s",
@ -22931,7 +23007,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// . should also add with a time of now plus 5 seconds to that if // . should also add with a time of now plus 5 seconds to that if
// we spider an outlink linkdb should be update with this doc // we spider an outlink linkdb should be update with this doc
// pointing to it so it can get link text then!! // pointing to it so it can get link text then!!
if ( spideringLinks && nl && ! m_doingConsistencyCheck && if ( spideringLinks && nl2 && ! m_doingConsistencyCheck &&
m_useSpiderdb && ! forDelete ){ m_useSpiderdb && ! forDelete ){
// returns NULL and sets g_errno on error // returns NULL and sets g_errno on error
char *ret = addOutlinkSpiderRecsToMetaList (); char *ret = addOutlinkSpiderRecsToMetaList ();
@ -25894,6 +25970,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// hash diffbot's json output here // hash diffbot's json output here
uint8_t *ct = getContentType(); uint8_t *ct = getContentType();
if ( ! ct ) return NULL; if ( ! ct ) return NULL;
/*
if ( *ct == CT_JSON ) { // && m_isDiffbotJSONObject ) { if ( *ct == CT_JSON ) { // && m_isDiffbotJSONObject ) {
// hash the content type for type:json query // hash the content type for type:json query
if ( ! hashContentType ( table ) ) return NULL; if ( ! hashContentType ( table ) ) return NULL;
@ -25911,6 +25988,7 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// and the json itself // and the json itself
return hashJSON ( table ); return hashJSON ( table );
} }
*/
if ( ! hashContentType ( table ) ) return NULL; if ( ! hashContentType ( table ) ) return NULL;
if ( ! hashUrl ( table ) ) return NULL; if ( ! hashUrl ( table ) ) return NULL;
@ -25936,12 +26014,31 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
if ( ! hashNoSplit ( table ) ) return NULL; if ( ! hashNoSplit ( table ) ) return NULL;
// global index unless this is a json object in which case it is // MDW: i think we just inject empty html with a diffbotreply into
// hased above in the call to hashJSON(). this will decrease disk // global index now, so don't need this... 9/28/2014
// usage by about half, posdb* files are pretty big.
if ( cr->m_isCustomCrawl || ! cr->m_indexBody ) return (char *)1;
// global index unless this is a json object in which case it is
// hashed above in the call to hashJSON(). this will decrease disk
// usage by about half, posdb* files are pretty big.
//if ( cr->m_isCustomCrawl || ! cr->m_indexBody ) return (char *)1;
// hash json fields
if ( *ct == CT_JSON ) {
// this hashes both with and without the fieldname
hashJSONFields ( table );
// hash gblang:de
if ( ! hashLanguageString ( table ) ) return NULL;
goto skip;
}
// same for xml now, so we can search for field:value like w/ json
if ( *ct == CT_XML ) {
// this hashes both with and without the fieldname
hashXMLFields ( table );
// hash gblang:de
if ( ! hashLanguageString ( table ) ) return NULL;
goto skip;
}
// hash the body of the doc first so m_dist is 0 to match // hash the body of the doc first so m_dist is 0 to match
// the rainbow display of sections // the rainbow display of sections
@ -25971,6 +26068,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
// somewhere. // somewhere.
if ( ! hashMetaSummary(table) ) return NULL; if ( ! hashMetaSummary(table) ) return NULL;
skip:
// this will only increment the scores of terms already in the table // this will only increment the scores of terms already in the table
// because we neighborhoods are not techincally in the document // because we neighborhoods are not techincally in the document
// necessarily and we do not want to ruin our precision // necessarily and we do not want to ruin our precision
@ -25986,6 +26085,9 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
if ( ! hashRSSInfo ( table ) ) return NULL; if ( ! hashRSSInfo ( table ) ) return NULL;
if ( ! hashPermalink ( table ) ) return NULL; if ( ! hashPermalink ( table ) ) return NULL;
// hash gblang:de last for parsing consistency
if ( ! hashLanguageString ( table ) ) return NULL;
// we set this now in hashWords3() // we set this now in hashWords3()
if ( m_doingSEO ) if ( m_doingSEO )
m_wordPosInfoBufValid = true; m_wordPosInfoBufValid = true;
@ -27299,6 +27401,9 @@ bool XmlDoc::hashUrl ( HashTableX *tt , bool isStatusDoc ) {
// . copied Url2.cpp into here basically, so we can now dump Url2.cpp // . copied Url2.cpp into here basically, so we can now dump Url2.cpp
bool XmlDoc::hashSections ( HashTableX *tt ) { bool XmlDoc::hashSections ( HashTableX *tt ) {
//if ( ! m_contentTypeValid ) { char *xx=NULL;*xx=0; }
//if ( m_contentType == CT_HTML ) return true;
setStatus ( "hashing sections" ); setStatus ( "hashing sections" );
if ( ! m_sectionsValid ) { char *xx=NULL;*xx=0; } if ( ! m_sectionsValid ) { char *xx=NULL;*xx=0; }
@ -28094,6 +28199,30 @@ bool XmlDoc::hashLanguage ( HashTableX *tt ) {
// try lang abbreviation // try lang abbreviation
sprintf(s , "%s ", getLangAbbr(langId) ); sprintf(s , "%s ", getLangAbbr(langId) );
// go back to broken way to try to fix parsing consistency bug
// by adding hashLanguageString() function below
//sprintf(s , "%s ", getLangAbbr(langId) );
if ( ! hashString ( s, slen, &hi ) ) return false;
return true;
}
bool XmlDoc::hashLanguageString ( HashTableX *tt ) {
setStatus ( "hashing language string" );
long langId = (long)*getLangId();
// update hash parms
HashInfo hi;
hi.m_tt = tt;
hi.m_hashGroup = HASHGROUP_INTAG;
hi.m_prefix = "gblang";
// try lang abbreviation
char s[32];
long slen = sprintf(s , "%s ", getLangAbbr(langId) );
// go back to broken way to try to fix parsing consistency bug
if ( ! hashString ( s, slen, &hi ) ) return false; if ( ! hashString ( s, slen, &hi ) ) return false;
return true; return true;
@ -29073,7 +29202,8 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
if ( ! jp || jp == (void *)-1) if ( ! jp || jp == (void *)-1)
return (Msg20Reply *)jp; return (Msg20Reply *)jp;
} }
if ( m_contentType == CT_HTML ) { if ( m_contentType == CT_HTML ||
m_contentType == CT_XML ) {
Xml *xml = getXml(); Xml *xml = getXml();
if ( ! xml || xml==(void *)-1) if ( ! xml || xml==(void *)-1)
return (Msg20Reply *)xml; return (Msg20Reply *)xml;
@ -29482,11 +29612,11 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
// returns values of specified meta tags // returns values of specified meta tags
if ( ! reply->ptr_dbuf && m_req->size_displayMetas > 1 ) { if ( ! reply->ptr_dbuf && m_req->size_displayMetas > 1 ) {
long dlen; char *d; long dsize; char *d;
d = getDescriptionBuf(m_req->ptr_displayMetas,&dlen); d = getDescriptionBuf(m_req->ptr_displayMetas,&dsize);
if ( ! d || d == (char *)-1 ) return (Msg20Reply *)d; if ( ! d || d == (char *)-1 ) return (Msg20Reply *)d;
reply->ptr_dbuf = d; reply->ptr_dbuf = d;
reply->size_dbuf = dlen + 1; reply->size_dbuf = dsize; // includes \0
} }
// breathe // breathe
@ -30370,9 +30500,9 @@ Matches *XmlDoc::getMatches () {
} }
// sender wants meta description, custom tags, etc. // sender wants meta description, custom tags, etc.
char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) { char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dsize ) {
// return the buffer if we got it // return the buffer if we got it
if ( m_dbufValid ) { *dlen = m_dbufLen; return m_dbuf; } if ( m_dbufValid ) { *dsize = m_dbufSize; return m_dbuf; }
Xml *xml = getXml(); Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml; if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
// now get the content of the requested display meta tags // now get the content of the requested display meta tags
@ -30416,6 +30546,14 @@ char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) {
gbstrlen(s) , // name len gbstrlen(s) , // name len
"name" , // http-equiv/name "name" , // http-equiv/name
false );// convert &#'s? false );// convert &#'s?
dptr[wlen] = '\0';
// test it out
if ( ! verifyUtf8 ( dptr ) ) {
log("xmldoc: invalid utf8 content for meta tag %s.",s);
continue;
}
// advance and NULL terminate // advance and NULL terminate
dptr += wlen; dptr += wlen;
*dptr++ = '\0'; *dptr++ = '\0';
@ -30425,8 +30563,9 @@ char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) {
"was encountered. Truncating.",dbufEnd-m_dbuf); "was encountered. Truncating.",dbufEnd-m_dbuf);
} }
// what is the size of the content of displayed meta tags? // what is the size of the content of displayed meta tags?
m_dbufLen = dptr - m_dbuf; m_dbufSize = dptr - m_dbuf;
m_dbufValid = true; m_dbufValid = true;
*dsize = m_dbufSize;
return m_dbuf; return m_dbuf;
} }
@ -30519,6 +30658,15 @@ Title *XmlDoc::getTitle ( ) {
Summary *XmlDoc::getSummary () { Summary *XmlDoc::getSummary () {
if ( m_summaryValid ) return &m_summary; if ( m_summaryValid ) return &m_summary;
// xml and json docs have empty summaries for now
uint8_t *ct = getContentType();
if ( ! ct || ct == (void *)-1 ) return (Summary *)ct;
if ( *ct == CT_JSON || *ct == CT_XML ) {
m_summaryValid = true;
return &m_summary;
}
// need a buncha crap // need a buncha crap
Words *ww = getWords(); Words *ww = getWords();
if ( ! ww || ww == (Words *)-1 ) return (Summary *)ww; if ( ! ww || ww == (Words *)-1 ) return (Summary *)ww;
@ -35813,7 +35961,7 @@ char **XmlDoc::getRootTitleBuf ( ) {
char *src = NULL; char *src = NULL;
long srcSize = 0; long srcSize = 0;
if ( ptr_rootTitleBuf ) { if ( ptr_rootTitleBuf || m_setFromTitleRec ) {
src = ptr_rootTitleBuf; src = ptr_rootTitleBuf;
srcSize = size_rootTitleBuf; srcSize = size_rootTitleBuf;
} }
@ -48352,9 +48500,9 @@ Json *XmlDoc::getParsedJson ( ) {
#include "Json.h" #include "Json.h"
char *XmlDoc::hashJSON ( HashTableX *table ) { char *XmlDoc::hashJSONFields ( HashTableX *table ) {
setStatus ( "hashing json" ); setStatus ( "hashing json fields" );
HashInfo hi; HashInfo hi;
hi.m_tt = table; hi.m_tt = table;
@ -48515,6 +48663,58 @@ char *XmlDoc::hashJSON ( HashTableX *table ) {
return (char *)0x01; return (char *)0x01;
} }
char *XmlDoc::hashXMLFields ( HashTableX *table ) {
setStatus ( "hashing xml fields" );
HashInfo hi;
hi.m_tt = table;
hi.m_desc = "xml object";
hi.m_hashGroup = HASHGROUP_BODY;
Xml *xml = getXml();
long n = xml->getNumNodes();
XmlNode *nodes = xml->getNodes ();
SafeBuf nameBuf;
// scan the xml nodes
for ( long i = 0 ; i < n ; i++ ) {
// breathe
QUICKPOLL(m_niceness);
// . skip if it's a tag not text node skip it
// . we just want the "text" nodes
if ( nodes[i].isTag() ) continue;
// assemble the full parent name
// like "tag1.tag2.tag3"
nameBuf.reset();
xml->getCompoundName ( i , &nameBuf );
// this is \0 terminated
char *tagName = nameBuf.getBufStart();
// get the utf8 text
char *val = nodes[i].m_node;
long vlen = nodes[i].m_nodeLen;
// index like "title:whatever"
if ( tagName && tagName[0] ) {
hi.m_prefix = tagName;
hashString ( val , vlen , &hi );
}
// hash without the field name as well
hi.m_prefix = NULL;
hashString ( val , vlen , &hi );
}
return (char *)0x01;
}
// if our url is that of a subdoc, then get the url of the parent doc // if our url is that of a subdoc, then get the url of the parent doc
// from which we were a subsection // from which we were a subsection
char *XmlDoc::getDiffbotParentUrl( char *myUrl ) { char *XmlDoc::getDiffbotParentUrl( char *myUrl ) {
@ -48561,6 +48761,9 @@ bool XmlDoc::storeFacetValues ( char *qs , SafeBuf *sb , FacetValHash_t fvh ) {
if ( m_contentType == CT_HTML ) if ( m_contentType == CT_HTML )
return storeFacetValuesHtml ( qs , sb , fvh ); return storeFacetValuesHtml ( qs , sb , fvh );
if ( m_contentType == CT_XML )
return storeFacetValuesXml ( qs , sb , fvh );
return true; return true;
} }
@ -48702,6 +48905,89 @@ bool XmlDoc::storeFacetValuesHtml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) {
return true; return true;
} }
bool XmlDoc::storeFacetValuesXml(char *qs, SafeBuf *sb, FacetValHash_t fvh ) {
Xml *xml = getXml();
long qsLen = gbstrlen(qs);
bool isString = false;
if ( strncmp(qs-4,"str:",4) == 0 ) isString = true;
long i = 0;
bool uniqueField = false;
SafeBuf nameBuf;
// find the first meta summary node
for ( i = 0 ; i < xml->m_numNodes ; i++ ) {
// skip text nodes
if ( xml->m_nodes[i].m_nodeId == 0 ) continue;
// assemble the full parent name
// like "tag1.tag2.tag3"
nameBuf.reset();
xml->getCompoundName ( i , &nameBuf );
long nameLen = nameBuf.length();
char *s = nameBuf.getBufStart();
// . does it have a type field that's "summary"
// . <meta name=summary content="...">
// . <meta http-equiv="refresh" content="0;URL=http://y.com/">
//s = xml->getString ( i , "name", &nameLen );
// "s" can be "summary","description","keywords",...
if ( nameLen != qsLen ) continue;
if ( strncasecmp ( s , qs , qsLen ) != 0 ) continue;
// got it...
// wtf?
if ( i + 1 >= xml->m_numNodes ) continue;
// point to the content! this is a text node?
// skip if not a text node, we don't return tag nodes i guess
if ( xml->m_nodes[i+1].m_nodeId ) continue;
char *content = xml->m_nodes[i+1].m_node;
long contentLen = xml->m_nodes[i+1].m_nodeLen;
// skip if empty
if ( ! content || contentLen <= 0 ) continue;
// skip commen cases too! like white space
if ( contentLen == 1 && is_wspace_a(content[0]) ) continue;
// hash it to match it if caller specified a particular hash
// because they are coming from Msg40::lookUpFacets() function
// to convert the hashes to strings, like for rendering in
// the facets box to the left of the search results
FacetValHash_t val32 = hash32 ( content, contentLen);
if ( fvh && fvh != val32 ) continue;
// otherwise add facet FIELD to our buf
if ( ! sb->safeStrcpy(qs) ) return false;
if ( ! sb->pushChar('\0') ) return false;
// then add facet VALUE
if ( isString && !sb->safePrintf("%lu,",(unsigned long)val32))
return false;
if ( !sb->safeMemcpy(content,contentLen) ) return false;
if ( !sb->pushChar('\0') ) return false;
// if only one specified, we are done
if ( fvh ) return true;
if ( uniqueField ) return true;
}
return true;
}
bool XmlDoc::storeFacetValuesJSON (char *qs, SafeBuf *sb,FacetValHash_t fvh ) { bool XmlDoc::storeFacetValuesJSON (char *qs, SafeBuf *sb,FacetValHash_t fvh ) {
// use new json parser // use new json parser

View File

@ -802,6 +802,7 @@ class XmlDoc {
bool linksToGigablast ( ) ; bool linksToGigablast ( ) ;
bool searchboxToGigablast ( ) ; bool searchboxToGigablast ( ) ;
bool hashLanguage ( class HashTableX *table ) ; bool hashLanguage ( class HashTableX *table ) ;
bool hashLanguageString ( class HashTableX *table ) ;
bool hashCountry ( class HashTableX *table ) ; bool hashCountry ( class HashTableX *table ) ;
bool hashSiteNumInlinks ( class HashTableX *table ) ; bool hashSiteNumInlinks ( class HashTableX *table ) ;
bool hashCharset ( class HashTableX *table ) ; bool hashCharset ( class HashTableX *table ) ;
@ -917,6 +918,8 @@ class XmlDoc {
FacetValHash_t fvh ) ; FacetValHash_t fvh ) ;
bool storeFacetValuesHtml ( char *qs , class SafeBuf *sb , bool storeFacetValuesHtml ( char *qs , class SafeBuf *sb ,
FacetValHash_t fvh ) ; FacetValHash_t fvh ) ;
bool storeFacetValuesXml ( char *qs , class SafeBuf *sb ,
FacetValHash_t fvh ) ;
bool storeFacetValuesJSON ( char *qs , class SafeBuf *sb , bool storeFacetValuesJSON ( char *qs , class SafeBuf *sb ,
FacetValHash_t fvh ) ; FacetValHash_t fvh ) ;
@ -1695,7 +1698,8 @@ class XmlDoc {
//bool doesUrlMatchDiffbotProcessPattern() ; //bool doesUrlMatchDiffbotProcessPattern() ;
bool doesPageContentMatchDiffbotProcessPattern() ; bool doesPageContentMatchDiffbotProcessPattern() ;
long *getDiffbotTitleHashes ( long *numHashes ) ; long *getDiffbotTitleHashes ( long *numHashes ) ;
char *hashJSON ( HashTableX *table ); char *hashJSONFields ( HashTableX *table );
char *hashXMLFields ( HashTableX *table );
long *nukeJSONObjects ( long *newTitleHashes , long numNewHashes ) ; long *nukeJSONObjects ( long *newTitleHashes , long numNewHashes ) ;
long m_joc; long m_joc;
@ -2032,7 +2036,7 @@ class XmlDoc {
Query m_query; Query m_query;
Matches m_matches; Matches m_matches;
// meta description buf // meta description buf
long m_dbufLen; long m_dbufSize;
char m_dbuf[1024]; char m_dbuf[1024];
SafeBuf m_htb; SafeBuf m_htb;
Title m_title; Title m_title;

View File

@ -95,6 +95,7 @@ class XmlNode {
// . use for <a href> xml nodes only right now // . use for <a href> xml nodes only right now
// . used so XmlDoc.cpp::getContactUsLink() works better // . used so XmlDoc.cpp::getContactUsLink() works better
//long m_linkNum; //long m_linkNum;
class XmlNode *m_parent;
}; };
// . does "s" start a tag? (regular tag , back tag or comment tag) // . does "s" start a tag? (regular tag , back tag or comment tag)

View File

@ -1,5 +1,5 @@
gb (1.14-1) unstable; urgency=low gb (1.16-1) unstable; urgency=low
* More bug fixes. * More bug fixes.
-- mwells <gigablast@mail.com> Tue, 19 Sep 2014 21:38:35 -0700 -- mwells <gigablast@mail.com> Tue, 24 Sep 2014 21:38:35 -0700

View File

@ -2,17 +2,30 @@
unsigned long long g_hashtab[256][256] ; unsigned long long g_hashtab[256][256] ;
// . now we explicitly specify the zobrist table so we are compatible
// with cygwin and apple environments
// . no, let's just define the rand2() function to be compatible then
//#include "hashtab.cpp"
// . used for computing zobrist hash of a string up to 256 chars long // . used for computing zobrist hash of a string up to 256 chars long
// . first array component is the max length, 256, of the string // . first array component is the max length, 256, of the string
bool hashinit () { bool hashinit () {
static bool s_initialized = false; static bool s_initialized = false;
// bail if we already called this // bail if we already called this
if ( s_initialized ) return true; if ( s_initialized ) return true;
// show RAND_MAX // show RAND_MAX
//printf("RAND_MAX = %lu\n", RAND_MAX ); it's 0x7fffffff //printf("RAND_MAX = %lu\n", RAND_MAX ); it's 0x7fffffff
// seed with same value so we get same rand sequence for all // seed with same value so we get same rand sequence for all
srand ( 1945687 ); srand ( 1945687 );
for ( long i = 0 ; i < 256 ; i++ )
//if ( g_hashtab[0][0] != 6720717044602784129LL ) return false;
//s_initialized = true;
//return true;
//fprintf(stdout,"g_hashtab[256][256]={\n");
for ( long i = 0 ; i < 256 ; i++ ) {
//fprintf(stdout,"{");
for ( long j = 0 ; j < 256 ; j++ ) { for ( long j = 0 ; j < 256 ; j++ ) {
g_hashtab [i][j] = (unsigned long long)rand(); g_hashtab [i][j] = (unsigned long long)rand();
// the top bit never gets set, so fix // the top bit never gets set, so fix
@ -23,8 +36,17 @@ bool hashinit () {
// the top bit never gets set, so fix // the top bit never gets set, so fix
if ( rand() > (0x7fffffff / 2) ) if ( rand() > (0x7fffffff / 2) )
g_hashtab[i][j] |= 0x80000000; g_hashtab[i][j] |= 0x80000000;
// fixes for cygwin/apple
//fprintf(stdout,"%lluULL",g_hashtab[i][j]);
//if ( j+1<256 ) fprintf(stdout,",");
} }
//fprintf(stdout,"},\n");
}
//fprintf(stdout,"};\n");
//fflush ( stdout );
if ( g_hashtab[0][0] != 6720717044602784129LL ) return false; if ( g_hashtab[0][0] != 6720717044602784129LL ) return false;
s_initialized = true; s_initialized = true;
return true; return true;
} }

View File

@ -22,7 +22,7 @@ with the least amount of hardware possible. Gigablast provides large-scale,
</p> </p>
<br> <br>
<p> <p>
Fr more information, <a href=/contact.html>contact Gigablast</a>. For more information, <a href=/contact.html>contact Gigablast</a>.
</P> </P>
<br> <br>
<br> <br>

50
html/adv.html Normal file
View File

@ -0,0 +1,50 @@
<br><br><br>
<form method=GET action=/search name=f>
<table width=605 border=0 align=center cellpadding=5 cellspacing=3>
<tbody>
<tr align=left valign=middle><th colspan=3>Search for...</th></tr><tr align=left valign=middle><td><strong>all</strong> of these words</td><td><input type=text id=q name=plus size=40 /></td><td><div onclick=document.f.submit(); onmouseover="this.style.backgroundColor='lightgreen';this.style.color='black';" onmouseout="this.style.backgroundColor='green';this.style.color='white';" style=border-radius:28px;cursor:pointer;cursor:hand;border-color:white;border-style:solid;border-width:3px;padding:12px;width:20px;height:20px;display:inline-block;background-color:green;color:white;><b style=margin-left:-5px;font-size:18px;>GO</b></div></td></tr><tr align=left valign=middle><td>this <strong>exact phrase</strong></td><td colspan=2><input type=text name=quotea size=40 /></td></tr><tr align=left valign=middle><td>and this <strong>exact phrase</strong></td><td colspan=2><input type=text name=quoteb size=40 /></td></tr>
<tr align=left valign=middle><td><strong>any</strong> of these words</td><td colspan=2><input type=text name=any size=40 /></td></tr>
<tr align=left valign=middle><td><strong>none</strong> of these words</td><td colspan=2><input type=text name=minus size=40 /></td></tr>
<!--<tr align=left valign=middle><td>Family Filter</td><td colspan=2><input type=radio name=ff value=1/>yes&nbsp;&nbsp;&nbsp;<input type=radio name=ff value=0 checked/>no</td></tr>-->
<tr align=left valign=middle><td>In this language:</td><td colspan=2>
<select name=gblang style=width:415px;>
<option value=-1>Any</option>
<option value=0>Unknown</option>
<option value=1>English</option><option value=2>French</option><option value=3>Spanish</option><option value=4>Russian</option><option value=5>Turkish</option><option value=6>Japanese</option><option value=7>ChineseTrad</option><option value=8>ChineseSimp</option><option value=9>Korean</option><option value=10>German</option><option value=11>Dutch</option><option value=12>Italian</option><option value=13>Finnish</option><option value=14>Swedish</option><option value=15>Norwegian</option><option value=16>Portuguese</option><option value=17>Vietnamese</option><option value=18>Arabic</option><option value=19>Hebrew</option><option value=20>Indonesian</option><option value=21>Greek</option><option value=22>Thai</option><option value=23>Hindi</option><option value=24>Bengala</option><option value=25>Polish</option><option value=26>Tagalog</option></select></td></tr>
<tr align=left valign=middle><td>Pages that link to this URL</td><td colspan=2><input type=text name=link size=40 /></td></tr>
<tr><td>Search these collections</td><td><input type=text name=c size=40></td></tr>
<tr align=left valign=middle><td>Site Clustering</td><td colspan=2><input type=radio name=sc value=1/>yes&nbsp;&nbsp;&nbsp;<input type=radio name=sc value=0 checked/>no</td></tr>
<tr align=left valign=middle><td>Number of summary excerpts</td><td colspan=2><input type=radio name=ns value=0>0&nbsp;&nbsp;&nbsp;<input type=radio name=ns value=1>1&nbsp;&nbsp;&nbsp;<input type=radio name=ns value=2>2&nbsp;&nbsp;&nbsp;<input type=radio name=ns value=3 checked>3&nbsp;&nbsp;&nbsp;<input type=radio name=ns value=4>4&nbsp;&nbsp;&nbsp;<input type=radio name=ns value=5>5</td></tr>
<tr align=left valign=middle><td>Results per Page</td>
<td colspan=2><input type=radio name=n value=10 checked/>10&nbsp;&nbsp;<input type=radio name=n value=20 />20&nbsp;&nbsp;<input type=radio name=n value=30 />30&nbsp;&nbsp;<input type=radio name=n value=40 />40&nbsp;&nbsp;<input type=radio name=n value=50 />50&nbsp;&nbsp;<input type=radio name=n value=100 />100</td></tr>
<tr align=left valign=middle><td>Restrict to these Sites</td><td colspan=2><textarea rows=10 cols=56 name=sites></textarea></td></tr>
<tr><td></td><td><input type=submit></td></tr>
</tbody></table>
</form>
<br>
<br><br>

View File

@ -137,7 +137,7 @@ Good luck!
<td><b>HTTP API</b></td> <td><b>HTTP API</b></td>
<!-- gb install --> <!-- gb install -->
<td> <td>
<a href=/api2.html>here</a> <a href=/admin/api>here</a>
</td> </td>
<!-- solr install--> <!-- solr install-->
<td> <td>
@ -262,7 +262,7 @@ Many different packages quilted together. Apache, MySQL, Lucene, Tika, Zookeeper
<!--gigablast--> <!--gigablast-->
<td> <td>
<font color=green><b> <font color=green><b>
Use curl using args (including <i>delim</i>) listed <a href=/api2.html#/admin/inject>here</a> Use curl using args (including <i>delim</i>) listed <a href=/admin/api#/admin/inject>here</a>
</b></font> </b></font>
<br> <br>
</td> </td>
@ -282,7 +282,7 @@ unsupported
<!--gigablast--> <!--gigablast-->
<td> <td>
Use curl to post the content of the file with args listed Use curl to post the content of the file with args listed
<a href=/api2.html#/admin/inject>here</a> <a href=/admin/api#/admin/inject>here</a>
</td> </td>
<!--solr--> <!--solr-->
<td> <td>
@ -300,7 +300,7 @@ You can index individual local files as such:
<!--gigablast--> <!--gigablast-->
<td> <td>
Use curl to inject the url with args listed Use curl to inject the url with args listed
<a href=/api2.html#/admin/inject>here</a> <a href=/admin/api#/admin/inject>here</a>
</td> </td>
<!--solr--> <!--solr-->
@ -317,7 +317,7 @@ Use curl to inject the url with args listed
<!--gigablast--> <!--gigablast-->
<td> <td>
Use one curl command for each url, using the interface described Use one curl command for each url, using the interface described
<a href=/api2.html#/admin/inject>here</a></b> <a href=/admin/api#/admin/inject>here</a></b>
</td> </td>
<!--solr--> <!--solr-->
<td> <td>
@ -335,7 +335,7 @@ Use one curl command for each url, using the interface described
<!--gigablast--> <!--gigablast-->
<td> <td>
Use curl command to delete a url, using the interface described Use curl command to delete a url, using the interface described
<a href=/api2.html#/admin/inject>here</a></b> <a href=/admin/api#/admin/inject>here</a></b>
</td> </td>
<!--solr--> <!--solr-->
<td> <td>
@ -351,7 +351,7 @@ You can delete individual documents by specifying queries that match just those
<td><b>Getting Results via cmdline</b></td> <td><b>Getting Results via cmdline</b></td>
<td> <td>
Use curl command to do a search, using the interface described Use curl command to do a search, using the interface described
<a href=/api2.html#/search>here</a></b> <a href=/admin/api#/search>here</a></b>
</td> </td>
<td> <td>
??? ???
@ -882,3 +882,4 @@ and federated search across them.
</table> </table>
<br><br><br>

File diff suppressed because one or more lines are too long

BIN
html/rocket16.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 784 B

13
html/searchbar.xml Normal file
View File

@ -0,0 +1,13 @@
<?xml version="1.0" encoding="UTF-8"?>
<OpenSearchDescription xmlns="http://a9.com/-/spec/opensearch/1.1/" xmlns:moz="http://www.mozilla.org/2006/browser/search/">
<ShortName>GigaBlast</ShortName>
<Description>The Search Engine</Description>
<Language>en-us</Language>
<OutputEncoding>UTF-8</OutputEncoding>
<InputEncoding>UTF-8</InputEncoding>
<Image width="16" height="16" type="image/png">http://www.gigablast.com/rocket16.png</Image>
<Url type="text/html" method="GET" template="http://www.gigablast.com/search?q={searchTerms}&amp;"></Url>
<Url type="application/xhtml+xml" indexOffset="0" template="http://www.gigablast.com/search?q={searchTerms}&amp;"></Url>
<moz:SearchForm>www.gigablast.com</moz:SearchForm>
</OpenSearchDescription>

View File

@ -1,7 +1,7 @@
<br><br><br> <br><br><br>
<h1>People that Use Gigablast</h1> <h1>Gigablast Open Source Users</h1>
<table cellpadding=10 style=max-width:500px;> <table cellpadding=10 style=max-width:500px;>
@ -55,4 +55,34 @@ search engine."
</table>
<br><br>
<h1>Gigablast pre Open Source Users</h1>
<table cellpadding=10 style=max-width:500px;>
<tr><td>
<img width=80 height=120 src=/user1.jpeg>
</td><td>
Snap.com powered its web search technology by running Gigablast on over 100 servers. Snap was started my founder of Overture/GoTo.com, Bill Gross. Before it closed, Snap help pioneer the marketplace for ads that popup when you mouse over a link.
</td></tr>
<tr><td>
<img width=80 height=120 src=/user1.jpeg>
</td><td>
GlobalSpec.com used Gigablast to index and search over millions of technical products to help grow it into one of the market leaders for online technical and industrial product information.
</td></tr>
<tr><td>
<img width=80 height=120 src=/user1.jpeg>
</td><td>
MetaLincs embedded Gigablast into its commercial solution for E-mail discovery before being acquired by Seagate, Inc.
</td></tr>
</table> </table>

View File

@ -2691,7 +2691,8 @@ int main2 ( int argc , char *argv[] ) {
// hash the term itself // hash the term itself
termId = hash64n(targ); termId = hash64n(targ);
// hash prefix with termhash // hash prefix with termhash
termId = hash64(termId,prefix64); if ( prefix64 )
termId = hash64(termId,prefix64);
termId &= TERMID_MASK; termId &= TERMID_MASK;
} }
else { else {
@ -5598,6 +5599,7 @@ bool registerMsgHandlers2(){
if(! g_udpServer.registerHandler(0x3f,handleRequest3f)) return false; if(! g_udpServer.registerHandler(0x3f,handleRequest3f)) return false;
if ( ! g_udpServer.registerHandler(0x25,handleRequest25)) return false; if ( ! g_udpServer.registerHandler(0x25,handleRequest25)) return false;
if ( ! g_udpServer.registerHandler(0x07,handleRequest7)) return false;
return true; return true;
@ -12886,6 +12888,7 @@ void dumpPosdb (char *coll,long startFileNum,long numFiles,bool includeTree,
if ( termId >= 0 ) { if ( termId >= 0 ) {
g_posdb.makeStartKey ( &startKey, termId ); g_posdb.makeStartKey ( &startKey, termId );
g_posdb.makeEndKey ( &endKey, termId ); g_posdb.makeEndKey ( &endKey, termId );
printf("termid=%llu\n",termId);
printf("startkey=%s\n",KEYSTR(&startKey,sizeof(POSDBKEY))); printf("startkey=%s\n",KEYSTR(&startKey,sizeof(POSDBKEY)));
printf("endkey=%s\n",KEYSTR(&endKey,sizeof(POSDBKEY))); printf("endkey=%s\n",KEYSTR(&endKey,sizeof(POSDBKEY)));
} }

BIN
pdftohtml

Binary file not shown.

720
qa.cpp
View File

@ -211,6 +211,11 @@ void processReply ( char *reply , long replyLen ) {
// # of collections in the admin page: ..."4 Collections" // # of collections in the admin page: ..."4 Collections"
markOut(content,"px;color:black;\"><center><nobr><b>"); markOut(content,"px;color:black;\"><center><nobr><b>");
markOut(content,"spider is done (");
markOut(content,"spider is paused (");
markOut(content,"spider is active (");
markOut(content,"spider queue empty (");
// make checksum. we ignore back to back spaces so this // make checksum. we ignore back to back spaces so this
// hash works for <docsInCollection>10 vs <docsInCollection>9 // hash works for <docsInCollection>10 vs <docsInCollection>9
long contentCRC = 0; long contentCRC = 0;
@ -502,8 +507,6 @@ static long *s_flags = NULL;
// //
bool qainject1 ( ) { bool qainject1 ( ) {
//if ( ! s_callback ) s_callback = qainject1;
// //
// delete the 'qatest123' collection // delete the 'qatest123' collection
// //
@ -520,7 +523,8 @@ bool qainject1 ( ) {
//static bool s_x2 = false; //static bool s_x2 = false;
if ( ! s_flags[1] ) { if ( ! s_flags[1] ) {
s_flags[1] = true; s_flags[1] = true;
if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1&"
"collectionips=127.0.0.1" ,
// checksum of reply expected // checksum of reply expected
238170006 ) ) 238170006 ) )
return false; return false;
@ -529,13 +533,12 @@ bool qainject1 ( ) {
// turn off images thumbnails // turn off images thumbnails
if ( ! s_flags[17] ) { if ( ! s_flags[17] ) {
s_flags[17] = true; s_flags[17] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0", if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
// checksum of reply expected // checksum of reply expected
238170006 ) ) 238170006 ) )
return false; return false;
} }
// this only loads once // this only loads once
loadUrls(); loadUrls();
long max = s_ubuf2.length()/(long)sizeof(char *); long max = s_ubuf2.length()/(long)sizeof(char *);
@ -605,6 +608,55 @@ bool qainject1 ( ) {
return false; return false;
} }
//
// adv.html test
//
// query for 'test' using adv.html advanced search interface
if ( ! s_flags[27] ) {
s_flags[27] = true;
if ( ! getUrl (
"/search?c=qatest123&qa=17&format=xml&"
"dr=1&pss=50&sc=1&hacr=1&quotea=web+site&"
"gblang=1&minus=transcripts&n=150",
123 ) )
return false;
}
// &sites= test
if ( ! s_flags[28] ) {
s_flags[28] = true;
if ( ! getUrl (
"/search?c=qatest123&qa=17&format=xml&q=web&"
"sortby=2&"
// html only:
"sw=20&"
"filetype=html&"
"ff=1&"
"facet=gbfacetint:gbhopcount&"
"sites=mindtools.com+www.redcross.org"
, 123 ) )
return false;
}
// html test of summary width
if ( ! s_flags[29] ) {
s_flags[29] = true;
if ( ! getUrl (
"/search?c=qatest123&qa=17&format=html&q=web&"
// html only:
"sw=20&tml=10&ns=1&smxcpl=30&qh=0&n=100&"
"dt=keywords+description&"
"facet=gbfacetint:gbspiderdate&"
, 123 ) )
return false;
}
// stop for now
//return true; //
// //
// eject/delete the urls // eject/delete the urls
// //
@ -682,7 +734,7 @@ bool qainject2 ( ) {
// turn off images thumbnails // turn off images thumbnails
if ( ! s_flags[17] ) { if ( ! s_flags[17] ) {
s_flags[17] = true; s_flags[17] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0", if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
// checksum of reply expected // checksum of reply expected
238170006 ) ) 238170006 ) )
return false; return false;
@ -758,7 +810,7 @@ bool qainject2 ( ) {
// //
// mdw: query reindex test // mdw: query DELETE test
// //
if ( ! s_flags[30] ) { if ( ! s_flags[30] ) {
s_flags[30] = true; s_flags[30] = true;
@ -824,6 +876,406 @@ bool qainject2 ( ) {
return true; return true;
} }
bool qaimport () {
//
// delete the 'qatest123' collection
//
//static bool s_x1 = false;
if ( ! s_flags[0] ) {
s_flags[0] = true;
if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
return false;
}
//
// add the 'qatest123' collection
//
//static bool s_x2 = false;
if ( ! s_flags[1] ) {
s_flags[1] = true;
if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" ,
// checksum of reply expected
238170006 ) )
return false;
}
// turn spiders off so it doesn't spider while we are importing
if ( ! s_flags[18] ) {
s_flags[18] = true;
if ( ! getUrl ( "/admin/spider?cse=0&c=qatest123",
// checksum of reply expected
238170006 ) )
return false;
}
// set the import dir and # inject threads
if ( ! s_flags[17] ) {
s_flags[17] = true;
if ( ! getUrl ( "/admin/import?c=qatest123&importdir=%2Fhome%2Fmwells%2Ftesting%2Fimport%2F&numimportinjects=3&import=1&action=submit",
// checksum of reply expected
238170006 ) )
return false;
}
// wait for importloop to "kick in" so it can set cr->m_importState
if ( ! s_flags[3] ) {
wait(1.0);
s_flags[3] = true;
return false;
}
// import must be done!
if ( ! s_flags[19] ) {
CollectionRec *cr = g_collectiondb.getRec("qatest123");
// if still importing this will be non-null
if ( cr->m_importState ) {
wait(1.0);
return false;
}
// all done then
s_flags[19] = true;
}
// wait for absorption of index
if ( ! s_flags[28] ) {
wait(2.0);
s_flags[28] = true;
return false;
}
// test query
if ( ! s_flags[16] ) {
s_flags[16] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe"
"&dsrt=500",
702467314 ) )
return false;
}
// test site clustering
if ( ! s_flags[29] ) {
s_flags[29] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
"q=mediapost&dsrt=0&sc=1",
702467314 ) )
return false;
}
//static bool s_fee2 = false;
if ( ! s_flags[13] ) {
s_flags[13] = true;
log("qa: SUCCESSFULLY COMPLETED DATA "
"IMPORT TEST");
//if ( s_callback == qainject ) exit(0);
return true;
}
return true;
}
bool qainlinks() {
//
// delete the 'qatest123' collection
//
//static bool s_x1 = false;
if ( ! s_flags[0] ) {
s_flags[0] = true;
if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
return false;
}
//
// add the 'qatest123' collection
//
//static bool s_x2 = false;
if ( ! s_flags[1] ) {
s_flags[1] = true;
if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" ,
// checksum of reply expected
238170006 ) )
return false;
}
// turn spiders off so it doesn't spider while we are importing
if ( ! s_flags[18] ) {
s_flags[18] = true;
if ( ! getUrl ( "/admin/spider?cse=0&c=qatest123",
// checksum of reply expected
238170006 ) )
return false;
}
// inject youtube
if ( ! s_flags[2] ) {
s_flags[2] = true;
SafeBuf sb;
sb.safePrintf( "/admin/inject?c=qatest123&"
"format=xml&u=www.youtube.com");
if ( ! getUrl ( sb.getBufStart() , 999 ) )
return false;
}
// test query
if ( ! s_flags[3] ) {
s_flags[3] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&q=youtube"
,702467314 ) )
return false;
}
// scrape inlinkers
if ( ! s_flags[4] ) {
s_flags[4] = true;
SafeBuf sb;
sb.safePrintf( "/admin/inject?c=qatest123&"
"format=xml&qts=link:www.youtube.com&n=100");
if ( ! getUrl ( sb.getBufStart() , 999 ) )
return false;
}
// inject better inlinkers
if ( ! s_flags[20] ) {
s_flags[20] = true;
SafeBuf sb;
sb.safePrintf( "/admin/inject?c=qatest123&"
"format=xml&"
"url=www.freebsd.org%%2Fcommunity.html");
if ( ! getUrl ( sb.getBufStart() , 999 ) )
return false;
}
// wait a second for linkdb absorption
if ( ! s_flags[5] ) {
wait(1.0);
s_flags[5] = true;
return false;
}
// RE-inject youtube
if ( ! s_flags[6] ) {
s_flags[6] = true;
SafeBuf sb;
sb.safePrintf( "/admin/inject?c=qatest123&"
"format=xml&u=www.youtube.com");
if ( ! getUrl ( sb.getBufStart() , 999 ) )
return false;
}
// wait a second term freq stabilization
if ( ! s_flags[9] ) {
wait(2.0);
s_flags[9] = true;
return false;
}
// test query
if ( ! s_flags[7] ) {
s_flags[7] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&"
"format=xml&q=youtube"
// get scoring info
"&scores=1"
,702467314 ) )
return false;
}
//static bool s_fee2 = false;
if ( ! s_flags[13] ) {
s_flags[13] = true;
log("qa: SUCCESSFULLY COMPLETED INLINK TEST");
//if ( s_callback == qainject ) exit(0);
return true;
}
return true;
}
// query reindex test
bool qareindex() {
//
// delete the 'qatest123' collection
//
//static bool s_x1 = false;
if ( ! s_flags[0] ) {
s_flags[0] = true;
if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
return false;
}
//
// add the 'qatest123' collection
//
//static bool s_x2 = false;
if ( ! s_flags[1] ) {
s_flags[1] = true;
if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" ,
// checksum of reply expected
238170006 ) )
return false;
}
// turn off images thumbnails
if ( ! s_flags[17] ) {
s_flags[17] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
// checksum of reply expected
238170006 ) )
return false;
}
// this only loads once
loadUrls();
long max = s_ubuf2.length()/(long)sizeof(char *);
//max = 1;
//
// inject urls, return false if not done yet
//
//static bool s_x4 = false;
if ( ! s_flags[2] ) {
// TODO: try delimeter based injection too
//static long s_ii = 0;
for ( ; s_flags[20] < max ; ) {
// inject using html api
SafeBuf sb;
sb.safePrintf("&c=qatest123&deleteurl=0&"
"format=xml&u=");
sb.urlEncode ( s_urlPtrs[s_flags[20]] );
// the content
sb.safePrintf("&hasmime=1");
// sanity
//if ( strstr(s_urlPtrs[s_flags[20]],"wdc.htm") )
// log("hey");
sb.safePrintf("&content=");
sb.urlEncode(s_contentPtrs[s_flags[20]] );
sb.nullTerm();
// pre-inc it in case getUrl() blocks
s_flags[20]++;//ii++;
if ( ! getUrl("/admin/inject",
0, // no idea what crc to expect
sb.getBufStart()) )
return false;
}
s_flags[2] = true;
}
// wait for absorption
if ( ! s_flags[3] ) {
wait(1.5);
s_flags[3] = true;
return false;
}
// query for 'test'
if ( ! s_flags[27] ) {
s_flags[27] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=17&format=xml&q=test&icc=1",
-1672870556 ) )
return false;
}
// make 2nd url filter !isreindex just have 0 spiders so we do
// not spider the links from the REINDEXED PAGES
if ( ! s_flags[4] ) {
s_flags[4] = true;
SafeBuf sb;
sb.safePrintf("&c=qatest123&"
// make it the custom filter
"ufp=custom&"
// zero spiders if not isreindex
"fe1=default&hspl1=0&hspl1=1&fsf1=1.000000&"
"mspr1=0&mspi1=0&xg1=1000&fsp1=45&"
);
if ( ! getUrl ( "/admin/filters",0,sb.getBufStart()) )
return false;
}
// do the query reindex on 'test'
if ( ! s_flags[16] ) {
s_flags[16] = true;
if ( ! getUrl ( "/admin/reindex?c=qatest123&qa=16&"
"format=xml&q=test"
, 702467314 ) )
return false;
}
checkagain2:
// wait until spider finishes. check the spider status page
// in json to see when completed
if ( ! s_flags[5] ) {
wait(3.0);
s_flags[5] = true;
return false;
}
// wait for all spiders to stop
if ( ! s_flags[15] ) {
s_flags[15] = true;
if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
return false;
}
//static bool s_k2 = false;
if ( ! s_flags[6] ) {
// ensure spiders are done.
// "Nothing currently available to spider"
if ( s_content&&!strstr(s_content,"Nothing currently avail")){
s_flags[5] = false;
s_flags[15] = false;
goto checkagain2;
}
s_flags[6] = true;
}
//
// query for 'test' again after the reindex
//
if ( ! s_flags[14] ) {
s_flags[14] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=14&format=xml&q=test&icc=1",
-1672870556 ) )
return false;
}
//static bool s_fee2 = false;
if ( ! s_flags[13] ) {
s_flags[13] = true;
log("qa: SUCCESSFULLY COMPLETED "
"QUERY REINDEX");
//if ( s_callback == qainject ) exit(0);
return true;
}
return true;
}
/* /*
static char *s_urls1 = static char *s_urls1 =
" walmart.com" " walmart.com"
@ -954,9 +1406,10 @@ bool qaspider1 ( ) {
} }
// turn off images thumbnails // turn off images thumbnails
// set max spiders to 1 for consistency!
if ( ! s_flags[24] ) { if ( ! s_flags[24] ) {
s_flags[24] = true; s_flags[24] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0", if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
// checksum of reply expected // checksum of reply expected
238170006 ) ) 238170006 ) )
return false; return false;
@ -1220,7 +1673,7 @@ bool qaspider2 ( ) {
// turn off images thumbnails // turn off images thumbnails
if ( ! s_flags[24] ) { if ( ! s_flags[24] ) {
s_flags[24] = true; s_flags[24] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0", if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
// checksum of reply expected // checksum of reply expected
238170006 ) ) 238170006 ) )
return false; return false;
@ -1417,7 +1870,7 @@ bool qascrape ( ) {
// turn off images thumbnails // turn off images thumbnails
if ( ! s_flags[24] ) { if ( ! s_flags[24] ) {
s_flags[24] = true; s_flags[24] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0", if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
// checksum of reply expected // checksum of reply expected
238170006 ) ) 238170006 ) )
return false; return false;
@ -1536,7 +1989,7 @@ bool qajson ( ) {
// turn off images thumbnails // turn off images thumbnails
if ( ! s_flags[24] ) { if ( ! s_flags[24] ) {
s_flags[24] = true; s_flags[24] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0", if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
// checksum of reply expected // checksum of reply expected
238170006 ) ) 238170006 ) )
return false; return false;
@ -1716,6 +2169,225 @@ bool qajson ( ) {
return true; return true;
} }
static char *s_ubuf5 =
"http://www.thompsoncancer.com/News/RSSLocation2.ashx?sid=7 "
"http://www.jdlculaval.com/xmlrpc.php?rsd "
"http://pharmacept.com/feed/ "
"http://www.web-erfolg.net/feed/ "
"http://www.extremetriathlon.org/site/feed/ "
"http://www.pilatesplusdublin.ie/wp-includes/wlwmanifest.xml "
"http://www.youtube.com/oembed?url=http%3A//www.youtube.com/watch?v%3Dv0lZQVaXSyM&format=xml "
"http://www.ehow.com/feed/home/garden-lawn/lawn-mowers.rss "
"http://www.functionaltrainingpro.com/xmlrpc.php?rsd "
"http://mississippisociety.com/index.php/feed "
;
;
bool qaxml ( ) {
//
// delete the 'qatest123' collection
//
//static bool s_x1 = false;
if ( ! s_flags[0] ) {
s_flags[0] = true;
if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
return false;
}
//
// add the 'qatest123' collection
//
//static bool s_x2 = false;
if ( ! s_flags[1] ) {
s_flags[1] = true;
if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" ,
// checksum of reply expected
238170006 ) )
return false;
}
// turn off images thumbnails
if ( ! s_flags[24] ) {
s_flags[24] = true;
if ( ! getUrl ( "/admin/spider?c=qatest123&mit=0&mns=1",
// checksum of reply expected
238170006 ) )
return false;
}
// add the 50 urls
if ( ! s_flags[3] ) {
s_flags[3] = true;
SafeBuf sb;
sb.safePrintf("&c=qatest123"
"&format=json"
"&strip=1"
"&spiderlinks=0"
"&urls="//www.walmart.com+ibm.com"
);
sb.urlEncode ( s_ubuf5 );
// . now a list of websites we want to spider
// . the space is already encoded as +
if ( ! getUrl ( "/admin/addurl",0,sb.getBufStart()) )
return false;
}
//
// wait for spidering to stop
//
checkagain:
// wait until spider finishes. check the spider status page
// in json to see when completed
//static bool s_k1 = false;
if ( ! s_flags[5] ) {
// wait 5 seconds, call sleep timer... then call qatest()
//usleep(5000000); // 5 seconds
wait(3.0);
s_flags[5] = true;
return false;
}
if ( ! s_flags[15] ) {
s_flags[15] = true;
if ( ! getUrl ( "/admin/status?format=json&c=qatest123",0) )
return false;
}
//static bool s_k2 = false;
if ( ! s_flags[6] ) {
// ensure spiders are done.
// "Nothing currently available to spider"
if ( s_content&&!strstr(s_content,"Nothing currently avail")){
s_flags[5] = false;
s_flags[15] = false;
goto checkagain;
}
s_flags[6] = true;
}
if ( ! s_flags[7] ) {
s_flags[7] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
"q=type%3Axml+oembed.type%3Avideo",
-1310551262 ) )
return false;
}
if ( ! s_flags[8] ) {
s_flags[8] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
"q=video",
-1310551262 ) )
return false;
}
if ( ! s_flags[9] ) {
s_flags[9] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
"q=oembed.thumbnail_height%3A360",
-1310551262 ) )
return false;
}
if ( ! s_flags[10] ) {
s_flags[10] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
"q=gbminint%3Aoembed.thumbnail_height%3A380",
-1310551262 ) )
return false;
}
// other query tests...
if ( ! s_flags[12] ) {
s_flags[12] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
"q=gbmaxint%3Aoembed.thumbnail_height%3A380",
-1310551262 ) )
return false;
}
if ( ! s_flags[13] ) {
s_flags[13] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
"q=rss.channel.item.title%3Abests",
-1310551262 ) )
return false;
}
if ( ! s_flags[14] ) {
s_flags[14] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
"q=gbfacetstr%3Arss.channel.title",
-1310551262 ) )
return false;
}
/*
if ( ! s_flags[15] ) {
s_flags[15] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
"q=gbfieldmatch%3Astrings.key"
"%3A\"Maemo+Browser\"",
-1310551262 ) )
return false;
}
if ( ! s_flags[16] ) {
s_flags[16] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
"q=gbfieldmatch%3Astrings.key"
"%3A\"Google+Wireless+Transcoder\"",
-1310551262 ) )
return false;
}
// this should have no results, not capitalized
if ( ! s_flags[17] ) {
s_flags[17] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
"q=gbfieldmatch%3Astrings.key%3A\"samsung\"",
-1310551262 ) )
return false;
}
if ( ! s_flags[18] ) {
s_flags[18] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
"q=gbfieldmatch%3Astrings.key%3ASamsung",
-1310551262 ) )
return false;
}
if ( ! s_flags[18] ) {
s_flags[18] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=json&"
"q=gbfieldmatch%3Astrings.key%3A\"Samsung\"",
-1310551262 ) )
return false;
}
*/
//static bool s_fee2 = false;
if ( ! s_flags[20] ) {
s_flags[20] = true;
log("qa: SUCCESSFULLY COMPLETED "
"QA XML TEST");
return true;
}
return true;
}
/* /*
bool qaspider ( ) { bool qaspider ( ) {
@ -1739,7 +2411,8 @@ static QATest s_qatests[] = {
{qainject1, {qainject1,
"injectTest1", "injectTest1",
"Test injection api. Test injection of multiple urls with content. " "Test injection api. Test injection of multiple urls with content. "
"Test deletion of urls via inject api."}, "Test deletion of urls via inject api. Test most query api parms. "
"Test advanced search parms."},
{qainject2, {qainject2,
"injectTest2", "injectTest2",
@ -1760,9 +2433,26 @@ static QATest s_qatests[] = {
"Scrape and inject results from google and bing."}, "Scrape and inject results from google and bing."},
{qajson, {qajson,
"jsontest", "jsonTest",
"Add Url some JSON pages and test json-ish queries. Test facets over " "Add Url some JSON pages and test json-ish queries. Test facets over "
"json docs."} "json docs."},
{qaxml,
"xmlTest",
"Add Url some XML pages and test xml-ish queries. Test facets over "
"xml docs."},
{qaimport,
"importDataTest",
"Test data import functionality. Test site clustering."},
{qainlinks,
"inlinksTest",
"Test youtube inlinks. Test EDOCUNCHANGED iff just inlinks change."},
{qareindex,
"queryReindexTest",
"Test query reindex function. Ensure changed docs are updated."}
}; };
@ -2014,5 +2704,3 @@ bool sendPageQA ( TcpSocket *sock , HttpRequest *hr ) {
return true; return true;
} }