Merge branch 'testing' into diffbot-matt

Conflicts:
	Errno.cpp
	Errno.h
	Parms.h
This commit is contained in:
mwells 2014-07-07 09:49:59 -07:00
commit 6434e5cc04
54 changed files with 2718 additions and 825 deletions

View File

@ -1962,7 +1962,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
m_harvestLinks[n] = 1;
*/
m_regExs[n].set("isdocidbased");
m_regExs[n].set("isreindex");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
@ -2198,7 +2198,7 @@ bool CollectionRec::rebuildChineseRules ( ) {
long n = 0;
m_regExs[n].set("isdocidbased");
m_regExs[n].set("isreindex");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
@ -3029,7 +3029,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
long i = 0;
// 1st one! for query reindex/ query delete
m_regExs[i].set("isdocidbased");
m_regExs[i].set("isreindex");
m_spiderIpMaxSpiders [i] = 10;
m_spiderPriorities [i] = 70;
i++;

View File

@ -426,6 +426,9 @@ class CollectionRec {
long m_spiderRoundNum;
char m_makeImageThumbnails;
long m_thumbnailMaxWidthHeight ;
char m_indexSpiderReplies;
char m_indexBody;
@ -616,7 +619,6 @@ class CollectionRec {
long m_summaryMaxLen;
long m_summaryMaxNumLines;
long m_summaryMaxNumCharsPerLine;
long m_summaryDefaultNumLines;
char m_useNewSummaries;
char m_getDocIdScoringInfo;

View File

@ -17781,6 +17781,65 @@ TimeZone tzs[] = {
// hash table of timezone information
static HashTableX s_tzt;
static long long h_mountain;
static long long h_eastern;
static long long h_central;
static long long h_pacific;
static long long h_time2;
static long long h_mdt;
static long long h_at2;
bool initTimeZoneTable ( ) {
// if already initalized return true
if ( s_tzt.m_numSlotsUsed ) return true;
// init static wids
h_mountain = hash64n("mountain");
h_eastern = hash64n("eastern");
h_central = hash64n("central");
h_pacific = hash64n("pacific");
h_time2 = hash64n("time");
h_mdt = hash64n("mdt");
h_at2 = hash64n("at");
// set up the time zone hashtable
if ( ! s_tzt.set( 8,4, 300,NULL,0,false,0,"tzts"))
return false;
// load time zone names and their modifiers into hashtable
for ( long i = 0 ; *tzs[i].m_name ; i++ ) {
char *t = tzs[i].m_name;
long tlen = gbstrlen(t);
// hash like Words.cpp computeWordIds
uint64_t h = hash64Lower_utf8( t , tlen );
// use the ptr as the value
if ( ! s_tzt.addKey ( &h, &tzs[i] ) )
return false;
}
return true;
}
// return what we have to add to UTC to get time in locale specified by "s"
// where "s" is like "PDT" "MST" "EST" etc. if unknown return 999999
long getTimeZone ( char *s ) {
if ( ! s ) return BADTIMEZONE;
char *send = s;
// point to end of the potential timezone
for ( ; *send && isalnum(*send) ; send++ );
// hash it
uint64_t h = hash64Lower_utf8( s , send -s );
// make sure table is ready
initTimeZoneTable();
// look it up
long slot = s_tzt.getSlot( &h );
if ( slot < 0 ) return 999999;
// did we find it in the table?
TimeZone *tzptr = (TimeZone *)s_tzt.getValueFromSlot ( slot );
// no error, return true
long secs = tzptr->m_hourMod * 3600;
secs += tzptr->m_minMod * 60;
return secs;
}
// . returns how many words starting at i are in the time zone
// . 0 means not a timezone
long getTimeZoneWord ( long i ,
@ -17793,40 +17852,14 @@ long getTimeZoneWord ( long i ,
*tzptr = NULL;
// only init table once
bool s_init16 = false;
static long long h_mountain;
static long long h_eastern;
static long long h_central;
static long long h_pacific;
static long long h_time;
static long long h_mdt;
static long long h_at;
// init the hash table of month names
if ( ! s_init16 ) {
// init static wids
h_mountain = hash64n("mountain");
h_eastern = hash64n("eastern");
h_central = hash64n("central");
h_pacific = hash64n("pacific");
h_time = hash64n("time");
h_mdt = hash64n("mdt");
h_at = hash64n("at");
// set up the time zone hashtable
if ( ! s_tzt.set( 8,4, 300,NULL,0,false,niceness,"tzts"))
return -1;
// load time zone names and their modifiers into hashtable
for ( long i = 0 ; *tzs[i].m_name ; i++ ) {
char *t = tzs[i].m_name;
long tlen = gbstrlen(t);
// hash like Words.cpp computeWordIds
uint64_t h = hash64Lower_utf8( t , tlen );
// use the ptr as the value
if ( ! s_tzt.addKey ( &h, &tzs[i] ) )
return -1;
}
// on error we return -1 from here
if ( ! initTimeZoneTable() ) return -1;
s_init16 = true;
}
// this is too common of a word!
if ( wids[i] == h_at ) return 0;
if ( wids[i] == h_at2 ) return 0;
long slot = s_tzt.getSlot( &wids[i] );
// return this, assume just one word
@ -17834,7 +17867,7 @@ long getTimeZoneWord ( long i ,
// . "mountain time"
// . this removes the event title "M-F 8:30 AM-5:30 PM Mountain Time"
// from the event (horus) on http://www.sfreporter.com/contact_us/
if ( slot<0 && i+2<nw && wids[i+2] == h_time ) {
if ( slot<0 && i+2<nw && wids[i+2] == h_time2 ) {
if ( wids[i] == h_mountain ) {
slot = s_tzt.getSlot (&h_mdt);
tznw = 3;

View File

@ -794,6 +794,7 @@ public:
bool m_isSiteRoot ;
};
// now time zones
struct TimeZone {
char m_name[16];
@ -803,6 +804,13 @@ struct TimeZone {
long m_modType;
};
#define BADTIMEZONE 999999
// "s" is the timezone, like "EDT" and we return # of secs to add to UTC
// to get the current time in that time zone.
// returns BADTIMEZONE if "s" is unknown timezone
long getTimeZone ( char *s ) ;
// . returns how many words starting at i are in the time zone
// . 0 means not a timezone
long getTimeZoneWord ( long i , long long *wids , long nw ,

View File

@ -170,8 +170,21 @@ case EDOCNONCANONICAL: return "Url was dup of canonical page";
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
case ENOTOKEN: return "Missing token";
case EBADIMG: return "Bad image";
case EREINDEXREDIR: return "Not parent url to reindex";
case EREINDEXREDIR: return "Not a reindexable doc";
case ETOOMANYPARENS: return "Too many nested parentheses in boolean query";
case EDIFFBOTUNABLETOAPPLYRULES: return "Diffbot unable to apply rules";
case EDIFFBOTCOULDNOTPARSE: return "Diffbot could not parse page";
case EDIFFBOTCOULDNOTDOWNLOAD: return "Diffbot could not download page";
case EDIFFBOTINVALIDAPI: return "Diffbot invalid API";
case EDIFFBOTVERSIONREQ: return "Diffbot version required";
case EDIFFBOTEMPTYCONTENT: return "Diffbot empty content";
case EDIFFBOTREQUESTTIMEDOUT: return "Diffbot request timed out";
case EDIFFBOTURLPROCESSERROR: return "Diffbot error processing url";
case EDIFFBOTTOKENEXPIRED: return "Diffbot token expired";
case EDIFFBOTUNKNOWNERROR: return "Diffbot unknown error";
case EMISSINGINPUT: return "Missing required input parms";
case EDMOZNOTREADY: return "Dmoz is not setup, follow instructions in "
"admin.html to setup";
case EPROXYSSLCONNECTFAILED: return "SSL tunnel through HTTP proxy failed";
}
// if the remote error bit is clear it must be a regulare errno

14
Errno.h
View File

@ -176,6 +176,20 @@ enum {
EBADIMG,
EREINDEXREDIR,
ETOOMANYPARENS,
EDIFFBOTUNABLETOAPPLYRULES,
EDIFFBOTCOULDNOTPARSE,
EDIFFBOTCOULDNOTDOWNLOAD,
EDIFFBOTINVALIDAPI,
EDIFFBOTVERSIONREQ,
EDIFFBOTEMPTYCONTENT,
EDIFFBOTREQUESTTIMEDOUT,
EDIFFBOTURLPROCESSERROR,
EDIFFBOTTOKENEXPIRED,
EDIFFBOTUNKNOWNERROR,
EMISSINGINPUT,
EDMOZNOTREADY,
EPROXYSSLCONNECTFAILED
};
#endif

View File

@ -237,6 +237,8 @@ time_t atotime ( char *s ) {
return atotime3 ( s );
}
#include "Dates.h" // for getTimeZone()
// #1: Sun, 06 Nov 1994 08:49:37 GMT ;RFC 822, updated by RFC 1123
time_t atotime1 ( char *s ) {
// this time structure, once filled, will help yield a time_t
@ -258,8 +260,20 @@ time_t atotime1 ( char *s ) {
getTime ( s , &t.tm_sec , &t.tm_min , &t.tm_hour );
// unknown if we're in daylight savings time
t.tm_isdst = -1;
// translate using mktime
time_t local = mktime ( &t );
time_t global = timegm ( &t );
// skip HH:MM:SS
while ( ! isspace (*s) ) s++;
// skip spaces
while ( isspace (*s) ) s++;
// convert local time to "utc" or whatever timezone "s" points to,
// which is usually gmt or utc
long tzoff = getTimeZone ( s ) ;
if ( tzoff != BADTIMEZONE ) global += tzoff;
return global;
// now, convert to utc
//time_t utc = time(NULL);
// get time here locally
@ -268,7 +282,6 @@ time_t atotime1 ( char *s ) {
//long delta = here - utc;
// modify our time to make it into utc
//return local - delta;
return local;
}
// #2: Sunday, 06-Nov-94 08:49:37 GMT ;RFC 850,obsoleted by RFC1036
@ -293,7 +306,17 @@ time_t atotime2 ( char *s ) {
// unknown if we're in daylight savings time
t.tm_isdst = -1;
// translate using mktime
return mktime ( &t );
time_t global = timegm ( &t );
// skip HH:MM:SS
while ( ! isspace (*s) ) s++;
// skip spaces
while ( isspace (*s) ) s++;
// convert local time to "utc" or whatever timezone "s" points to,
// which is usually gmt or utc
long tzoff = getTimeZone ( s ) ;
if ( tzoff != BADTIMEZONE ) global += tzoff;
return global;
}
// #3: Sun Nov 6 08:49:37 1994 ;ANSI C's asctime() format
@ -319,7 +342,7 @@ time_t atotime3 ( char *s ) {
// unknown if we're in daylight savings time
t.tm_isdst = -1;
// translate using mktime
time_t tt = mktime ( &t );
time_t tt = timegm ( &t );
return tt;
}
@ -346,7 +369,17 @@ time_t atotime4 ( char *s ) {
// unknown if we're in daylight savings time
t.tm_isdst = -1;
// translate using mktime
return mktime ( &t );
time_t global = timegm ( &t );
// skip HH:MM:SS
while ( ! isspace (*s) ) s++;
// skip spaces
while ( isspace (*s) ) s++;
// convert local time to "utc" or whatever timezone "s" points to,
// which is usually gmt or utc
long tzoff = getTimeZone ( s ) ;
if ( tzoff != BADTIMEZONE ) global += tzoff;
return global;
}
// 2007-12-31
@ -387,7 +420,7 @@ time_t atotime5 ( char *s ) {
// unknown if we're in daylight savings time
t.tm_isdst = -1;
// translate using mktime
return mktime ( &t );
return timegm ( &t );
}

View File

@ -6,18 +6,63 @@
HttpRequest::HttpRequest () { m_cgiBuf = NULL; m_cgiBuf2 = NULL; reset(); }
HttpRequest::~HttpRequest() { reset(); }
char HttpRequest::getReplyFormat() {
if ( m_replyFormatValid ) return m_replyFormat;
char *fs = getString("format",NULL,NULL);
char fmt = FORMAT_HTML;
if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
m_replyFormat = fmt;
char *formatStr = getString("format");
char format = -1;//FORMAT_HTML;
// what format should search results be in? default is html
if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
if ( formatStr && strcmp(formatStr,"iframe")==0)
format=FORMAT_WIDGET_IFRAME;
if ( formatStr && strcmp(formatStr,"ajax")==0)
format=FORMAT_WIDGET_AJAX;
if ( formatStr && strcmp(formatStr,"append")==0)
format=FORMAT_WIDGET_APPEND;
// support old api &xml=1 to mean &format=1
if ( getLong("xml",0) ) {
format = FORMAT_XML;
}
// also support &json=1
if ( getLong("json",0) ) {
format = FORMAT_JSON;
}
if ( getLong("csv",0) ) {
format = FORMAT_CSV;
}
if ( getLong("iframe",0) ) {
format = FORMAT_WIDGET_IFRAME;
}
if ( getLong("ajax",0) ) {
format = FORMAT_WIDGET_AJAX;
}
if ( getLong("append",0) ) {
format = FORMAT_WIDGET_APPEND;
}
// default to html
if ( format == -1 )
format = FORMAT_HTML;
m_replyFormat = format;
m_replyFormatValid = true;
return m_replyFormat;
return format;
}
void HttpRequest::reset() {
m_numFields = 0;
m_replyFormatValid = false;

View File

@ -1562,6 +1562,77 @@ void cleanUp ( void *state , TcpSocket *s ) {
if ( s && s->m_state == f ) s->m_state = NULL;
}
bool HttpServer::sendSuccessReply ( TcpSocket *s , char format, char *addMsg) {
// get time in secs since epoch
time_t now ;
if ( isClockInSync() ) now = getTimeGlobal();
else now = getTimeLocal();
// . buffer for the MIME request and brief html err msg
// . NOTE: ctime appends a \n to the time, so we don't need to
char msg[1024];
SafeBuf sb(msg,1024,0,false);
char *tt = asctime(gmtime ( &now ));
tt [ gbstrlen(tt) - 1 ] = '\0';
char *ct = "text/html";
if ( format == FORMAT_XML ) ct = "text/xml";
if ( format == FORMAT_JSON ) ct = "application/json";
char cbuf[1024];
SafeBuf cb(cbuf,1024,0,false);
if ( format != FORMAT_XML && format != FORMAT_JSON )
cb.safePrintf("<html><b>Success</b></html>");
if ( format == FORMAT_XML ) {
cb.safePrintf("<response>\n"
"\t<statusCode>0</statusCode>\n"
"\t<statusMsg><![CDATA[Success]]>"
"</statusMsg>\n");
}
if ( format == FORMAT_JSON ) {
cb.safePrintf("{\"response\":{\n"
"\t\"statusCode\":0,\n"
"\t\"statusMsg\":\"Success\",\n" );
}
if ( addMsg )
cb.safeStrcpy(addMsg);
if ( format == FORMAT_XML ) {
cb.safePrintf("</response>\n");
}
if ( format == FORMAT_JSON ) {
// erase trailing ,\n
cb.m_length -= 2;
cb.safePrintf("\n"
"}\n"
"}\n");
}
sb.safePrintf(
"HTTP/1.0 200 (OK)\r\n"
"Content-Length: %li\r\n"
"Connection: Close\r\n"
"Content-Type: %s\r\n"
"Date: %s UTC\r\n\r\n"
, cb.length()
, ct
, tt );
sb.safeMemcpy ( &cb );
// use this new function that will compress the reply now if the
// request was a ZET instead of a GET
return sendReply2 ( msg , sb.length() , NULL , 0 , s );
}
// . send an error reply, like "HTTP/1.1 404 Not Found"
// . returns false if blocked, true otherwise
// . sets g_errno on error
@ -1578,9 +1649,16 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
time_t now ;//= getTimeGlobal();
if ( isClockInSync() ) now = getTimeGlobal();
else now = getTimeLocal();
// this kinda sucks that we have to do it twice...
HttpRequest hr;
hr.set ( s->m_readBuf , s->m_readOffset , s ) ;
char format = hr.getReplyFormat();
// . buffer for the MIME request and brief html err msg
// . NOTE: ctime appends a \n to the time, so we don't need to
char msg[1024];
SafeBuf sb(msg,1024,0,false);
// if it's a 404, redirect to home page
/*
if ( error == 404 )
@ -1595,26 +1673,61 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
*/
char *tt = asctime(gmtime ( &now ));
tt [ gbstrlen(tt) - 1 ] = '\0';
sprintf ( msg ,
"HTTP/1.0 %li (%s)\r\n"
"Content-Length: %li\r\n"
"Connection: Close\r\n"
"Date: %s UTC\r\n\r\n"
"<html><b>Error = %s</b></html>",
error ,
errmsg ,
(long)(gbstrlen("<html><b>Error = </b></html>")+
gbstrlen(errmsg)),
tt , // ctime ( &now ) ,
errmsg );
char *ct = "text/html";
if ( format == FORMAT_XML ) ct = "text/xml";
if ( format == FORMAT_JSON ) ct = "application/json";
SafeBuf xb;
if ( format != FORMAT_XML && format != FORMAT_JSON )
xb.safePrintf("<html><b>Error = %s</b></html>",errmsg );
if ( format == FORMAT_XML ) {
xb.safePrintf("<response>\n"
"\t<statusCode>%li</statusCode>\n"
"\t<statusMsg><![CDATA[", error );
xb.cdataEncode(errmsg );
xb.safePrintf("]]></statusMsg>\n"
"</response>\n");
}
if ( format == FORMAT_JSON ) {
xb.safePrintf("{\"response\":{\n"
"\t\"statusCode\":%li,\n"
"\t\"statusMsg\":\"", error );
xb.jsonEncode(errmsg );
xb.safePrintf("\"\n"
"}\n"
"}\n");
}
sb.safePrintf(
"HTTP/1.0 %li (%s)\r\n"
"Content-Length: %li\r\n"
"Connection: Close\r\n"
"Content-Type: %s\r\n"
"Date: %s UTC\r\n\r\n"
,
error ,
errmsg ,
xb.length(),
ct ,
tt ); // ctime ( &now ) ,
sb.safeMemcpy ( &xb );
// . move the reply to a send buffer
// . don't make sendBuf bigger than g_conf.m_httpMaxSendBufSize
long msgSize = gbstrlen ( msg );
//long msgSize = gbstrlen ( msg );
// record it
if ( bytesSent ) *bytesSent = msgSize;//sendBufSize;
if ( bytesSent ) *bytesSent = sb.length();//sendBufSize;
// use this new function that will compress the reply now if the
// request was a ZET instead of a GET
return sendReply2 ( msg , msgSize , NULL , 0 , s );
return sendReply2 ( msg , sb.length() , NULL , 0 , s );
/*
// . this returns false if blocked, true otherwise
@ -1640,6 +1753,11 @@ bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error ,
//long rawFormat,
char format ,
int errnum, char *content) {
// just use this for now. it detects the format already...
return sendErrorReply ( s,error,errmsg,NULL);
/*
// clear g_errno so the send goes through
g_errno = 0;
// get time in secs since epoch
@ -1707,6 +1825,7 @@ bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error ,
long msgSize = gbstrlen ( msg );
return sendReply2 ( msg , msgSize , NULL , 0 , s );
*/
/*
long sendBufSize = msgSize;

View File

@ -135,6 +135,8 @@ class HttpServer {
// send an error reply, like "HTTP/1.1 404 Not Found"
bool sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
long *bytesSent = NULL );
// xml and json uses this
bool sendSuccessReply (TcpSocket *s , char format , char *addMsg=NULL);
// send a "prettier" error reply, formatted in XML if necessary
bool sendQueryErrorReply ( TcpSocket *s , long error , char *errmsg,
// FORMAT_HTML=0,FORMAT_XML,FORMAT_JSON

View File

@ -91,6 +91,58 @@ void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
// best candidate, and just use that
if ( xd->m_isDiffbotJSONObject ) return;
//
// first add any open graph candidate.
// basically they page telling us the best image straight up.
//
long node2 = -1;
long startNode = 0;
// . field can be stuff like "summary","description","keywords",...
// . if "convertHtmlEntites" is true we change < to &lt; and > to &gt;
// . <meta property="og:image" content="http://example.com/rock2.jpg"/>
// . <meta property="og:image" content="http://example.com/rock3.jpg"/>
ogimgloop:
char ubuf[2000];
long ulen = xml->getMetaContent ( ubuf , // store the val here
1999 ,
"og:image",
8,
"property",
false, // convertHtmlEntities
startNode ,
&node2 ); // matchedNode
// update this in case goto ogimgloop is called
startNode = node2 + 1;
// see section below for explanation of what we are storing here...
if ( node2 >= 0 ) {
// save it
m_imageNodes[m_numImages] = node2;
Query q;
if ( ulen > MAX_URL_LEN ) goto ogimgloop;
// set it to the full url
Url iu;
// use "pageUrl" as the baseUrl
iu.set ( pageUrl , ubuf , ulen );
// skip if invalid domain or TLD
if ( iu.getDomainLen() <= 0 ) goto ogimgloop;
// for looking it up on disk to see if unique or not
char buf[2000];
snprintf ( buf , 1999, "gbimage:%s",iu.getUrl());
// TODO: make sure this is a no-split termid storage thingy
// in Msg14.cpp
if ( ! q.set2 ( buf , langUnknown , false ) ) return;
// store the termid
m_termIds[m_numImages] = q.getTermId(0);
// advance the counter
m_numImages++;
// try to get more graph images if we have some room
if ( m_numImages + 2 < MAX_IMAGES ) goto ogimgloop;
}
//m_pageSite = pageSite;
// scan the words
long nw = words->getNumWords();
@ -530,7 +582,7 @@ bool Images::downloadImages () {
// get img tag node
node = m_imageNodes[m_j];
// get the url of the image
src = m_xml->getString(node,"src",&srcLen);
src = getImageUrl ( m_j , &srcLen );
// use "pageUrl" as the baseUrl
m_imageUrl.set ( m_pageUrl , src , srcLen );
}
@ -755,8 +807,7 @@ bool Images::makeThumb ( ) {
srcLen = gbstrlen(src);
}
else {
long node = m_imageNodes[m_j];
src = m_xml->getString(node,"src",&srcLen);
src = getImageUrl ( m_j , &srcLen );
}
// set it to the full url
Url iu;
@ -848,6 +899,16 @@ bool Images::makeThumb ( ) {
}
}
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
if ( ! cr ) { g_errno = ENOCOLLREC; return true; }
// save how big of thumbnails we should make. user can change
// this in the 'spider controls'
m_xysize = cr->m_thumbnailMaxWidthHeight ;
// make it 250 pixels if no decent value provided
if ( m_xysize <= 0 ) m_xysize = 250;
// and keep it sane
if ( m_xysize > 2048 ) m_xysize = 2048;
// update status
if ( m_xd ) m_xd->setStatus ( "making thumbnail" );
@ -897,16 +958,18 @@ void Images::thumbStart_r ( bool amThread ) {
long id = getpidtid();
// pass the input to the program through this file
// rather than a pipe, since popen() seems broken
// rather than a pipe, since popen() seems broken.
// m_dir ends in / so this should work.
char in[364];
snprintf ( in , 363,"%strash/in.%li", g_hostdb.m_dir, id );
snprintf ( in , 363,"%strashin.%li", g_hostdb.m_dir, id );
unlink ( in );
log( LOG_DEBUG, "image: thumbStart_r create in file." );
// collect the output from the filter from this file
// m_dir ends in / so this should work.
char out[364];
snprintf ( out , 363,"%strash/out.%li", g_hostdb.m_dir, id );
snprintf ( out , 363,"%strashout.%li", g_hostdb.m_dir, id );
unlink ( out );
log( LOG_DEBUG, "image: thumbStart_r create out file." );
@ -964,23 +1027,48 @@ void Images::thumbStart_r ( bool amThread ) {
break;
}
long xysize = 250;//100;
//long xysize = 250;//100;
// make thumbnail a little bigger for diffbot for widget
if ( m_xd->m_isDiffbotJSONObject ) xysize = 250;
//if ( m_xd->m_isDiffbotJSONObject ) xysize = 250;
// i hope 2500 is big enough!
char cmd[2501];
//sprintf( cmd, scmd, ext, in, out);
char *wdir = g_hostdb.m_dir;
// can be /dev/stderr or like /var/gigablast/data/log000 etc.
char *logFile = g_log.getFilename();
// wdir ends in / so this should work.
snprintf( cmd, 2500 ,
"LD_LIBRARY_PATH=%s %s/%stopnm %s | "
"LD_LIBRARY_PATH=%s %s/pnmscale -xysize %li %li - | "
"LD_LIBRARY_PATH=%s %s/ppmtojpeg - > %s"
, wdir , wdir , ext , in
, wdir , wdir , xysize , xysize
, wdir , wdir , out
"LD_LIBRARY_PATH=%s %s%stopnm %s 2>> %s | "
"LD_LIBRARY_PATH=%s %spnmscale -xysize %li %li - 2>> %s | "
// put all its stderr msgs into /dev/null
// so "jpegtopnm: WRITING PPM FILE" doesn't clog console
"LD_LIBRARY_PATH=%s %sppmtojpeg - > %s 2>> %s"
, wdir , wdir , ext , in , logFile
, wdir , wdir , m_xysize , m_xysize , logFile
, wdir , wdir , out , logFile
);
// if they already have netpbm package installed use that then
static bool s_checked = false;
static bool s_hasNetpbm = false;
if ( ! s_checked ) {
s_checked = true;
File f;
f.set("/usr/bin/pnmscale");
s_hasNetpbm = f.doesExist() ;
}
if ( s_hasNetpbm )
snprintf( cmd, 2500 ,
"%stopnm %s 2>> %s | "
"pnmscale -xysize %li %li - 2>> %s | "
"ppmtojpeg - > %s 2>> %s"
, ext , in , logFile
, m_xysize , m_xysize , logFile
, out , logFile
);
// Call clone function for the shell to execute command
// This call WILL BLOCK . timeout is 30 seconds.
@ -1211,10 +1299,11 @@ bool ThumbnailInfo::printThumbnailInHtml ( SafeBuf *sb ,
long newdx = (long)((float)m_dx * min);
long newdy = (long)((float)m_dy * min);
if ( printLink && format==FORMAT_HTML )
// might be FORMAT_AJAX!
if ( printLink && format !=FORMAT_XML && format != FORMAT_JSON )
sb->safePrintf("<a href=%s>", getUrl() );
if ( format == FORMAT_HTML )
if ( format !=FORMAT_XML && format != FORMAT_JSON )
sb->safePrintf("<img width=%li height=%li align=left "
"%s"
"src=\"data:image/"
@ -1225,20 +1314,44 @@ bool ThumbnailInfo::printThumbnailInHtml ( SafeBuf *sb ,
);
if ( format == FORMAT_XML )
sb->safePrintf("<imageBase64>");
sb->safePrintf("\t<imageBase64>");
if ( format == FORMAT_JSON )
sb->safePrintf("\t\"imageBase64\":\"");
// encode image in base 64
sb->base64Encode ( getData(), m_dataSize , 0 ); // 0 niceness
if ( format == FORMAT_HTML ) {
if ( format !=FORMAT_XML && format != FORMAT_JSON ) {
sb->safePrintf("\">");
if ( printLink ) sb->safePrintf ("</a>");
}
if ( format == FORMAT_XML )
sb->safePrintf("</imageBase64>");
sb->safePrintf("</imageBase64>\n");
if ( format == FORMAT_JSON )
sb->safePrintf("\",\n");
// widget needs to know the width of the thumb for formatting
// the text either on top of the thumb or to the right of it
if ( retNewdx ) *retNewdx = newdx;
return true;
}
char *Images::getImageUrl ( long j , long *urlLen ) {
long node = m_imageNodes[j];
long srcLen = 0;
char *src = m_xml->getString(node,"src",&srcLen);
// maybe it was an og:image meta tag
if ( ! src )
src = m_xml->getString(node,"content",&srcLen);
// wtf?
if ( ! src )
log("image: image bad/null src");
*urlLen = srcLen;
return src;
}

View File

@ -119,6 +119,8 @@ class Images {
bool downloadImage();
bool makeThumb();
char *getImageUrl ( long j , long *urlLen ) ;
//bool gotImage ( );
void thumbStart_r ( bool amThread );
@ -131,6 +133,8 @@ class Images {
void *m_state ;
void (* m_callback)(void *state );
long m_xysize;
bool m_setCalled;
long m_errno;
long m_hadError;

2
Log.h
View File

@ -143,6 +143,8 @@ class Log {
bool m_logTimestamps;
char *getFilename() { return m_filename; };
private:
bool dumpLog ( ); // make room for the new ones

View File

@ -551,6 +551,7 @@ master-rpm:
# DEBIAN PACKAGE SECTION BEGIN
# need to do 'apt-get intall dh-make'
# deb-master
master-deb:
git archive --format=tar --prefix=gb-1.0/ master > ../gb_1.0.orig.tar
rm -rf debian
@ -569,6 +570,7 @@ master-deb:
cp control.deb debian/control
# try to use our own rules so we can override dh_shlibdeps and others
cp gb.deb.rules debian/rules
cp changelog debian/changelog
# fix dh_shlibdeps from bitching about dependencies on shared libs
# YOU HAVE TO RUN THIS before you run 'make'
# export LD_LIBRARY_PATH=./debian/gb/var/gigablast/data0
@ -583,12 +585,12 @@ master-deb:
# upload rpm
scp gb*.rpm gk268:/w/html/
#deb-testing
testing-deb:
git archive --format=tar --prefix=gb-1.0/ testing > ../gb_1.0.orig.tar
git archive --format=tar --prefix=gb-1.1/ testing > ../gb_1.1.orig.tar
rm -rf debian
# change "-p gb_1.0" to "-p gb_1.1" to update version for example
dh_make -e gigablast@mail.com -p gb_1.0 -f ../gb_1.0.orig.tar
dh_make -e gigablast@mail.com -p gb_1.1 -f ../gb_1.1.orig.tar
# zero this out, it is just filed with the .txt files erroneously and it'll
# try to automatiicaly install in /usr/docs/
rm debian/docs
@ -602,16 +604,24 @@ testing-deb:
cp control.deb debian/control
# try to use our own rules so we can override dh_shlibdeps and others
cp gb.deb.rules debian/rules
cp changelog debian/changelog
# make the pkg dependencies file ourselves since we overrode dh_shlibdeps
# with our own debian/rules file. see that file for more info.
# echo "shlibs:Depends=libc6 (>= 2.3)" > debian/gb.substvars
# echo "shlibs:Depends=netpbm (>= 0.0)" > debian/gb.substvars
# echo "misc:Depends=netpbm (>= 0.0)" > debian/gb.substvars
# fix dh_shlibdeps from bitching about dependencies on shared libs
# YOU HAVE TO RUN THIS before you run 'make'
# export LD_LIBRARY_PATH=./debian/gb/var/gigablast/data0
# build the package now
# build the package now. if we don't specify -ai386 -ti386 then some users
# get a wrong architecture msg and 'dpkg -i' fails
dpkg-buildpackage -nc -ai386 -ti386 -b -uc -rfakeroot
# dpkg-buildpackage -nc -b -uc -rfakeroot
# move to current dur
mv ../gb_*.deb .
install-pkgs-local:
sudo alien --to-rpm gb_1.0-1_i386.deb
sudo alien --to-rpm gb_1.1-1_i386.deb
# upload
scp gb*.deb gb*.rpm gk268:/w/html/

View File

@ -1931,7 +1931,7 @@ bool getTestSpideredDate ( Url *u , long *origSpideredDate , char *testDir ) {
// hash the url into 64 bits
long long uh64 = hash64(u->getUrl(),u->getUrlLen());
// read the spider date file first
char fn[300];
char fn[2000];
File f;
// get the spider date then
sprintf(fn,"%s/%s/doc.%llu.spiderdate.txt",
@ -1964,6 +1964,10 @@ bool getTestSpideredDate ( Url *u , long *origSpideredDate , char *testDir ) {
}
bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) {
// ensure dir exists
::mkdir(testDir,S_IRWXU);
// set this
long long uh64 = hash64(u->getUrl(),u->getUrlLen());
// make that into a filename

View File

@ -51,8 +51,11 @@ void handleRequest ( UdpSlot *slot , long netnice ) {
char *filename = g_hostdb.m_logFilename;
// running just ./gb will log to stderr...
if ( strcmp(filename ,"/dev/stderr") == 0 )
if ( strcmp(filename ,"/dev/stderr") == 0 ) {
g_errno = EBADFILE;
g_udpServer.sendErrorReply ( slot, g_errno );
return;
}
long fd = open ( filename , O_RDONLY,
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );

View File

@ -75,6 +75,7 @@ class Msg20Request {
char m_numSummaryLines ; // non-zero default
char m_expected ; // non-zero default
char m_allowPunctInPhrase ; // non-zero default
bool m_getHeaderTag ;
void *m_state ;
void *m_state2 ; // used by Msg25.cpp
long m_j ; // used by Msg25.cpp
@ -416,9 +417,11 @@ public:
// methods must be changed
// . also, all ptr_* should be char* and all size_* should be in bytes
char *ptr_tbuf ; // title buffer
char *ptr_htag ; // h1 tag buf
char *ptr_ubuf ; // url buffer
char *ptr_rubuf ; // redirect url buffer
char *ptr_sum ; // summary
char *ptr_displaySum ; // summary for displaying
char *ptr_dedupSum ; // summary for deduping
char *ptr_dbuf ; // display metas \0 separated
//char *ptr_sbuf ; // big sample buf for gigabits
char *ptr_gigabitSample ;
@ -512,9 +515,11 @@ public:
// . string sizes of the strings we store into m_buf[]
// . wordCountBuf is an exact word count 1-1 with each "range"
long size_tbuf ;
long size_htag ;
long size_ubuf ;
long size_rubuf ;
long size_sum ;
long size_displaySum ;
long size_dedupSum ;
long size_dbuf ;
//long size_sbuf ;
long size_gigabitSample ; // includes \0

View File

@ -1330,31 +1330,52 @@ bool Msg40::launchMsg20s ( bool recalled ) {
// m_printi < m_msg3a.m_numDocIds checks that kinda expect
// us to get all summaries for every docid. but when we
// do federated search we can get a ton of docids.
if ( m_printi >= m_docsToGetVisible ) {
logf(LOG_DEBUG,"query: got %li >= %li "
"summaries. done. "
"waiting on remaining "
"%li to return."
, m_printi
, m_docsToGetVisible
, m_numRequests-m_numReplies);
// wait for all msg20 replies to come in
if ( m_numRequests != m_numReplies ) break;
// then let's hack fix this then so we can call
// printSearchResultsTail()
m_printi = m_msg3a.m_numDocIds;
// set these to max so they do not launch another
// summary request, just in case, below
m_numRequests = m_msg3a.m_numDocIds;
m_numReplies = m_msg3a.m_numDocIds;
break;
}
// if ( m_printi >= m_docsToGetVisible ) {
// logf(LOG_DEBUG,"query: got %li >= %li "
// "summaries. done. "
// "waiting on remaining "
// "%li to return."
// , m_printi
// , m_docsToGetVisible
// , m_numRequests-m_numReplies);
// // wait for all msg20 replies to come in
// if ( m_numRequests != m_numReplies ) break;
// // then let's hack fix this then so we can call
// // printSearchResultsTail()
// m_printi = m_msg3a.m_numDocIds;
// // set these to max so they do not launch another
// // summary request, just in case, below
// m_numRequests = m_msg3a.m_numDocIds;
// m_numReplies = m_msg3a.m_numDocIds;
// break;
// }
// do not double count!
//if ( i <= m_lastProcessedi ) continue;
// do not repeat for this i
m_lastProcessedi = i;
// if we have printed enough summaries then do not launch
// any more, wait for them to come back in.
/// this is causing problems because we have a bunch of
// m_printi < m_msg3a.m_numDocIds checks that kinda expect
// us to get all summaries for every docid. but when we
// do federated search we can get a ton of docids.
// if ( m_printi >= m_docsToGetVisible ) {
// logf(LOG_DEBUG,"query: got %li >= %li "
// "summaries. done. "
// "waiting on remaining "
// "%li to return."
// , m_printi
// , m_docsToGetVisible
// , m_numRequests-m_numReplies);
// m_numRequests++;
// m_numReplies++;
// continue;
// }
// start up a Msg20 to get the summary
Msg20 *m = NULL;
if ( m_si->m_streamResults ) {
@ -1492,6 +1513,12 @@ bool Msg40::launchMsg20s ( bool recalled ) {
req.m_bigSampleMaxLen = bigSampleMaxLen;
req.m_titleMaxLen = 256;
req.m_titleMaxLen = cr->m_titleMaxLen;
req.m_summaryMaxLen = cr->m_summaryMaxLen;
// a special undocumented thing for getting <h1> tag
req.m_getHeaderTag = m_si->m_hr.getLong("geth1tag",0);
//req.m_numSummaryLines = cr->m_summaryMaxNumLines;
// let "ns" parm override
req.m_numSummaryLines = m_si->m_numLinesInSummary;
if(m_si->m_isAdmin && m_si->m_format == FORMAT_HTML )
req.m_getGigabitVector = true;
else req.m_getGigabitVector = false;
@ -1909,7 +1936,9 @@ bool Msg40::gotSummary ( ) {
// . wrap it up with Next 10 etc.
// . this is in PageResults.cpp
if ( m_si && m_si->m_streamResults && ! m_printedTail &&
if ( m_si &&
m_si->m_streamResults &&
! m_printedTail &&
m_printi >= m_msg3a.m_numDocIds ) {
m_printedTail = true;
printSearchResultsTail ( st );
@ -1960,10 +1989,19 @@ bool Msg40::gotSummary ( ) {
if ( ! launchMsg20s ( true ) ) return false;
// it won't launch now if we are bottlnecked waiting for
// m_printi's summary to come in
if ( m_si->m_streamResults )
if ( m_si->m_streamResults ) {
// it won't launch any if we printed out enough as well
// and it printed "waiting on remaining 0 to return"
// and it printed "waiting on remaining 0 to return".
// we shouldn't be waiting for more to come in b/c
// we are in gotSummart() so one just came in
// freeing up a msg20 to launch another, so assume
// this means we are basically done. and it
// set m_numRequests=m_msg3a.m_numDocIds etc.
//if ( m_numRequests == m_msg3a.m_numDocIds )
// goto printTail;
// otherwise, keep chugging
goto complete;
}
// maybe some were cached?
//goto refilter;
// it returned true, so m_numRequests == m_numReplies and

View File

@ -204,11 +204,13 @@ bool Msg5::getList ( char rdbId ,
m_rdbId = rdbId;
m_collnum = collnum;
CollectionRec *ttt = g_collectiondb.getRec ( m_collnum );
if ( ! ttt ) {
g_errno = ENOCOLLREC;
return true;
}
// why was this here? it was messing up the statsdb ("graph") link
// in the admin panel.
//CollectionRec *ttt = g_collectiondb.getRec ( m_collnum );
//if ( ! ttt ) {
// g_errno = ENOCOLLREC;
// return true;
//}
m_list = list;
//m_startKey = startKey;

View File

@ -53,6 +53,29 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
if ( ! add && ! cast ) g_collectiondb.deleteRecs ( r ) ;
*/
char format = r->getReplyFormat();
if ( format == FORMAT_XML || format == FORMAT_JSON ) {
// no addcoll given?
long page = g_pages.getDynamicPageNumber ( r );
char *addcoll = r->getString("addcoll",NULL);
char *delcoll = r->getString("delcoll",NULL);
if ( ! addcoll ) addcoll = r->getString("addColl",NULL);
if ( ! delcoll ) delcoll = r->getString("delColl",NULL);
if ( page == PAGE_ADDCOLL && ! addcoll ) {
g_errno = EBADENGINEER;
char *msg = "no addcoll parm provided";
return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
}
if ( page == PAGE_DELCOLL && ! delcoll ) {
g_errno = EBADENGINEER;
char *msg = "no delcoll parm provided";
return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
}
return g_httpServer.sendSuccessReply(s,format);
}
char buf [ 64*1024 ];
SafeBuf p(buf, 64*1024);
// print standard header
@ -93,7 +116,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
p.safePrintf (
"<tr bgcolor=#%s>"
"<td><b>name of new collection to add</td>\n"
"<td><input type=text name=addColl size=30>"
"<td><input type=text name=addcoll size=30>"
"</td></tr>\n"
, LIGHT_BLUE
);
@ -142,7 +165,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
if ( ! cr ) continue;
p.safePrintf (
"<tr bgcolor=#%s><td>"
"<input type=checkbox name=delColl value=\"%s\"> "
"<input type=checkbox name=delcoll value=\"%s\"> "
"%s</td></tr>\n",
DARK_BLUE,
cr->m_coll,cr->m_coll);

View File

@ -117,6 +117,9 @@ bool sendPageAddUrl2 ( TcpSocket *s , HttpRequest *r ) {
// . use to manually update spider times for a url
// . however, will not remove old scheduled spider times
// . mdw: made force on the default
// . mdw: don't use this anymore, use url filters, it has
// a "isaddurl" directive you can use where you can set the
// respider frequency to basically 0 to simulate this parm.
//st1->m_forceRespider = r->getLong("force",1); // 0);
// if no url given, just print a blank page
@ -135,7 +138,10 @@ bool sendPageAddUrl2 ( TcpSocket *s , HttpRequest *r ) {
return sendReply ( st1 , true );
}
if ( spiderLinks )
sreq->m_avoidSpiderLinks = 0;
else
sreq->m_avoidSpiderLinks = 1;
// shortcut
Msg4 *m = &st1->m_msg4;

View File

@ -63,7 +63,7 @@ public:
// hash of the subdomain or domain for this line in sitelist
long m_thingHash32;
// ptr to the line in CollectionRec::m_siteListBuf
char *m_patternStr;
long m_patternStrOff;
// offset of the url path in the pattern, 0 means none
short m_pathOff;
short m_pathLen;
@ -315,7 +315,10 @@ bool updateSiteListBuf ( collnum_t collnum ,
pd.m_thingHash32 = u.getHostHash32();
// . ptr to the line in CollectionRec::m_siteListBuf.
// . includes pointing to "exact:" too i guess and tag: later.
pd.m_patternStr = start;
// . store offset since CommandUpdateSiteList() passes us
// a temp buf that will be freed before copying the buf
// over to its permanent place at cr->m_siteListBuf
pd.m_patternStrOff = start - siteListArg;
// offset of the url path in the pattern, 0 means none
pd.m_pathOff = 0;
// scan url pattern, it should start at "s"
@ -432,30 +435,66 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
// we handle.
long slot = dt->getSlot ( &sreq->m_domHash32 );
char *buf = cr->m_siteListBuf.getBufStart();
// loop over all the patterns that contain this domain and see
// the first one we match, and if we match a negative one.
for ( ; slot >= 0 ; slot = dt->getNextSlot(slot,&sreq->m_domHash32)) {
// get pattern
PatternData *pd = (PatternData *)dt->getValueFromSlot ( slot );
// point to string
char *patternStr = buf + pd->m_patternStrOff;
// is it negative? return NULL if so so url will be ignored
//if ( pd->m_patternStr[0] == '-' )
//if ( patternStr[0] == '-' )
// return NULL;
// otherwise, it has a path. skip if we don't match path ptrn
if ( pd->m_pathOff ) {
if ( ! myPath ) myPath = sreq->getUrlPath();
if ( strncmp (myPath,
pd->m_patternStr + pd->m_pathOff,
patternStr + pd->m_pathOff,
pd->m_pathLen ) )
continue;
}
// for entries like http://domain.com/ we have to match
// protocol and url can NOT be like www.domain.com to match.
// this is really like a regex like ^http://xyz.com/poo/boo/
if ( (patternStr[0]=='h' ||
patternStr[0]=='H') &&
( patternStr[1]=='t' ||
patternStr[1]=='T' ) &&
( patternStr[2]=='t' ||
patternStr[2]=='T' ) &&
( patternStr[3]=='p' ||
patternStr[3]=='P' ) ) {
char *x = patternStr+4;
// is it https:// ?
if ( *x == 's' || *x == 'S' ) x++;
// watch out for subdomains like http.foo.com
if ( *x != ':' ) goto nomatch;
// ok, we have to substring match exactly. like
// ^http://xyssds.com/foobar/
char *a = patternStr;
char *b = sreq->m_url;
for ( ; ; a++, b++ ) {
// stop matching when pattern is exhausted
if ( is_wspace_a(*a) || ! *a )
return patternStr;
if ( *a != *b ) break;
}
// we failed to match "pd" so try next line
continue;
}
nomatch:
// was the line just a domain and not a subdomain?
if ( pd->m_thingHash32 == sreq->m_domHash32 )
// this will be false if negative pattern i guess
return pd->m_patternStr;
return patternStr;
// was it just a subdomain?
if ( pd->m_thingHash32 == sreq->m_hostHash32 )
// this will be false if negative pattern i guess
return pd->m_patternStr;
return patternStr;
}
@ -573,7 +612,25 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
"Spider the url "
"<i>http://www.goodstuff.com/</i> and spider "
"any links we harvest that start with "
"<i>http://www.goodstuff.com/</i>"
"<i>http://www.goodstuff.com/</i>. NOTE: if the url "
"www.goodstuff.com redirects to foo.goodstuff.com then "
"foo.goodstuff.com still gets spidered "
"because it is considered to be manually added, but "
"no other urls from foo.goodstuff.com will be spidered."
"</td>"
"</tr>"
// protocol and subdomain match
"<tr>"
"<td>http://justdomain.com/foo/</td>"
"<td>"
"Spider the url "
"<i>http://justdomain.com/foo/</i> and spider "
"any links we harvest that start with "
"<i>http://justdomain.com/foo/</i>. "
"Urls that start with "
"<i>http://<b>www.</b>justdomain.com/</i>, for example, "
"will NOT match this."
"</td>"
"</tr>"

View File

@ -804,6 +804,14 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
if ( srep && srep->m_hadDiffbotError )
msg = "Diffbot processing error";
// indicate specific diffbot error if we have it
if ( srep &&
srep->m_hadDiffbotError &&
srep->m_errCode &&
// stick with "diffbot processing error" for these...
srep->m_errCode != EDIFFBOTINTERNALERROR )
msg = mstrerror(srep->m_errCode);
// matching url filter, print out the expression
long ufn ;
ufn = ::getUrlFilterNum(sreq,
@ -1868,6 +1876,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
// i guess bail if not there?
if ( ! cr ) {
log("crawlbot: missing coll rec for coll %s",collName);
char *msg = "invalid or missing collection rec";
return sendErrorReply2 (socket,fmt,msg);
}

View File

@ -50,7 +50,11 @@ bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
// if /Top print the directory homepage
if ( catId == 1 || catId <= 0 ) {
// this is in PageRoot.cpp
printDirHomePage(sb,r);
if ( ! printDirHomePage(sb,r) )
// this will be an error if dmoz not set up and
// it and xml or json reply format requested
return g_httpServer.sendErrorReply(s,500,
mstrerror(g_errno));
}
//
// try printing this shit out not as search results right now

View File

@ -22,6 +22,7 @@ static bool processLoop ( void *state ) ;
class State2 {
public:
Msg22 m_msg22;
char m_format;
//TitleRec m_tr;
long m_niceness;
XmlDoc m_xd;
@ -76,7 +77,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
// get the collection rec
CollectionRec *cr = g_collectiondb.getRec ( coll );
if ( ! cr ) {
g_errno = ENOTFOUND;
g_errno = ENOCOLLREC;
log("query: Archived copy retrieval failed. "
"No collection record found for "
"collection \"%s\".",coll);
@ -103,6 +104,13 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
long long docId = r->getLongLong ( "d" , 0LL /*default*/ );
// get url
char *url = r->getString ( "u",NULL);
if ( docId == 0 && ! url ) {
g_errno = EMISSINGINPUT;
return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
}
// . should we do a sequential lookup?
// . we need to match summary here so we need to know this
//bool seq = r->getLong ( "seq" , false );
@ -153,6 +161,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
st->m_isBanned = false;
st->m_noArchive = false;
st->m_socket = s;
st->m_format = r->getReplyFormat();
// default to 0 niceness
st->m_niceness = 0;
st->m_r.copy ( r );
@ -212,7 +221,7 @@ bool sendErrorReply ( void *state , long err ) {
TcpSocket *s = st->m_socket;
char tmp [ 1024*32 ] ;
sprintf ( tmp , "<b>had server-side error: %s</b><br>",
sprintf ( tmp , "%s",
mstrerror(g_errno));
// nuke state2
mdelete ( st , sizeof(State2) , "PageGet1" );
@ -358,6 +367,9 @@ bool processLoop ( void *state ) {
//p += gbstrlen ( p );
}
char format = st->m_format;
if ( format == FORMAT_XML ) sb.reset();
if ( format == FORMAT_JSON ) sb.reset();
// for undoing the stuff below
long startLen2 = sb.length();//p;
@ -383,6 +395,19 @@ bool processLoop ( void *state ) {
if ( xd->m_contentType == CT_JSON )
printDisclaimer = false;
if ( format == FORMAT_XML ) printDisclaimer = false;
if ( format == FORMAT_JSON ) printDisclaimer = false;
char tbuf[100];
tbuf[0] = 0;
time_t lastSpiderDate = xd->m_spideredTime;
if ( printDisclaimer ||
format == FORMAT_XML ||
format == FORMAT_JSON ) {
struct tm *timeStruct = gmtime ( &lastSpiderDate );
strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
}
// We should always be displaying this disclaimer.
// - May eventually want to display this at a different location
@ -425,10 +450,10 @@ bool processLoop ( void *state ) {
//p += gbstrlen ( p );
// then the spider date in GMT
time_t lastSpiderDate = xd->m_spideredTime;
struct tm *timeStruct = gmtime ( &lastSpiderDate );
char tbuf[100];
strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
// time_t lastSpiderDate = xd->m_spideredTime;
// struct tm *timeStruct = gmtime ( &lastSpiderDate );
// char tbuf[100];
// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
//p += gbstrlen ( p );
sb.safeStrcpy(tbuf);
@ -562,6 +587,9 @@ bool processLoop ( void *state ) {
if ( xd->m_contentType == CT_JSON )
includeHeader = false;
if ( format == FORMAT_XML ) includeHeader = false;
if ( format == FORMAT_JSON ) includeHeader = false;
//mfree(uq, uqCapacity, "PageGet");
// undo the header writes if we should
if ( ! includeHeader ) {
@ -571,6 +599,35 @@ bool processLoop ( void *state ) {
else sb.m_length=startLen1;//p=start1;
}
//sb.safeStrcpy(tbuf);
if ( format == FORMAT_XML ) {
sb.safePrintf("<response>\n");
sb.safePrintf("<statusCode>0</statusCode>\n");
sb.safePrintf("<statusMsg>Success</statusMsg>\n");
sb.safePrintf("<url><![CDATA[");
sb.cdataEncode(xd->m_firstUrl.m_url);
sb.safePrintf("]]></url>\n");
sb.safePrintf("<docId>%llu</docId>\n",xd->m_docId);
sb.safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n",
lastSpiderDate);
sb.safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
}
if ( format == FORMAT_JSON ) {
sb.safePrintf("{\"response\":{\n");
sb.safePrintf("\t\"statusCode\":0,\n");
sb.safePrintf("\t\"statusMsg\":\"Success\",\n");
sb.safePrintf("\t\"url\":\"");
sb.jsonEncode(xd->m_firstUrl.m_url);
sb.safePrintf("\",\n");
sb.safePrintf("\t\"docId\":%llu,\n",xd->m_docId);
sb.safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate);
sb.safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
}
// identify start of <title> tag we wrote out
char *sbstart = sb.getBufStart();
char *sbend = sb.getBufEnd();
@ -681,6 +738,10 @@ bool processLoop ( void *state ) {
if ( ctype == CT_TEXT ) pre = true ; // text/plain
if ( ctype == CT_DOC ) pre = true ; // filtered msword
if ( ctype == CT_PS ) pre = true ; // filtered postscript
if ( format == FORMAT_XML ) pre = false;
if ( format == FORMAT_JSON ) pre = false;
// if it is content-type text, add a <pre>
if ( pre ) {//p + 5 < bufEnd && pre ) {
sb.safePrintf("<pre>");
@ -706,10 +767,15 @@ bool processLoop ( void *state ) {
// do not do term highlighting if json
if ( xd->m_contentType == CT_JSON )
queryHighlighting = false;
SafeBuf tmp;
SafeBuf *xb = &sb;
if ( format == FORMAT_XML ) xb = &tmp;
if ( format == FORMAT_JSON ) xb = &tmp;
if ( ! queryHighlighting ) {
sb.safeMemcpy ( content , contentLen );
xb->safeMemcpy ( content , contentLen );
//p += contentLen ;
}
else {
@ -733,7 +799,7 @@ bool processLoop ( void *state ) {
Matches m;
m.setQuery ( &qq );
m.addMatches ( &ww );
hilen = hi.set ( &sb , // p , avail ,
hilen = hi.set ( xb , // p , avail ,
&ww , &m ,
false /*doStemming?*/ ,
st->m_clickAndScroll ,
@ -742,6 +808,21 @@ bool processLoop ( void *state ) {
log(LOG_DEBUG, "query: Done highlighting cached page content");
}
if ( format == FORMAT_XML ) {
sb.safePrintf("\t<content><![CDATA[");
sb.cdataEncode ( xb->getBufStart() );
sb.safePrintf("]]></content>\n");
sb.safePrintf("</response>\n");
}
if ( format == FORMAT_JSON ) {
sb.safePrintf("\t\"content\":\"\n");
sb.jsonEncode ( xb->getBufStart() );
sb.safePrintf("\"\n}\n}\n");
}
// if it is content-type text, add a </pre>
if ( pre ) { // p + 6 < bufEnd && pre ) {
sb.safeMemcpy ( "</pre>" , 6 );
@ -784,6 +865,9 @@ bool processLoop ( void *state ) {
if ( xd->m_contentType == CT_JSON )
contentType = "application/json";
if ( format == FORMAT_XML ) contentType = "text/xml";
if ( format == FORMAT_JSON ) contentType = "application/json";
// nuke state2
mdelete ( st , sizeof(State2) , "PageGet1" );
delete (st);

View File

@ -44,6 +44,15 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
mnew ( msg7, sizeof(Msg7) , "PageInject" );
char format = hr->getReplyFormat();
// no url parm?
if ( format != FORMAT_HTML && ! hr->getString("c",NULL) ) {
g_errno = ENOCOLLREC;
char *msg = mstrerror(g_errno);
return g_httpServer.sendErrorReply(sock,g_errno,msg,NULL);
}
// set this. also sets gr->m_hr
GigablastRequest *gr = &msg7->m_gr;
// this will fill in GigablastRequest so all the parms we need are set
@ -78,6 +87,9 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
return sendReply ( msg7 );
}
// a scrape request?
if ( gr->m_queryToScrape && gr->m_queryToScrape[0] ) {
//char *uf="http://www.google.com/search?num=50&"
@ -117,7 +129,45 @@ bool sendReply ( void *state ) {
//long hostId = msg7->m_msg7.m_hostId;
long long docId = xd->m_docId;
long hostId = 0;//msg7->m_msg7.m_hostId;
// set g_errno to index code
if ( xd->m_indexCodeValid && xd->m_indexCode && ! g_errno )
g_errno = xd->m_indexCode;
char format = gr->m_hr.getReplyFormat();
// no url parm?
if ( ! g_errno && ! gr->m_url && format != FORMAT_HTML )
g_errno = EMISSINGINPUT;
if ( g_errno ) {
long save = g_errno;
mdelete ( msg7, sizeof(Msg7) , "PageInject" );
delete (msg7);
g_errno = save;
char *msg = mstrerror(g_errno);
return g_httpServer.sendErrorReply(sock,save,msg,NULL);
}
char abuf[32];
SafeBuf am(abuf,32,0,false);
// a success reply, include docid and url i guess
if ( format == FORMAT_XML ) {
am.safePrintf("\t<docId>%lli</docId>\n",xd->m_docId);
char *addMsg = am.getBufStart();
mdelete ( msg7, sizeof(Msg7) , "PageInject" );
delete (msg7);
return g_httpServer.sendSuccessReply(sock,format,addMsg);
}
if ( format == FORMAT_JSON ) {
am.safePrintf("\t\"docId\":%lli,\n",xd->m_docId);
char *addMsg = am.getBufStart();
mdelete ( msg7, sizeof(Msg7) , "PageInject" );
delete (msg7);
return g_httpServer.sendSuccessReply(sock,format,addMsg);
}
//
// debug
@ -159,11 +209,6 @@ bool sendReply ( void *state ) {
if ( url && gr->m_shortReply ) {
char buf[1024*32];
char *p = buf;
// set g_errno to index code
if ( xd->m_indexCodeValid &&
xd->m_indexCode &&
! g_errno )
g_errno = xd->m_indexCode;
// return docid and hostid
if ( ! g_errno ) p += sprintf ( p ,
"0,docId=%lli,hostId=%li," ,
@ -275,6 +320,12 @@ bool Msg7::inject ( void *state ,
return true;
}
if ( ! gr->m_url ) {
log("inject: no url provied to inject");
g_errno = EBADURL;
return true;
}
//char *coll = cr->m_coll;
m_state = state;

View File

@ -257,6 +257,10 @@ bool Msg1c::reindexQuery ( char *query ,
//CollectionRec *cr = g_collectiondb.getRec ( collnum );
// sanity fix
if ( endNum - startNum > MAXDOCIDSTOCOMPUTE )
endNum = startNum + MAXDOCIDSTOCOMPUTE;
//CollectionRec *cr = g_collectiondb.getRec ( coll );
// reset again just in case
m_req.reset();

View File

@ -149,6 +149,7 @@ bool sendReply ( State0 *st , char *reply ) {
mdelete(st, sizeof(State0), "PageResults2");
delete st;
/*
if ( format == FORMAT_XML ) {
SafeBuf sb;
sb.safePrintf("<?xml version=\"1.0\" "
@ -174,6 +175,7 @@ bool sendReply ( State0 *st , char *reply ) {
charset );
return true;
}
*/
long status = 500;
if (savedErr == ETOOMANYOPERANDS ||
@ -244,7 +246,7 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
//long xml = hr->getLong("xml",0);
// what format should search results be in? default is html
char format = getFormatFromRequest ( hr );
char format = hr->getReplyFormat();//getFormatFromRequest ( hr );
// get the dmoz catid if given
//long searchingDmoz = hr->getLong("dmoz",0);
@ -543,6 +545,10 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
if ( cr ) st->m_collnum = cr->m_collnum;
else st->m_collnum = -1;
// turn this on for json output, unless diffbot collection
if ( format == FORMAT_JSON && ! cr->m_isCustomCrawl )
st->m_header = 1;
// take this out here as well!
// limit here
// long maxpp = cr->m_maxSearchResultsPerQuery ;
@ -1009,6 +1015,11 @@ bool gotResults ( void *state ) {
// if already printed from Msg40.cpp, bail out now
if ( si->m_streamResults ) {
// this will be our final send
if ( st->m_socket->m_streamingMode ) {
log("res: socket still in streaming mode. wtf?");
st->m_socket->m_streamingMode = false;
}
log("msg40: done streaming. nuking state.");
mdelete(st, sizeof(State0), "PageResults2");
delete st;
@ -1019,12 +1030,12 @@ bool gotResults ( void *state ) {
//char *coll = si->m_coll2;
//long collLen = si->m_collLen2;
collnum_t collnum = si->m_firstCollnum;
//collnum_t collnum = si->m_firstCollnum;
// collection rec must still be there since SearchInput references
// into it, and it must be the SAME ptr too!
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr || cr != si->m_cr ) {
CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum );
if ( ! cr ) { // || cr != si->m_cr ) {
g_errno = ENOCOLLREC;
return sendReply(st,NULL);
}
@ -1705,12 +1716,6 @@ bool printSearchResultsHeader ( State0 *st ) {
(long)moreFollow);
}
if ( st->m_header && si->m_format == FORMAT_JSON ) {
sb->safePrintf("\"objects\":[\n");
return true;
}
// . did he get a spelling recommendation?
// . do not use htmlEncode() on this anymore since receiver
// of the XML feed usually does not want that.
@ -1720,6 +1725,27 @@ bool printSearchResultsHeader ( State0 *st ) {
sb->safePrintf ("]]></spell>\n");
}
if ( si->m_format == FORMAT_JSON && st->m_spell[0] ) {
sb->safePrintf ("\t\"spell\":\"");
sb->jsonEncode(st->m_spell);
sb->safePrintf ("\"\n,");
}
// for diffbot collections only...
if ( st->m_header &&
si->m_format == FORMAT_JSON &&
cr->m_isCustomCrawl ) {
sb->safePrintf("\"objects\":[\n");
return true;
}
if ( si->m_format == FORMAT_JSON &&
! cr->m_isCustomCrawl ) {
sb->safePrintf("\"results\":[\n");
return true;
}
// debug
if ( si->m_debug )
logf(LOG_DEBUG,"query: Displaying up to %li results.",
@ -2821,6 +2847,40 @@ static bool printDMOZCategoryUnderResult ( SafeBuf *sb ,
long catid ,
State0 *st ) {
char format = si->m_format;
if ( format == FORMAT_XML ) {
sb->safePrintf("\t\t<dmozCat>\n"
"\t\t\t<dmozCatId>%li</dmozCatId>\n"
"\t\t\t<dmozCatStr><![CDATA["
,catid);
// print the name of the dmoz category
char xbuf[256];
SafeBuf xb(xbuf,256,0,false);
g_categories->printPathFromId(&xb, catid, false,si->m_isRTL);
sb->cdataEncode(xb.getBufStart());
sb->safePrintf("]]></dmozCatStr>\n"
"\t\t</dmozCat>\n");
return true;
}
if ( format == FORMAT_JSON ) {
sb->safePrintf("\t\t\"dmozCat\":{\n"
"\t\t\t\"dmozCatId\":%li,\n"
"\t\t\t\"dmozCatStr\":\""
,catid);
// print the name of the dmoz category
char xbuf[256];
SafeBuf xb(xbuf,256,0,false);
g_categories->printPathFromId(&xb, catid, false,si->m_isRTL);
sb->jsonEncode(xb.getBufStart());
sb->safePrintf("\"\n"
"\t\t},\n");
return true;
}
//uint8_t queryLanguage = langUnknown;
uint8_t queryLanguage = si->m_queryLangId;
// Don't print category if not in native language category
@ -3011,7 +3071,13 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
}
if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t<result>\n" );
if ( si->m_format == FORMAT_XML )
sb->safePrintf("\t<result>\n" );
if ( si->m_format == FORMAT_JSON ) {
if ( *numPrintedSoFar != 0 ) sb->safePrintf(",\n");
sb->safePrintf("\t{\n" );
}
Highlight hi;
@ -3112,7 +3178,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
// if we have a thumbnail show it next to the search result,
// base64 encoded
if ( (si->m_format == FORMAT_HTML || si->m_format == FORMAT_XML ) &&
if ( //(si->m_format == FORMAT_HTML || si->m_format == FORMAT_XML ) &&
//! mr->ptr_imgUrl &&
mr->ptr_imgData ) {
ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
@ -3128,9 +3194,25 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
si->m_format );
if ( si->m_format == FORMAT_XML ) {
sb->safePrintf("\t\t<imageHeight>%li</imageHeight>\n",
ti->m_dx);
sb->safePrintf("\t\t<imageWidth>%li</imageWidth>\n",
ti->m_dy);
sb->safePrintf("\t\t<imageWidth>%li</imageWidth>\n",
ti->m_dx);
sb->safePrintf("\t\t<origImageHeight>%li"
"</origImageHeight>\n",
ti->m_origDY);
sb->safePrintf("\t\t<origImageWidth>%li"
"</origImageWidth>\n",
ti->m_origDX);
}
if ( si->m_format == FORMAT_JSON ) {
sb->safePrintf("\t\t\"imageHeight\":%li,\n",
ti->m_dy);
sb->safePrintf("\t\t\"imageWidth\":%li,\n",
ti->m_dx);
sb->safePrintf("\t\t\"origImageHeight\":%li,\n",
ti->m_origDY);
sb->safePrintf("\t\t\"origImageWidth\":%li,\n",
ti->m_origDX);
}
}
@ -3357,7 +3439,6 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
}
long hlen;
//copy all summary and title excerpts for this result into here
//char tt[1024*32];
@ -3375,8 +3456,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
frontTag = "<font style=\"background-color:yellow\">" ;
}
long cols = 80;
if ( si->m_format == FORMAT_XML )
sb->safePrintf("\t\t<title><![CDATA[");
SafeBuf hb;
if ( str && strLen && si->m_doQueryHighlighting ) {
hlen = hi.set ( &hb,
@ -3393,29 +3473,55 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
backTag,
0,
0 ); // niceness
// reassign!
str = hb.getBufStart();
strLen = hb.getLength();
//if (!sb->utf8Encode2(tt, hlen)) return false;
if ( ! sb->brify ( hb.getBufStart(),
hb.getLength(),
0,
cols) ) return false;
// if ( si->m_format != FORMAT_JSON )
// if ( ! sb->brify ( hb.getBufStart(),
// hb.getLength(),
// 0,
// cols) ) return false;
}
else if ( str && strLen ) {
// . use "UNTITLED" if no title
// . msg20 should supply the dmoz title if it can
if ( strLen == 0 &&
si->m_format != FORMAT_XML &&
si->m_format != FORMAT_JSON ) {
str = "<i>UNTITLED</i>";
strLen = gbstrlen(str);
}
if ( str &&
strLen &&
( si->m_format == FORMAT_HTML ||
si->m_format == FORMAT_WIDGET_IFRAME ||
si->m_format == FORMAT_WIDGET_APPEND ||
si->m_format == FORMAT_WIDGET_AJAX )
) {
// determine if TiTle wraps, if it does add a <br> count for
// each wrap
//if (!sb->utf8Encode2(str , strLen )) return false;
if ( ! sb->brify ( str,strLen,0,cols) ) return false;
}
// . use "UNTITLED" if no title
// . msg20 should supply the dmoz title if it can
if ( strLen == 0 ) {
if(!sb->safePrintf("<i>UNTITLED</i>"))
return false;
}
// close up the title tag
if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></title>\n");
if ( si->m_format == FORMAT_XML ) {
sb->safePrintf("\t\t<title><![CDATA[");
if ( str ) sb->cdataEncode(str);
sb->safePrintf("]]></title>\n");
}
if ( si->m_format == FORMAT_JSON ) {
sb->safePrintf("\t\t\"title\":\"");
if ( str ) sb->jsonEncode(str);
sb->safePrintf("\",\n");
}
if ( si->m_format == FORMAT_HTML ) sb->safePrintf ("</a><br>\n" ) ;
if ( si->m_format == FORMAT_HTML )
sb->safePrintf ("</a><br>\n" ) ;
// close the title tag stuf
@ -3424,6 +3530,22 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
si->m_format == FORMAT_WIDGET_AJAX )
sb->safePrintf("</b></a>\n");
//
// print <h1> tag contents. hack for client.
//
if ( mr->ptr_htag && mr->size_htag > 1 ) {
if ( si->m_format == FORMAT_XML ) {
sb->safePrintf("\t\t<h1Tag><![CDATA[");
sb->cdataEncode(mr->ptr_htag);
sb->safePrintf("]]></h1Tag>\n");
}
if ( si->m_format == FORMAT_JSON ) {
sb->safePrintf("\t\t\"h1Tag\":\"");
sb->jsonEncode(mr->ptr_htag);
sb->safePrintf("\",\n");
}
}
/////
//
@ -3440,6 +3562,8 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
"]]>"
"</contentType>\n",
cs);
else if ( si->m_format == FORMAT_JSON )
sb->safePrintf("\t\t\"contentType\":\"%s\",\n",cs);
else if ( si->m_format == FORMAT_HTML && ctype != CT_HTML ) {
sb->safePrintf(" <b><font style=color:white;"
"background-color:maroon;>");
@ -3460,13 +3584,18 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
// . then the summary
// . "s" is a string of null terminated strings
char *send;
//char *send;
// do the normal summary
str = mr->ptr_sum;
strLen = mr->size_sum-1;
str = mr->ptr_displaySum;
// sometimes the summary is longer than requested because for
// summary deduping purposes (see "pss" parm in Parms.cpp) we do not
// get it as short as request. so use mr->m_sumPrintSize here
// not mr->size_sum
strLen = mr->size_displaySum - 1;//-1;
// this includes the terminating \0 or \0\0 so back up
if ( strLen < 0 ) strLen = 0;
send = str + strLen;
//send = str + strLen;
// dmoz summary might override if we are showing a dmoz topic page
if ( dmozSummary ) {
@ -3474,8 +3603,6 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
strLen = gbstrlen(dmozSummary);
}
if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t<sum><![CDATA[");
bool printSummary = true;
// do not print summaries for widgets by default unless overridden
// with &summary=1
@ -3485,13 +3612,25 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
hr->getLong("summaries",0) == 0 )
printSummary = false;
if ( printSummary )
if ( printSummary && si->m_format == FORMAT_HTML )
sb->brify ( str , strLen, 0 , cols ); // niceness = 0
// close xml tag
if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></sum>\n");
if ( si->m_format == FORMAT_XML ) {
sb->safePrintf("\t\t<sum><![CDATA[");
sb->cdataEncode(str);
sb->safePrintf("]]></sum>\n");
}
if ( si->m_format == FORMAT_JSON ) {
sb->safePrintf("\t\t\"sum\":\"");
sb->jsonEncode(str);
sb->safePrintf("\",\n");
}
// new line if not xml
else if ( strLen ) sb->safePrintf("<br>\n");
if ( si->m_format == FORMAT_HTML && strLen )
sb->safePrintf("<br>\n");
////////////
//
@ -3557,6 +3696,11 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
sb->safeMemcpy ( url , urlLen );
sb->safePrintf("]]></url>\n");
}
if ( si->m_format == FORMAT_JSON ) {
sb->safePrintf("\t\t\"url\":\"");
sb->jsonEncode ( url , urlLen );
sb->safePrintf("\",\n");
}
// now the last spidered date of the document
@ -3617,6 +3761,49 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
datedbDate);
}
if ( si->m_format == FORMAT_JSON ) {
// doc size in Kilobytes
sb->safePrintf ( "\t\t\"size\":\"%4.0fk\",\n",
(float)mr->m_contentLen/1024.0);
// . docId for possible cached link
// . might have merged a bunch together
sb->safePrintf("\t\t\"docId\":%lli,\n",mr->m_docId );
// . show the site root
// . for hompages.com/users/fred/mypage.html this will be
// homepages.com/users/fred/
// . for www.xyz.edu/~foo/burp/ this will be
// www.xyz.edu/~foo/ etc.
long siteLen = 0;
char *site = NULL;
// seems like this isn't the way to do it, cuz Tagdb.cpp
// adds the "site" tag itself and we do not always have it
// in the XmlDoc::ptr_tagRec... so do it this way:
site = mr->ptr_site;
siteLen = mr->size_site-1;
//char *site=uu.getSite( &siteLen , si->m_coll, false, tagRec);
sb->safePrintf("\t\t\"site\":\"");
if ( site && siteLen > 0 ) sb->safeMemcpy ( site , siteLen );
sb->safePrintf("\",\n");
//long sh = hash32 ( site , siteLen );
//sb->safePrintf ("\t\t<siteHash32>%lu</siteHash32>\n",sh);
//long dh = uu.getDomainHash32 ();
//sb->safePrintf ("\t\t<domainHash32>%lu</domainHash32>\n",dh);
// spider date
sb->safePrintf ( "\t\t\"spidered\":%lu,\n",
mr->m_lastSpidered);
// backwards compatibility for buzz
sb->safePrintf ( "\t\t\"firstIndexedDateUTC\":%lu,\n"
, mr->m_firstIndexedDate);
sb->safePrintf( "\t\t\"contentHash32\":%lu,\n"
, mr->m_contentHash32);
// pub date
long datedbDate = mr->m_datedbDate;
// show the datedb date as "<pubDate>" for now
if ( datedbDate != -1 )
sb->safePrintf ( "\t\t\"pubdate\":%lu,\n",
datedbDate);
}
// . we also store the outlinks in a linkInfo structure
@ -3642,6 +3829,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
k->m_ip, // hostHash, but use ip for now
(long)k->m_firstIndexedDate ,
(long)k->m_datedbDate );
if ( si->m_format == FORMAT_XML ) {
// result
sb->safePrintf("\t\t<language><![CDATA[%s]]>"
@ -3654,6 +3842,16 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
"</charset>\n", charset);
}
if ( si->m_format == FORMAT_JSON ) {
// result
sb->safePrintf("\t\t\"language\":\"%s\",\n",
getLanguageString(mr->m_language));
char *charset = get_charset_str(mr->m_charset);
if(charset)
sb->safePrintf("\t\t\"charset\":\"%s\",\n",charset);
}
//
// end more xml stuff
//
@ -3797,10 +3995,10 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
,ix
);
// reindex
sb->safePrintf(" - <a style=color:red; href=\"/addurl?u=");
sb->safePrintf(" - <a style=color:red; href=\"/addurl?urls=");
sb->urlEncode ( url , gbstrlen(url) , false );
unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
sb->safePrintf("&rand64=%llu&force=1\">respider</a>",rand64);
sb->safePrintf("&rand64=%llu\">respider</a>",rand64);
}
@ -4041,6 +4239,11 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
if ( ! dp ) {
if ( si->m_format == FORMAT_XML )
sb->safePrintf ("\t</result>\n\n");
if ( si->m_format == FORMAT_JSON ) {
// remove last ,\n
sb->m_length -= 2;
sb->safePrintf ("\n\t}\n\n");
}
// wtf?
//char *xx=NULL;*xx=0;
// at least close up the table
@ -4126,7 +4329,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
if ( minScore < 0.0 || totalPairScore < minScore )
minScore = totalPairScore;
// we need to set "ft" for xml stuff below
if ( si->m_format == FORMAT_XML ) continue;
if ( si->m_format != FORMAT_HTML ) continue;
//sb->safePrintf("<table border=1><tr><td><center><b>");
// print pair text
//long qtn1 = fps->m_qtermNum1;
@ -4209,7 +4412,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
if ( minScore < 0.0 || totalSingleScore < minScore )
minScore = totalSingleScore;
// we need to set "ft" for xml stuff below
if ( si->m_format == FORMAT_XML ) continue;
if ( si->m_format != FORMAT_HTML ) continue;
//sb->safePrintf("<table border=1><tr><td><center><b>");
// print pair text
//long qtn = fss->m_qtermNum;

View File

@ -22,7 +22,7 @@
//char *printNumResultsDropDown ( char *p, long n, bool *printedDropDown);
bool printNumResultsDropDown ( SafeBuf& sb, long n, bool *printedDropDown);
//static char *printTopDirectory ( char *p, char *pend );
static bool printTopDirectory ( SafeBuf& sb );
static bool printTopDirectory ( SafeBuf& sb , char format );
// this prints the last five queries
//static long printLastQueries ( char *p , char *pend ) ;
@ -586,7 +586,7 @@ bool expandHtml ( SafeBuf& sb,
if ( head[i+1] == 't' ) {
i += 1;
//p = printTopDirectory ( p, pend );
printTopDirectory ( sb );
printTopDirectory ( sb , FORMAT_HTML );
continue;
}
@ -963,7 +963,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
"onLoad=\""
"var client = new XMLHttpRequest();\n"
"client.onreadystatechange = handler;\n"
"var url='/addurl?u="
"var url='/addurl?urls="
);
sb.urlEncode ( url );
// propagate "admin" if set
@ -1042,7 +1042,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
if ( ! coll )
coll = "";
sb.safePrintf("<input name=u type=text size=60 value=\"");
sb.safePrintf("<input name=urls type=text size=60 value=\"");
if ( url ) {
SafeBuf tmp;
tmp.safePrintf("%s",url);
@ -1092,7 +1092,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
//"alert('shit');"
"var client = new XMLHttpRequest();\n"
"client.onreadystatechange = handler;\n"
"var url='/addurl?u="
"var url='/addurl?urls="
, root );
sb.urlEncode ( url );
// propagate "admin" if set
@ -1128,6 +1128,11 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
char format = r->getReplyFormat();
if ( format != FORMAT_HTML )
return printTopDirectory ( sb , format );
sb.safePrintf("<html>\n");
sb.safePrintf("<head>\n");
//sb.safePrintf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");
@ -1216,7 +1221,7 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
sb.safePrintf("\n");
printTopDirectory ( sb );
printTopDirectory ( sb , FORMAT_HTML );
sb.safePrintf("<br><br>\n");
@ -1395,10 +1400,12 @@ long printLastQueries ( char *p , char *pend ) {
//char *printTopDirectory ( char *p, char *pend ) {
bool printTopDirectory ( SafeBuf& sb ) {
bool printTopDirectory ( SafeBuf& sb , char format ) {
long nr = g_catdb.getRdb()->getNumTotalRecs();
// if no recs in catdb, print instructions
if ( g_catdb.getRdb()->getNumTotalRecs() == 0 )
if ( nr == 0 && format == FORMAT_HTML)
return sb.safePrintf("<center>"
"<b>DMOZ functionality is not set up.</b>"
"<br>"
@ -1411,6 +1418,12 @@ bool printTopDirectory ( SafeBuf& sb ) {
"</b>"
"</center>");
// send back an xml/json error reply
if ( nr == 0 && format != FORMAT_HTML ) {
g_errno = EDMOZNOTREADY;
return false;
}
//char topList[4096];
//sprintf(topList,
return sb.safePrintf (
@ -1619,26 +1632,26 @@ static bool s_inprogress = false;
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
// . get fields from cgi field of the requested url
// . get the search query
long urlLen = 0;
char *url = r->getString ( "u" , &urlLen , NULL /*default*/);
char *url = hr->getString ( "urls" , &urlLen , NULL /*default*/);
// see if they provided a url of a file of urls if they did not
// provide a url to add directly
bool isAdmin = g_conf.isCollAdmin ( s , r );
bool isAdmin = g_conf.isCollAdmin ( sock , hr );
long ufuLen = 0;
char *ufu = NULL;
if ( isAdmin )
// get the url of a file of urls (ufu)
ufu = r->getString ( "ufu" , &ufuLen , NULL );
//if ( isAdmin )
// // get the url of a file of urls (ufu)
// ufu = hr->getString ( "ufu" , &ufuLen , NULL );
// can't be too long, that's obnoxious
if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) {
g_errno = EBUFTOOSMALL;
g_msg = " (error: url too long)";
return g_httpServer.sendErrorReply(s,500,"url too long");
return g_httpServer.sendErrorReply(sock,500,"url too long");
}
// get the collection
//long collLen = 0;
@ -1650,20 +1663,20 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
//}
// get collection rec
CollectionRec *cr = g_collectiondb.getRec ( r );
CollectionRec *cr = g_collectiondb.getRec ( hr );
// bitch if no collection rec found
if ( ! cr ) {
g_errno = ENOCOLLREC;
g_msg = " (error: no collection)";
return g_httpServer.sendErrorReply(s,500,"no coll rec");
return g_httpServer.sendErrorReply(sock,500,"no coll rec");
}
// . make sure the ip is not banned
// . we may also have an exclusive list of IPs for private collections
if ( ! cr->hasSearchPermission ( s ) ) {
if ( ! cr->hasSearchPermission ( sock ) ) {
g_errno = ENOPERM;
g_msg = " (error: permission denied)";
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno));
}
@ -1672,8 +1685,8 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
//
if ( ! url ) {
SafeBuf sb;
printAddUrlHomePage ( sb , NULL , r );
return g_httpServer.sendDynamicPage(s,
printAddUrlHomePage ( sb , NULL , hr );
return g_httpServer.sendDynamicPage(sock,
sb.getBufStart(),
sb.length(),
// 120 secs cachetime
@ -1686,19 +1699,19 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
200,
NULL, // cookie
"UTF-8",
r);
hr);
}
//
// run the ajax script on load to submit the url now
//
long id = r->getLong("id",0);
long id = hr->getLong("id",0);
// if we are not being called by the ajax loader, the put the
// ajax loader script into the html now
if ( id == 0 ) {
SafeBuf sb;
printAddUrlHomePage ( sb , url , r );
return g_httpServer.sendDynamicPage ( s,
printAddUrlHomePage ( sb , url , hr );
return g_httpServer.sendDynamicPage ( sock,
sb.getBufStart(),
sb.length(),
// don't cache any more
@ -1711,7 +1724,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
200,
NULL, // cookie
"UTF-8",
r);
hr);
}
//
@ -1742,7 +1755,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
if ( msg ) {
SafeBuf sb;
sb.safePrintf("%s",msg);
g_httpServer.sendDynamicPage (s,
g_httpServer.sendDynamicPage (sock,
sb.getBufStart(),
sb.length(),
3600,//-1, // cachetime
@ -1764,10 +1777,10 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
g_errno = ENOMEM;
log("PageAddUrl: new(%i): %s",
sizeof(State1i),mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); }
return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno)); }
mnew ( st1 , sizeof(State1i) , "PageAddUrl" );
// save socket and isAdmin
st1->m_socket = s;
st1->m_socket = sock;
st1->m_isAdmin = isAdmin;
/*
@ -1809,12 +1822,12 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
//unsigned long h = ipdom ( s->m_ip );
// . use top 2 bytes now, some isps have large blocks
// . if this causes problems, then they can do pay for inclusion
unsigned long h = iptop ( s->m_ip );
unsigned long h = iptop ( sock->m_ip );
long codeLen;
char* code = r->getString("code", &codeLen);
if(g_autoBan.hasCode(code, codeLen, s->m_ip)) {
char* code = hr->getString("code", &codeLen);
if(g_autoBan.hasCode(code, codeLen, sock->m_ip)) {
long uipLen = 0;
char* uip = r->getString("uip",&uipLen);
char* uip = hr->getString("uip",&uipLen);
long hip = 0;
//use the uip when we have a raw query to test if
//we can submit
@ -1824,18 +1837,18 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
}
}
st1->m_strip = r->getLong("strip",0);
st1->m_strip = hr->getLong("strip",0);
// . Remember, for cgi, if the box is not checked, then it is not
// reported in the request, so set default return value to 0
// . support both camel case and all lower-cases
st1->m_spiderLinks = r->getLong("spiderLinks",0);
st1->m_spiderLinks = r->getLong("spiderlinks",st1->m_spiderLinks);
st1->m_spiderLinks = hr->getLong("spiderLinks",0);
st1->m_spiderLinks = hr->getLong("spiderlinks",st1->m_spiderLinks);
// . should we force it into spiderdb even if already in there
// . use to manually update spider times for a url
// . however, will not remove old scheduled spider times
// . mdw: made force on the default
st1->m_forceRespider = r->getLong("force",1); // 0);
st1->m_forceRespider = hr->getLong("force",1); // 0);
long now = getTimeGlobal();
// . allow 1 submit every 1 hour
@ -1850,7 +1863,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
delete (st1);
// use cachetime of 3600 so it does not re-inject if you hit
// the back button!
g_httpServer.sendDynamicPage (s,
g_httpServer.sendDynamicPage (sock,
sb.getBufStart(),
sb.length(),
3600,//-1, // cachetime
@ -1878,6 +1891,17 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
*/
// set this. also sets gr->m_hr
GigablastRequest *gr = &st1->m_msg7.m_gr;
// this will fill in GigablastRequest so all the parms we need are set
g_parms.setGigablastRequest ( sock , hr , gr );
// this is really an injection, not add url, so make
// GigablastRequest::m_url point to Gigablast::m_urlsBuf because
// the PAGE_ADDURLS2 parms in Parms.cpp fill in the m_urlsBuf.
// HACK!
gr->m_url = gr->m_urlsBuf;
//
// inject using msg7
//

View File

@ -51,7 +51,7 @@ static void sendReply ( void *st ) ;
// . returns false if blocked, otherwise true
// . sets g_errno on error
bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
char *cgi;
long cgiLen;
@ -201,6 +201,13 @@ void sendReply ( void *state ) {
strncpy( startTimeStr, ctime( &st->m_startDate ), 30 );
strncpy( endTimeStr, ctime( &st->m_endDate ), 30 );
buf.safePrintf(
"<b>Graph of various query performance statistics.</b>"
"<br>"
"<br>"
);
buf.safePrintf("<center>\n");
if ( ! g_conf.m_useStatsdb )
@ -208,6 +215,7 @@ void sendReply ( void *state ) {
"Turn on in the master controls.</b>"
"</font>\n" );
buf.safePrintf("<table %s>\n",TABLE_STYLE);
buf.safePrintf("<tr><td bgcolor=#%s>"

326
Pages.cpp
View File

@ -72,13 +72,16 @@ static WebPage s_pages[] = {
//{ PAGE_WIDGET , "widget" , 0 , "widget" , 0 , 0 ,
// "widget page",
// sendPageWidget, 0 ,NULL,NULL,PG_NOAPI},
// this is the public addurl, /addurl, if you are using the
// api use PAGE_ADDURL2 which is /admin/addurl. so we set PG_NOAPI here
{ PAGE_ADDURL , "addurl" , 0 , "add url" , 0 , 0 ,
"Page where you can add url for spidering",
sendPageAddUrl, 0 ,NULL,NULL,0},
sendPageAddUrl, 0 ,NULL,NULL,PG_NOAPI},
{ PAGE_GET , "get" , 0 , "get" , 0 , 0 ,
//USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_CLIENT,
"gets cached url",
"gets cached web page",
sendPageGet , 0 ,NULL,NULL,0},
{ PAGE_LOGIN , "login" , 0 , "login" , 0 , 0 ,
//USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_SPAM | USER_CLIENT,
@ -99,15 +102,15 @@ static WebPage s_pages[] = {
// use post now for the "site list" which can be big
{ PAGE_BASIC_SETTINGS, "admin/settings", 0 , "settings",1, M_POST ,
"Basic settings page.", sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI},
"basic settings page", sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI},
{ PAGE_BASIC_STATUS, "admin/status", 0 , "status",1, 0 ,
"Basic status page.", sendPageBasicStatus , 0 ,NULL,NULL,0},
"basic status page", sendPageBasicStatus , 0 ,NULL,NULL,0},
//{ PAGE_BASIC_DIFFBOT, "admin/diffbot", 0 , "diffbot",1, 0 ,
// "Basic diffbot page.", sendPageBasicDiffbot , 0 ,NULL,NULL,PG_NOAPI},
{ PAGE_BASIC_SECURITY, "admin/security", 0 , "security",1, 0 ,
"Basic security page.", sendPageGeneric , 0 ,NULL,NULL,0},
"basic security page", sendPageGeneric , 0 ,NULL,NULL,0},
{ PAGE_BASIC_SEARCH, "", 0 , "search",1, 0 ,
"Basic search page.", sendPageRoot , 0 ,NULL,NULL,PG_NOAPI},
"basic search page", sendPageRoot , 0 ,NULL,NULL,PG_NOAPI},
@ -115,7 +118,8 @@ static WebPage s_pages[] = {
//USER_MASTER | USER_PROXY ,
"master controls page",
sendPageGeneric , 0 ,NULL,NULL,0},
{ PAGE_SEARCH , "admin/search" , 0 , "search controls" , 1 , 1,
// use POST for html head/tail and page root html. might be large.
{ PAGE_SEARCH , "admin/search" , 0 , "search controls" ,1,M_POST,
//USER_ADMIN | USER_MASTER ,
"search controls page",
sendPageGeneric , 0 ,NULL,NULL,0},
@ -151,10 +155,11 @@ static WebPage s_pages[] = {
// { PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1,
// "what sites can be spidered",
// sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI}, // sendPageBasicSettings
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 ,M_POST,
//USER_ADMIN | USER_MASTER ,
"prioritize urls for spidering",
sendPageGeneric , 0 ,NULL,NULL,0},
// until we get this working, set PG_NOAPI
sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI},
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0,M_MULTI ,
//USER_ADMIN | USER_MASTER ,
"inject url in the index here",
@ -180,17 +185,17 @@ static WebPage s_pages[] = {
// master admin pages
{ PAGE_STATS , "admin/stats" , 0 , "stats" , 0 , 0 ,
//USER_MASTER | USER_PROXY ,
"statistics page",
"general statistics",
sendPageStats , 0 ,NULL,NULL,0},
{ PAGE_STATSDB , "admin/statsdb" , 0 , "graph" , 0 , 0 ,
{ PAGE_GRAPH , "admin/graph" , 0 , "graph" , 0 , 0 ,
//USER_MASTER ,
"statistics page",
sendPageStatsdb , 2 /*niceness*/ ,NULL,NULL,0},
"query stats graph page",
sendPageGraph , 2 /*niceness*/ ,NULL,NULL,0},
{ PAGE_PERF , "admin/perf" , 0 , "performance" , 0 , 0 ,
//USER_MASTER | USER_PROXY ,
"master performance page",
"function performance graph",
sendPagePerf , 0 ,NULL,NULL,0},
{ PAGE_SOCKETS , "admin/sockets" , 0 , "sockets" , 0 , 0 ,
@ -237,7 +242,7 @@ static WebPage s_pages[] = {
{ PAGE_API , "admin/api" , 0 , "api" , 0 , 0 ,
//USER_MASTER | USER_ADMIN ,
"api page",
sendPageAPI , 0 ,NULL,NULL,0},
sendPageAPI , 0 ,NULL,NULL,PG_NOAPI},
{ PAGE_RULES , "admin/siterules", 0 , "site rules", 1, M_POST,
//USER_ADMIN | USER_MASTER ,
"site rules page",
@ -258,7 +263,7 @@ static WebPage s_pages[] = {
{ PAGE_SPIDERDB , "admin/spiderdb" , 0 , "spider queue" , 0 , 0 ,
//USER_ADMIN | USER_MASTER ,
"spiderdb page",
"spider queue",
sendPageSpiderdb , 0 ,NULL,NULL,0},
//{ PAGE_PRIORITIES, "admin/priorities" , 0 , "priority controls",1,1,
// //USER_ADMIN | USER_MASTER ,
@ -293,7 +298,7 @@ static WebPage s_pages[] = {
sendPageParser , 2 ,NULL,NULL,PG_NOAPI},
{ PAGE_SITEDB , "admin/tagdb" , 0 , "tagdb" , 0 , M_POST,
//USER_MASTER | USER_ADMIN,
"tagdb page to add/remove/get tags",
"add/remove/get tags for sites/urls",
sendPageTagdb , 0 ,NULL,NULL,0},
{ PAGE_CATDB , "admin/catdb" , 0 , "catdb" , 0,M_POST,
//USER_MASTER | USER_ADMIN,
@ -518,6 +523,9 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
if ( ! publicPage && ! isAdmin )
return sendPageLogin ( s , r );
if ( page == PAGE_CRAWLBOT && ! isAdmin )
log("pages: accessing a crawlbot page without admin privs. "
"no parms can be changed.");
/*
// is request coming from a local ip?
@ -1088,9 +1096,17 @@ bool Pages::printAdminTop (SafeBuf *sb ,
if ( isBasic ) menu = "basic";
sb->safePrintf("<br>");
sb->safePrintf("<b><font color=gray size=+2>"
"%s &gt; %s &gt; %s</font></b>"
"%s &gt; %s &gt; %s "
"&nbsp; "
"</font>"
"</b>"
//"<a href=/%s?c=%s&showparms=1&format=xml>xml</a> "
//"<a href=/%s?c=%s&showparms=1&format=json>json</a> "
"<br><br>\n",
coll, menu, s_pages[page].m_name);
coll, menu, s_pages[page].m_name
//,s_pages[page].m_filename , coll
//,s_pages[page].m_filename , coll
);
@ -2479,7 +2495,10 @@ bool sendPageAPI ( TcpSocket *s , HttpRequest *r ) {
g_pages.printLogo ( &p , coll );
p.safePrintf("</td></tr></table><br><br>");
p.safePrintf("NOTE: All APIs support both GET and POST method. "
"If the size of your request is more than 2K you "
"should use POST.");
p.safePrintf("<br><br>");
p.safePrintf("<div style=padding-left:10%%>"
"<font size=+2><b>API by pages</b></font>"
@ -2592,8 +2611,17 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
sb->safePrintf("</a>");
// description of page
sb->safePrintf("<font size=-0> - %s</font><br>",
s_pages[PAGENUM].m_desc);
sb->safePrintf("<font size=-0> - %s "
" &nbsp; "
"[ <b>output response in</b> "
"<a href=/%s?showparms=1&format=xml>xml</a> "
"or <a href=/%s?showparms=1&format=json>json</a> "
"or <a href=/%s>html</a> ] "
"</font><br>",
s_pages[PAGENUM].m_desc,
pageStr,
pageStr,
pageStr);
sb->safePrintf("</div><br>");
// begin new list of centered tables
@ -2603,7 +2631,7 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
sb->safePrintf (
"<table style=max-width:80%%; %s>"
"<tr class=hdrow><td colspan=9>"
"<center><b>Parms</b></tr></tr>"
"<center><b>Input</b></tr></tr>"
"<tr bgcolor=#%s>"
"<td><b>#</b></td>"
"<td><b>parm</b></td>"
@ -2615,9 +2643,75 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
, TABLE_STYLE
, DARK_BLUE );
const char *blue = LIGHT_BLUE;
const char *blues[] = {DARK_BLUE,LIGHT_BLUE};
long count = 1;
//
// every page supports the:
// 1) &format=xml|html|json
// 2) &showparms=0|1
// 3) &c=<collectionName>
// parms. we support them in sendPageGeneric() for pages like
// /admin/master /admin/search /admin/spider so you can see
// the settings.
// put these in Parms.cpp, but use PF_DISPLAY flag so we ignore them
// in convertHttpRequestToParmList() and we do not show them on the
// page itself.
//
// page display/output parms
sb->safePrintf("<tr bgcolor=%s>"
"<td>%li</td>\n"
"<td><b>format</b></td>"
"<td>STRING</td>"
"<td>output format</td>"
"<td>html</td>"
"<td>Display output in this format.</td>"
"</tr>"
, blues[count%2]
, count
);
count++;
// for pages that have settings...
if ( PAGENUM == PAGE_MASTER ||
PAGENUM == PAGE_SEARCH ||
PAGENUM == PAGE_SPIDER ) {
sb->safePrintf("<tr bgcolor=%s>"
"<td>%li</td>\n"
"<td><b>showparms</b></td>"
"<td>BOOL (0 or 1)</td>"
"<td>show parms</td>"
"<td></td>"
"<td>Display the values of all settings.</td>"
"</tr>"
, blues[count%2]
, count
);
count++;
}
// . master controls are for all collections so no need for this
// . we already have this in the parms list for some pages so only
// show for selected pages here
// if ( PAGENUM != PAGE_MASTER ) {
// sb->safePrintf("<tr bgcolor=%s>"
// "<td>%li</td>\n"
// "<td><b>c</b></td>"
// "<td>STRING</td>"
// "<td>Collection</td>"
// "<td></td>"
// "<td>The name of the collection. "
// "<font color=green><b>REQUIRED</b></font>"
// "</td>"
// "</tr>"
// , blues[count%2]
// , count
// );
// count++;
// }
//char *lastPage = NULL;
//Parm *lastParm = NULL;
@ -2643,10 +2737,6 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
if ( pageNum != PAGENUM ) continue;
if ( blue == (const char *)LIGHT_BLUE ) blue = DARK_BLUE;
else if(blue==(const char *)DARK_BLUE ) blue = LIGHT_BLUE;
SafeBuf tmp;
char diff = 0;
bool printVal = false;
@ -2664,7 +2754,7 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
if ( diff == 1 )
sb->safePrintf ( "<tr bgcolor=orange>");
else
sb->safePrintf ( "<tr bgcolor=#%s>",blue);
sb->safePrintf ( "<tr bgcolor=#%s>",blues[count%2]);
sb->safePrintf("<td>%li</td>",count++);
@ -2721,6 +2811,17 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
// end input parm table we started below
sb->safePrintf("</table><br>\n\n");
// do not print the tables below now,
// we provide output links for xml, json and html
sb->safePrintf("</center>");
if ( PAGENUM != PAGE_GET &&
PAGENUM != PAGE_RESULTS )
return true;
sb->safePrintf("<center>");
//
// done printing parm table
//
@ -2731,22 +2832,82 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
sb->safePrintf (
"<table style=max-width:80%%; %s>"
"<tr class=hdrow><td colspan=9>"
"<center><b>XML Output</b></tr></tr>"
"<tr><td>"
"<center><b>Example XML Output</b> "
"(&format=xml)</tr></tr>"
"<tr><td bgcolor=%s>"
, TABLE_STYLE
, LIGHT_BLUE
);
sb->safePrintf("<pre>\n");
char *desc = s_pages[PAGENUM].m_xmlOutputDesc;
if ( ! desc )
desc = "<response>\n"
"\t<status>N</status> "
"# 0 on success, otherwise an "
"error code\n"
"\t<statusMsg>S</statusMsg> "
"# \"Success\" on success, "
"otherwise the error message."
"</response>";
sb->htmlEncode ( desc);
// bool showParms = false;
// if ( PAGENUM == PAGE_MASTER ||
// PAGENUM == PAGE_SPIDER ||
// PAGENUM == PAGE_SEARCH
// )
// showParms = true;
sb->safePrintf("<pre style=max-width:500px;>\n");
char *get = "<html><title>Some web page title</title>"
"<head>My first web page</head></html>";
// example output in xml
if ( PAGENUM == PAGE_GET ) {
SafeBuf xb;
xb.safePrintf("<response>\n"
"\t<statusCode>0</statusCode>\n"
"\t<statusMsg>Success</statusMsg>\n"
"\t<url><![CDATA[http://www.doi.gov/]]></url>\n"
"\t<docId>34111603247</docId>\n"
"\t<cachedTimeUTC>1404512549</cachedTimeUTC>\n"
"\t<cachedTimeStr>Jul 04, 2014 UTC"
"</cachedTimeStr>\n"
"\t<content><![CDATA[");
xb.cdataEncode(get);
xb.safePrintf("]]></content>\n");
xb.safePrintf("</response>\n");
sb->htmlEncode ( xb.getBufStart() );
}
if ( PAGENUM == PAGE_RESULTS ) {
SafeBuf xb;
xb.safePrintf("<response>\n"
"\t<statusCode>0</statusCode>\n"
"\t<statusMsg>Success</statusMsg>\n"
"\t<currentTimeUTC>1404513734</currentTimeUTC>\n"
"\t<responseTimeMS>284</responseTimeMS>\n"
"\t<docsInCollection>226</docsInCollection>\n"
"\t<hits>193</hits>\n"
"\t<moreResultsFollow>1</moreResultsFollow>\n"
"\t<result>\n"
"\t\t<imageBase64>/9j/4AAQSkZJRgABAQAAAQABA..."
"</imageBase64>\n"
"\t\t<imageHeight>350</imageHeight>\n"
"\t\t<imageWidth>223</imageWidth>\n"
"\t\t<origImageHeight>470</origImageHeight>\n"
"\t\t<origImageWidth>300</origImageWidth>\n"
"\t\t<title><![CDATA[U.S....]]></title>\n"
"\t\t<sum>Department of the Interior protects "
"America's natural resources and</sum>\n"
"\t\t<url><![CDATA[www.doi.gov]]></url>\n"
"\t\t<size> 64k</size>\n"
"\t\t<docId>34111603247</docId>\n"
"\t\t<site>www.doi.gov</site>\n"
"\t\t<spidered>1404512549</spidered>\n"
"\t\t<firstIndexedDateUTC>1404512549"
"</firstIndexedDateUTC>\n"
"\t\t<contentHash32>2680492249</contentHash32>\n"
"\t\t<language>English</language>\n"
"\t</result>\n"
"</response>\n");
sb->htmlEncode ( xb.getBufStart() );
}
sb->safePrintf("</pre>");
sb->safePrintf ( "</td></tr></table><br>\n\n" );
@ -2756,23 +2917,74 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
sb->safePrintf (
"<table style=max-width:80%%; %s>"
"<tr class=hdrow><td colspan=9>"
"<center><b>JSON Output</b></tr></tr>"
"<tr><td>"
"<center><b>Example JSON Output</b> "
"(&format=json)</tr></tr>"
"<tr><td bgcolor=%s>"
, TABLE_STYLE
, LIGHT_BLUE
);
sb->safePrintf("<pre>\n");
desc = s_pages[PAGENUM].m_jsonOutputDesc;
if ( ! desc )
desc = "{ \"response:\"{\n"
"\t\"status\":N, "
"# 0 on success, otherwise an "
"error code\n"
"\t\"statusMsg\":\"xxx\" "
"# xxx is \"Success\" on success, "
"otherwise the error message.\n"
"\t}\n"
"}";
sb->htmlEncode ( desc);
// example output in xml
if ( PAGENUM == PAGE_GET ) {
sb->safePrintf(
"{ \"response:\"{\n"
"\t\"statusCode\":0,\n"
"\t\"statusMsg\":\"Success\",\n"
"\t\"url\":\"http://www.doi.gov/\",\n"
"\t\"docId\":34111603247,\n"
"\t\"cachedTimeUTC\":1404512549,\n"
"\t\"cachedTimeStr\":\"Jul 04, 2014 UTC\",\n"
"\t\"content\":\"");
SafeBuf js;
js.jsonEncode(get);
sb->htmlEncode(js.getBufStart());
sb->safePrintf("\"\n"
"}\n"
"}\n");
}
if ( PAGENUM == PAGE_RESULTS ) {
sb->safePrintf(
"{ \"response:\"{\n"
"\t\"statusCode\":0,\n"
"\t\"statusMsg\":\"Success\",\n"
"\t\"currentTimeUTC\":1404588231,\n"
"\t\"responseTimeMS\":312,\n"
"\t\"docsInCollection\":226,\n"
"\t\"hits\":193,\n"
"\t\"moreResultsFollow\":1,\n"
"\t\"results\":[\n"
"\t{\n"
"\t\t\"imageBase64\":\"/9j/4AAQSkZJR...\",\n"
"\t\t\"imageHeight\":223,\n"
"\t\t\"imageWidth\":350,\n"
"\t\t\"origImageHeight\":300,\n"
"\t\t\"origImageWidth\":470,\n"
"\t\t\"title\":\"U.S....\",\n"
"\t\t\"sum\":\"Department of the Interior "
"protects America's natural resources.\",\n"
"\t\t\"url\":\"www.doi.gov\",\n"
"\t\t\"size\":\" 64k\",\n"
"\t\t\"docId\":34111603247,\n"
"\t\t\"site\":\"www.doi.gov\",\n"
"\t\t\"spidered\":1404512549,\n"
"\t\t\"firstIndexedDateUTC\":1404512549,\n"
"\t\t\"contentHash32\":2680492249,\n"
"\t\t\"language\":\"English\"\n"
"\t}\n"
"\t,\n"
"\t...\n"
"]\n"
"}\n"
);
}
sb->safePrintf("</pre>");
sb->safePrintf ( "</td></tr></table><br>\n\n" );

View File

@ -85,7 +85,7 @@ bool sendPageAPI ( TcpSocket *s , HttpRequest *r );
bool sendPageWordVec ( TcpSocket *s , HttpRequest *r );
bool sendPageQualityAgent ( TcpSocket *s , HttpRequest *r );
bool sendPageThesaurus ( TcpSocket *s , HttpRequest *r );
bool sendPageStatsdb ( TcpSocket *s , HttpRequest *r );
bool sendPageGraph ( TcpSocket *s , HttpRequest *r );
// values for m_usePost:
#define M_GET 0x00
@ -110,8 +110,8 @@ class WebPage {
char *m_desc; // page description
bool (* m_function)(TcpSocket *s , HttpRequest *r);
long m_niceness;
char *m_xmlOutputDesc;
char *m_jsonOutputDesc;
char *m_reserved1;
char *m_reserved2;
char m_pgflags;
};
@ -340,7 +340,7 @@ enum {
PAGE_HOSTS ,
PAGE_STATS , // 10
PAGE_STATSDB ,
PAGE_GRAPH , // PAGE_STATSDB ,
PAGE_PERF ,
PAGE_SOCKETS ,

760
Parms.cpp

File diff suppressed because it is too large Load Diff

View File

@ -152,7 +152,6 @@ class GigablastRequest {
char *m_urlsBuf;
char m_stripBox;
char m_harvestLinksBox;
char m_forceRespiderBox;
/////////////
//
@ -200,6 +199,9 @@ class GigablastRequest {
#define PF_REQUIRED 0x4000
#define PF_REBUILDPROXYTABLE 0x8000
#define PF_NOHTML 0x10000
class Parm {
public:
char *m_title; // displayed above m_desc on admin gui page
@ -317,7 +319,7 @@ class Parms {
long nc ,
long pd ,
bool isCrawlbot ,
bool isJSON,
char format, //bool isJSON,
TcpSocket *sock
);
@ -353,7 +355,7 @@ class Parms {
long pd ,
bool lastRow ,
bool isCrawlbot = false,
bool isJSON = false ) ;
char format = FORMAT_HTML);//bool isJSON = false ) ;
char *getTHIS ( HttpRequest *r , long page );

View File

@ -58,6 +58,13 @@ bool RdbDump::set ( //char *coll ,
// use 0 for collectionless
if ( rdb && rdb->m_isCollectionLess ) m_collnum = 0;
// are we like catdb/statsdb etc.?
m_doCollCheck = true;
if ( rdb && rdb->m_isCollectionLess ) m_doCollCheck = false;
// RdbMerge also calls us but rdb is always set to NULL and it was
// causing a merge on catdb (collectionless) to screw up
if ( ! rdb ) m_doCollCheck = false;
/*
if ( ! coll && g_catdb.getRdb() == rdb )
strcpy(m_coll, "catdb");
@ -1023,14 +1030,18 @@ void RdbDump::continueDumping() {
// if someone reset/deleted the collection we were dumping...
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
if ( ! cr ) {
// . do not do this for statsdb/catdb which always use collnum of 0
// . RdbMerge also calls us but gives a NULL m_rdb so we can't
// set m_isCollectionless to false
if ( ! cr && m_doCollCheck ) {
g_errno = ENOCOLLREC;
// m_file is invalid if collrec got nuked because so did
// the Rdbbase which has the files
log("db: continue dumping lost collection");
}
// bitch about errors
else if (g_errno)log("db: Dump to %s had error writing: %s.",
if (g_errno)log("db: Dump to %s had error writing: %s.",
m_file->getFilename(),mstrerror(g_errno));
// go back now if we were NOT dumping a tree

View File

@ -183,6 +183,8 @@ class RdbDump {
//char m_coll [ MAX_COLL_LEN + 1 ];
collnum_t m_collnum;
bool m_doCollCheck;
bool m_tried;
bool m_isSuspended;

View File

@ -1212,6 +1212,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
long hkp = 0;
char useHalfKeys = false;
// these guy always use a collnum of 0
bool doCollRecCheck = true;
if ( !strcmp(m_dbname,"catdb") ) doCollRecCheck = false;
if ( !strcmp(m_dbname,"statsdb") ) doCollRecCheck = false;
if ( !strcmp(m_dbname,"indexdb") ) useHalfKeys = true;
if ( !strcmp(m_dbname,"datedb" ) ) useHalfKeys = true;
if ( !strcmp(m_dbname,"tfndb" ) ) useHalfKeys = true;
@ -1232,12 +1238,17 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
// for posdb
if ( m_ks == 18 &&(m_keys[i*m_ks] & 0x06) ) {
char *xx=NULL;*xx=0; }
// bad collnum?
collnum_t cn = m_collnums[i];
if ( m_rdbId>=0 && (cn >= g_collectiondb.m_numRecs || cn < 0) )
return log("db: bad collnum in tree");
if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
return log("db: collnum is obsolete in tree");
if ( doCollRecCheck ) {
collnum_t cn = m_collnums[i];
if ( m_rdbId>=0 &&
(cn >= g_collectiondb.m_numRecs || cn < 0) )
return log("db: bad collnum in tree");
if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
return log("db: collnum is obsolete in tree");
}
// if no left/right kid it MUST be -1
if ( m_left[i] < -1 )
return log(

View File

@ -392,6 +392,10 @@ long SafeBuf::saveToFile ( char *dir , char *filename ) {
return dumpToFile ( buf );
}
long SafeBuf::save ( char *fullFilename ) {
return dumpToFile ( fullFilename );
}
long SafeBuf::dumpToFile(char *filename ) {
retry22:
long fd = open ( filename , O_CREAT | O_WRONLY | O_TRUNC,
@ -2785,6 +2789,15 @@ bool SafeBuf::safeStrcpyPrettyJSON ( char *decodedJson ) {
}
*/
bool SafeBuf::jsonEncode ( char *src , long srcLen ) {
char c = src[srcLen];
src[srcLen] = 0;
bool status = jsonEncode ( src );
src[srcLen] = c;
return status;
}
// encode into json
bool SafeBuf::safeUtf8ToJSON ( char *utf8 ) {
if ( ! utf8 ) return true;

View File

@ -56,6 +56,7 @@ struct SafeBuf {
long saveToFile ( char *dir , char *filename ) ;
long dumpToFile(char *filename);
long save ( char *dir, char *fname){return saveToFile(dir,fname); };
long save ( char *fullFilename ) ;
long fillFromFile(char *filename);
long fillFromFile(char *dir,char *filename);
@ -107,6 +108,8 @@ struct SafeBuf {
bool safeStrcpy ( char *s ) ;
//bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
bool safeUtf8ToJSON ( char *utf8 ) ;
bool jsonEncode ( char *utf8 ) { return safeUtf8ToJSON(utf8); };
bool jsonEncode ( char *utf8 , long utf8Len );
bool csvEncode ( char *s , long len , long niceness = 0 );

View File

@ -12,7 +12,7 @@
#include "Timedb.h"
#include "PageResults.h"
char getFormatFromRequest ( class HttpRequest *hr ) ;
//char getFormatFromRequest ( class HttpRequest *hr ) ;
SearchInput::SearchInput() {
reset();
@ -257,7 +257,8 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
return false;
}
// add to our list
if (!m_collnumBuf.safeMemcpy(&cr->m_collnum,sizeof(collnum_t)))
if (!m_collnumBuf.safeMemcpy(&tmpcr->m_collnum,
sizeof(collnum_t)))
return false;
// restore the \0 character we wrote in there
*end = c;
@ -272,10 +273,10 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
// use default collection if none provided
if ( ! p && ! token && m_collnumBuf.length() <= 0 ) {
// get default collection rec
CollectionRec *dr = g_collectiondb.getRec (coll);
cr = g_collectiondb.getRec (coll);
// add to our list
if ( dr &&
!m_collnumBuf.safeMemcpy(&dr->m_collnum,
if ( cr &&
!m_collnumBuf.safeMemcpy(&cr->m_collnum,
sizeof(collnum_t)))
return false;
}
@ -294,9 +295,9 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
// must have had one
if ( ! cr ) {
log("si: collection does not exist");
g_errno = ENOCOLLREC;
return false;
log("si: si. collection does not exist");
//g_errno = ENOCOLLREC;
//return false;
}
// and set from the http request. will set m_coll, etc.
@ -310,7 +311,7 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
//////
// get the format. "xml" "html" "json" --> FORMAT_HTML, FORMAT_CSV ...
char tmpFormat = getFormatFromRequest ( &m_hr );
char tmpFormat = m_hr.getReplyFormat();//getFormatFromRequest ( &m_hr);
// now override automatic defaults for special cases
if ( tmpFormat != FORMAT_HTML ) {
m_familyFilter = 0;
@ -960,51 +961,6 @@ uint8_t SearchInput::detectQueryLanguage(void) {
}
*/
char getFormatFromRequest ( HttpRequest *r ) {
char *formatStr = r->getString("format");
//if ( ! formatStr ) return FORMAT_HTML;
char format = FORMAT_HTML;
// what format should search results be in? default is html
if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
if ( formatStr && strcmp(formatStr,"iframe")==0)
format=FORMAT_WIDGET_IFRAME;
if ( formatStr && strcmp(formatStr,"ajax")==0)
format=FORMAT_WIDGET_AJAX;
if ( formatStr && strcmp(formatStr,"append")==0)
format=FORMAT_WIDGET_APPEND;
// support old api &xml=1 to mean &format=1
if ( r->getLong("xml",0) ) {
format = FORMAT_XML;
}
// also support &json=1
if ( r->getLong("json",0) ) {
format = FORMAT_JSON;
}
if ( r->getLong("csv",0) ) {
format = FORMAT_CSV;
}
if ( r->getLong("iframe",0) ) {
format = FORMAT_WIDGET_IFRAME;
}
if ( r->getLong("ajax",0) ) {
format = FORMAT_WIDGET_AJAX;
}
if ( r->getLong("append",0) ) {
format = FORMAT_WIDGET_APPEND;
}
return format;
}
//char getFormatFromRequest ( HttpRequest *r ) {
//
//}

View File

@ -5207,6 +5207,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
// if spidering disabled then do not do this crap
if ( ! g_conf.m_spideringEnabled ) return;
//if ( ! g_conf.m_webSpideringEnabled ) return;
// or if trying to exit
if ( g_process.m_mode == EXIT_MODE ) return;
// wait for clock to sync with host #0
if ( ! isClockInSync() ) {
@ -5517,6 +5519,8 @@ void SpiderLoop::spiderDoledUrls ( ) {
// must be spidering to dole out
if ( ! g_conf.m_spideringEnabled ) return;
// or if trying to exit
if ( g_process.m_mode == EXIT_MODE ) return;
// if we don't have all the url counts from all hosts, then wait.
// one host is probably down and was never up to begin with
if ( ! s_countsAreValid ) return;
@ -6617,7 +6621,9 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
return true;
}
// turned off?
if ( ( (! g_conf.m_spideringEnabled
if ( ( (! g_conf.m_spideringEnabled ||
// or if trying to exit
g_process.m_mode == EXIT_MODE
) && // ! g_conf.m_webSpideringEnabled ) &&
! sreq->m_isInjecting ) ||
// repairing the collection's rdbs?
@ -8584,7 +8590,16 @@ bool sendPage ( State11 *st ) {
g_stats.m_allErrorsOld[i] == 0 &&
bucketsNew[i] == 0 && bucketsOld[i] == 0 ) continue;
sb.safePrintf (
"<tr bgcolor=#%s><td><b>%s</b></td>"
"<tr bgcolor=#%s>"
"<td><b><a href=/search?c=%s&q=gbstatusmsg%%3A"
"%%22"
,
LIGHT_BLUE , cr->m_coll );
sb.urlEncode(mstrerror(i));
sb.safePrintf ("%%22>"
"%s"
"</a>"
"</b></td>"
"<td>%lli</td>"
"<td>%lli</td>"
"<td>%lli</td>"
@ -8592,7 +8607,6 @@ bool sendPage ( State11 *st ) {
"<td>%li</td>"
"<td>%li</td>"
"</tr>\n" ,
LIGHT_BLUE,
mstrerror(i),
g_stats.m_allErrorsNew[i] +
g_stats.m_allErrorsOld[i],
@ -10259,6 +10273,14 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
errCode != EDNSDEAD &&
// assume diffbot is temporarily experiencing errs
errCode != EDIFFBOTINTERNALERROR &&
// if diffbot received empty content when d'lding
errCode != EDIFFBOTEMPTYCONTENT &&
// or diffbot tcp timed out when d'lding the url
errCode != EDIFFBOTREQUESTTIMEDOUT &&
// if diffbot closed the socket on us...
errCode != EDIFFBOTMIMEERROR &&
// of the diffbot reply itself was not 200 (OK)
errCode != EDIFFBOTBADHTTPSTATUS &&
// out of memory while crawling?
errCode != ENOMEM &&
errCode != ENETUNREACH &&
@ -10332,6 +10354,22 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
goto checkNextRule;
}
if ( strncmp(p,"isreindex",9) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;
// if no match continue
//if ( (bool)sreq->m_urlIsDocId==val ) continue;
if ( (bool)sreq->m_isPageReindex==val ) continue;
// skip
p += 10;
// skip to next constraint
p = strstr(p, "&&");
// all done?
if ( ! p ) return i;
p += 2;
goto checkNextRule;
}
if ( strncmp(p,"iscontacty",10) == 0 ) {
// skip for msg20
if ( isForMsg20 ) continue;

View File

@ -58,6 +58,7 @@ bool Summary::set2 ( Xml *xml ,
bool doStemming ,
long maxSummaryLen ,
long maxNumLines ,
long numDisplayLines ,
long maxNumCharsPerLine ,
//long bigSampleRadius ,
//long bigSampleMaxLen ,
@ -81,6 +82,9 @@ bool Summary::set2 ( Xml *xml ,
// to see if it has all the query terms...
//if ( maxNumLines <= 0 ) return true;
m_numDisplayLines = numDisplayLines;
m_displayLen = 0;
//m_useDateLists = useDateLists;
//m_exclDateList = exclDateList;
//m_begPubDateList = begPubDateList;
@ -232,7 +236,12 @@ bool Summary::set2 ( Xml *xml ,
// highest scoring window around each term. And then find the highest
// of those over all the matching terms.
//
for ( long numFinal = 0; numFinal < maxNumLines; numFinal++ ){
long numFinal;
for ( numFinal = 0; numFinal < maxNumLines; numFinal++ ){
if ( numFinal == m_numDisplayLines )
m_displayLen = p - m_summary;
// reset these at the top of each loop
Match *maxm;
long long maxScore = 0;
@ -508,6 +517,9 @@ bool Summary::set2 ( Xml *xml ,
bb[j] |= D_USED;
}
if ( numFinal <= m_numDisplayLines )
m_displayLen = p - m_summary;
/*end = gettimeofdayInMilliseconds();
if ( end - start > 10 )
log ( LOG_WARN,"summary: took %llims to finish doing summary "
@ -530,18 +542,25 @@ bool Summary::set2 ( Xml *xml ,
m_summaryExcerptLen[0] = p - m_summary;
m_numExcerpts = 1;
}
// in this case we only have one summary line
if ( m_numDisplayLines > 0 )
m_displayLen = p - m_summary;
}
// If we still didn't find a summary, get the default summary
if ( p == m_summary )
if ( p == m_summary ) {
// then return the default summary
return getDefaultSummary ( xml,
words,
sections,
pos,
//bigSampleRadius,
maxSummaryLen );
bool status = getDefaultSummary ( xml,
words,
sections,
pos,
//bigSampleRadius,
maxSummaryLen );
if ( m_numDisplayLines > 0 )
m_displayLen = m_summaryLen;
return status;
}
// if we don't find a summary, theres no need to NULL terminate
if ( p != m_summary ) *p++ = '\0';
@ -954,6 +973,10 @@ bool Summary::getDefaultSummary ( Xml *xml,
m_summaryLen = xml->getMetaContent(p,maxSummaryLen,
"description",11);
if ( m_numDisplayLines > 0 )
m_displayLen = m_summaryLen;
if ( m_summaryLen > 0 ) {
m_summaryExcerptLen[0] = m_summaryLen;
m_numExcerpts = 1;
@ -1056,6 +1079,10 @@ bool Summary::getDefaultSummary ( Xml *xml,
*p++ = '\0';
// set length
m_summaryLen = p - m_summary;
if ( m_numDisplayLines > 0 )
m_displayLen = m_summaryLen;
if ( m_summaryLen > 50000 ) { char*xx=NULL;*xx=0; }
return true;
}

View File

@ -78,6 +78,7 @@ class Summary {
//long collLen ,
bool doStemming ,
long maxSummaryLen ,
long numDisplayLines ,
long maxNumLines ,
long maxNumCharsPerLine ,
//long bigSampleRadius ,
@ -237,6 +238,12 @@ class Summary {
//bool m_freeBuf;
//char m_localBuf[10032];
// if getting more lines for deduping than we need for displaying,
// how big is that part of the summary to display?
long m_numDisplayLines;
long m_displayLen;
long getSummaryDisplayLen() { return m_displayLen; }
long m_maxNumCharsPerLine;
long m_titleVersion;

View File

@ -136,7 +136,9 @@ bool TcpServer::init ( void (* requestHandler)(TcpSocket *s) ,
struct sockaddr_in name;
// parm
int options;
// if port is -1 don't set up a listening socket
// if port is -1 don't set up a listening socket, this is used
// for things like blaster that are clients only. or the qatest()
// function.
if ( m_port == -1 || m_port == 0 ) goto skipServer;
// . set up our connection listening socket
// . sets g_errno and returns -1 on error
@ -756,7 +758,7 @@ static long s_lastTime = 0;
TcpSocket *TcpServer::getNewSocket ( ) {
// . if outta sd's we close least used socket first
// . if they're all in use set g_errno and return NULL
if ( m_numIncomingUsed >= *m_maxSocketsPtr )
if ( m_maxSocketsPtr && m_numIncomingUsed >= *m_maxSocketsPtr )
if ( ! closeLeastUsed () ){
// note it in the log
long now = getTimeLocal();

View File

@ -1878,15 +1878,18 @@ bool Title::copyTitle ( Words *w , Pos *pos ,
// size of character in bytes, usually 1
char cs ;
// point to last punct char
char *lastp = NULL;
char *lastp = dst;//NULL;
// convert them always for now
bool convertHtmlEntities = true;
long charCount = 0;
// copy the node @p into "dst"
for ( ; src < srcEnd ; src += cs , dst += cs ) {
// get src size
cs = getUtf8CharSize ( src );
// break if we are full!
if ( dst + cs >= dstEnd ) break;
// or hit our max char limit
if ( charCount++ >= m_maxTitleChars ) break;
// remember last punct for cutting purposes
if ( ! is_alnum_utf8 ( src ) ) lastp = dst;
// encode it as an html entity if asked to

View File

@ -890,9 +890,10 @@ char *XmlDoc::getTestDir ( ) {
// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
// then return "test-spider" otherwise...
if ( m_sreqValid && m_sreq.m_useTestSpiderDir )
return "test-spider";
return "qa";//"test-spider";
// ... default to "test-parser"
return "test-parser";
//return "test-parser";
return "qa";
/*
if ( getIsPageParser() )
return "test-page-parser";
@ -1969,6 +1970,8 @@ bool XmlDoc::injectDoc ( char *url ,
SpiderRequest sreq;
sreq.setFromInject ( cleanUrl );
if ( deleteUrl )
sreq.m_forceDelete = 1;
//static char s_dummy[3];
// sometims the content is indeed NULL...
@ -2282,6 +2285,9 @@ bool XmlDoc::indexDoc ( ) {
//
////
SpiderReply *nsr = getFakeSpiderReply ( );
// this can be NULL and g_errno set to ENOCOLLREC or something
if ( ! nsr )
return true;
//SafeBuf metaList;
if ( ! m_metaList2.pushChar(RDB_SPIDERDB) )
@ -3229,6 +3235,10 @@ long *XmlDoc::getIndexCode2 ( ) {
if ( gr->getLong("deep",0) ) spamCheck = false;
// not for crawlbot
if ( cr->m_isCustomCrawl ) spamCheck = false;
// only html for now
if ( m_contentTypeValid && m_contentType != CT_HTML ) spamCheck =false;
// turn this off for now
spamCheck = false;
// otherwise, check the weights
if ( spamCheck ) {
char *ws = getWordSpamVec();
@ -3272,17 +3282,23 @@ long *XmlDoc::getIndexCode2 ( ) {
return &m_indexCode;
}
// if using diffbot and the diffbot reply had a time out error
// or otherwise... diffbot failure demands a re-try always i guess.
// put this above getSpiderPriority() call otherwise we end up in
// a recursive loop with getIndexCode() and getNewSpiderReply()
SafeBuf *dbr = getDiffbotReply();
if ( ! dbr || dbr == (void *)-1 ) return (long *)dbr;
if ( m_diffbotReplyValid && m_diffbotReplyError ) {
m_indexCode= m_diffbotReplyError;
m_indexCodeValid = true;
return &m_indexCode;
}
// . if using diffbot and the diffbot reply had a time out error
// or otherwise... diffbot failure demands a re-try always i guess.
// put this above getSpiderPriority() call otherwise we end up in
// a recursive loop with getIndexCode() and getNewSpiderReply()
// . NO, don't do this anymore, however, if there is a diffbot
// reply error then record it in the spider reply BUT only if it is
// a diffbot reply error that warrants a retry. for instance,
// EDIFFBOTCOULDNOTDOWNLOAD happens when diffbot got a 404 or 500
// error trying to download the page so it probably should not
// retry. but EDIFFBOTREQUESTTIMEDOUT should retry.
// SafeBuf *dbr = getDiffbotReply();
// if ( ! dbr || dbr == (void *)-1 ) return (long *)dbr;
// if ( m_diffbotReplyValid && m_diffbotReplyError ) {
// m_indexCode= m_diffbotReplyError;
// m_indexCodeValid = true;
// return &m_indexCode;
// }
// no error otherwise
m_indexCode = 0;
@ -9639,8 +9655,10 @@ Url **XmlDoc::getRedirUrl() {
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
return &m_redirUrlPtr;
}
// if we followed too many then bail
if ( ++m_numRedirects >= 4 ) {
// . if we followed too many then bail
// . www.motorolamobility.com www.outlook.com ... failed when we
// had >= 4 here
if ( ++m_numRedirects >= 5 ) {
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
return &m_redirUrlPtr;
}
@ -10702,6 +10720,8 @@ char *XmlDoc::getIsIndexed ( ) {
// note it
if ( ! m_calledMsg22e )
setStatus ( "checking titledb for old title rec");
else
setStatus ( "back from msg22e call");
// . consult the title rec tree!
// . "justCheckTfndb" is set to true here!
@ -13621,7 +13641,35 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
THIS->m_diffbotUrl.getBufStart(),
page
);
THIS->m_diffbotReplyError = EDIFFBOTINTERNALERROR;
// try to get the right error code
char *err = strstr(page,"\"error\":\"");
if ( err ) err += 9;
long code = EDIFFBOTUNKNOWNERROR;
if ( err && !strncmp(err,"Unable to apply rules",21))
code = EDIFFBOTUNABLETOAPPLYRULES;
// like .pdf pages get this error
if ( err && !strncmp(err,"Could not parse page",20))
code = EDIFFBOTCOULDNOTPARSE;
// if it is 404... 502, etc. any http status code
if ( err && !strncmp(err,"Could not download page",23))
code = EDIFFBOTCOULDNOTDOWNLOAD;
// custom api does not apply to the url
if ( err && !strncmp(err,"Invalid API",11))
code = EDIFFBOTINVALIDAPI;
if ( err && !strncmp(err,"Version required",16))
code = EDIFFBOTVERSIONREQ;
if ( err && !strncmp(err,"Empty content",13))
code = EDIFFBOTEMPTYCONTENT;
if ( err && !strncmp(err,"No content received",19))
code = EDIFFBOTEMPTYCONTENT;
if ( err && !strncmp(err,"Request timed",13))
code = EDIFFBOTREQUESTTIMEDOUT;
// error processing url
if ( err && !strncmp(err,"Error processing",16))
code = EDIFFBOTURLPROCESSERROR;
if ( err && !strncmp(err,"Your token has exp",18))
code = EDIFFBOTTOKENEXPIRED;
THIS->m_diffbotReplyError = code;
}
// a hack for detecting if token is expired
if ( ! ttt && cr && strstr ( page , ":429}" ) ) {
@ -15183,6 +15231,7 @@ long long *XmlDoc::getDownloadEndTime ( ) {
if ( m_deleteFromIndex ) {
m_downloadEndTime = 0;
m_downloadEndTimeValid = true;
return &m_downloadEndTime;
}
// if recycling content use its download end time
@ -15199,7 +15248,7 @@ long long *XmlDoc::getDownloadEndTime ( ) {
return &m_downloadEndTime;
}
}
// need a valid reply
char **reply = getHttpReply ();
if ( ! reply || reply == (void *)-1 ) return (long long *)reply;
@ -17021,7 +17070,8 @@ char **XmlDoc::getUtf8Content ( ) {
// it should be there if trying to delete as well!
m_deleteFromIndex ) {
log("xmldoc: null utf8 content for docid-based "
"titlerec lookup which was not found");
"titlerec (d=%lli) lookup which was not found",
m_docId);
ptr_utf8Content = NULL;
size_utf8Content = 0;
m_utf8ContentValid = true;
@ -19804,7 +19854,9 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
if ( *p & 0x01 ) del = false;
else del = true;
// must always be negative if deleteing
if ( m_deleteFromIndex && ! del ) {
// spiderdb is exempt because we add a spiderreply that is
// positive and a spiderdoc
if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB) {
char *xx=NULL;*xx=0; }
// get the key size. a table lookup in Rdb.cpp.
long ks ;
@ -20485,7 +20537,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// if we are indexing a subdoc piece of a multidoc url
// then parentUrl should return non-NULL
char *parentUrl = getDiffbotParentUrl(od->m_firstUrl.m_url);
if ( ! parentUrl ) goto skip9;
if ( ! parentUrl && od->m_contentType != CT_STATUS )
goto skip9;
// in that case we need to reindex the parent url not the
// subdoc url, so make the spider reply gen quick
//SpiderReply *newsr = od->getFakeSpiderReply();
@ -20537,12 +20590,23 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// been fulfilled!
if( ! m_zbuf.safeMemcpy (&srep, srep.getRecSize()))
return NULL;
// complain
if ( ! cr->m_diffbotApiUrl.length()<1 && !cr->m_isCustomCrawl )
log("build: doing query reindex but diffbot api "
"url is not set in spider controls");
// but also store a new spider request for the parent url
SpiderRequest ksr;
long long pd;
// skip if doc is a spider status "document". their docids
// often get added during a query reindex but we should ignore
// them completely.
if ( od->m_contentType == CT_STATUS )
goto returnList;
//goto returnList;
// complain
if ( cr->m_diffbotApiUrl.length()<1 && !cr->m_isCustomCrawl )
log("build: doing query reindex but diffbot api "
"url is not set in spider controls");
// just copy original request
memcpy ( &ksr , &m_sreq , m_sreq.getRecSize() );
// do not spider links, it's a page reindex of a multidoc url
@ -20551,6 +20615,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
ksr.m_ignoreDocUnchangedError = 1;
// no longer docid based we set it to parentUrl
ksr.m_urlIsDocId = 0;
// but consider it a manual add. this should already be set.
ksr.m_isPageReindex = 1;
// but it is not docid based, so overwrite the docid
// in ksr.m_url with the parent multidoc url. it \0 terms it.
strcpy(ksr.m_url , parentUrl );//, MAX_URL_LEN-1);
@ -20558,7 +20624,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
//if ( ! od->m_firstIpValid ) { char *xx=NULL;*xx=0; }
// set the key, ksr.m_key. isDel = false
// fake docid
long long pd = g_titledb.getProbableDocId(parentUrl);
pd = g_titledb.getProbableDocId(parentUrl);
ksr.setKey ( m_sreq.m_firstIp, pd , false );
// store this
if ( ! m_zbuf.pushChar(RDB_SPIDERDB) )
@ -20566,6 +20632,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// then the request
if ( ! m_zbuf.safeMemcpy(&ksr,ksr.getRecSize() ) )
return NULL;
returnList:
// prevent cores in indexDoc()
m_indexCode = EREINDEXREDIR;
m_indexCodeValid = true;
@ -20960,7 +21027,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// if recycling json objects, leave them there!
if ( *recycle ) nukeJson = false;
// you have to be a diffbot crawl to do this
if ( ! cr->m_isCustomCrawl ) nukeJson = false;
// no, not if you have th diffbot api url set... so take this out
//if ( ! cr->m_isCustomCrawl ) nukeJson = false;
// do not remove old json objects if pageparser.cpp test
// because that can not change the index, etc.
if ( getIsPageParser() ) nukeJson = false;
@ -21818,7 +21886,12 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// but don't do this if it is pagereindex. why is pagereindex
// setting the injecting flag anyway?
long needSpiderdb3 = 0;
if ( m_sreqValid && m_sreq.m_isInjecting )//&&!m_sreq.m_isPageReindex)
if ( m_sreqValid &&
m_sreq.m_isInjecting &&
m_sreq.m_fakeFirstIp &&
! m_sreq.m_forceDelete &&
/// don't add requests like http://xyz.com/xxx-diffbotxyz0 though
! m_isDiffbotJSONObject )
needSpiderdb3 = m_sreq.getRecSize() + 1;
need += needSpiderdb3;
@ -22325,11 +22398,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
// if we are injecting we must add the spider request
// we are injecting from so the url can be scheduled to be
// spidered again
if ( m_sreqValid &&
m_sreq.m_isInjecting &&
m_sreq.m_fakeFirstIp &&
/// don't add requests like http://xyz.com/xxx-diffbotxyz0 though
! m_isDiffbotJSONObject ) {
if ( needSpiderdb3 ) {
// note it
setStatus("adding spider request");
// checkpoint
@ -23308,6 +23377,10 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
else
m_srep.m_hadDiffbotError = false;
// if we only had an error code in the diffbot reply, record that
if ( ! m_indexCode && m_diffbotReplyError )
m_srep.m_errCode = m_diffbotReplyError;
// sanity. if being called directly from indexDoc() because of
// an error like out of memory, then we do not know if it is
// indexed or not or was indexed...
@ -25112,11 +25185,11 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
// hash gbimage: for permalinks only for Images.cpp
for ( long i = 0 ; i < m_images.m_numImages ; i++ ) {
// get the node number
long nn = m_images.m_imageNodes[i];
//long nn = m_images.m_imageNodes[i];
// get the url of the image
XmlNode *xn = m_xml.getNodePtr(nn);
//XmlNode *xn = m_xml.getNodePtr(nn);
long srcLen;
char *src = xn->getFieldValue("src",&srcLen);
char *src = m_images.getImageUrl(i,&srcLen);
// set it to the full url
Url iu;
// use "pageUrl" as the baseUrl
@ -25488,6 +25561,17 @@ SafeBuf *XmlDoc::getSpiderReplyMetaList ( SpiderReply *reply ) {
return &m_spiderReplyMetaList;
}
// we double add regular html urls in a query reindex because the
// json url adds the parent, so the parent gets added twice sometimes,
// and for some reason it is adding a spider status doc the 2nd time
// so cut that out. this is kinda a hack b/c i'm not sure what's
// going on. but you can set a break point here and see what's up if
// you want.
if ( m_indexCodeValid && m_indexCode == EDOCFORCEDELETE ) {
m_spiderReplyMetaListValid = true;
return &m_spiderReplyMetaList;
}
// . fake this out so we do not core
// . hashWords3() uses it i guess
bool forcedLangId = false;
@ -28586,28 +28670,37 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
*/
// does they want a summary?
if ( m_req->m_numSummaryLines>0 && ! reply->ptr_sum ) {
char *sum = getHighlightedSummary();
if ( ! sum || sum == (void *)-1 ) return (Msg20Reply *)sum;
if ( m_req->m_numSummaryLines>0 && ! reply->ptr_displaySum ) {
char *hsum = getHighlightedSummary();
if ( ! hsum || hsum == (void *)-1 ) return (Msg20Reply *)hsum;
//Summary *s = getSummary();
//if ( ! s || s == (void *)-1 ) return (Msg20Reply *)s;
//long sumLen = m_finalSummaryBuf.length();
// is it size and not length?
long sumLen = 0;
long hsumLen = 0;
// seems like it can return 0x01 if none...
//if ( sum == (char *)0x01 ) sum = NULL;
// get len
if ( sum ) sumLen = gbstrlen(sum);
// must be \0 terminated
if ( sumLen > 0 && sum[sumLen] ) { char *xx=NULL;*xx=0; }
if ( hsum == (char *)0x01 ) hsum = NULL;
// get len. this is the HIGHLIGHTED summary so it is ok.
if ( hsum ) hsumLen = gbstrlen(hsum);
// must be \0 terminated. not any more, it can be a subset
// of a larger summary used for deduping
if ( hsumLen > 0 && hsum[hsumLen] ) { char *xx=NULL;*xx=0; }
// assume size is 0
long sumSize = 0;
//long sumSize = 0;
// include the \0 in size
if ( sum ) sumSize = sumLen + 1;
//if ( sum ) sumSize = sumLen + 1;
// do not get any more than "me" lines/excerpts of summary
//long max = m_req->m_numSummaryLines;
// grab stuff from it!
//reply->m_proximityScore = s->getProximityScore();
reply-> ptr_sum = sum;//s->getSummary();
reply->size_sum = sumSize;//s->getSummaryLen(max)+1;
reply-> ptr_displaySum = hsum;//s->getSummary();
reply->size_displaySum = hsumLen+1;//sumSize;//s->getSummaryLen
// this is unhighlighted for deduping, and it might be longer
// . seems like we are not using this for deduping but using
// the gigabit vector in Msg40.cpp, so take out for now
//reply-> ptr_dedupSum = s->m_summary;
//reply->size_dedupSum = s->m_summaryLen+1;
//if ( s->m_summaryLen == 0 ) reply->size_dedupSum = 0;
//reply->m_diversity = s->getDiversity();
}
@ -28675,6 +28768,15 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
}
}
// this is not documented because i don't think it will be popular
if ( m_req->m_getHeaderTag ) {
SafeBuf *htb = getHeaderTagBuf();
if ( ! htb || htb == (SafeBuf *)-1 ) return (Msg20Reply *)htb;
// it should be null terminated
reply->ptr_htag = htb->getBufStart();
reply->size_htag = htb->getLength() + 1;
}
// breathe
QUICKPOLL ( m_niceness );
@ -29674,6 +29776,38 @@ char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) {
return m_dbuf;
}
SafeBuf *XmlDoc::getHeaderTagBuf() {
if ( m_htbValid ) return &m_htb;
Sections *ss = getSections();
if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss;
// scan sections
Section *si = ss->m_rootSection;
for ( ; si ; si = si->m_next ) {
// breathe
QUICKPOLL(m_niceness);
if ( si->m_tagId == TAG_H1 ) break;
}
// if no h1 tag then make buf empty
if ( ! si ) {
m_htb.nullTerm();
m_htbValid = true;
return &m_htb;
}
// otherwise, set it
char *a = m_words.m_words[si->m_firstWordPos];
char *b = m_words.m_words[si->m_lastWordPos] ;
b += m_words.m_wordLens[si->m_lastWordPos];
// copy it
m_htb.safeMemcpy ( a , b - a );
m_htb.nullTerm();
m_htbValid = true;
return &m_htb;
}
Title *XmlDoc::getTitle ( ) {
if ( m_titleValid ) return &m_title;
// need a buncha crap
@ -29775,6 +29909,10 @@ Summary *XmlDoc::getSummary () {
false , // doStemming
m_req->m_summaryMaxLen ,
numLines ,
// . displayLines, # lines we are displaying
// . Summary::getDisplayLen() will return the
// length of the summary to display
m_req->m_numSummaryLines ,
cr->m_summaryMaxNumCharsPerLine,
m_req->m_ratInSummary ,
getFirstUrl() ,
@ -29807,11 +29945,15 @@ char *XmlDoc::getHighlightedSummary ( ) {
// get the summary
char *sum = s->getSummary();
long sumLen = s->getSummaryLen();
//long sumLen = s->getSummaryLen();
long sumLen = s->getSummaryDisplayLen();
//sum[sumLen] = 0;
// assume no highlighting?
if ( ! m_req->m_highlightQueryTerms || sumLen == 0 ) {
m_finalSummaryBuf.safeMemcpy ( sum , sumLen + 1 );
m_finalSummaryBuf.safeMemcpy ( sum , sumLen );
m_finalSummaryBuf.nullTerm();
m_finalSummaryBufValid = true;
return m_finalSummaryBuf.getBufStart();
//char *fsum = m_finalSummaryBuf.getBufStart();

View File

@ -821,6 +821,7 @@ class XmlDoc {
Query *getQuery() ;
Matches *getMatches () ;
char *getDescriptionBuf ( char *displayMetas , long *dlen ) ;
SafeBuf *getHeaderTagBuf();
class Title *getTitle ();
class Summary *getSummary () ;
char *getHighlightedSummary ();
@ -1377,6 +1378,7 @@ class XmlDoc {
bool m_matchesValid;
bool m_dbufValid;
bool m_titleValid;
bool m_htbValid;
bool m_collnumValid;
//bool m_twidsValid;
bool m_termId32BufValid;
@ -2010,6 +2012,7 @@ class XmlDoc {
// meta description buf
long m_dbufLen;
char m_dbuf[1024];
SafeBuf m_htb;
Title m_title;
Summary m_summary;
char m_isCompromised;

View File

@ -257,6 +257,8 @@ long XmlNode::setCommentNode2 ( char *node ) {
// look for ending of ]> like for <![if gt IE 6]>
if ( node[i] !='>' ) continue;
if ( node[i-1] ==']' ) break;
// look for ending of --> like for <![endif]-->
if ( node[i-1] == '-' && node[i-2] == '-' ) break;
}
// skip i over the >, if any (could be end of doc)

6
changelog Normal file
View File

@ -0,0 +1,6 @@
gb (1.1-1) unstable; urgency=low
* Lots of bug fixes
* API updates.
-- mwells <gigablast@mail.com> Sat, 05 Jul 2014 18:38:35 -0700

View File

@ -26,6 +26,8 @@ bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
//long g_qbufNeedSave = false;
//SafeBuf g_qbuf;
bool g_recoveryMode;
#define RDFBUFFER_SIZE (1024*1024*10)
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
#define RDFCONTENT_FILE "content.rdf.u8"

View File

@ -32,6 +32,11 @@ override_dh_strip:
# debian/gb.substvars and makes dpkg -i bitch about dependencies not being met
override_dh_shlibdeps:
echo "skipping dh_shlibdeps call! MDW"
# adding the line below here does not seem to make dpkg prompt to
# install netpbm, rather just bitch about it and make it harder to install
# echo "building our own gb.substvars"
# echo "misc:Depends=netpbm (>= 0.0)" > debian/gb.substvars
# echo "misc:Depends=netpbm" > debian/gb.substvars
# override_dh_shlibdeps-indep:
# echo "shit"

View File

@ -835,7 +835,7 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
<table cellpadding=1 border=0 width=100% bgcolor=#0079ba>
<tr><td><center><b><font color=#ffffff size=+1>Building a DMOZ Based Directory</td></tr></table>
<br>
&lt;<i>Last Updated October 2013</i>&gt;
&lt;<i>Last Updated July 2014</i>&gt;
<br>
<br>
@ -849,9 +849,9 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
<br> $ wget http://rdf.dmoz.org/rdf/structure.rdf.u8.gz
<br> $ gunzip structure.rdf.u8.gz</b>
<br>
<li>Execute <i>dmozparse</i> in its directory with the <i>new</i> option to generate the catdb dat files.<br> <b>$ dmozparse new</b><br>
<li>Execute <i>dmozparse</i> in its directory with the <i>new</i> option to generate the catdb dat files.<br> <b>$ ./dmozparse new</b><br>
<li>Execute the installcat script command on host 0 to distribute the catdb files to all the hosts.<br>This just does an scp/rcp from host 0 to the other hosts listed in <a href=#hosts>hosts.conf</a>.<br> <b>$ gb installcat</b><br>
<li>Execute the installcat script command on host 0 to distribute the catdb files to all the hosts.<br>This just does an scp/rcp from host 0 to the other hosts listed in <a href=#hosts>hosts.conf</a>.<br> <b>$ ./gb installcat</b><br>
<li>Make sure all spiders are stopped and inactive.<br>
@ -865,7 +865,7 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
<li>Gigablast provides the unique ability to search the content of the pages in the DMOZ directory. But in order to search the pages in DMOZ we have to index them.
So execute <i>dmozparse</i> with the <i>urldump -s</i> option to create the html/gbdmoz.urls.txt.* files which contain all the URLs in DMOZ. (Excluding URLs that contained hashtags, '#'.)
<br><b>$ dmozparse urldump -s</b>
<br><b>$ ./dmozparse urldump -s</b>
<br><li>Now tell Gigablast to index each URL listed in each gbdmoz.urls.txt.* file. Make sure you specify the collection you are using for DMOZ, in the example below it uses <i>main</i>. You can use the <a href=/addurl>add url</a> page to add the gbdmoz.urls.txt.* files or you can use curl (or wget) like:
<br>

View File

@ -129,6 +129,9 @@
//#include "Facebook.h"
//#include "Accessdb.h"
// from qa.cpp
bool qatest ( ) ;
// call this to shut everything down
bool mainShutdown ( bool urgent ) ;
//bool mainShutdown2 ( bool urgent ) ;
@ -1453,6 +1456,70 @@ int main2 ( int argc , char *argv[] ) {
g_conf.m_save = false;
//
// run our smoketests
//
if ( strcmp ( cmd, "qa" ) == 0 ) {
// let's ensure our core file can dump
struct rlimit lim;
lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
if ( setrlimit(RLIMIT_CORE,&lim) )
log("qa::setrlimit: %s", mstrerror(errno) );
// 50MB
g_conf.m_maxMem = 50000000;
// init our table for doing zobrist hashing
if ( ! hashinit() ) {
log("qa::hashinit failed" ); return 0; }
// init memory class after conf since it gets maxMem from Conf
if ( ! g_mem.init ( 200000000 ) ) {
log("qa::Mem init failed" ); return 0; }
if (!ucInit(g_hostdb.m_dir)) {
log("Unicode initialization failed!");
return 1;
}
g_conf.m_askRootNameservers = true;
//g_conf.m_dnsIps [0] = atoip ( "192.168.0.1", 11 );
//g_conf.m_dnsClientPort = 9909;
g_conf.m_dnsMaxCacheMem = 1024*10;
// hack http server port to -1 (none)
//g_conf.m_httpPort = 0;
g_conf.m_httpMaxSockets = 200;
//g_conf.m_httpMaxReadBufSize = 102*1024*1024;
g_conf.m_httpMaxSendBufSize = 16*1024;
// init the loop
if ( ! g_loop.init() ) {
log("qa::Loop init failed" ); return 0; }
// . then dns client
// . server should listen to a socket and register with g_loop
if ( ! g_dns.init(14834) ) {
log("qa::Dns client init failed" ); return 0; }
// . then webserver
// . server should listen to a socket and register with g_loop
// . use -1 for both http and https ports to mean do not
// listen on any ports. we are a client only.
if ( ! g_httpServer.init( -1 , -1 ) ) {
log("qa::HttpServer init failed" ); return 0; }
// set our new pid
g_mem.setPid();
g_threads.setPid();
g_log.setPid();
//
// beging the qaloop
//
qatest();
//
// wait for some i/o signals
//
if ( ! g_loop.runLoop() ) {
log("db: runLoop failed." );
return 1;
}
// no error, return 0
return 0;
}
// log the version
//log(LOG_INIT,"conf: Gigablast Server %s",GBVersion);
@ -5044,7 +5111,19 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
else if ( installFlag == ifk_installcat ) {
// . copy catdb files to all hosts
// don't copy to ourselves
if ( h2->m_hostId == 0 ) continue;
if ( h2->m_hostId == 0 ) {
sprintf(tmp,
"cp "
"content.rdf.u8 "
"structure.rdf.u8 "
"gbdmoz.structure.dat "
"gbdmoz.content.dat "
"%scatdb/",
h2->m_dir);
log(LOG_INIT,"admin: %s", tmp);
system ( tmp );
continue;
}
sprintf(tmp,
"rcp "
"%scatdb/content.rdf.u8 "

488
qa.cpp
View File

@ -4,15 +4,18 @@
static long s_failures = 0;
bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
bool getUrl( char *path ,
void (* callback) (void *state, TcpSocket *sock) ,
char *post = NULL ) {
SafeBuf sb;
sb.safePrintf ( "http://%s:%li%s"
, iptoa(g_hostdb.m_myHost->m_ip)
, (long)g_hostdb.m_myHost->m_port
, (long)g_hostdb.m_myHost->m_httpPort
, path
);
Url u;
u.set ( sb.getBufStart() );
log("qa: getting %s",sb.getBufStart());
if ( ! g_httpServer.getDoc ( u.getUrl() ,
0 , // ip
0 , // offset
@ -25,7 +28,13 @@ bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
0, // proxyport
-1, // maxtextdoclen
-1, // maxotherdoclen
NULL ) ) // useragent
NULL , // useragent
"HTTP/1.0" , // protocol
true , // doPost
NULL , // cookie
NULL , // additionalHeader
NULL , // fullRequest
post ) )
return false;
// error?
log("qa: getUrl error: %s",mstrerror(g_errno));
@ -34,27 +43,90 @@ bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
bool qatest ( ) ;
void qatestWrapper ( void *state , TcpSocket *sock ) { qatest(); }
void markOut ( char *reply , char *needle ) {
// return false if blocked, true otherwise
bool addColl ( ) {
static bool s_flag = false;
if ( s_flag ) return true;
s_flag = true;
return getUrl ( "/admin/addcoll?c=qatest123" , qatestWrapper );
if ( ! reply ) return;
char *s = strstr ( reply , needle );
if ( ! s ) return;
for ( ; *s && ! is_digit(*s); s++ );
// find end of digit stream
//char *end = s;
//while ( ; *end && is_digit(*s); end++ );
// just bury the digit stream now, zeroing out was not
// a consistent LENGTH if we had 10 hits vs 9... making the hash
// different
// space out digits
for ( ; *s && is_digit(*s); s++ ) *s = ' ';
}
// do not hash
long qa_hash32 ( char *s ) {
unsigned long h = 0;
long k = 0;
for ( long i = 0 ; s[i] ; i++ ) {
// skip if not first space and back to back spaces
if ( s[i] == ' ' &&i>0 && s[i-1]==' ') continue;
h ^= g_hashtab [(unsigned char)k] [(unsigned char)s[i]];
k++;
}
return h;
}
long s_replyCRC = 0;
TcpSocket *s_sock = NULL;
void qatestWrapper ( void *state , TcpSocket *sock ) {
log("qa: got reply(%li)=%s",sock->m_readOffset,sock->m_readBuf);
// get mime
HttpMime mime;
mime.set ( sock->m_readBuf , sock->m_readOffset , NULL );
// only hash content since mime has a timestamp in it
char *content = mime.getContent();
long contentLen = mime.getContentLen();
if ( content[contentLen] ) { char *xx=NULL;*xx=0; }
char *reply = sock->m_readBuf;
// take out <responseTimeMS>
markOut ( reply , "<currentTimeUTC>");
markOut ( reply , "<responseTimeMS>");
// until i figure this one out, take it out
markOut ( reply , "<docsInCollection>");
// until i figure this one out, take it out
markOut ( reply , "<hits>");
// make checksum. we ignore back to back spaces so this
// hash works for <docsInCollection>10 vs <docsInCollection>9
s_replyCRC = qa_hash32 ( content );
// this too is used for recording the reply into a file on disk
s_sock = sock;
// continue qa loop
qatest();
}
// first inject a set list of urls
static char **s_urlPtrs = NULL;
static long s_numUrls = 0;
static char **s_contentPtrs = NULL;
static SafeBuf s_ubuf1;
static SafeBuf s_ubuf2;
static SafeBuf s_cbuf2;
bool loadUrls ( ) {
static bool s_loaded = false;
if ( s_loaded ) return true;
s_loaded = true;
// use injectme3 file
s_ubuf1.load("./injectme3");
// scan for +++URL: xxxxx
@ -62,6 +134,8 @@ bool loadUrls ( ) {
for ( ; *s ; s++ ) {
if ( strncmp(s,"+++URL: ",8) ) continue;
// got one
// \0 term it for s_contentPtrs below
*s = '\0';
// find end of it
s += 8;
char *e = s;
@ -72,27 +146,16 @@ bool loadUrls ( ) {
s_ubuf2.pushLong((long)s);
// skip past that
s = e;
// point to content
s_cbuf2.pushLong((long)(s+1));
}
// make array of url ptrs
s_urlPtrs = (char **)s_ubuf2.getBufStart();
s_contentPtrs= (char **)s_cbuf2.getBufStart();
return true;
}
bool injectUrls ( ) {
loadUrls();
static long s_ii = 0;
for ( ; s_ii < s_numUrls ; ) {
// pre-inc it
s_ii++;
// inject using html api
SafeBuf sb;
sb.safePrintf("/admin/inject?c=qatest123&delete=0&u=");
sb.urlEncode ( s_urlPtrs[s_ii] );
return getUrl ( sb.getBufStart() , qatestWrapper );
}
return true;
}
/*
static char *s_queries[] = {
"the",
"+the",
@ -106,116 +169,7 @@ static char *s_queries[] = {
"cat -dog",
"site:wisc.edu"
};
static long s_checksums[] = {
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0
};
static long s_qi1 = 0;
void doneSearching1 ( void *state , TcpSocket *sock ) {
//loadQueries1();
long ii = s_qi1 - 1;
// get checksum of it
HttpMime hm;
hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
char *page = sock->m_readBuf + hm.getMimeLen() ;
// we will need to ignore fields like the latency etc.
// perhaps pass that in as a cgi parm. &qa=1
long crc = hash32n ( page );
if ( crc != s_checksums[ii] ) {
log("qatest: query '%s' checksum %lu != %lu",
s_queries[ii],
s_checksums[ii],
crc);
s_failures++;
}
// resume the qa loop
qatest();
}
// ensure search results are consistent
bool searchTest1 () {
long nq = sizeof(s_queries)/sizeof(char *);
for ( ; s_qi1 < nq ; ) {
// pre-inc it
s_qi1++;
// inject using html api
SafeBuf sb;
// qa=1 tell gb to exclude "variable" or "random" things
// from the serps so we can checksum it consistently
sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
sb.urlEncode ( s_queries[s_qi1] );
return getUrl ( sb.getBufStart() , doneSearching1 );
}
return true;
}
static long s_qi2 = 0;
void doneSearching2 ( void *state , TcpSocket *sock ) {
//loadQueries1();
long ii = s_qi2 - 1;
// get checksum of it
HttpMime hm;
hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
char *page = sock->m_readBuf + hm.getMimeLen() ;
// we will need to ignore fields like the latency etc.
// perhaps pass that in as a cgi parm. &qa=1
long crc = hash32n ( page );
if ( crc != s_checksums[ii] ) {
log("qatest: query '%s' checksum %lu != %lu",
s_queries[ii],
s_checksums[ii],
crc);
s_failures++;
}
// resume the qa loop
qatest();
}
// ensure search results are consistent
bool searchTest2 () {
long nq = sizeof(s_queries)/sizeof(char *);
for ( ; s_qi2 < nq ; ) {
// pre-inc it
s_qi2++;
// inject using html api
SafeBuf sb;
// qa=1 tell gb to exclude "variable" or "random" things
// from the serps so we can checksum it consistently
sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
sb.urlEncode ( s_queries[s_qi2] );
return getUrl ( sb.getBufStart() , doneSearching2 );
}
return true;
}
bool deleteUrls ( ) {
static long s_ii2 = 0;
for ( ; s_ii2 < s_numUrls ; ) {
// pre-inc it
s_ii2++;
// reject using html api
SafeBuf sb;
sb.safePrintf( "/admin/inject?c=qatest123&delete=1&u=");
sb.urlEncode ( s_urlPtrs[s_ii2] );
return getUrl ( sb.getBufStart() , qatestWrapper );
}
return true;
}
*/
#include "Msg0.h"
static Msg0 s_msg0;
@ -371,67 +325,238 @@ bool checkSpidersDone ( ) {
return false;
}
bool delColl ( ) {
static bool s_flag = false;
if ( s_flag ) return true;
s_flag = true;
return getUrl ( "/admin/delcoll?c=qatest123" , qatestWrapper );
//static long s_phase = -1;
void checkCRC ( long needCRC ) {
// and our current reply
SafeBuf fb2;
fb2.safeMemcpy(s_sock->m_readBuf,s_sock->m_readOffset);
fb2.nullTerm();
if ( s_replyCRC == needCRC ) {
// save reply if good
char fn3[1024];
sprintf(fn3,"%sqa/reply.%li",g_hostdb.m_dir,needCRC);
File ff; ff.set ( fn3 );
if ( ff.doesExist() ) return;
// if not there yet then save it
fb2.save(fn3);
return;
}
const char *emsg = "qa: bad replyCRC of %li should be %li "
"\n";//"phase=%li\n";
fprintf(stderr,emsg,s_replyCRC,needCRC);//,s_phase-1);
// get response on file
SafeBuf fb1;
char fn1[1024];
sprintf(fn1,"%sqa/reply.%li",g_hostdb.m_dir,needCRC);
fb1.load(fn1);
fb1.nullTerm();
// break up into lines
char fn2[1024];
sprintf(fn2,"/tmp/reply.%li",s_replyCRC);
fb2.save ( fn2 );
// do the diff between the two replies so we can see what changed
char cmd[1024];
sprintf(cmd,"diff %s %s",fn1,fn2);
fprintf(stderr,"%s\n",cmd);
system(cmd);
// if this is zero allow it to slide by. it is learning mode i guess.
// so we can learn what crc we need to use.
if ( needCRC == 0 ) return;
// otherwise, stop right there for debugging
exit(1);
}
#undef usleep
static long s_rdbId1 = 0;
static long s_rdbId2 = 0;
//static long s_rdbId3 = 0;
//
// the injection qa test suite
//
bool qainject () {
// . run a series of tests to ensure that gb is functioning properly
// . use s_urls[] array of urls for injecting and spider seeding
// . contain an archive copy of all webpages in the injectme3 file and
// in pagearchive1.txt file
// . while initially spidering store pages in pagearchive1.txt so we can
// replay later. store up to 100,000 pages in there.
bool qatest ( ) {
static bool s_x1 = false;
if ( ! s_x1 ) {
s_x1 = true;
return getUrl ( "/admin/delcoll?delcoll=qatest123" ,
qatestWrapper );
}
//
// add the 'qatest123' collection
if ( ! addColl () ) return false;
//
static bool s_x2 = false;
if ( ! s_x2 ) {
s_x2 = true;
if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" ,
qatestWrapper ) )
return false;
}
//
// check addcoll reply
//
static bool s_x3 = false;
if ( ! s_x3 ) {
s_x3 = true;
checkCRC ( 238170006 );
}
//
// inject urls, return false if not done yet
if ( ! injectUrls ( ) ) return false;
//
static bool s_x4 = false;
if ( ! s_x4 ) {
// TODO: try delimeter based injection too
loadUrls();
static long s_ii = 0;
for ( ; s_ii < s_ubuf2.length()/(long)sizeof(char *) ; ) {
// inject using html api
SafeBuf sb;
sb.safePrintf("&c=qatest123&deleteurl=0&"
"format=xml&u=");
sb.urlEncode ( s_urlPtrs[s_ii] );
// the content
sb.safePrintf("&hasmime=1");
sb.safePrintf("&content=");
sb.urlEncode(s_contentPtrs[s_ii] );
sb.nullTerm();
// pre-inc it in case getUrl() blocks
s_ii++;
getUrl("/admin/inject",qatestWrapper,sb.getBufStart());
return false;
}
s_x4 = true;
}
// +the
static bool s_x5 = false;
if ( ! s_x5 ) {
usleep(500000);
s_x5 = true;
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
qatestWrapper );
return false;
}
static bool s_x6 = false;
if ( ! s_x6 ) { s_x6 = true ; checkCRC ( -1452050577 ); }
// sports news
static bool s_x7 = false;
if ( ! s_x7 ) {
s_x7 = true;
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports+news",
qatestWrapper );
return false;
}
static bool s_x8 = false;
if ( ! s_x8 ) { s_x8 = true; checkCRC ( -1586622518 ); }
//
// eject/delete the urls
//
static long s_ii2 = 0;
for ( ; s_ii2 < s_ubuf2.length()/(long)sizeof(char *) ; ) {
// reject using html api
SafeBuf sb;
sb.safePrintf( "/admin/inject?c=qatest123&deleteurl=1&"
"format=xml&u=");
sb.urlEncode ( s_urlPtrs[s_ii2] );
sb.nullTerm();
// pre-inc it in case getUrl() blocks
s_ii2++;
getUrl ( sb.getBufStart() , qatestWrapper );
return false;
}
//
// make sure no results left, +the
//
static bool s_x9 = false;
if ( ! s_x9 ) {
usleep(500000);
s_x9 = true;
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
qatestWrapper );
return false;
}
// seems to have <docsInCollection>2</>
static bool s_y1 = false;
if ( ! s_y1 ) { s_y1 = true; checkCRC ( -1672870556 ); }
//
// try delimeter based injecting
//
static bool s_y2 = false;
if ( ! s_y2 ) {
s_y2 = true;
SafeBuf sb;
// delim=+++URL:
sb.safePrintf("&c=qatest123&deleteurl=0&"
"delim=%%2B%%2B%%2BURL%%3A&format=xml&u=xyz.com&"
"hasmime=1&content=");
// use injectme3 file
SafeBuf ubuf;
ubuf.load("./injectme3");
sb.urlEncode(ubuf.getBufStart());
getUrl ( "/admin/inject",qatestWrapper,sb.getBufStart());
return false;
}
// check the reply, seems to have only a single docid in it...
static bool s_y3 = false;
if ( ! s_y3 ) { s_y3 = true; checkCRC ( -1970198487 ); }
// now query check
static bool s_y4 = false;
if ( ! s_y4 ) {
usleep(500000);
s_y4 = true;
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
qatestWrapper );
return false;
}
// check search results crc
static bool s_y5 = false;
if ( ! s_y5 ) { s_y5 = true; checkCRC ( -480078278 ); }
// test search results
if ( ! searchTest1 () ) return false;
// delete all urls cleanly now
if ( ! deleteUrls ( ) ) return false;
// now get rdblist for every rdb for this coll and make sure all zero!
if ( ! checkRdbLists ( &s_rdbId1 ) ) return false;
//if ( ! checkRdbLists ( &s_rdbId1 ) ) return false;
// dump, tight merge and ensure no data in our rdbs for this coll
if ( ! dumpTreesToDisk() ) return false;
//if ( ! dumpTreesToDisk() ) return false;
// wait for tight merge to complete
if ( ! waitForMergeToFinish() ) return false;
//if ( ! waitForMergeToFinish() ) return false;
// now get rdblist for every rdb for this coll and make sure all zero!
if ( ! checkRdbLists ( &s_rdbId2 ) ) return false;
//if ( ! checkRdbLists ( &s_rdbId2 ) ) return false;
// reset the collection so we can test spidering
if ( ! resetColl ( ) ) return false;
//if ( ! resetColl ( ) ) return false;
// add urls to seed spider with. make msg13.cpp recognize qatest123
// collection and return 404 on urls not in our official list so
// we can ensure search result consistency. msg13.cpp will initially
// store the pages in a file, like the first 1,000 or so pages.
if ( ! addUrlTest () ) return false;
//if ( ! addUrlTest () ) return false;
// wait for spidering to complete. sleep callback. # of spidered urls
// will be x, so we know when to stop
if ( ! checkSpidersDone() ) return false;
// . now search again on the large collection most likely
// . store search queries and checksum into queries2.txt
// . a 0 (or no) checksum means we should fill it in
if ( ! searchTest2 () ) return false;
//if ( ! checkSpidersDone() ) return false;
// try a query delete
//if ( ! queryDeleteTest() ) return false;
@ -440,7 +565,30 @@ bool qatest ( ) {
//if ( ! checkRdbLists ( &s_rdbId3 ) ) return false;
// delete the collection
if ( ! delColl() ) return false;
static bool s_fee = false;
if ( ! s_fee ) {
s_fee = true;
return getUrl ( "/admin/delcoll?delcoll=qatest123" ,
qatestWrapper );
}
static bool s_fee2 = false;
if ( ! s_fee2 ) {
s_fee2 = true;
fprintf(stderr,"\n\n\nSUCCESSFULLY COMPLETED QA TEST\n\n\n");
exit(0);
}
return true;
}
// . run a series of tests to ensure that gb is functioning properly
// . uses the ./qa subdirectory to hold archive pages, ips, spider dates to
// ensure consistency between tests for exact replays
bool qatest ( ) {
return qainject();
}