mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
Merge branch 'testing' into diffbot-matt
Conflicts: Errno.cpp Errno.h Parms.h
This commit is contained in:
commit
6434e5cc04
@ -1962,7 +1962,7 @@ bool CollectionRec::rebuildUrlFilters2 ( ) {
|
||||
m_harvestLinks[n] = 1;
|
||||
*/
|
||||
|
||||
m_regExs[n].set("isdocidbased");
|
||||
m_regExs[n].set("isreindex");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
@ -2198,7 +2198,7 @@ bool CollectionRec::rebuildChineseRules ( ) {
|
||||
|
||||
long n = 0;
|
||||
|
||||
m_regExs[n].set("isdocidbased");
|
||||
m_regExs[n].set("isreindex");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
@ -3029,7 +3029,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
long i = 0;
|
||||
|
||||
// 1st one! for query reindex/ query delete
|
||||
m_regExs[i].set("isdocidbased");
|
||||
m_regExs[i].set("isreindex");
|
||||
m_spiderIpMaxSpiders [i] = 10;
|
||||
m_spiderPriorities [i] = 70;
|
||||
i++;
|
||||
|
@ -426,6 +426,9 @@ class CollectionRec {
|
||||
long m_spiderRoundNum;
|
||||
|
||||
char m_makeImageThumbnails;
|
||||
|
||||
long m_thumbnailMaxWidthHeight ;
|
||||
|
||||
char m_indexSpiderReplies;
|
||||
char m_indexBody;
|
||||
|
||||
@ -616,7 +619,6 @@ class CollectionRec {
|
||||
long m_summaryMaxLen;
|
||||
long m_summaryMaxNumLines;
|
||||
long m_summaryMaxNumCharsPerLine;
|
||||
long m_summaryDefaultNumLines;
|
||||
char m_useNewSummaries;
|
||||
|
||||
char m_getDocIdScoringInfo;
|
||||
|
93
Dates.cpp
93
Dates.cpp
@ -17781,6 +17781,65 @@ TimeZone tzs[] = {
|
||||
// hash table of timezone information
|
||||
static HashTableX s_tzt;
|
||||
|
||||
static long long h_mountain;
|
||||
static long long h_eastern;
|
||||
static long long h_central;
|
||||
static long long h_pacific;
|
||||
static long long h_time2;
|
||||
static long long h_mdt;
|
||||
static long long h_at2;
|
||||
|
||||
bool initTimeZoneTable ( ) {
|
||||
|
||||
// if already initalized return true
|
||||
if ( s_tzt.m_numSlotsUsed ) return true;
|
||||
|
||||
// init static wids
|
||||
h_mountain = hash64n("mountain");
|
||||
h_eastern = hash64n("eastern");
|
||||
h_central = hash64n("central");
|
||||
h_pacific = hash64n("pacific");
|
||||
h_time2 = hash64n("time");
|
||||
h_mdt = hash64n("mdt");
|
||||
h_at2 = hash64n("at");
|
||||
// set up the time zone hashtable
|
||||
if ( ! s_tzt.set( 8,4, 300,NULL,0,false,0,"tzts"))
|
||||
return false;
|
||||
// load time zone names and their modifiers into hashtable
|
||||
for ( long i = 0 ; *tzs[i].m_name ; i++ ) {
|
||||
char *t = tzs[i].m_name;
|
||||
long tlen = gbstrlen(t);
|
||||
// hash like Words.cpp computeWordIds
|
||||
uint64_t h = hash64Lower_utf8( t , tlen );
|
||||
// use the ptr as the value
|
||||
if ( ! s_tzt.addKey ( &h, &tzs[i] ) )
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// return what we have to add to UTC to get time in locale specified by "s"
|
||||
// where "s" is like "PDT" "MST" "EST" etc. if unknown return 999999
|
||||
long getTimeZone ( char *s ) {
|
||||
if ( ! s ) return BADTIMEZONE;
|
||||
char *send = s;
|
||||
// point to end of the potential timezone
|
||||
for ( ; *send && isalnum(*send) ; send++ );
|
||||
// hash it
|
||||
uint64_t h = hash64Lower_utf8( s , send -s );
|
||||
// make sure table is ready
|
||||
initTimeZoneTable();
|
||||
// look it up
|
||||
long slot = s_tzt.getSlot( &h );
|
||||
if ( slot < 0 ) return 999999;
|
||||
// did we find it in the table?
|
||||
TimeZone *tzptr = (TimeZone *)s_tzt.getValueFromSlot ( slot );
|
||||
// no error, return true
|
||||
long secs = tzptr->m_hourMod * 3600;
|
||||
secs += tzptr->m_minMod * 60;
|
||||
return secs;
|
||||
}
|
||||
|
||||
// . returns how many words starting at i are in the time zone
|
||||
// . 0 means not a timezone
|
||||
long getTimeZoneWord ( long i ,
|
||||
@ -17793,40 +17852,14 @@ long getTimeZoneWord ( long i ,
|
||||
*tzptr = NULL;
|
||||
// only init table once
|
||||
bool s_init16 = false;
|
||||
static long long h_mountain;
|
||||
static long long h_eastern;
|
||||
static long long h_central;
|
||||
static long long h_pacific;
|
||||
static long long h_time;
|
||||
static long long h_mdt;
|
||||
static long long h_at;
|
||||
// init the hash table of month names
|
||||
if ( ! s_init16 ) {
|
||||
// init static wids
|
||||
h_mountain = hash64n("mountain");
|
||||
h_eastern = hash64n("eastern");
|
||||
h_central = hash64n("central");
|
||||
h_pacific = hash64n("pacific");
|
||||
h_time = hash64n("time");
|
||||
h_mdt = hash64n("mdt");
|
||||
h_at = hash64n("at");
|
||||
// set up the time zone hashtable
|
||||
if ( ! s_tzt.set( 8,4, 300,NULL,0,false,niceness,"tzts"))
|
||||
return -1;
|
||||
// load time zone names and their modifiers into hashtable
|
||||
for ( long i = 0 ; *tzs[i].m_name ; i++ ) {
|
||||
char *t = tzs[i].m_name;
|
||||
long tlen = gbstrlen(t);
|
||||
// hash like Words.cpp computeWordIds
|
||||
uint64_t h = hash64Lower_utf8( t , tlen );
|
||||
// use the ptr as the value
|
||||
if ( ! s_tzt.addKey ( &h, &tzs[i] ) )
|
||||
return -1;
|
||||
}
|
||||
// on error we return -1 from here
|
||||
if ( ! initTimeZoneTable() ) return -1;
|
||||
s_init16 = true;
|
||||
}
|
||||
// this is too common of a word!
|
||||
if ( wids[i] == h_at ) return 0;
|
||||
if ( wids[i] == h_at2 ) return 0;
|
||||
|
||||
long slot = s_tzt.getSlot( &wids[i] );
|
||||
// return this, assume just one word
|
||||
@ -17834,7 +17867,7 @@ long getTimeZoneWord ( long i ,
|
||||
// . "mountain time"
|
||||
// . this removes the event title "M-F 8:30 AM-5:30 PM Mountain Time"
|
||||
// from the event (horus) on http://www.sfreporter.com/contact_us/
|
||||
if ( slot<0 && i+2<nw && wids[i+2] == h_time ) {
|
||||
if ( slot<0 && i+2<nw && wids[i+2] == h_time2 ) {
|
||||
if ( wids[i] == h_mountain ) {
|
||||
slot = s_tzt.getSlot (&h_mdt);
|
||||
tznw = 3;
|
||||
|
8
Dates.h
8
Dates.h
@ -794,6 +794,7 @@ public:
|
||||
bool m_isSiteRoot ;
|
||||
};
|
||||
|
||||
|
||||
// now time zones
|
||||
struct TimeZone {
|
||||
char m_name[16];
|
||||
@ -803,6 +804,13 @@ struct TimeZone {
|
||||
long m_modType;
|
||||
};
|
||||
|
||||
#define BADTIMEZONE 999999
|
||||
|
||||
// "s" is the timezone, like "EDT" and we return # of secs to add to UTC
|
||||
// to get the current time in that time zone.
|
||||
// returns BADTIMEZONE if "s" is unknown timezone
|
||||
long getTimeZone ( char *s ) ;
|
||||
|
||||
// . returns how many words starting at i are in the time zone
|
||||
// . 0 means not a timezone
|
||||
long getTimeZoneWord ( long i , long long *wids , long nw ,
|
||||
|
15
Errno.cpp
15
Errno.cpp
@ -170,8 +170,21 @@ case EDOCNONCANONICAL: return "Url was dup of canonical page";
|
||||
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
|
||||
case ENOTOKEN: return "Missing token";
|
||||
case EBADIMG: return "Bad image";
|
||||
case EREINDEXREDIR: return "Not parent url to reindex";
|
||||
case EREINDEXREDIR: return "Not a reindexable doc";
|
||||
case ETOOMANYPARENS: return "Too many nested parentheses in boolean query";
|
||||
case EDIFFBOTUNABLETOAPPLYRULES: return "Diffbot unable to apply rules";
|
||||
case EDIFFBOTCOULDNOTPARSE: return "Diffbot could not parse page";
|
||||
case EDIFFBOTCOULDNOTDOWNLOAD: return "Diffbot could not download page";
|
||||
case EDIFFBOTINVALIDAPI: return "Diffbot invalid API";
|
||||
case EDIFFBOTVERSIONREQ: return "Diffbot version required";
|
||||
case EDIFFBOTEMPTYCONTENT: return "Diffbot empty content";
|
||||
case EDIFFBOTREQUESTTIMEDOUT: return "Diffbot request timed out";
|
||||
case EDIFFBOTURLPROCESSERROR: return "Diffbot error processing url";
|
||||
case EDIFFBOTTOKENEXPIRED: return "Diffbot token expired";
|
||||
case EDIFFBOTUNKNOWNERROR: return "Diffbot unknown error";
|
||||
case EMISSINGINPUT: return "Missing required input parms";
|
||||
case EDMOZNOTREADY: return "Dmoz is not setup, follow instructions in "
|
||||
"admin.html to setup";
|
||||
case EPROXYSSLCONNECTFAILED: return "SSL tunnel through HTTP proxy failed";
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
|
14
Errno.h
14
Errno.h
@ -176,6 +176,20 @@ enum {
|
||||
EBADIMG,
|
||||
EREINDEXREDIR,
|
||||
ETOOMANYPARENS,
|
||||
|
||||
EDIFFBOTUNABLETOAPPLYRULES,
|
||||
EDIFFBOTCOULDNOTPARSE,
|
||||
EDIFFBOTCOULDNOTDOWNLOAD,
|
||||
EDIFFBOTINVALIDAPI,
|
||||
EDIFFBOTVERSIONREQ,
|
||||
EDIFFBOTEMPTYCONTENT,
|
||||
EDIFFBOTREQUESTTIMEDOUT,
|
||||
EDIFFBOTURLPROCESSERROR,
|
||||
EDIFFBOTTOKENEXPIRED,
|
||||
EDIFFBOTUNKNOWNERROR,
|
||||
|
||||
EMISSINGINPUT,
|
||||
EDMOZNOTREADY,
|
||||
EPROXYSSLCONNECTFAILED
|
||||
};
|
||||
#endif
|
||||
|
45
HttpMime.cpp
45
HttpMime.cpp
@ -237,6 +237,8 @@ time_t atotime ( char *s ) {
|
||||
return atotime3 ( s );
|
||||
}
|
||||
|
||||
#include "Dates.h" // for getTimeZone()
|
||||
|
||||
// #1: Sun, 06 Nov 1994 08:49:37 GMT ;RFC 822, updated by RFC 1123
|
||||
time_t atotime1 ( char *s ) {
|
||||
// this time structure, once filled, will help yield a time_t
|
||||
@ -258,8 +260,20 @@ time_t atotime1 ( char *s ) {
|
||||
getTime ( s , &t.tm_sec , &t.tm_min , &t.tm_hour );
|
||||
// unknown if we're in daylight savings time
|
||||
t.tm_isdst = -1;
|
||||
|
||||
// translate using mktime
|
||||
time_t local = mktime ( &t );
|
||||
time_t global = timegm ( &t );
|
||||
|
||||
// skip HH:MM:SS
|
||||
while ( ! isspace (*s) ) s++;
|
||||
// skip spaces
|
||||
while ( isspace (*s) ) s++;
|
||||
// convert local time to "utc" or whatever timezone "s" points to,
|
||||
// which is usually gmt or utc
|
||||
long tzoff = getTimeZone ( s ) ;
|
||||
if ( tzoff != BADTIMEZONE ) global += tzoff;
|
||||
return global;
|
||||
|
||||
// now, convert to utc
|
||||
//time_t utc = time(NULL);
|
||||
// get time here locally
|
||||
@ -268,7 +282,6 @@ time_t atotime1 ( char *s ) {
|
||||
//long delta = here - utc;
|
||||
// modify our time to make it into utc
|
||||
//return local - delta;
|
||||
return local;
|
||||
}
|
||||
|
||||
// #2: Sunday, 06-Nov-94 08:49:37 GMT ;RFC 850,obsoleted by RFC1036
|
||||
@ -293,7 +306,17 @@ time_t atotime2 ( char *s ) {
|
||||
// unknown if we're in daylight savings time
|
||||
t.tm_isdst = -1;
|
||||
// translate using mktime
|
||||
return mktime ( &t );
|
||||
time_t global = timegm ( &t );
|
||||
|
||||
// skip HH:MM:SS
|
||||
while ( ! isspace (*s) ) s++;
|
||||
// skip spaces
|
||||
while ( isspace (*s) ) s++;
|
||||
// convert local time to "utc" or whatever timezone "s" points to,
|
||||
// which is usually gmt or utc
|
||||
long tzoff = getTimeZone ( s ) ;
|
||||
if ( tzoff != BADTIMEZONE ) global += tzoff;
|
||||
return global;
|
||||
}
|
||||
|
||||
// #3: Sun Nov 6 08:49:37 1994 ;ANSI C's asctime() format
|
||||
@ -319,7 +342,7 @@ time_t atotime3 ( char *s ) {
|
||||
// unknown if we're in daylight savings time
|
||||
t.tm_isdst = -1;
|
||||
// translate using mktime
|
||||
time_t tt = mktime ( &t );
|
||||
time_t tt = timegm ( &t );
|
||||
return tt;
|
||||
}
|
||||
|
||||
@ -346,7 +369,17 @@ time_t atotime4 ( char *s ) {
|
||||
// unknown if we're in daylight savings time
|
||||
t.tm_isdst = -1;
|
||||
// translate using mktime
|
||||
return mktime ( &t );
|
||||
time_t global = timegm ( &t );
|
||||
|
||||
// skip HH:MM:SS
|
||||
while ( ! isspace (*s) ) s++;
|
||||
// skip spaces
|
||||
while ( isspace (*s) ) s++;
|
||||
// convert local time to "utc" or whatever timezone "s" points to,
|
||||
// which is usually gmt or utc
|
||||
long tzoff = getTimeZone ( s ) ;
|
||||
if ( tzoff != BADTIMEZONE ) global += tzoff;
|
||||
return global;
|
||||
}
|
||||
|
||||
// 2007-12-31
|
||||
@ -387,7 +420,7 @@ time_t atotime5 ( char *s ) {
|
||||
// unknown if we're in daylight savings time
|
||||
t.tm_isdst = -1;
|
||||
// translate using mktime
|
||||
return mktime ( &t );
|
||||
return timegm ( &t );
|
||||
}
|
||||
|
||||
|
||||
|
@ -6,18 +6,63 @@
|
||||
HttpRequest::HttpRequest () { m_cgiBuf = NULL; m_cgiBuf2 = NULL; reset(); }
|
||||
HttpRequest::~HttpRequest() { reset(); }
|
||||
|
||||
|
||||
char HttpRequest::getReplyFormat() {
|
||||
if ( m_replyFormatValid ) return m_replyFormat;
|
||||
char *fs = getString("format",NULL,NULL);
|
||||
char fmt = FORMAT_HTML;
|
||||
if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
|
||||
if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
|
||||
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
|
||||
m_replyFormat = fmt;
|
||||
|
||||
char *formatStr = getString("format");
|
||||
|
||||
char format = -1;//FORMAT_HTML;
|
||||
|
||||
// what format should search results be in? default is html
|
||||
if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
|
||||
if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
|
||||
if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
|
||||
if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
|
||||
if ( formatStr && strcmp(formatStr,"iframe")==0)
|
||||
format=FORMAT_WIDGET_IFRAME;
|
||||
if ( formatStr && strcmp(formatStr,"ajax")==0)
|
||||
format=FORMAT_WIDGET_AJAX;
|
||||
if ( formatStr && strcmp(formatStr,"append")==0)
|
||||
format=FORMAT_WIDGET_APPEND;
|
||||
|
||||
// support old api &xml=1 to mean &format=1
|
||||
if ( getLong("xml",0) ) {
|
||||
format = FORMAT_XML;
|
||||
}
|
||||
|
||||
// also support &json=1
|
||||
if ( getLong("json",0) ) {
|
||||
format = FORMAT_JSON;
|
||||
}
|
||||
|
||||
if ( getLong("csv",0) ) {
|
||||
format = FORMAT_CSV;
|
||||
}
|
||||
|
||||
if ( getLong("iframe",0) ) {
|
||||
format = FORMAT_WIDGET_IFRAME;
|
||||
}
|
||||
|
||||
if ( getLong("ajax",0) ) {
|
||||
format = FORMAT_WIDGET_AJAX;
|
||||
}
|
||||
|
||||
if ( getLong("append",0) ) {
|
||||
format = FORMAT_WIDGET_APPEND;
|
||||
}
|
||||
|
||||
// default to html
|
||||
if ( format == -1 )
|
||||
format = FORMAT_HTML;
|
||||
|
||||
m_replyFormat = format;
|
||||
m_replyFormatValid = true;
|
||||
return m_replyFormat;
|
||||
|
||||
return format;
|
||||
}
|
||||
|
||||
|
||||
void HttpRequest::reset() {
|
||||
m_numFields = 0;
|
||||
m_replyFormatValid = false;
|
||||
|
149
HttpServer.cpp
149
HttpServer.cpp
@ -1562,6 +1562,77 @@ void cleanUp ( void *state , TcpSocket *s ) {
|
||||
if ( s && s->m_state == f ) s->m_state = NULL;
|
||||
}
|
||||
|
||||
bool HttpServer::sendSuccessReply ( TcpSocket *s , char format, char *addMsg) {
|
||||
// get time in secs since epoch
|
||||
time_t now ;
|
||||
if ( isClockInSync() ) now = getTimeGlobal();
|
||||
else now = getTimeLocal();
|
||||
// . buffer for the MIME request and brief html err msg
|
||||
// . NOTE: ctime appends a \n to the time, so we don't need to
|
||||
char msg[1024];
|
||||
SafeBuf sb(msg,1024,0,false);
|
||||
|
||||
char *tt = asctime(gmtime ( &now ));
|
||||
tt [ gbstrlen(tt) - 1 ] = '\0';
|
||||
|
||||
char *ct = "text/html";
|
||||
if ( format == FORMAT_XML ) ct = "text/xml";
|
||||
if ( format == FORMAT_JSON ) ct = "application/json";
|
||||
|
||||
char cbuf[1024];
|
||||
SafeBuf cb(cbuf,1024,0,false);
|
||||
|
||||
if ( format != FORMAT_XML && format != FORMAT_JSON )
|
||||
cb.safePrintf("<html><b>Success</b></html>");
|
||||
|
||||
if ( format == FORMAT_XML ) {
|
||||
cb.safePrintf("<response>\n"
|
||||
"\t<statusCode>0</statusCode>\n"
|
||||
"\t<statusMsg><![CDATA[Success]]>"
|
||||
"</statusMsg>\n");
|
||||
}
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
cb.safePrintf("{\"response\":{\n"
|
||||
"\t\"statusCode\":0,\n"
|
||||
"\t\"statusMsg\":\"Success\",\n" );
|
||||
}
|
||||
|
||||
if ( addMsg )
|
||||
cb.safeStrcpy(addMsg);
|
||||
|
||||
|
||||
if ( format == FORMAT_XML ) {
|
||||
cb.safePrintf("</response>\n");
|
||||
}
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
// erase trailing ,\n
|
||||
cb.m_length -= 2;
|
||||
cb.safePrintf("\n"
|
||||
"}\n"
|
||||
"}\n");
|
||||
}
|
||||
|
||||
|
||||
sb.safePrintf(
|
||||
"HTTP/1.0 200 (OK)\r\n"
|
||||
"Content-Length: %li\r\n"
|
||||
"Connection: Close\r\n"
|
||||
"Content-Type: %s\r\n"
|
||||
"Date: %s UTC\r\n\r\n"
|
||||
, cb.length()
|
||||
, ct
|
||||
, tt );
|
||||
|
||||
sb.safeMemcpy ( &cb );
|
||||
|
||||
// use this new function that will compress the reply now if the
|
||||
// request was a ZET instead of a GET
|
||||
return sendReply2 ( msg , sb.length() , NULL , 0 , s );
|
||||
}
|
||||
|
||||
|
||||
// . send an error reply, like "HTTP/1.1 404 Not Found"
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
@ -1578,9 +1649,16 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
|
||||
time_t now ;//= getTimeGlobal();
|
||||
if ( isClockInSync() ) now = getTimeGlobal();
|
||||
else now = getTimeLocal();
|
||||
|
||||
// this kinda sucks that we have to do it twice...
|
||||
HttpRequest hr;
|
||||
hr.set ( s->m_readBuf , s->m_readOffset , s ) ;
|
||||
char format = hr.getReplyFormat();
|
||||
|
||||
// . buffer for the MIME request and brief html err msg
|
||||
// . NOTE: ctime appends a \n to the time, so we don't need to
|
||||
char msg[1024];
|
||||
SafeBuf sb(msg,1024,0,false);
|
||||
// if it's a 404, redirect to home page
|
||||
/*
|
||||
if ( error == 404 )
|
||||
@ -1595,26 +1673,61 @@ bool HttpServer::sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
|
||||
*/
|
||||
char *tt = asctime(gmtime ( &now ));
|
||||
tt [ gbstrlen(tt) - 1 ] = '\0';
|
||||
sprintf ( msg ,
|
||||
"HTTP/1.0 %li (%s)\r\n"
|
||||
"Content-Length: %li\r\n"
|
||||
"Connection: Close\r\n"
|
||||
"Date: %s UTC\r\n\r\n"
|
||||
"<html><b>Error = %s</b></html>",
|
||||
error ,
|
||||
errmsg ,
|
||||
(long)(gbstrlen("<html><b>Error = </b></html>")+
|
||||
gbstrlen(errmsg)),
|
||||
tt , // ctime ( &now ) ,
|
||||
errmsg );
|
||||
|
||||
char *ct = "text/html";
|
||||
if ( format == FORMAT_XML ) ct = "text/xml";
|
||||
if ( format == FORMAT_JSON ) ct = "application/json";
|
||||
|
||||
SafeBuf xb;
|
||||
|
||||
if ( format != FORMAT_XML && format != FORMAT_JSON )
|
||||
xb.safePrintf("<html><b>Error = %s</b></html>",errmsg );
|
||||
|
||||
if ( format == FORMAT_XML ) {
|
||||
xb.safePrintf("<response>\n"
|
||||
"\t<statusCode>%li</statusCode>\n"
|
||||
"\t<statusMsg><![CDATA[", error );
|
||||
xb.cdataEncode(errmsg );
|
||||
xb.safePrintf("]]></statusMsg>\n"
|
||||
"</response>\n");
|
||||
}
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
xb.safePrintf("{\"response\":{\n"
|
||||
"\t\"statusCode\":%li,\n"
|
||||
"\t\"statusMsg\":\"", error );
|
||||
xb.jsonEncode(errmsg );
|
||||
xb.safePrintf("\"\n"
|
||||
"}\n"
|
||||
"}\n");
|
||||
}
|
||||
|
||||
sb.safePrintf(
|
||||
"HTTP/1.0 %li (%s)\r\n"
|
||||
"Content-Length: %li\r\n"
|
||||
"Connection: Close\r\n"
|
||||
"Content-Type: %s\r\n"
|
||||
"Date: %s UTC\r\n\r\n"
|
||||
,
|
||||
error ,
|
||||
errmsg ,
|
||||
|
||||
xb.length(),
|
||||
|
||||
ct ,
|
||||
tt ); // ctime ( &now ) ,
|
||||
|
||||
|
||||
sb.safeMemcpy ( &xb );
|
||||
|
||||
// . move the reply to a send buffer
|
||||
// . don't make sendBuf bigger than g_conf.m_httpMaxSendBufSize
|
||||
long msgSize = gbstrlen ( msg );
|
||||
//long msgSize = gbstrlen ( msg );
|
||||
// record it
|
||||
if ( bytesSent ) *bytesSent = msgSize;//sendBufSize;
|
||||
if ( bytesSent ) *bytesSent = sb.length();//sendBufSize;
|
||||
// use this new function that will compress the reply now if the
|
||||
// request was a ZET instead of a GET
|
||||
return sendReply2 ( msg , msgSize , NULL , 0 , s );
|
||||
return sendReply2 ( msg , sb.length() , NULL , 0 , s );
|
||||
|
||||
/*
|
||||
// . this returns false if blocked, true otherwise
|
||||
@ -1640,6 +1753,11 @@ bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error ,
|
||||
//long rawFormat,
|
||||
char format ,
|
||||
int errnum, char *content) {
|
||||
|
||||
// just use this for now. it detects the format already...
|
||||
return sendErrorReply ( s,error,errmsg,NULL);
|
||||
|
||||
/*
|
||||
// clear g_errno so the send goes through
|
||||
g_errno = 0;
|
||||
// get time in secs since epoch
|
||||
@ -1707,6 +1825,7 @@ bool HttpServer::sendQueryErrorReply( TcpSocket *s , long error ,
|
||||
long msgSize = gbstrlen ( msg );
|
||||
|
||||
return sendReply2 ( msg , msgSize , NULL , 0 , s );
|
||||
*/
|
||||
|
||||
/*
|
||||
long sendBufSize = msgSize;
|
||||
|
@ -135,6 +135,8 @@ class HttpServer {
|
||||
// send an error reply, like "HTTP/1.1 404 Not Found"
|
||||
bool sendErrorReply ( TcpSocket *s , long error , char *errmsg ,
|
||||
long *bytesSent = NULL );
|
||||
// xml and json uses this
|
||||
bool sendSuccessReply (TcpSocket *s , char format , char *addMsg=NULL);
|
||||
// send a "prettier" error reply, formatted in XML if necessary
|
||||
bool sendQueryErrorReply ( TcpSocket *s , long error , char *errmsg,
|
||||
// FORMAT_HTML=0,FORMAT_XML,FORMAT_JSON
|
||||
|
151
Images.cpp
151
Images.cpp
@ -91,6 +91,58 @@ void Images::setCandidates ( Url *pageUrl , Words *words , Xml *xml ,
|
||||
// best candidate, and just use that
|
||||
if ( xd->m_isDiffbotJSONObject ) return;
|
||||
|
||||
//
|
||||
// first add any open graph candidate.
|
||||
// basically they page telling us the best image straight up.
|
||||
//
|
||||
|
||||
long node2 = -1;
|
||||
long startNode = 0;
|
||||
|
||||
// . field can be stuff like "summary","description","keywords",...
|
||||
// . if "convertHtmlEntites" is true we change < to < and > to >
|
||||
// . <meta property="og:image" content="http://example.com/rock2.jpg"/>
|
||||
// . <meta property="og:image" content="http://example.com/rock3.jpg"/>
|
||||
ogimgloop:
|
||||
char ubuf[2000];
|
||||
long ulen = xml->getMetaContent ( ubuf , // store the val here
|
||||
1999 ,
|
||||
"og:image",
|
||||
8,
|
||||
"property",
|
||||
false, // convertHtmlEntities
|
||||
startNode ,
|
||||
&node2 ); // matchedNode
|
||||
// update this in case goto ogimgloop is called
|
||||
startNode = node2 + 1;
|
||||
// see section below for explanation of what we are storing here...
|
||||
if ( node2 >= 0 ) {
|
||||
// save it
|
||||
m_imageNodes[m_numImages] = node2;
|
||||
Query q;
|
||||
if ( ulen > MAX_URL_LEN ) goto ogimgloop;
|
||||
// set it to the full url
|
||||
Url iu;
|
||||
// use "pageUrl" as the baseUrl
|
||||
iu.set ( pageUrl , ubuf , ulen );
|
||||
// skip if invalid domain or TLD
|
||||
if ( iu.getDomainLen() <= 0 ) goto ogimgloop;
|
||||
// for looking it up on disk to see if unique or not
|
||||
char buf[2000];
|
||||
snprintf ( buf , 1999, "gbimage:%s",iu.getUrl());
|
||||
// TODO: make sure this is a no-split termid storage thingy
|
||||
// in Msg14.cpp
|
||||
if ( ! q.set2 ( buf , langUnknown , false ) ) return;
|
||||
// store the termid
|
||||
m_termIds[m_numImages] = q.getTermId(0);
|
||||
// advance the counter
|
||||
m_numImages++;
|
||||
// try to get more graph images if we have some room
|
||||
if ( m_numImages + 2 < MAX_IMAGES ) goto ogimgloop;
|
||||
}
|
||||
|
||||
|
||||
|
||||
//m_pageSite = pageSite;
|
||||
// scan the words
|
||||
long nw = words->getNumWords();
|
||||
@ -530,7 +582,7 @@ bool Images::downloadImages () {
|
||||
// get img tag node
|
||||
node = m_imageNodes[m_j];
|
||||
// get the url of the image
|
||||
src = m_xml->getString(node,"src",&srcLen);
|
||||
src = getImageUrl ( m_j , &srcLen );
|
||||
// use "pageUrl" as the baseUrl
|
||||
m_imageUrl.set ( m_pageUrl , src , srcLen );
|
||||
}
|
||||
@ -755,8 +807,7 @@ bool Images::makeThumb ( ) {
|
||||
srcLen = gbstrlen(src);
|
||||
}
|
||||
else {
|
||||
long node = m_imageNodes[m_j];
|
||||
src = m_xml->getString(node,"src",&srcLen);
|
||||
src = getImageUrl ( m_j , &srcLen );
|
||||
}
|
||||
// set it to the full url
|
||||
Url iu;
|
||||
@ -848,6 +899,16 @@ bool Images::makeThumb ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec(m_collnum);
|
||||
if ( ! cr ) { g_errno = ENOCOLLREC; return true; }
|
||||
|
||||
// save how big of thumbnails we should make. user can change
|
||||
// this in the 'spider controls'
|
||||
m_xysize = cr->m_thumbnailMaxWidthHeight ;
|
||||
// make it 250 pixels if no decent value provided
|
||||
if ( m_xysize <= 0 ) m_xysize = 250;
|
||||
// and keep it sane
|
||||
if ( m_xysize > 2048 ) m_xysize = 2048;
|
||||
|
||||
// update status
|
||||
if ( m_xd ) m_xd->setStatus ( "making thumbnail" );
|
||||
@ -897,16 +958,18 @@ void Images::thumbStart_r ( bool amThread ) {
|
||||
long id = getpidtid();
|
||||
|
||||
// pass the input to the program through this file
|
||||
// rather than a pipe, since popen() seems broken
|
||||
// rather than a pipe, since popen() seems broken.
|
||||
// m_dir ends in / so this should work.
|
||||
char in[364];
|
||||
snprintf ( in , 363,"%strash/in.%li", g_hostdb.m_dir, id );
|
||||
snprintf ( in , 363,"%strashin.%li", g_hostdb.m_dir, id );
|
||||
unlink ( in );
|
||||
|
||||
log( LOG_DEBUG, "image: thumbStart_r create in file." );
|
||||
|
||||
// collect the output from the filter from this file
|
||||
// m_dir ends in / so this should work.
|
||||
char out[364];
|
||||
snprintf ( out , 363,"%strash/out.%li", g_hostdb.m_dir, id );
|
||||
snprintf ( out , 363,"%strashout.%li", g_hostdb.m_dir, id );
|
||||
unlink ( out );
|
||||
|
||||
log( LOG_DEBUG, "image: thumbStart_r create out file." );
|
||||
@ -964,23 +1027,48 @@ void Images::thumbStart_r ( bool amThread ) {
|
||||
break;
|
||||
}
|
||||
|
||||
long xysize = 250;//100;
|
||||
//long xysize = 250;//100;
|
||||
// make thumbnail a little bigger for diffbot for widget
|
||||
if ( m_xd->m_isDiffbotJSONObject ) xysize = 250;
|
||||
//if ( m_xd->m_isDiffbotJSONObject ) xysize = 250;
|
||||
|
||||
// i hope 2500 is big enough!
|
||||
char cmd[2501];
|
||||
|
||||
//sprintf( cmd, scmd, ext, in, out);
|
||||
char *wdir = g_hostdb.m_dir;
|
||||
// can be /dev/stderr or like /var/gigablast/data/log000 etc.
|
||||
char *logFile = g_log.getFilename();
|
||||
// wdir ends in / so this should work.
|
||||
snprintf( cmd, 2500 ,
|
||||
"LD_LIBRARY_PATH=%s %s/%stopnm %s | "
|
||||
"LD_LIBRARY_PATH=%s %s/pnmscale -xysize %li %li - | "
|
||||
"LD_LIBRARY_PATH=%s %s/ppmtojpeg - > %s"
|
||||
, wdir , wdir , ext , in
|
||||
, wdir , wdir , xysize , xysize
|
||||
, wdir , wdir , out
|
||||
"LD_LIBRARY_PATH=%s %s%stopnm %s 2>> %s | "
|
||||
"LD_LIBRARY_PATH=%s %spnmscale -xysize %li %li - 2>> %s | "
|
||||
// put all its stderr msgs into /dev/null
|
||||
// so "jpegtopnm: WRITING PPM FILE" doesn't clog console
|
||||
"LD_LIBRARY_PATH=%s %sppmtojpeg - > %s 2>> %s"
|
||||
, wdir , wdir , ext , in , logFile
|
||||
, wdir , wdir , m_xysize , m_xysize , logFile
|
||||
, wdir , wdir , out , logFile
|
||||
);
|
||||
|
||||
// if they already have netpbm package installed use that then
|
||||
static bool s_checked = false;
|
||||
static bool s_hasNetpbm = false;
|
||||
if ( ! s_checked ) {
|
||||
s_checked = true;
|
||||
File f;
|
||||
f.set("/usr/bin/pnmscale");
|
||||
s_hasNetpbm = f.doesExist() ;
|
||||
}
|
||||
if ( s_hasNetpbm )
|
||||
snprintf( cmd, 2500 ,
|
||||
"%stopnm %s 2>> %s | "
|
||||
"pnmscale -xysize %li %li - 2>> %s | "
|
||||
"ppmtojpeg - > %s 2>> %s"
|
||||
, ext , in , logFile
|
||||
, m_xysize , m_xysize , logFile
|
||||
, out , logFile
|
||||
);
|
||||
|
||||
|
||||
// Call clone function for the shell to execute command
|
||||
// This call WILL BLOCK . timeout is 30 seconds.
|
||||
@ -1211,10 +1299,11 @@ bool ThumbnailInfo::printThumbnailInHtml ( SafeBuf *sb ,
|
||||
long newdx = (long)((float)m_dx * min);
|
||||
long newdy = (long)((float)m_dy * min);
|
||||
|
||||
if ( printLink && format==FORMAT_HTML )
|
||||
// might be FORMAT_AJAX!
|
||||
if ( printLink && format !=FORMAT_XML && format != FORMAT_JSON )
|
||||
sb->safePrintf("<a href=%s>", getUrl() );
|
||||
|
||||
if ( format == FORMAT_HTML )
|
||||
if ( format !=FORMAT_XML && format != FORMAT_JSON )
|
||||
sb->safePrintf("<img width=%li height=%li align=left "
|
||||
"%s"
|
||||
"src=\"data:image/"
|
||||
@ -1225,20 +1314,44 @@ bool ThumbnailInfo::printThumbnailInHtml ( SafeBuf *sb ,
|
||||
);
|
||||
|
||||
if ( format == FORMAT_XML )
|
||||
sb->safePrintf("<imageBase64>");
|
||||
sb->safePrintf("\t<imageBase64>");
|
||||
|
||||
if ( format == FORMAT_JSON )
|
||||
sb->safePrintf("\t\"imageBase64\":\"");
|
||||
|
||||
// encode image in base 64
|
||||
sb->base64Encode ( getData(), m_dataSize , 0 ); // 0 niceness
|
||||
if ( format == FORMAT_HTML ) {
|
||||
if ( format !=FORMAT_XML && format != FORMAT_JSON ) {
|
||||
sb->safePrintf("\">");
|
||||
if ( printLink ) sb->safePrintf ("</a>");
|
||||
}
|
||||
|
||||
if ( format == FORMAT_XML )
|
||||
sb->safePrintf("</imageBase64>");
|
||||
sb->safePrintf("</imageBase64>\n");
|
||||
|
||||
if ( format == FORMAT_JSON )
|
||||
sb->safePrintf("\",\n");
|
||||
|
||||
// widget needs to know the width of the thumb for formatting
|
||||
// the text either on top of the thumb or to the right of it
|
||||
if ( retNewdx ) *retNewdx = newdx;
|
||||
return true;
|
||||
}
|
||||
|
||||
|
||||
char *Images::getImageUrl ( long j , long *urlLen ) {
|
||||
|
||||
long node = m_imageNodes[j];
|
||||
long srcLen = 0;
|
||||
char *src = m_xml->getString(node,"src",&srcLen);
|
||||
// maybe it was an og:image meta tag
|
||||
if ( ! src )
|
||||
src = m_xml->getString(node,"content",&srcLen);
|
||||
|
||||
// wtf?
|
||||
if ( ! src )
|
||||
log("image: image bad/null src");
|
||||
|
||||
*urlLen = srcLen;
|
||||
return src;
|
||||
}
|
||||
|
4
Images.h
4
Images.h
@ -119,6 +119,8 @@ class Images {
|
||||
bool downloadImage();
|
||||
bool makeThumb();
|
||||
|
||||
char *getImageUrl ( long j , long *urlLen ) ;
|
||||
|
||||
//bool gotImage ( );
|
||||
void thumbStart_r ( bool amThread );
|
||||
|
||||
@ -131,6 +133,8 @@ class Images {
|
||||
void *m_state ;
|
||||
void (* m_callback)(void *state );
|
||||
|
||||
long m_xysize;
|
||||
|
||||
bool m_setCalled;
|
||||
long m_errno;
|
||||
long m_hadError;
|
||||
|
2
Log.h
2
Log.h
@ -143,6 +143,8 @@ class Log {
|
||||
|
||||
bool m_logTimestamps;
|
||||
|
||||
char *getFilename() { return m_filename; };
|
||||
|
||||
private:
|
||||
|
||||
bool dumpLog ( ); // make room for the new ones
|
||||
|
20
Makefile
20
Makefile
@ -551,6 +551,7 @@ master-rpm:
|
||||
# DEBIAN PACKAGE SECTION BEGIN
|
||||
|
||||
# need to do 'apt-get intall dh-make'
|
||||
# deb-master
|
||||
master-deb:
|
||||
git archive --format=tar --prefix=gb-1.0/ master > ../gb_1.0.orig.tar
|
||||
rm -rf debian
|
||||
@ -569,6 +570,7 @@ master-deb:
|
||||
cp control.deb debian/control
|
||||
# try to use our own rules so we can override dh_shlibdeps and others
|
||||
cp gb.deb.rules debian/rules
|
||||
cp changelog debian/changelog
|
||||
# fix dh_shlibdeps from bitching about dependencies on shared libs
|
||||
# YOU HAVE TO RUN THIS before you run 'make'
|
||||
# export LD_LIBRARY_PATH=./debian/gb/var/gigablast/data0
|
||||
@ -583,12 +585,12 @@ master-deb:
|
||||
# upload rpm
|
||||
scp gb*.rpm gk268:/w/html/
|
||||
|
||||
|
||||
#deb-testing
|
||||
testing-deb:
|
||||
git archive --format=tar --prefix=gb-1.0/ testing > ../gb_1.0.orig.tar
|
||||
git archive --format=tar --prefix=gb-1.1/ testing > ../gb_1.1.orig.tar
|
||||
rm -rf debian
|
||||
# change "-p gb_1.0" to "-p gb_1.1" to update version for example
|
||||
dh_make -e gigablast@mail.com -p gb_1.0 -f ../gb_1.0.orig.tar
|
||||
dh_make -e gigablast@mail.com -p gb_1.1 -f ../gb_1.1.orig.tar
|
||||
# zero this out, it is just filed with the .txt files erroneously and it'll
|
||||
# try to automatiicaly install in /usr/docs/
|
||||
rm debian/docs
|
||||
@ -602,16 +604,24 @@ testing-deb:
|
||||
cp control.deb debian/control
|
||||
# try to use our own rules so we can override dh_shlibdeps and others
|
||||
cp gb.deb.rules debian/rules
|
||||
cp changelog debian/changelog
|
||||
# make the pkg dependencies file ourselves since we overrode dh_shlibdeps
|
||||
# with our own debian/rules file. see that file for more info.
|
||||
# echo "shlibs:Depends=libc6 (>= 2.3)" > debian/gb.substvars
|
||||
# echo "shlibs:Depends=netpbm (>= 0.0)" > debian/gb.substvars
|
||||
# echo "misc:Depends=netpbm (>= 0.0)" > debian/gb.substvars
|
||||
# fix dh_shlibdeps from bitching about dependencies on shared libs
|
||||
# YOU HAVE TO RUN THIS before you run 'make'
|
||||
# export LD_LIBRARY_PATH=./debian/gb/var/gigablast/data0
|
||||
# build the package now
|
||||
# build the package now. if we don't specify -ai386 -ti386 then some users
|
||||
# get a wrong architecture msg and 'dpkg -i' fails
|
||||
dpkg-buildpackage -nc -ai386 -ti386 -b -uc -rfakeroot
|
||||
# dpkg-buildpackage -nc -b -uc -rfakeroot
|
||||
# move to current dur
|
||||
mv ../gb_*.deb .
|
||||
|
||||
install-pkgs-local:
|
||||
sudo alien --to-rpm gb_1.0-1_i386.deb
|
||||
sudo alien --to-rpm gb_1.1-1_i386.deb
|
||||
# upload
|
||||
scp gb*.deb gb*.rpm gk268:/w/html/
|
||||
|
||||
|
@ -1931,7 +1931,7 @@ bool getTestSpideredDate ( Url *u , long *origSpideredDate , char *testDir ) {
|
||||
// hash the url into 64 bits
|
||||
long long uh64 = hash64(u->getUrl(),u->getUrlLen());
|
||||
// read the spider date file first
|
||||
char fn[300];
|
||||
char fn[2000];
|
||||
File f;
|
||||
// get the spider date then
|
||||
sprintf(fn,"%s/%s/doc.%llu.spiderdate.txt",
|
||||
@ -1964,6 +1964,10 @@ bool getTestSpideredDate ( Url *u , long *origSpideredDate , char *testDir ) {
|
||||
}
|
||||
|
||||
bool addTestSpideredDate ( Url *u , long spideredTime , char *testDir ) {
|
||||
|
||||
// ensure dir exists
|
||||
::mkdir(testDir,S_IRWXU);
|
||||
|
||||
// set this
|
||||
long long uh64 = hash64(u->getUrl(),u->getUrlLen());
|
||||
// make that into a filename
|
||||
|
@ -51,8 +51,11 @@ void handleRequest ( UdpSlot *slot , long netnice ) {
|
||||
char *filename = g_hostdb.m_logFilename;
|
||||
|
||||
// running just ./gb will log to stderr...
|
||||
if ( strcmp(filename ,"/dev/stderr") == 0 )
|
||||
if ( strcmp(filename ,"/dev/stderr") == 0 ) {
|
||||
g_errno = EBADFILE;
|
||||
g_udpServer.sendErrorReply ( slot, g_errno );
|
||||
return;
|
||||
}
|
||||
|
||||
long fd = open ( filename , O_RDONLY,
|
||||
S_IRUSR |S_IWUSR |S_IRGRP |S_IWGRP| S_IROTH );
|
||||
|
9
Msg20.h
9
Msg20.h
@ -75,6 +75,7 @@ class Msg20Request {
|
||||
char m_numSummaryLines ; // non-zero default
|
||||
char m_expected ; // non-zero default
|
||||
char m_allowPunctInPhrase ; // non-zero default
|
||||
bool m_getHeaderTag ;
|
||||
void *m_state ;
|
||||
void *m_state2 ; // used by Msg25.cpp
|
||||
long m_j ; // used by Msg25.cpp
|
||||
@ -416,9 +417,11 @@ public:
|
||||
// methods must be changed
|
||||
// . also, all ptr_* should be char* and all size_* should be in bytes
|
||||
char *ptr_tbuf ; // title buffer
|
||||
char *ptr_htag ; // h1 tag buf
|
||||
char *ptr_ubuf ; // url buffer
|
||||
char *ptr_rubuf ; // redirect url buffer
|
||||
char *ptr_sum ; // summary
|
||||
char *ptr_displaySum ; // summary for displaying
|
||||
char *ptr_dedupSum ; // summary for deduping
|
||||
char *ptr_dbuf ; // display metas \0 separated
|
||||
//char *ptr_sbuf ; // big sample buf for gigabits
|
||||
char *ptr_gigabitSample ;
|
||||
@ -512,9 +515,11 @@ public:
|
||||
// . string sizes of the strings we store into m_buf[]
|
||||
// . wordCountBuf is an exact word count 1-1 with each "range"
|
||||
long size_tbuf ;
|
||||
long size_htag ;
|
||||
long size_ubuf ;
|
||||
long size_rubuf ;
|
||||
long size_sum ;
|
||||
long size_displaySum ;
|
||||
long size_dedupSum ;
|
||||
long size_dbuf ;
|
||||
//long size_sbuf ;
|
||||
long size_gigabitSample ; // includes \0
|
||||
|
82
Msg40.cpp
82
Msg40.cpp
@ -1330,31 +1330,52 @@ bool Msg40::launchMsg20s ( bool recalled ) {
|
||||
// m_printi < m_msg3a.m_numDocIds checks that kinda expect
|
||||
// us to get all summaries for every docid. but when we
|
||||
// do federated search we can get a ton of docids.
|
||||
if ( m_printi >= m_docsToGetVisible ) {
|
||||
logf(LOG_DEBUG,"query: got %li >= %li "
|
||||
"summaries. done. "
|
||||
"waiting on remaining "
|
||||
"%li to return."
|
||||
, m_printi
|
||||
, m_docsToGetVisible
|
||||
, m_numRequests-m_numReplies);
|
||||
// wait for all msg20 replies to come in
|
||||
if ( m_numRequests != m_numReplies ) break;
|
||||
// then let's hack fix this then so we can call
|
||||
// printSearchResultsTail()
|
||||
m_printi = m_msg3a.m_numDocIds;
|
||||
// set these to max so they do not launch another
|
||||
// summary request, just in case, below
|
||||
m_numRequests = m_msg3a.m_numDocIds;
|
||||
m_numReplies = m_msg3a.m_numDocIds;
|
||||
break;
|
||||
}
|
||||
// if ( m_printi >= m_docsToGetVisible ) {
|
||||
// logf(LOG_DEBUG,"query: got %li >= %li "
|
||||
// "summaries. done. "
|
||||
// "waiting on remaining "
|
||||
// "%li to return."
|
||||
// , m_printi
|
||||
// , m_docsToGetVisible
|
||||
// , m_numRequests-m_numReplies);
|
||||
// // wait for all msg20 replies to come in
|
||||
// if ( m_numRequests != m_numReplies ) break;
|
||||
// // then let's hack fix this then so we can call
|
||||
// // printSearchResultsTail()
|
||||
// m_printi = m_msg3a.m_numDocIds;
|
||||
// // set these to max so they do not launch another
|
||||
// // summary request, just in case, below
|
||||
// m_numRequests = m_msg3a.m_numDocIds;
|
||||
// m_numReplies = m_msg3a.m_numDocIds;
|
||||
// break;
|
||||
// }
|
||||
|
||||
// do not double count!
|
||||
//if ( i <= m_lastProcessedi ) continue;
|
||||
// do not repeat for this i
|
||||
m_lastProcessedi = i;
|
||||
|
||||
|
||||
// if we have printed enough summaries then do not launch
|
||||
// any more, wait for them to come back in.
|
||||
/// this is causing problems because we have a bunch of
|
||||
// m_printi < m_msg3a.m_numDocIds checks that kinda expect
|
||||
// us to get all summaries for every docid. but when we
|
||||
// do federated search we can get a ton of docids.
|
||||
// if ( m_printi >= m_docsToGetVisible ) {
|
||||
// logf(LOG_DEBUG,"query: got %li >= %li "
|
||||
// "summaries. done. "
|
||||
// "waiting on remaining "
|
||||
// "%li to return."
|
||||
// , m_printi
|
||||
// , m_docsToGetVisible
|
||||
// , m_numRequests-m_numReplies);
|
||||
// m_numRequests++;
|
||||
// m_numReplies++;
|
||||
// continue;
|
||||
// }
|
||||
|
||||
|
||||
// start up a Msg20 to get the summary
|
||||
Msg20 *m = NULL;
|
||||
if ( m_si->m_streamResults ) {
|
||||
@ -1492,6 +1513,12 @@ bool Msg40::launchMsg20s ( bool recalled ) {
|
||||
req.m_bigSampleMaxLen = bigSampleMaxLen;
|
||||
req.m_titleMaxLen = 256;
|
||||
req.m_titleMaxLen = cr->m_titleMaxLen;
|
||||
req.m_summaryMaxLen = cr->m_summaryMaxLen;
|
||||
// a special undocumented thing for getting <h1> tag
|
||||
req.m_getHeaderTag = m_si->m_hr.getLong("geth1tag",0);
|
||||
//req.m_numSummaryLines = cr->m_summaryMaxNumLines;
|
||||
// let "ns" parm override
|
||||
req.m_numSummaryLines = m_si->m_numLinesInSummary;
|
||||
if(m_si->m_isAdmin && m_si->m_format == FORMAT_HTML )
|
||||
req.m_getGigabitVector = true;
|
||||
else req.m_getGigabitVector = false;
|
||||
@ -1909,7 +1936,9 @@ bool Msg40::gotSummary ( ) {
|
||||
|
||||
// . wrap it up with Next 10 etc.
|
||||
// . this is in PageResults.cpp
|
||||
if ( m_si && m_si->m_streamResults && ! m_printedTail &&
|
||||
if ( m_si &&
|
||||
m_si->m_streamResults &&
|
||||
! m_printedTail &&
|
||||
m_printi >= m_msg3a.m_numDocIds ) {
|
||||
m_printedTail = true;
|
||||
printSearchResultsTail ( st );
|
||||
@ -1960,10 +1989,19 @@ bool Msg40::gotSummary ( ) {
|
||||
if ( ! launchMsg20s ( true ) ) return false;
|
||||
// it won't launch now if we are bottlnecked waiting for
|
||||
// m_printi's summary to come in
|
||||
if ( m_si->m_streamResults )
|
||||
if ( m_si->m_streamResults ) {
|
||||
// it won't launch any if we printed out enough as well
|
||||
// and it printed "waiting on remaining 0 to return"
|
||||
// and it printed "waiting on remaining 0 to return".
|
||||
// we shouldn't be waiting for more to come in b/c
|
||||
// we are in gotSummart() so one just came in
|
||||
// freeing up a msg20 to launch another, so assume
|
||||
// this means we are basically done. and it
|
||||
// set m_numRequests=m_msg3a.m_numDocIds etc.
|
||||
//if ( m_numRequests == m_msg3a.m_numDocIds )
|
||||
// goto printTail;
|
||||
// otherwise, keep chugging
|
||||
goto complete;
|
||||
}
|
||||
// maybe some were cached?
|
||||
//goto refilter;
|
||||
// it returned true, so m_numRequests == m_numReplies and
|
||||
|
12
Msg5.cpp
12
Msg5.cpp
@ -204,11 +204,13 @@ bool Msg5::getList ( char rdbId ,
|
||||
m_rdbId = rdbId;
|
||||
m_collnum = collnum;
|
||||
|
||||
CollectionRec *ttt = g_collectiondb.getRec ( m_collnum );
|
||||
if ( ! ttt ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
return true;
|
||||
}
|
||||
// why was this here? it was messing up the statsdb ("graph") link
|
||||
// in the admin panel.
|
||||
//CollectionRec *ttt = g_collectiondb.getRec ( m_collnum );
|
||||
//if ( ! ttt ) {
|
||||
// g_errno = ENOCOLLREC;
|
||||
// return true;
|
||||
//}
|
||||
|
||||
m_list = list;
|
||||
//m_startKey = startKey;
|
||||
|
@ -53,6 +53,29 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
|
||||
if ( ! add && ! cast ) g_collectiondb.deleteRecs ( r ) ;
|
||||
*/
|
||||
|
||||
char format = r->getReplyFormat();
|
||||
|
||||
|
||||
if ( format == FORMAT_XML || format == FORMAT_JSON ) {
|
||||
// no addcoll given?
|
||||
long page = g_pages.getDynamicPageNumber ( r );
|
||||
char *addcoll = r->getString("addcoll",NULL);
|
||||
char *delcoll = r->getString("delcoll",NULL);
|
||||
if ( ! addcoll ) addcoll = r->getString("addColl",NULL);
|
||||
if ( ! delcoll ) delcoll = r->getString("delColl",NULL);
|
||||
if ( page == PAGE_ADDCOLL && ! addcoll ) {
|
||||
g_errno = EBADENGINEER;
|
||||
char *msg = "no addcoll parm provided";
|
||||
return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
|
||||
}
|
||||
if ( page == PAGE_DELCOLL && ! delcoll ) {
|
||||
g_errno = EBADENGINEER;
|
||||
char *msg = "no delcoll parm provided";
|
||||
return g_httpServer.sendErrorReply(s,g_errno,msg,NULL);
|
||||
}
|
||||
return g_httpServer.sendSuccessReply(s,format);
|
||||
}
|
||||
|
||||
char buf [ 64*1024 ];
|
||||
SafeBuf p(buf, 64*1024);
|
||||
// print standard header
|
||||
@ -93,7 +116,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
|
||||
p.safePrintf (
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b>name of new collection to add</td>\n"
|
||||
"<td><input type=text name=addColl size=30>"
|
||||
"<td><input type=text name=addcoll size=30>"
|
||||
"</td></tr>\n"
|
||||
, LIGHT_BLUE
|
||||
);
|
||||
@ -142,7 +165,7 @@ bool sendPageAddDelColl ( TcpSocket *s , HttpRequest *r , bool add ) {
|
||||
if ( ! cr ) continue;
|
||||
p.safePrintf (
|
||||
"<tr bgcolor=#%s><td>"
|
||||
"<input type=checkbox name=delColl value=\"%s\"> "
|
||||
"<input type=checkbox name=delcoll value=\"%s\"> "
|
||||
"%s</td></tr>\n",
|
||||
DARK_BLUE,
|
||||
cr->m_coll,cr->m_coll);
|
||||
|
@ -117,6 +117,9 @@ bool sendPageAddUrl2 ( TcpSocket *s , HttpRequest *r ) {
|
||||
// . use to manually update spider times for a url
|
||||
// . however, will not remove old scheduled spider times
|
||||
// . mdw: made force on the default
|
||||
// . mdw: don't use this anymore, use url filters, it has
|
||||
// a "isaddurl" directive you can use where you can set the
|
||||
// respider frequency to basically 0 to simulate this parm.
|
||||
//st1->m_forceRespider = r->getLong("force",1); // 0);
|
||||
|
||||
// if no url given, just print a blank page
|
||||
@ -135,7 +138,10 @@ bool sendPageAddUrl2 ( TcpSocket *s , HttpRequest *r ) {
|
||||
return sendReply ( st1 , true );
|
||||
}
|
||||
|
||||
|
||||
if ( spiderLinks )
|
||||
sreq->m_avoidSpiderLinks = 0;
|
||||
else
|
||||
sreq->m_avoidSpiderLinks = 1;
|
||||
|
||||
// shortcut
|
||||
Msg4 *m = &st1->m_msg4;
|
||||
|
@ -63,7 +63,7 @@ public:
|
||||
// hash of the subdomain or domain for this line in sitelist
|
||||
long m_thingHash32;
|
||||
// ptr to the line in CollectionRec::m_siteListBuf
|
||||
char *m_patternStr;
|
||||
long m_patternStrOff;
|
||||
// offset of the url path in the pattern, 0 means none
|
||||
short m_pathOff;
|
||||
short m_pathLen;
|
||||
@ -315,7 +315,10 @@ bool updateSiteListBuf ( collnum_t collnum ,
|
||||
pd.m_thingHash32 = u.getHostHash32();
|
||||
// . ptr to the line in CollectionRec::m_siteListBuf.
|
||||
// . includes pointing to "exact:" too i guess and tag: later.
|
||||
pd.m_patternStr = start;
|
||||
// . store offset since CommandUpdateSiteList() passes us
|
||||
// a temp buf that will be freed before copying the buf
|
||||
// over to its permanent place at cr->m_siteListBuf
|
||||
pd.m_patternStrOff = start - siteListArg;
|
||||
// offset of the url path in the pattern, 0 means none
|
||||
pd.m_pathOff = 0;
|
||||
// scan url pattern, it should start at "s"
|
||||
@ -432,30 +435,66 @@ char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
|
||||
// we handle.
|
||||
long slot = dt->getSlot ( &sreq->m_domHash32 );
|
||||
|
||||
char *buf = cr->m_siteListBuf.getBufStart();
|
||||
|
||||
// loop over all the patterns that contain this domain and see
|
||||
// the first one we match, and if we match a negative one.
|
||||
for ( ; slot >= 0 ; slot = dt->getNextSlot(slot,&sreq->m_domHash32)) {
|
||||
// get pattern
|
||||
PatternData *pd = (PatternData *)dt->getValueFromSlot ( slot );
|
||||
// point to string
|
||||
char *patternStr = buf + pd->m_patternStrOff;
|
||||
// is it negative? return NULL if so so url will be ignored
|
||||
//if ( pd->m_patternStr[0] == '-' )
|
||||
//if ( patternStr[0] == '-' )
|
||||
// return NULL;
|
||||
// otherwise, it has a path. skip if we don't match path ptrn
|
||||
if ( pd->m_pathOff ) {
|
||||
if ( ! myPath ) myPath = sreq->getUrlPath();
|
||||
if ( strncmp (myPath,
|
||||
pd->m_patternStr + pd->m_pathOff,
|
||||
patternStr + pd->m_pathOff,
|
||||
pd->m_pathLen ) )
|
||||
continue;
|
||||
}
|
||||
|
||||
// for entries like http://domain.com/ we have to match
|
||||
// protocol and url can NOT be like www.domain.com to match.
|
||||
// this is really like a regex like ^http://xyz.com/poo/boo/
|
||||
if ( (patternStr[0]=='h' ||
|
||||
patternStr[0]=='H') &&
|
||||
( patternStr[1]=='t' ||
|
||||
patternStr[1]=='T' ) &&
|
||||
( patternStr[2]=='t' ||
|
||||
patternStr[2]=='T' ) &&
|
||||
( patternStr[3]=='p' ||
|
||||
patternStr[3]=='P' ) ) {
|
||||
char *x = patternStr+4;
|
||||
// is it https:// ?
|
||||
if ( *x == 's' || *x == 'S' ) x++;
|
||||
// watch out for subdomains like http.foo.com
|
||||
if ( *x != ':' ) goto nomatch;
|
||||
// ok, we have to substring match exactly. like
|
||||
// ^http://xyssds.com/foobar/
|
||||
char *a = patternStr;
|
||||
char *b = sreq->m_url;
|
||||
for ( ; ; a++, b++ ) {
|
||||
// stop matching when pattern is exhausted
|
||||
if ( is_wspace_a(*a) || ! *a )
|
||||
return patternStr;
|
||||
if ( *a != *b ) break;
|
||||
}
|
||||
// we failed to match "pd" so try next line
|
||||
continue;
|
||||
}
|
||||
nomatch:
|
||||
|
||||
// was the line just a domain and not a subdomain?
|
||||
if ( pd->m_thingHash32 == sreq->m_domHash32 )
|
||||
// this will be false if negative pattern i guess
|
||||
return pd->m_patternStr;
|
||||
return patternStr;
|
||||
// was it just a subdomain?
|
||||
if ( pd->m_thingHash32 == sreq->m_hostHash32 )
|
||||
// this will be false if negative pattern i guess
|
||||
return pd->m_patternStr;
|
||||
return patternStr;
|
||||
}
|
||||
|
||||
|
||||
@ -573,7 +612,25 @@ bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
"Spider the url "
|
||||
"<i>http://www.goodstuff.com/</i> and spider "
|
||||
"any links we harvest that start with "
|
||||
"<i>http://www.goodstuff.com/</i>"
|
||||
"<i>http://www.goodstuff.com/</i>. NOTE: if the url "
|
||||
"www.goodstuff.com redirects to foo.goodstuff.com then "
|
||||
"foo.goodstuff.com still gets spidered "
|
||||
"because it is considered to be manually added, but "
|
||||
"no other urls from foo.goodstuff.com will be spidered."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
// protocol and subdomain match
|
||||
"<tr>"
|
||||
"<td>http://justdomain.com/foo/</td>"
|
||||
"<td>"
|
||||
"Spider the url "
|
||||
"<i>http://justdomain.com/foo/</i> and spider "
|
||||
"any links we harvest that start with "
|
||||
"<i>http://justdomain.com/foo/</i>. "
|
||||
"Urls that start with "
|
||||
"<i>http://<b>www.</b>justdomain.com/</i>, for example, "
|
||||
"will NOT match this."
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
|
@ -804,6 +804,14 @@ void StateCD::printSpiderdbList ( RdbList *list,SafeBuf *sb,char **lastKeyPtr){
|
||||
if ( srep && srep->m_hadDiffbotError )
|
||||
msg = "Diffbot processing error";
|
||||
|
||||
// indicate specific diffbot error if we have it
|
||||
if ( srep &&
|
||||
srep->m_hadDiffbotError &&
|
||||
srep->m_errCode &&
|
||||
// stick with "diffbot processing error" for these...
|
||||
srep->m_errCode != EDIFFBOTINTERNALERROR )
|
||||
msg = mstrerror(srep->m_errCode);
|
||||
|
||||
// matching url filter, print out the expression
|
||||
long ufn ;
|
||||
ufn = ::getUrlFilterNum(sreq,
|
||||
@ -1868,6 +1876,7 @@ bool sendPageCrawlbot ( TcpSocket *socket , HttpRequest *hr ) {
|
||||
|
||||
// i guess bail if not there?
|
||||
if ( ! cr ) {
|
||||
log("crawlbot: missing coll rec for coll %s",collName);
|
||||
char *msg = "invalid or missing collection rec";
|
||||
return sendErrorReply2 (socket,fmt,msg);
|
||||
}
|
||||
|
@ -50,7 +50,11 @@ bool sendPageDirectory ( TcpSocket *s , HttpRequest *r ) {
|
||||
// if /Top print the directory homepage
|
||||
if ( catId == 1 || catId <= 0 ) {
|
||||
// this is in PageRoot.cpp
|
||||
printDirHomePage(sb,r);
|
||||
if ( ! printDirHomePage(sb,r) )
|
||||
// this will be an error if dmoz not set up and
|
||||
// it and xml or json reply format requested
|
||||
return g_httpServer.sendErrorReply(s,500,
|
||||
mstrerror(g_errno));
|
||||
}
|
||||
//
|
||||
// try printing this shit out not as search results right now
|
||||
|
100
PageGet.cpp
100
PageGet.cpp
@ -22,6 +22,7 @@ static bool processLoop ( void *state ) ;
|
||||
class State2 {
|
||||
public:
|
||||
Msg22 m_msg22;
|
||||
char m_format;
|
||||
//TitleRec m_tr;
|
||||
long m_niceness;
|
||||
XmlDoc m_xd;
|
||||
@ -76,7 +77,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
// get the collection rec
|
||||
CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOTFOUND;
|
||||
g_errno = ENOCOLLREC;
|
||||
log("query: Archived copy retrieval failed. "
|
||||
"No collection record found for "
|
||||
"collection \"%s\".",coll);
|
||||
@ -103,6 +104,13 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
long long docId = r->getLongLong ( "d" , 0LL /*default*/ );
|
||||
// get url
|
||||
char *url = r->getString ( "u",NULL);
|
||||
|
||||
if ( docId == 0 && ! url ) {
|
||||
g_errno = EMISSINGINPUT;
|
||||
return g_httpServer.sendErrorReply (s,500 ,mstrerror(g_errno));
|
||||
}
|
||||
|
||||
|
||||
// . should we do a sequential lookup?
|
||||
// . we need to match summary here so we need to know this
|
||||
//bool seq = r->getLong ( "seq" , false );
|
||||
@ -153,6 +161,7 @@ bool sendPageGet ( TcpSocket *s , HttpRequest *r ) {
|
||||
st->m_isBanned = false;
|
||||
st->m_noArchive = false;
|
||||
st->m_socket = s;
|
||||
st->m_format = r->getReplyFormat();
|
||||
// default to 0 niceness
|
||||
st->m_niceness = 0;
|
||||
st->m_r.copy ( r );
|
||||
@ -212,7 +221,7 @@ bool sendErrorReply ( void *state , long err ) {
|
||||
TcpSocket *s = st->m_socket;
|
||||
|
||||
char tmp [ 1024*32 ] ;
|
||||
sprintf ( tmp , "<b>had server-side error: %s</b><br>",
|
||||
sprintf ( tmp , "%s",
|
||||
mstrerror(g_errno));
|
||||
// nuke state2
|
||||
mdelete ( st , sizeof(State2) , "PageGet1" );
|
||||
@ -358,6 +367,9 @@ bool processLoop ( void *state ) {
|
||||
//p += gbstrlen ( p );
|
||||
}
|
||||
|
||||
char format = st->m_format;
|
||||
if ( format == FORMAT_XML ) sb.reset();
|
||||
if ( format == FORMAT_JSON ) sb.reset();
|
||||
|
||||
// for undoing the stuff below
|
||||
long startLen2 = sb.length();//p;
|
||||
@ -383,6 +395,19 @@ bool processLoop ( void *state ) {
|
||||
if ( xd->m_contentType == CT_JSON )
|
||||
printDisclaimer = false;
|
||||
|
||||
if ( format == FORMAT_XML ) printDisclaimer = false;
|
||||
if ( format == FORMAT_JSON ) printDisclaimer = false;
|
||||
|
||||
char tbuf[100];
|
||||
tbuf[0] = 0;
|
||||
time_t lastSpiderDate = xd->m_spideredTime;
|
||||
|
||||
if ( printDisclaimer ||
|
||||
format == FORMAT_XML ||
|
||||
format == FORMAT_JSON ) {
|
||||
struct tm *timeStruct = gmtime ( &lastSpiderDate );
|
||||
strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
|
||||
}
|
||||
|
||||
// We should always be displaying this disclaimer.
|
||||
// - May eventually want to display this at a different location
|
||||
@ -425,10 +450,10 @@ bool processLoop ( void *state ) {
|
||||
//p += gbstrlen ( p );
|
||||
|
||||
// then the spider date in GMT
|
||||
time_t lastSpiderDate = xd->m_spideredTime;
|
||||
struct tm *timeStruct = gmtime ( &lastSpiderDate );
|
||||
char tbuf[100];
|
||||
strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
|
||||
// time_t lastSpiderDate = xd->m_spideredTime;
|
||||
// struct tm *timeStruct = gmtime ( &lastSpiderDate );
|
||||
// char tbuf[100];
|
||||
// strftime ( tbuf, 100,"%b %d, %Y UTC", timeStruct);
|
||||
//p += gbstrlen ( p );
|
||||
sb.safeStrcpy(tbuf);
|
||||
|
||||
@ -562,6 +587,9 @@ bool processLoop ( void *state ) {
|
||||
if ( xd->m_contentType == CT_JSON )
|
||||
includeHeader = false;
|
||||
|
||||
if ( format == FORMAT_XML ) includeHeader = false;
|
||||
if ( format == FORMAT_JSON ) includeHeader = false;
|
||||
|
||||
//mfree(uq, uqCapacity, "PageGet");
|
||||
// undo the header writes if we should
|
||||
if ( ! includeHeader ) {
|
||||
@ -571,6 +599,35 @@ bool processLoop ( void *state ) {
|
||||
else sb.m_length=startLen1;//p=start1;
|
||||
}
|
||||
|
||||
//sb.safeStrcpy(tbuf);
|
||||
|
||||
|
||||
|
||||
if ( format == FORMAT_XML ) {
|
||||
sb.safePrintf("<response>\n");
|
||||
sb.safePrintf("<statusCode>0</statusCode>\n");
|
||||
sb.safePrintf("<statusMsg>Success</statusMsg>\n");
|
||||
sb.safePrintf("<url><![CDATA[");
|
||||
sb.cdataEncode(xd->m_firstUrl.m_url);
|
||||
sb.safePrintf("]]></url>\n");
|
||||
sb.safePrintf("<docId>%llu</docId>\n",xd->m_docId);
|
||||
sb.safePrintf("\t<cachedTimeUTC>%lu</cachedTimeUTC>\n",
|
||||
lastSpiderDate);
|
||||
sb.safePrintf("\t<cachedTimeStr>%s</cachedTimeStr>\n",tbuf);
|
||||
}
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
sb.safePrintf("{\"response\":{\n");
|
||||
sb.safePrintf("\t\"statusCode\":0,\n");
|
||||
sb.safePrintf("\t\"statusMsg\":\"Success\",\n");
|
||||
sb.safePrintf("\t\"url\":\"");
|
||||
sb.jsonEncode(xd->m_firstUrl.m_url);
|
||||
sb.safePrintf("\",\n");
|
||||
sb.safePrintf("\t\"docId\":%llu,\n",xd->m_docId);
|
||||
sb.safePrintf("\t\"cachedTimeUTC\":%lu,\n",lastSpiderDate);
|
||||
sb.safePrintf("\t\"cachedTimeStr\":\"%s\",\n",tbuf);
|
||||
}
|
||||
|
||||
// identify start of <title> tag we wrote out
|
||||
char *sbstart = sb.getBufStart();
|
||||
char *sbend = sb.getBufEnd();
|
||||
@ -681,6 +738,10 @@ bool processLoop ( void *state ) {
|
||||
if ( ctype == CT_TEXT ) pre = true ; // text/plain
|
||||
if ( ctype == CT_DOC ) pre = true ; // filtered msword
|
||||
if ( ctype == CT_PS ) pre = true ; // filtered postscript
|
||||
|
||||
if ( format == FORMAT_XML ) pre = false;
|
||||
if ( format == FORMAT_JSON ) pre = false;
|
||||
|
||||
// if it is content-type text, add a <pre>
|
||||
if ( pre ) {//p + 5 < bufEnd && pre ) {
|
||||
sb.safePrintf("<pre>");
|
||||
@ -706,10 +767,15 @@ bool processLoop ( void *state ) {
|
||||
// do not do term highlighting if json
|
||||
if ( xd->m_contentType == CT_JSON )
|
||||
queryHighlighting = false;
|
||||
|
||||
SafeBuf tmp;
|
||||
SafeBuf *xb = &sb;
|
||||
if ( format == FORMAT_XML ) xb = &tmp;
|
||||
if ( format == FORMAT_JSON ) xb = &tmp;
|
||||
|
||||
|
||||
if ( ! queryHighlighting ) {
|
||||
sb.safeMemcpy ( content , contentLen );
|
||||
xb->safeMemcpy ( content , contentLen );
|
||||
//p += contentLen ;
|
||||
}
|
||||
else {
|
||||
@ -733,7 +799,7 @@ bool processLoop ( void *state ) {
|
||||
Matches m;
|
||||
m.setQuery ( &qq );
|
||||
m.addMatches ( &ww );
|
||||
hilen = hi.set ( &sb , // p , avail ,
|
||||
hilen = hi.set ( xb , // p , avail ,
|
||||
&ww , &m ,
|
||||
false /*doStemming?*/ ,
|
||||
st->m_clickAndScroll ,
|
||||
@ -742,6 +808,21 @@ bool processLoop ( void *state ) {
|
||||
log(LOG_DEBUG, "query: Done highlighting cached page content");
|
||||
}
|
||||
|
||||
|
||||
if ( format == FORMAT_XML ) {
|
||||
sb.safePrintf("\t<content><![CDATA[");
|
||||
sb.cdataEncode ( xb->getBufStart() );
|
||||
sb.safePrintf("]]></content>\n");
|
||||
sb.safePrintf("</response>\n");
|
||||
}
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
sb.safePrintf("\t\"content\":\"\n");
|
||||
sb.jsonEncode ( xb->getBufStart() );
|
||||
sb.safePrintf("\"\n}\n}\n");
|
||||
}
|
||||
|
||||
|
||||
// if it is content-type text, add a </pre>
|
||||
if ( pre ) { // p + 6 < bufEnd && pre ) {
|
||||
sb.safeMemcpy ( "</pre>" , 6 );
|
||||
@ -784,6 +865,9 @@ bool processLoop ( void *state ) {
|
||||
if ( xd->m_contentType == CT_JSON )
|
||||
contentType = "application/json";
|
||||
|
||||
if ( format == FORMAT_XML ) contentType = "text/xml";
|
||||
if ( format == FORMAT_JSON ) contentType = "application/json";
|
||||
|
||||
// nuke state2
|
||||
mdelete ( st , sizeof(State2) , "PageGet1" );
|
||||
delete (st);
|
||||
|
@ -44,6 +44,15 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
mnew ( msg7, sizeof(Msg7) , "PageInject" );
|
||||
|
||||
|
||||
char format = hr->getReplyFormat();
|
||||
|
||||
// no url parm?
|
||||
if ( format != FORMAT_HTML && ! hr->getString("c",NULL) ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
char *msg = mstrerror(g_errno);
|
||||
return g_httpServer.sendErrorReply(sock,g_errno,msg,NULL);
|
||||
}
|
||||
|
||||
// set this. also sets gr->m_hr
|
||||
GigablastRequest *gr = &msg7->m_gr;
|
||||
// this will fill in GigablastRequest so all the parms we need are set
|
||||
@ -78,6 +87,9 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
return sendReply ( msg7 );
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
// a scrape request?
|
||||
if ( gr->m_queryToScrape && gr->m_queryToScrape[0] ) {
|
||||
//char *uf="http://www.google.com/search?num=50&"
|
||||
@ -117,7 +129,45 @@ bool sendReply ( void *state ) {
|
||||
//long hostId = msg7->m_msg7.m_hostId;
|
||||
long long docId = xd->m_docId;
|
||||
long hostId = 0;//msg7->m_msg7.m_hostId;
|
||||
|
||||
|
||||
// set g_errno to index code
|
||||
if ( xd->m_indexCodeValid && xd->m_indexCode && ! g_errno )
|
||||
g_errno = xd->m_indexCode;
|
||||
|
||||
char format = gr->m_hr.getReplyFormat();
|
||||
|
||||
// no url parm?
|
||||
if ( ! g_errno && ! gr->m_url && format != FORMAT_HTML )
|
||||
g_errno = EMISSINGINPUT;
|
||||
|
||||
if ( g_errno ) {
|
||||
long save = g_errno;
|
||||
mdelete ( msg7, sizeof(Msg7) , "PageInject" );
|
||||
delete (msg7);
|
||||
g_errno = save;
|
||||
char *msg = mstrerror(g_errno);
|
||||
return g_httpServer.sendErrorReply(sock,save,msg,NULL);
|
||||
}
|
||||
|
||||
char abuf[32];
|
||||
SafeBuf am(abuf,32,0,false);
|
||||
|
||||
// a success reply, include docid and url i guess
|
||||
if ( format == FORMAT_XML ) {
|
||||
am.safePrintf("\t<docId>%lli</docId>\n",xd->m_docId);
|
||||
char *addMsg = am.getBufStart();
|
||||
mdelete ( msg7, sizeof(Msg7) , "PageInject" );
|
||||
delete (msg7);
|
||||
return g_httpServer.sendSuccessReply(sock,format,addMsg);
|
||||
}
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
am.safePrintf("\t\"docId\":%lli,\n",xd->m_docId);
|
||||
char *addMsg = am.getBufStart();
|
||||
mdelete ( msg7, sizeof(Msg7) , "PageInject" );
|
||||
delete (msg7);
|
||||
return g_httpServer.sendSuccessReply(sock,format,addMsg);
|
||||
}
|
||||
|
||||
//
|
||||
// debug
|
||||
@ -159,11 +209,6 @@ bool sendReply ( void *state ) {
|
||||
if ( url && gr->m_shortReply ) {
|
||||
char buf[1024*32];
|
||||
char *p = buf;
|
||||
// set g_errno to index code
|
||||
if ( xd->m_indexCodeValid &&
|
||||
xd->m_indexCode &&
|
||||
! g_errno )
|
||||
g_errno = xd->m_indexCode;
|
||||
// return docid and hostid
|
||||
if ( ! g_errno ) p += sprintf ( p ,
|
||||
"0,docId=%lli,hostId=%li," ,
|
||||
@ -275,6 +320,12 @@ bool Msg7::inject ( void *state ,
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( ! gr->m_url ) {
|
||||
log("inject: no url provied to inject");
|
||||
g_errno = EBADURL;
|
||||
return true;
|
||||
}
|
||||
|
||||
//char *coll = cr->m_coll;
|
||||
|
||||
m_state = state;
|
||||
|
@ -257,6 +257,10 @@ bool Msg1c::reindexQuery ( char *query ,
|
||||
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
||||
|
||||
// sanity fix
|
||||
if ( endNum - startNum > MAXDOCIDSTOCOMPUTE )
|
||||
endNum = startNum + MAXDOCIDSTOCOMPUTE;
|
||||
|
||||
//CollectionRec *cr = g_collectiondb.getRec ( coll );
|
||||
// reset again just in case
|
||||
m_req.reset();
|
||||
|
291
PageResults.cpp
291
PageResults.cpp
@ -149,6 +149,7 @@ bool sendReply ( State0 *st , char *reply ) {
|
||||
mdelete(st, sizeof(State0), "PageResults2");
|
||||
delete st;
|
||||
|
||||
/*
|
||||
if ( format == FORMAT_XML ) {
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("<?xml version=\"1.0\" "
|
||||
@ -174,6 +175,7 @@ bool sendReply ( State0 *st , char *reply ) {
|
||||
charset );
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
long status = 500;
|
||||
if (savedErr == ETOOMANYOPERANDS ||
|
||||
@ -244,7 +246,7 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
//long xml = hr->getLong("xml",0);
|
||||
|
||||
// what format should search results be in? default is html
|
||||
char format = getFormatFromRequest ( hr );
|
||||
char format = hr->getReplyFormat();//getFormatFromRequest ( hr );
|
||||
|
||||
// get the dmoz catid if given
|
||||
//long searchingDmoz = hr->getLong("dmoz",0);
|
||||
@ -543,6 +545,10 @@ bool sendPageResults ( TcpSocket *s , HttpRequest *hr ) {
|
||||
if ( cr ) st->m_collnum = cr->m_collnum;
|
||||
else st->m_collnum = -1;
|
||||
|
||||
// turn this on for json output, unless diffbot collection
|
||||
if ( format == FORMAT_JSON && ! cr->m_isCustomCrawl )
|
||||
st->m_header = 1;
|
||||
|
||||
// take this out here as well!
|
||||
// limit here
|
||||
// long maxpp = cr->m_maxSearchResultsPerQuery ;
|
||||
@ -1009,6 +1015,11 @@ bool gotResults ( void *state ) {
|
||||
|
||||
// if already printed from Msg40.cpp, bail out now
|
||||
if ( si->m_streamResults ) {
|
||||
// this will be our final send
|
||||
if ( st->m_socket->m_streamingMode ) {
|
||||
log("res: socket still in streaming mode. wtf?");
|
||||
st->m_socket->m_streamingMode = false;
|
||||
}
|
||||
log("msg40: done streaming. nuking state.");
|
||||
mdelete(st, sizeof(State0), "PageResults2");
|
||||
delete st;
|
||||
@ -1019,12 +1030,12 @@ bool gotResults ( void *state ) {
|
||||
//char *coll = si->m_coll2;
|
||||
//long collLen = si->m_collLen2;
|
||||
|
||||
collnum_t collnum = si->m_firstCollnum;
|
||||
//collnum_t collnum = si->m_firstCollnum;
|
||||
|
||||
// collection rec must still be there since SearchInput references
|
||||
// into it, and it must be the SAME ptr too!
|
||||
CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
||||
if ( ! cr || cr != si->m_cr ) {
|
||||
CollectionRec *cr = si->m_cr;//g_collectiondb.getRec ( collnum );
|
||||
if ( ! cr ) { // || cr != si->m_cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
return sendReply(st,NULL);
|
||||
}
|
||||
@ -1705,12 +1716,6 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
(long)moreFollow);
|
||||
}
|
||||
|
||||
|
||||
if ( st->m_header && si->m_format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\"objects\":[\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// . did he get a spelling recommendation?
|
||||
// . do not use htmlEncode() on this anymore since receiver
|
||||
// of the XML feed usually does not want that.
|
||||
@ -1720,6 +1725,27 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
sb->safePrintf ("]]></spell>\n");
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_JSON && st->m_spell[0] ) {
|
||||
sb->safePrintf ("\t\"spell\":\"");
|
||||
sb->jsonEncode(st->m_spell);
|
||||
sb->safePrintf ("\"\n,");
|
||||
}
|
||||
|
||||
|
||||
// for diffbot collections only...
|
||||
if ( st->m_header &&
|
||||
si->m_format == FORMAT_JSON &&
|
||||
cr->m_isCustomCrawl ) {
|
||||
sb->safePrintf("\"objects\":[\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_JSON &&
|
||||
! cr->m_isCustomCrawl ) {
|
||||
sb->safePrintf("\"results\":[\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// debug
|
||||
if ( si->m_debug )
|
||||
logf(LOG_DEBUG,"query: Displaying up to %li results.",
|
||||
@ -2821,6 +2847,40 @@ static bool printDMOZCategoryUnderResult ( SafeBuf *sb ,
|
||||
long catid ,
|
||||
State0 *st ) {
|
||||
|
||||
char format = si->m_format;
|
||||
|
||||
if ( format == FORMAT_XML ) {
|
||||
sb->safePrintf("\t\t<dmozCat>\n"
|
||||
"\t\t\t<dmozCatId>%li</dmozCatId>\n"
|
||||
"\t\t\t<dmozCatStr><![CDATA["
|
||||
,catid);
|
||||
// print the name of the dmoz category
|
||||
char xbuf[256];
|
||||
SafeBuf xb(xbuf,256,0,false);
|
||||
g_categories->printPathFromId(&xb, catid, false,si->m_isRTL);
|
||||
sb->cdataEncode(xb.getBufStart());
|
||||
sb->safePrintf("]]></dmozCatStr>\n"
|
||||
"\t\t</dmozCat>\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\t\t\"dmozCat\":{\n"
|
||||
"\t\t\t\"dmozCatId\":%li,\n"
|
||||
"\t\t\t\"dmozCatStr\":\""
|
||||
,catid);
|
||||
// print the name of the dmoz category
|
||||
char xbuf[256];
|
||||
SafeBuf xb(xbuf,256,0,false);
|
||||
g_categories->printPathFromId(&xb, catid, false,si->m_isRTL);
|
||||
sb->jsonEncode(xb.getBufStart());
|
||||
sb->safePrintf("\"\n"
|
||||
"\t\t},\n");
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
//uint8_t queryLanguage = langUnknown;
|
||||
uint8_t queryLanguage = si->m_queryLangId;
|
||||
// Don't print category if not in native language category
|
||||
@ -3011,7 +3071,13 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
}
|
||||
|
||||
|
||||
if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t<result>\n" );
|
||||
if ( si->m_format == FORMAT_XML )
|
||||
sb->safePrintf("\t<result>\n" );
|
||||
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
if ( *numPrintedSoFar != 0 ) sb->safePrintf(",\n");
|
||||
sb->safePrintf("\t{\n" );
|
||||
}
|
||||
|
||||
Highlight hi;
|
||||
|
||||
@ -3112,7 +3178,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
|
||||
// if we have a thumbnail show it next to the search result,
|
||||
// base64 encoded
|
||||
if ( (si->m_format == FORMAT_HTML || si->m_format == FORMAT_XML ) &&
|
||||
if ( //(si->m_format == FORMAT_HTML || si->m_format == FORMAT_XML ) &&
|
||||
//! mr->ptr_imgUrl &&
|
||||
mr->ptr_imgData ) {
|
||||
ThumbnailArray *ta = (ThumbnailArray *)mr->ptr_imgData;
|
||||
@ -3128,9 +3194,25 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
si->m_format );
|
||||
if ( si->m_format == FORMAT_XML ) {
|
||||
sb->safePrintf("\t\t<imageHeight>%li</imageHeight>\n",
|
||||
ti->m_dx);
|
||||
sb->safePrintf("\t\t<imageWidth>%li</imageWidth>\n",
|
||||
ti->m_dy);
|
||||
sb->safePrintf("\t\t<imageWidth>%li</imageWidth>\n",
|
||||
ti->m_dx);
|
||||
sb->safePrintf("\t\t<origImageHeight>%li"
|
||||
"</origImageHeight>\n",
|
||||
ti->m_origDY);
|
||||
sb->safePrintf("\t\t<origImageWidth>%li"
|
||||
"</origImageWidth>\n",
|
||||
ti->m_origDX);
|
||||
}
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\t\t\"imageHeight\":%li,\n",
|
||||
ti->m_dy);
|
||||
sb->safePrintf("\t\t\"imageWidth\":%li,\n",
|
||||
ti->m_dx);
|
||||
sb->safePrintf("\t\t\"origImageHeight\":%li,\n",
|
||||
ti->m_origDY);
|
||||
sb->safePrintf("\t\t\"origImageWidth\":%li,\n",
|
||||
ti->m_origDX);
|
||||
}
|
||||
}
|
||||
|
||||
@ -3357,7 +3439,6 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
}
|
||||
|
||||
|
||||
|
||||
long hlen;
|
||||
//copy all summary and title excerpts for this result into here
|
||||
//char tt[1024*32];
|
||||
@ -3375,8 +3456,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
frontTag = "<font style=\"background-color:yellow\">" ;
|
||||
}
|
||||
long cols = 80;
|
||||
if ( si->m_format == FORMAT_XML )
|
||||
sb->safePrintf("\t\t<title><![CDATA[");
|
||||
|
||||
SafeBuf hb;
|
||||
if ( str && strLen && si->m_doQueryHighlighting ) {
|
||||
hlen = hi.set ( &hb,
|
||||
@ -3393,29 +3473,55 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
backTag,
|
||||
0,
|
||||
0 ); // niceness
|
||||
// reassign!
|
||||
str = hb.getBufStart();
|
||||
strLen = hb.getLength();
|
||||
//if (!sb->utf8Encode2(tt, hlen)) return false;
|
||||
if ( ! sb->brify ( hb.getBufStart(),
|
||||
hb.getLength(),
|
||||
0,
|
||||
cols) ) return false;
|
||||
// if ( si->m_format != FORMAT_JSON )
|
||||
// if ( ! sb->brify ( hb.getBufStart(),
|
||||
// hb.getLength(),
|
||||
// 0,
|
||||
// cols) ) return false;
|
||||
}
|
||||
else if ( str && strLen ) {
|
||||
|
||||
// . use "UNTITLED" if no title
|
||||
// . msg20 should supply the dmoz title if it can
|
||||
if ( strLen == 0 &&
|
||||
si->m_format != FORMAT_XML &&
|
||||
si->m_format != FORMAT_JSON ) {
|
||||
str = "<i>UNTITLED</i>";
|
||||
strLen = gbstrlen(str);
|
||||
}
|
||||
|
||||
if ( str &&
|
||||
strLen &&
|
||||
( si->m_format == FORMAT_HTML ||
|
||||
si->m_format == FORMAT_WIDGET_IFRAME ||
|
||||
si->m_format == FORMAT_WIDGET_APPEND ||
|
||||
si->m_format == FORMAT_WIDGET_AJAX )
|
||||
) {
|
||||
// determine if TiTle wraps, if it does add a <br> count for
|
||||
// each wrap
|
||||
//if (!sb->utf8Encode2(str , strLen )) return false;
|
||||
if ( ! sb->brify ( str,strLen,0,cols) ) return false;
|
||||
}
|
||||
// . use "UNTITLED" if no title
|
||||
// . msg20 should supply the dmoz title if it can
|
||||
if ( strLen == 0 ) {
|
||||
if(!sb->safePrintf("<i>UNTITLED</i>"))
|
||||
return false;
|
||||
}
|
||||
|
||||
// close up the title tag
|
||||
if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></title>\n");
|
||||
if ( si->m_format == FORMAT_XML ) {
|
||||
sb->safePrintf("\t\t<title><![CDATA[");
|
||||
if ( str ) sb->cdataEncode(str);
|
||||
sb->safePrintf("]]></title>\n");
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\t\t\"title\":\"");
|
||||
if ( str ) sb->jsonEncode(str);
|
||||
sb->safePrintf("\",\n");
|
||||
}
|
||||
|
||||
|
||||
if ( si->m_format == FORMAT_HTML ) sb->safePrintf ("</a><br>\n" ) ;
|
||||
if ( si->m_format == FORMAT_HTML )
|
||||
sb->safePrintf ("</a><br>\n" ) ;
|
||||
|
||||
|
||||
// close the title tag stuf
|
||||
@ -3424,6 +3530,22 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
si->m_format == FORMAT_WIDGET_AJAX )
|
||||
sb->safePrintf("</b></a>\n");
|
||||
|
||||
//
|
||||
// print <h1> tag contents. hack for client.
|
||||
//
|
||||
if ( mr->ptr_htag && mr->size_htag > 1 ) {
|
||||
if ( si->m_format == FORMAT_XML ) {
|
||||
sb->safePrintf("\t\t<h1Tag><![CDATA[");
|
||||
sb->cdataEncode(mr->ptr_htag);
|
||||
sb->safePrintf("]]></h1Tag>\n");
|
||||
}
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\t\t\"h1Tag\":\"");
|
||||
sb->jsonEncode(mr->ptr_htag);
|
||||
sb->safePrintf("\",\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/////
|
||||
//
|
||||
@ -3440,6 +3562,8 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
"]]>"
|
||||
"</contentType>\n",
|
||||
cs);
|
||||
else if ( si->m_format == FORMAT_JSON )
|
||||
sb->safePrintf("\t\t\"contentType\":\"%s\",\n",cs);
|
||||
else if ( si->m_format == FORMAT_HTML && ctype != CT_HTML ) {
|
||||
sb->safePrintf(" <b><font style=color:white;"
|
||||
"background-color:maroon;>");
|
||||
@ -3460,13 +3584,18 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
|
||||
// . then the summary
|
||||
// . "s" is a string of null terminated strings
|
||||
char *send;
|
||||
//char *send;
|
||||
// do the normal summary
|
||||
str = mr->ptr_sum;
|
||||
strLen = mr->size_sum-1;
|
||||
str = mr->ptr_displaySum;
|
||||
// sometimes the summary is longer than requested because for
|
||||
// summary deduping purposes (see "pss" parm in Parms.cpp) we do not
|
||||
// get it as short as request. so use mr->m_sumPrintSize here
|
||||
// not mr->size_sum
|
||||
strLen = mr->size_displaySum - 1;//-1;
|
||||
|
||||
// this includes the terminating \0 or \0\0 so back up
|
||||
if ( strLen < 0 ) strLen = 0;
|
||||
send = str + strLen;
|
||||
//send = str + strLen;
|
||||
|
||||
// dmoz summary might override if we are showing a dmoz topic page
|
||||
if ( dmozSummary ) {
|
||||
@ -3474,8 +3603,6 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
strLen = gbstrlen(dmozSummary);
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t<sum><![CDATA[");
|
||||
|
||||
bool printSummary = true;
|
||||
// do not print summaries for widgets by default unless overridden
|
||||
// with &summary=1
|
||||
@ -3485,13 +3612,25 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
hr->getLong("summaries",0) == 0 )
|
||||
printSummary = false;
|
||||
|
||||
if ( printSummary )
|
||||
if ( printSummary && si->m_format == FORMAT_HTML )
|
||||
sb->brify ( str , strLen, 0 , cols ); // niceness = 0
|
||||
|
||||
// close xml tag
|
||||
if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></sum>\n");
|
||||
if ( si->m_format == FORMAT_XML ) {
|
||||
sb->safePrintf("\t\t<sum><![CDATA[");
|
||||
sb->cdataEncode(str);
|
||||
sb->safePrintf("]]></sum>\n");
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\t\t\"sum\":\"");
|
||||
sb->jsonEncode(str);
|
||||
sb->safePrintf("\",\n");
|
||||
}
|
||||
|
||||
|
||||
// new line if not xml
|
||||
else if ( strLen ) sb->safePrintf("<br>\n");
|
||||
if ( si->m_format == FORMAT_HTML && strLen )
|
||||
sb->safePrintf("<br>\n");
|
||||
|
||||
////////////
|
||||
//
|
||||
@ -3557,6 +3696,11 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
sb->safeMemcpy ( url , urlLen );
|
||||
sb->safePrintf("]]></url>\n");
|
||||
}
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\t\t\"url\":\"");
|
||||
sb->jsonEncode ( url , urlLen );
|
||||
sb->safePrintf("\",\n");
|
||||
}
|
||||
|
||||
|
||||
// now the last spidered date of the document
|
||||
@ -3617,6 +3761,49 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
datedbDate);
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
// doc size in Kilobytes
|
||||
sb->safePrintf ( "\t\t\"size\":\"%4.0fk\",\n",
|
||||
(float)mr->m_contentLen/1024.0);
|
||||
// . docId for possible cached link
|
||||
// . might have merged a bunch together
|
||||
sb->safePrintf("\t\t\"docId\":%lli,\n",mr->m_docId );
|
||||
// . show the site root
|
||||
// . for hompages.com/users/fred/mypage.html this will be
|
||||
// homepages.com/users/fred/
|
||||
// . for www.xyz.edu/~foo/burp/ this will be
|
||||
// www.xyz.edu/~foo/ etc.
|
||||
long siteLen = 0;
|
||||
char *site = NULL;
|
||||
// seems like this isn't the way to do it, cuz Tagdb.cpp
|
||||
// adds the "site" tag itself and we do not always have it
|
||||
// in the XmlDoc::ptr_tagRec... so do it this way:
|
||||
site = mr->ptr_site;
|
||||
siteLen = mr->size_site-1;
|
||||
//char *site=uu.getSite( &siteLen , si->m_coll, false, tagRec);
|
||||
sb->safePrintf("\t\t\"site\":\"");
|
||||
if ( site && siteLen > 0 ) sb->safeMemcpy ( site , siteLen );
|
||||
sb->safePrintf("\",\n");
|
||||
//long sh = hash32 ( site , siteLen );
|
||||
//sb->safePrintf ("\t\t<siteHash32>%lu</siteHash32>\n",sh);
|
||||
//long dh = uu.getDomainHash32 ();
|
||||
//sb->safePrintf ("\t\t<domainHash32>%lu</domainHash32>\n",dh);
|
||||
// spider date
|
||||
sb->safePrintf ( "\t\t\"spidered\":%lu,\n",
|
||||
mr->m_lastSpidered);
|
||||
// backwards compatibility for buzz
|
||||
sb->safePrintf ( "\t\t\"firstIndexedDateUTC\":%lu,\n"
|
||||
, mr->m_firstIndexedDate);
|
||||
sb->safePrintf( "\t\t\"contentHash32\":%lu,\n"
|
||||
, mr->m_contentHash32);
|
||||
// pub date
|
||||
long datedbDate = mr->m_datedbDate;
|
||||
// show the datedb date as "<pubDate>" for now
|
||||
if ( datedbDate != -1 )
|
||||
sb->safePrintf ( "\t\t\"pubdate\":%lu,\n",
|
||||
datedbDate);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// . we also store the outlinks in a linkInfo structure
|
||||
@ -3642,6 +3829,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
k->m_ip, // hostHash, but use ip for now
|
||||
(long)k->m_firstIndexedDate ,
|
||||
(long)k->m_datedbDate );
|
||||
|
||||
if ( si->m_format == FORMAT_XML ) {
|
||||
// result
|
||||
sb->safePrintf("\t\t<language><![CDATA[%s]]>"
|
||||
@ -3654,6 +3842,16 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
"</charset>\n", charset);
|
||||
}
|
||||
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
// result
|
||||
sb->safePrintf("\t\t\"language\":\"%s\",\n",
|
||||
getLanguageString(mr->m_language));
|
||||
|
||||
char *charset = get_charset_str(mr->m_charset);
|
||||
if(charset)
|
||||
sb->safePrintf("\t\t\"charset\":\"%s\",\n",charset);
|
||||
}
|
||||
|
||||
//
|
||||
// end more xml stuff
|
||||
//
|
||||
@ -3797,10 +3995,10 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
,ix
|
||||
);
|
||||
// reindex
|
||||
sb->safePrintf(" - <a style=color:red; href=\"/addurl?u=");
|
||||
sb->safePrintf(" - <a style=color:red; href=\"/addurl?urls=");
|
||||
sb->urlEncode ( url , gbstrlen(url) , false );
|
||||
unsigned long long rand64 = gettimeofdayInMillisecondsLocal();
|
||||
sb->safePrintf("&rand64=%llu&force=1\">respider</a>",rand64);
|
||||
sb->safePrintf("&rand64=%llu\">respider</a>",rand64);
|
||||
}
|
||||
|
||||
|
||||
@ -4041,6 +4239,11 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
if ( ! dp ) {
|
||||
if ( si->m_format == FORMAT_XML )
|
||||
sb->safePrintf ("\t</result>\n\n");
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
// remove last ,\n
|
||||
sb->m_length -= 2;
|
||||
sb->safePrintf ("\n\t}\n\n");
|
||||
}
|
||||
// wtf?
|
||||
//char *xx=NULL;*xx=0;
|
||||
// at least close up the table
|
||||
@ -4126,7 +4329,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
if ( minScore < 0.0 || totalPairScore < minScore )
|
||||
minScore = totalPairScore;
|
||||
// we need to set "ft" for xml stuff below
|
||||
if ( si->m_format == FORMAT_XML ) continue;
|
||||
if ( si->m_format != FORMAT_HTML ) continue;
|
||||
//sb->safePrintf("<table border=1><tr><td><center><b>");
|
||||
// print pair text
|
||||
//long qtn1 = fps->m_qtermNum1;
|
||||
@ -4209,7 +4412,7 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
if ( minScore < 0.0 || totalSingleScore < minScore )
|
||||
minScore = totalSingleScore;
|
||||
// we need to set "ft" for xml stuff below
|
||||
if ( si->m_format == FORMAT_XML ) continue;
|
||||
if ( si->m_format != FORMAT_HTML ) continue;
|
||||
//sb->safePrintf("<table border=1><tr><td><center><b>");
|
||||
// print pair text
|
||||
//long qtn = fss->m_qtermNum;
|
||||
|
100
PageRoot.cpp
100
PageRoot.cpp
@ -22,7 +22,7 @@
|
||||
//char *printNumResultsDropDown ( char *p, long n, bool *printedDropDown);
|
||||
bool printNumResultsDropDown ( SafeBuf& sb, long n, bool *printedDropDown);
|
||||
//static char *printTopDirectory ( char *p, char *pend );
|
||||
static bool printTopDirectory ( SafeBuf& sb );
|
||||
static bool printTopDirectory ( SafeBuf& sb , char format );
|
||||
|
||||
// this prints the last five queries
|
||||
//static long printLastQueries ( char *p , char *pend ) ;
|
||||
@ -586,7 +586,7 @@ bool expandHtml ( SafeBuf& sb,
|
||||
if ( head[i+1] == 't' ) {
|
||||
i += 1;
|
||||
//p = printTopDirectory ( p, pend );
|
||||
printTopDirectory ( sb );
|
||||
printTopDirectory ( sb , FORMAT_HTML );
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -963,7 +963,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
"onLoad=\""
|
||||
"var client = new XMLHttpRequest();\n"
|
||||
"client.onreadystatechange = handler;\n"
|
||||
"var url='/addurl?u="
|
||||
"var url='/addurl?urls="
|
||||
);
|
||||
sb.urlEncode ( url );
|
||||
// propagate "admin" if set
|
||||
@ -1042,7 +1042,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
if ( ! coll )
|
||||
coll = "";
|
||||
|
||||
sb.safePrintf("<input name=u type=text size=60 value=\"");
|
||||
sb.safePrintf("<input name=urls type=text size=60 value=\"");
|
||||
if ( url ) {
|
||||
SafeBuf tmp;
|
||||
tmp.safePrintf("%s",url);
|
||||
@ -1092,7 +1092,7 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
//"alert('shit');"
|
||||
"var client = new XMLHttpRequest();\n"
|
||||
"client.onreadystatechange = handler;\n"
|
||||
"var url='/addurl?u="
|
||||
"var url='/addurl?urls="
|
||||
, root );
|
||||
sb.urlEncode ( url );
|
||||
// propagate "admin" if set
|
||||
@ -1128,6 +1128,11 @@ bool printAddUrlHomePage ( SafeBuf &sb , char *url , HttpRequest *r ) {
|
||||
|
||||
bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
|
||||
char format = r->getReplyFormat();
|
||||
if ( format != FORMAT_HTML )
|
||||
return printTopDirectory ( sb , format );
|
||||
|
||||
|
||||
sb.safePrintf("<html>\n");
|
||||
sb.safePrintf("<head>\n");
|
||||
//sb.safePrintf("<meta http-equiv=\"Content-Type\" content=\"text/html; charset=iso-8859-1\">");
|
||||
@ -1216,7 +1221,7 @@ bool printDirHomePage ( SafeBuf &sb , HttpRequest *r ) {
|
||||
sb.safePrintf("\n");
|
||||
|
||||
|
||||
printTopDirectory ( sb );
|
||||
printTopDirectory ( sb , FORMAT_HTML );
|
||||
|
||||
sb.safePrintf("<br><br>\n");
|
||||
|
||||
@ -1395,10 +1400,12 @@ long printLastQueries ( char *p , char *pend ) {
|
||||
|
||||
|
||||
//char *printTopDirectory ( char *p, char *pend ) {
|
||||
bool printTopDirectory ( SafeBuf& sb ) {
|
||||
bool printTopDirectory ( SafeBuf& sb , char format ) {
|
||||
|
||||
long nr = g_catdb.getRdb()->getNumTotalRecs();
|
||||
|
||||
// if no recs in catdb, print instructions
|
||||
if ( g_catdb.getRdb()->getNumTotalRecs() == 0 )
|
||||
if ( nr == 0 && format == FORMAT_HTML)
|
||||
return sb.safePrintf("<center>"
|
||||
"<b>DMOZ functionality is not set up.</b>"
|
||||
"<br>"
|
||||
@ -1411,6 +1418,12 @@ bool printTopDirectory ( SafeBuf& sb ) {
|
||||
"</b>"
|
||||
"</center>");
|
||||
|
||||
// send back an xml/json error reply
|
||||
if ( nr == 0 && format != FORMAT_HTML ) {
|
||||
g_errno = EDMOZNOTREADY;
|
||||
return false;
|
||||
}
|
||||
|
||||
//char topList[4096];
|
||||
//sprintf(topList,
|
||||
return sb.safePrintf (
|
||||
@ -1619,26 +1632,26 @@ static bool s_inprogress = false;
|
||||
|
||||
// . returns false if blocked, true otherwise
|
||||
// . sets g_errno on error
|
||||
bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
bool sendPageAddUrl ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
// . get fields from cgi field of the requested url
|
||||
// . get the search query
|
||||
long urlLen = 0;
|
||||
char *url = r->getString ( "u" , &urlLen , NULL /*default*/);
|
||||
char *url = hr->getString ( "urls" , &urlLen , NULL /*default*/);
|
||||
|
||||
// see if they provided a url of a file of urls if they did not
|
||||
// provide a url to add directly
|
||||
bool isAdmin = g_conf.isCollAdmin ( s , r );
|
||||
bool isAdmin = g_conf.isCollAdmin ( sock , hr );
|
||||
long ufuLen = 0;
|
||||
char *ufu = NULL;
|
||||
if ( isAdmin )
|
||||
// get the url of a file of urls (ufu)
|
||||
ufu = r->getString ( "ufu" , &ufuLen , NULL );
|
||||
//if ( isAdmin )
|
||||
// // get the url of a file of urls (ufu)
|
||||
// ufu = hr->getString ( "ufu" , &ufuLen , NULL );
|
||||
|
||||
// can't be too long, that's obnoxious
|
||||
if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) {
|
||||
g_errno = EBUFTOOSMALL;
|
||||
g_msg = " (error: url too long)";
|
||||
return g_httpServer.sendErrorReply(s,500,"url too long");
|
||||
return g_httpServer.sendErrorReply(sock,500,"url too long");
|
||||
}
|
||||
// get the collection
|
||||
//long collLen = 0;
|
||||
@ -1650,20 +1663,20 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
//}
|
||||
// get collection rec
|
||||
|
||||
CollectionRec *cr = g_collectiondb.getRec ( r );
|
||||
CollectionRec *cr = g_collectiondb.getRec ( hr );
|
||||
|
||||
// bitch if no collection rec found
|
||||
if ( ! cr ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
g_msg = " (error: no collection)";
|
||||
return g_httpServer.sendErrorReply(s,500,"no coll rec");
|
||||
return g_httpServer.sendErrorReply(sock,500,"no coll rec");
|
||||
}
|
||||
// . make sure the ip is not banned
|
||||
// . we may also have an exclusive list of IPs for private collections
|
||||
if ( ! cr->hasSearchPermission ( s ) ) {
|
||||
if ( ! cr->hasSearchPermission ( sock ) ) {
|
||||
g_errno = ENOPERM;
|
||||
g_msg = " (error: permission denied)";
|
||||
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
|
||||
return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno));
|
||||
}
|
||||
|
||||
|
||||
@ -1672,8 +1685,8 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
//
|
||||
if ( ! url ) {
|
||||
SafeBuf sb;
|
||||
printAddUrlHomePage ( sb , NULL , r );
|
||||
return g_httpServer.sendDynamicPage(s,
|
||||
printAddUrlHomePage ( sb , NULL , hr );
|
||||
return g_httpServer.sendDynamicPage(sock,
|
||||
sb.getBufStart(),
|
||||
sb.length(),
|
||||
// 120 secs cachetime
|
||||
@ -1686,19 +1699,19 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
200,
|
||||
NULL, // cookie
|
||||
"UTF-8",
|
||||
r);
|
||||
hr);
|
||||
}
|
||||
|
||||
//
|
||||
// run the ajax script on load to submit the url now
|
||||
//
|
||||
long id = r->getLong("id",0);
|
||||
long id = hr->getLong("id",0);
|
||||
// if we are not being called by the ajax loader, the put the
|
||||
// ajax loader script into the html now
|
||||
if ( id == 0 ) {
|
||||
SafeBuf sb;
|
||||
printAddUrlHomePage ( sb , url , r );
|
||||
return g_httpServer.sendDynamicPage ( s,
|
||||
printAddUrlHomePage ( sb , url , hr );
|
||||
return g_httpServer.sendDynamicPage ( sock,
|
||||
sb.getBufStart(),
|
||||
sb.length(),
|
||||
// don't cache any more
|
||||
@ -1711,7 +1724,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
200,
|
||||
NULL, // cookie
|
||||
"UTF-8",
|
||||
r);
|
||||
hr);
|
||||
}
|
||||
|
||||
//
|
||||
@ -1742,7 +1755,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
if ( msg ) {
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("%s",msg);
|
||||
g_httpServer.sendDynamicPage (s,
|
||||
g_httpServer.sendDynamicPage (sock,
|
||||
sb.getBufStart(),
|
||||
sb.length(),
|
||||
3600,//-1, // cachetime
|
||||
@ -1764,10 +1777,10 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
g_errno = ENOMEM;
|
||||
log("PageAddUrl: new(%i): %s",
|
||||
sizeof(State1i),mstrerror(g_errno));
|
||||
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); }
|
||||
return g_httpServer.sendErrorReply(sock,500,mstrerror(g_errno)); }
|
||||
mnew ( st1 , sizeof(State1i) , "PageAddUrl" );
|
||||
// save socket and isAdmin
|
||||
st1->m_socket = s;
|
||||
st1->m_socket = sock;
|
||||
st1->m_isAdmin = isAdmin;
|
||||
|
||||
/*
|
||||
@ -1809,12 +1822,12 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
//unsigned long h = ipdom ( s->m_ip );
|
||||
// . use top 2 bytes now, some isps have large blocks
|
||||
// . if this causes problems, then they can do pay for inclusion
|
||||
unsigned long h = iptop ( s->m_ip );
|
||||
unsigned long h = iptop ( sock->m_ip );
|
||||
long codeLen;
|
||||
char* code = r->getString("code", &codeLen);
|
||||
if(g_autoBan.hasCode(code, codeLen, s->m_ip)) {
|
||||
char* code = hr->getString("code", &codeLen);
|
||||
if(g_autoBan.hasCode(code, codeLen, sock->m_ip)) {
|
||||
long uipLen = 0;
|
||||
char* uip = r->getString("uip",&uipLen);
|
||||
char* uip = hr->getString("uip",&uipLen);
|
||||
long hip = 0;
|
||||
//use the uip when we have a raw query to test if
|
||||
//we can submit
|
||||
@ -1824,18 +1837,18 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
}
|
||||
}
|
||||
|
||||
st1->m_strip = r->getLong("strip",0);
|
||||
st1->m_strip = hr->getLong("strip",0);
|
||||
// . Remember, for cgi, if the box is not checked, then it is not
|
||||
// reported in the request, so set default return value to 0
|
||||
// . support both camel case and all lower-cases
|
||||
st1->m_spiderLinks = r->getLong("spiderLinks",0);
|
||||
st1->m_spiderLinks = r->getLong("spiderlinks",st1->m_spiderLinks);
|
||||
st1->m_spiderLinks = hr->getLong("spiderLinks",0);
|
||||
st1->m_spiderLinks = hr->getLong("spiderlinks",st1->m_spiderLinks);
|
||||
|
||||
// . should we force it into spiderdb even if already in there
|
||||
// . use to manually update spider times for a url
|
||||
// . however, will not remove old scheduled spider times
|
||||
// . mdw: made force on the default
|
||||
st1->m_forceRespider = r->getLong("force",1); // 0);
|
||||
st1->m_forceRespider = hr->getLong("force",1); // 0);
|
||||
|
||||
long now = getTimeGlobal();
|
||||
// . allow 1 submit every 1 hour
|
||||
@ -1850,7 +1863,7 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
delete (st1);
|
||||
// use cachetime of 3600 so it does not re-inject if you hit
|
||||
// the back button!
|
||||
g_httpServer.sendDynamicPage (s,
|
||||
g_httpServer.sendDynamicPage (sock,
|
||||
sb.getBufStart(),
|
||||
sb.length(),
|
||||
3600,//-1, // cachetime
|
||||
@ -1878,6 +1891,17 @@ bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
|
||||
*/
|
||||
|
||||
|
||||
// set this. also sets gr->m_hr
|
||||
GigablastRequest *gr = &st1->m_msg7.m_gr;
|
||||
// this will fill in GigablastRequest so all the parms we need are set
|
||||
g_parms.setGigablastRequest ( sock , hr , gr );
|
||||
|
||||
// this is really an injection, not add url, so make
|
||||
// GigablastRequest::m_url point to Gigablast::m_urlsBuf because
|
||||
// the PAGE_ADDURLS2 parms in Parms.cpp fill in the m_urlsBuf.
|
||||
// HACK!
|
||||
gr->m_url = gr->m_urlsBuf;
|
||||
|
||||
//
|
||||
// inject using msg7
|
||||
//
|
||||
|
@ -51,7 +51,7 @@ static void sendReply ( void *st ) ;
|
||||
|
||||
// . returns false if blocked, otherwise true
|
||||
// . sets g_errno on error
|
||||
bool sendPageStatsdb ( TcpSocket *s, HttpRequest *r ) {
|
||||
bool sendPageGraph ( TcpSocket *s, HttpRequest *r ) {
|
||||
|
||||
char *cgi;
|
||||
long cgiLen;
|
||||
@ -201,6 +201,13 @@ void sendReply ( void *state ) {
|
||||
strncpy( startTimeStr, ctime( &st->m_startDate ), 30 );
|
||||
strncpy( endTimeStr, ctime( &st->m_endDate ), 30 );
|
||||
|
||||
buf.safePrintf(
|
||||
"<b>Graph of various query performance statistics.</b>"
|
||||
"<br>"
|
||||
"<br>"
|
||||
);
|
||||
|
||||
|
||||
buf.safePrintf("<center>\n");
|
||||
|
||||
if ( ! g_conf.m_useStatsdb )
|
||||
@ -208,6 +215,7 @@ void sendReply ( void *state ) {
|
||||
"Turn on in the master controls.</b>"
|
||||
"</font>\n" );
|
||||
|
||||
|
||||
buf.safePrintf("<table %s>\n",TABLE_STYLE);
|
||||
|
||||
buf.safePrintf("<tr><td bgcolor=#%s>"
|
||||
|
326
Pages.cpp
326
Pages.cpp
@ -72,13 +72,16 @@ static WebPage s_pages[] = {
|
||||
//{ PAGE_WIDGET , "widget" , 0 , "widget" , 0 , 0 ,
|
||||
// "widget page",
|
||||
// sendPageWidget, 0 ,NULL,NULL,PG_NOAPI},
|
||||
|
||||
// this is the public addurl, /addurl, if you are using the
|
||||
// api use PAGE_ADDURL2 which is /admin/addurl. so we set PG_NOAPI here
|
||||
{ PAGE_ADDURL , "addurl" , 0 , "add url" , 0 , 0 ,
|
||||
"Page where you can add url for spidering",
|
||||
sendPageAddUrl, 0 ,NULL,NULL,0},
|
||||
sendPageAddUrl, 0 ,NULL,NULL,PG_NOAPI},
|
||||
|
||||
{ PAGE_GET , "get" , 0 , "get" , 0 , 0 ,
|
||||
//USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_CLIENT,
|
||||
"gets cached url",
|
||||
"gets cached web page",
|
||||
sendPageGet , 0 ,NULL,NULL,0},
|
||||
{ PAGE_LOGIN , "login" , 0 , "login" , 0 , 0 ,
|
||||
//USER_PUBLIC | USER_MASTER | USER_ADMIN | USER_SPAM | USER_CLIENT,
|
||||
@ -99,15 +102,15 @@ static WebPage s_pages[] = {
|
||||
|
||||
// use post now for the "site list" which can be big
|
||||
{ PAGE_BASIC_SETTINGS, "admin/settings", 0 , "settings",1, M_POST ,
|
||||
"Basic settings page.", sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI},
|
||||
"basic settings page", sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI},
|
||||
{ PAGE_BASIC_STATUS, "admin/status", 0 , "status",1, 0 ,
|
||||
"Basic status page.", sendPageBasicStatus , 0 ,NULL,NULL,0},
|
||||
"basic status page", sendPageBasicStatus , 0 ,NULL,NULL,0},
|
||||
//{ PAGE_BASIC_DIFFBOT, "admin/diffbot", 0 , "diffbot",1, 0 ,
|
||||
// "Basic diffbot page.", sendPageBasicDiffbot , 0 ,NULL,NULL,PG_NOAPI},
|
||||
{ PAGE_BASIC_SECURITY, "admin/security", 0 , "security",1, 0 ,
|
||||
"Basic security page.", sendPageGeneric , 0 ,NULL,NULL,0},
|
||||
"basic security page", sendPageGeneric , 0 ,NULL,NULL,0},
|
||||
{ PAGE_BASIC_SEARCH, "", 0 , "search",1, 0 ,
|
||||
"Basic search page.", sendPageRoot , 0 ,NULL,NULL,PG_NOAPI},
|
||||
"basic search page", sendPageRoot , 0 ,NULL,NULL,PG_NOAPI},
|
||||
|
||||
|
||||
|
||||
@ -115,7 +118,8 @@ static WebPage s_pages[] = {
|
||||
//USER_MASTER | USER_PROXY ,
|
||||
"master controls page",
|
||||
sendPageGeneric , 0 ,NULL,NULL,0},
|
||||
{ PAGE_SEARCH , "admin/search" , 0 , "search controls" , 1 , 1,
|
||||
// use POST for html head/tail and page root html. might be large.
|
||||
{ PAGE_SEARCH , "admin/search" , 0 , "search controls" ,1,M_POST,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"search controls page",
|
||||
sendPageGeneric , 0 ,NULL,NULL,0},
|
||||
@ -151,10 +155,11 @@ static WebPage s_pages[] = {
|
||||
// { PAGE_SITES , "admin/sites", 0 , "site list" , 1 , 1,
|
||||
// "what sites can be spidered",
|
||||
// sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI}, // sendPageBasicSettings
|
||||
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 , 1,
|
||||
{ PAGE_FILTERS , "admin/filters", 0 , "url filters" , 1 ,M_POST,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"prioritize urls for spidering",
|
||||
sendPageGeneric , 0 ,NULL,NULL,0},
|
||||
// until we get this working, set PG_NOAPI
|
||||
sendPageGeneric , 0 ,NULL,NULL,PG_NOAPI},
|
||||
{ PAGE_INJECT , "admin/inject" , 0 , "inject url" , 0,M_MULTI ,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"inject url in the index here",
|
||||
@ -180,17 +185,17 @@ static WebPage s_pages[] = {
|
||||
// master admin pages
|
||||
{ PAGE_STATS , "admin/stats" , 0 , "stats" , 0 , 0 ,
|
||||
//USER_MASTER | USER_PROXY ,
|
||||
"statistics page",
|
||||
"general statistics",
|
||||
sendPageStats , 0 ,NULL,NULL,0},
|
||||
|
||||
{ PAGE_STATSDB , "admin/statsdb" , 0 , "graph" , 0 , 0 ,
|
||||
{ PAGE_GRAPH , "admin/graph" , 0 , "graph" , 0 , 0 ,
|
||||
//USER_MASTER ,
|
||||
"statistics page",
|
||||
sendPageStatsdb , 2 /*niceness*/ ,NULL,NULL,0},
|
||||
"query stats graph page",
|
||||
sendPageGraph , 2 /*niceness*/ ,NULL,NULL,0},
|
||||
|
||||
{ PAGE_PERF , "admin/perf" , 0 , "performance" , 0 , 0 ,
|
||||
//USER_MASTER | USER_PROXY ,
|
||||
"master performance page",
|
||||
"function performance graph",
|
||||
sendPagePerf , 0 ,NULL,NULL,0},
|
||||
|
||||
{ PAGE_SOCKETS , "admin/sockets" , 0 , "sockets" , 0 , 0 ,
|
||||
@ -237,7 +242,7 @@ static WebPage s_pages[] = {
|
||||
{ PAGE_API , "admin/api" , 0 , "api" , 0 , 0 ,
|
||||
//USER_MASTER | USER_ADMIN ,
|
||||
"api page",
|
||||
sendPageAPI , 0 ,NULL,NULL,0},
|
||||
sendPageAPI , 0 ,NULL,NULL,PG_NOAPI},
|
||||
{ PAGE_RULES , "admin/siterules", 0 , "site rules", 1, M_POST,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"site rules page",
|
||||
@ -258,7 +263,7 @@ static WebPage s_pages[] = {
|
||||
|
||||
{ PAGE_SPIDERDB , "admin/spiderdb" , 0 , "spider queue" , 0 , 0 ,
|
||||
//USER_ADMIN | USER_MASTER ,
|
||||
"spiderdb page",
|
||||
"spider queue",
|
||||
sendPageSpiderdb , 0 ,NULL,NULL,0},
|
||||
//{ PAGE_PRIORITIES, "admin/priorities" , 0 , "priority controls",1,1,
|
||||
// //USER_ADMIN | USER_MASTER ,
|
||||
@ -293,7 +298,7 @@ static WebPage s_pages[] = {
|
||||
sendPageParser , 2 ,NULL,NULL,PG_NOAPI},
|
||||
{ PAGE_SITEDB , "admin/tagdb" , 0 , "tagdb" , 0 , M_POST,
|
||||
//USER_MASTER | USER_ADMIN,
|
||||
"tagdb page to add/remove/get tags",
|
||||
"add/remove/get tags for sites/urls",
|
||||
sendPageTagdb , 0 ,NULL,NULL,0},
|
||||
{ PAGE_CATDB , "admin/catdb" , 0 , "catdb" , 0,M_POST,
|
||||
//USER_MASTER | USER_ADMIN,
|
||||
@ -518,6 +523,9 @@ bool Pages::sendDynamicReply ( TcpSocket *s , HttpRequest *r , long page ) {
|
||||
if ( ! publicPage && ! isAdmin )
|
||||
return sendPageLogin ( s , r );
|
||||
|
||||
if ( page == PAGE_CRAWLBOT && ! isAdmin )
|
||||
log("pages: accessing a crawlbot page without admin privs. "
|
||||
"no parms can be changed.");
|
||||
|
||||
/*
|
||||
// is request coming from a local ip?
|
||||
@ -1088,9 +1096,17 @@ bool Pages::printAdminTop (SafeBuf *sb ,
|
||||
if ( isBasic ) menu = "basic";
|
||||
sb->safePrintf("<br>");
|
||||
sb->safePrintf("<b><font color=gray size=+2>"
|
||||
"%s > %s > %s</font></b>"
|
||||
"%s > %s > %s "
|
||||
" "
|
||||
"</font>"
|
||||
"</b>"
|
||||
//"<a href=/%s?c=%s&showparms=1&format=xml>xml</a> "
|
||||
//"<a href=/%s?c=%s&showparms=1&format=json>json</a> "
|
||||
"<br><br>\n",
|
||||
coll, menu, s_pages[page].m_name);
|
||||
coll, menu, s_pages[page].m_name
|
||||
//,s_pages[page].m_filename , coll
|
||||
//,s_pages[page].m_filename , coll
|
||||
);
|
||||
|
||||
|
||||
|
||||
@ -2479,7 +2495,10 @@ bool sendPageAPI ( TcpSocket *s , HttpRequest *r ) {
|
||||
g_pages.printLogo ( &p , coll );
|
||||
p.safePrintf("</td></tr></table><br><br>");
|
||||
|
||||
|
||||
p.safePrintf("NOTE: All APIs support both GET and POST method. "
|
||||
"If the size of your request is more than 2K you "
|
||||
"should use POST.");
|
||||
p.safePrintf("<br><br>");
|
||||
|
||||
p.safePrintf("<div style=padding-left:10%%>"
|
||||
"<font size=+2><b>API by pages</b></font>"
|
||||
@ -2592,8 +2611,17 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
|
||||
sb->safePrintf("</a>");
|
||||
|
||||
// description of page
|
||||
sb->safePrintf("<font size=-0> - %s</font><br>",
|
||||
s_pages[PAGENUM].m_desc);
|
||||
sb->safePrintf("<font size=-0> - %s "
|
||||
" "
|
||||
"[ <b>output response in</b> "
|
||||
"<a href=/%s?showparms=1&format=xml>xml</a> "
|
||||
"or <a href=/%s?showparms=1&format=json>json</a> "
|
||||
"or <a href=/%s>html</a> ] "
|
||||
"</font><br>",
|
||||
s_pages[PAGENUM].m_desc,
|
||||
pageStr,
|
||||
pageStr,
|
||||
pageStr);
|
||||
sb->safePrintf("</div><br>");
|
||||
|
||||
// begin new list of centered tables
|
||||
@ -2603,7 +2631,7 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
|
||||
sb->safePrintf (
|
||||
"<table style=max-width:80%%; %s>"
|
||||
"<tr class=hdrow><td colspan=9>"
|
||||
"<center><b>Parms</b></tr></tr>"
|
||||
"<center><b>Input</b></tr></tr>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b>#</b></td>"
|
||||
"<td><b>parm</b></td>"
|
||||
@ -2615,9 +2643,75 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
|
||||
, TABLE_STYLE
|
||||
, DARK_BLUE );
|
||||
|
||||
const char *blue = LIGHT_BLUE;
|
||||
const char *blues[] = {DARK_BLUE,LIGHT_BLUE};
|
||||
long count = 1;
|
||||
|
||||
//
|
||||
// every page supports the:
|
||||
// 1) &format=xml|html|json
|
||||
// 2) &showparms=0|1
|
||||
// 3) &c=<collectionName>
|
||||
// parms. we support them in sendPageGeneric() for pages like
|
||||
// /admin/master /admin/search /admin/spider so you can see
|
||||
// the settings.
|
||||
// put these in Parms.cpp, but use PF_DISPLAY flag so we ignore them
|
||||
// in convertHttpRequestToParmList() and we do not show them on the
|
||||
// page itself.
|
||||
//
|
||||
|
||||
// page display/output parms
|
||||
sb->safePrintf("<tr bgcolor=%s>"
|
||||
"<td>%li</td>\n"
|
||||
"<td><b>format</b></td>"
|
||||
"<td>STRING</td>"
|
||||
"<td>output format</td>"
|
||||
"<td>html</td>"
|
||||
"<td>Display output in this format.</td>"
|
||||
"</tr>"
|
||||
, blues[count%2]
|
||||
, count
|
||||
);
|
||||
count++;
|
||||
|
||||
// for pages that have settings...
|
||||
if ( PAGENUM == PAGE_MASTER ||
|
||||
PAGENUM == PAGE_SEARCH ||
|
||||
PAGENUM == PAGE_SPIDER ) {
|
||||
sb->safePrintf("<tr bgcolor=%s>"
|
||||
"<td>%li</td>\n"
|
||||
"<td><b>showparms</b></td>"
|
||||
"<td>BOOL (0 or 1)</td>"
|
||||
"<td>show parms</td>"
|
||||
"<td></td>"
|
||||
"<td>Display the values of all settings.</td>"
|
||||
"</tr>"
|
||||
, blues[count%2]
|
||||
, count
|
||||
);
|
||||
count++;
|
||||
}
|
||||
|
||||
|
||||
// . master controls are for all collections so no need for this
|
||||
// . we already have this in the parms list for some pages so only
|
||||
// show for selected pages here
|
||||
// if ( PAGENUM != PAGE_MASTER ) {
|
||||
// sb->safePrintf("<tr bgcolor=%s>"
|
||||
// "<td>%li</td>\n"
|
||||
// "<td><b>c</b></td>"
|
||||
// "<td>STRING</td>"
|
||||
// "<td>Collection</td>"
|
||||
// "<td></td>"
|
||||
// "<td>The name of the collection. "
|
||||
// "<font color=green><b>REQUIRED</b></font>"
|
||||
// "</td>"
|
||||
// "</tr>"
|
||||
// , blues[count%2]
|
||||
// , count
|
||||
// );
|
||||
// count++;
|
||||
// }
|
||||
|
||||
//char *lastPage = NULL;
|
||||
//Parm *lastParm = NULL;
|
||||
|
||||
@ -2643,10 +2737,6 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
|
||||
|
||||
if ( pageNum != PAGENUM ) continue;
|
||||
|
||||
if ( blue == (const char *)LIGHT_BLUE ) blue = DARK_BLUE;
|
||||
else if(blue==(const char *)DARK_BLUE ) blue = LIGHT_BLUE;
|
||||
|
||||
|
||||
SafeBuf tmp;
|
||||
char diff = 0;
|
||||
bool printVal = false;
|
||||
@ -2664,7 +2754,7 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
|
||||
if ( diff == 1 )
|
||||
sb->safePrintf ( "<tr bgcolor=orange>");
|
||||
else
|
||||
sb->safePrintf ( "<tr bgcolor=#%s>",blue);
|
||||
sb->safePrintf ( "<tr bgcolor=#%s>",blues[count%2]);
|
||||
|
||||
sb->safePrintf("<td>%li</td>",count++);
|
||||
|
||||
@ -2721,6 +2811,17 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
|
||||
// end input parm table we started below
|
||||
sb->safePrintf("</table><br>\n\n");
|
||||
|
||||
// do not print the tables below now,
|
||||
// we provide output links for xml, json and html
|
||||
sb->safePrintf("</center>");
|
||||
|
||||
if ( PAGENUM != PAGE_GET &&
|
||||
PAGENUM != PAGE_RESULTS )
|
||||
return true;
|
||||
|
||||
|
||||
sb->safePrintf("<center>");
|
||||
|
||||
//
|
||||
// done printing parm table
|
||||
//
|
||||
@ -2731,22 +2832,82 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
|
||||
sb->safePrintf (
|
||||
"<table style=max-width:80%%; %s>"
|
||||
"<tr class=hdrow><td colspan=9>"
|
||||
"<center><b>XML Output</b></tr></tr>"
|
||||
"<tr><td>"
|
||||
"<center><b>Example XML Output</b> "
|
||||
"(&format=xml)</tr></tr>"
|
||||
"<tr><td bgcolor=%s>"
|
||||
, TABLE_STYLE
|
||||
, LIGHT_BLUE
|
||||
);
|
||||
sb->safePrintf("<pre>\n");
|
||||
char *desc = s_pages[PAGENUM].m_xmlOutputDesc;
|
||||
if ( ! desc )
|
||||
desc = "<response>\n"
|
||||
"\t<status>N</status> "
|
||||
"# 0 on success, otherwise an "
|
||||
"error code\n"
|
||||
"\t<statusMsg>S</statusMsg> "
|
||||
"# \"Success\" on success, "
|
||||
"otherwise the error message."
|
||||
"</response>";
|
||||
sb->htmlEncode ( desc);
|
||||
|
||||
|
||||
// bool showParms = false;
|
||||
// if ( PAGENUM == PAGE_MASTER ||
|
||||
// PAGENUM == PAGE_SPIDER ||
|
||||
// PAGENUM == PAGE_SEARCH
|
||||
// )
|
||||
// showParms = true;
|
||||
|
||||
|
||||
sb->safePrintf("<pre style=max-width:500px;>\n");
|
||||
|
||||
char *get = "<html><title>Some web page title</title>"
|
||||
"<head>My first web page</head></html>";
|
||||
|
||||
// example output in xml
|
||||
if ( PAGENUM == PAGE_GET ) {
|
||||
SafeBuf xb;
|
||||
xb.safePrintf("<response>\n"
|
||||
"\t<statusCode>0</statusCode>\n"
|
||||
"\t<statusMsg>Success</statusMsg>\n"
|
||||
"\t<url><![CDATA[http://www.doi.gov/]]></url>\n"
|
||||
"\t<docId>34111603247</docId>\n"
|
||||
"\t<cachedTimeUTC>1404512549</cachedTimeUTC>\n"
|
||||
"\t<cachedTimeStr>Jul 04, 2014 UTC"
|
||||
"</cachedTimeStr>\n"
|
||||
"\t<content><![CDATA[");
|
||||
xb.cdataEncode(get);
|
||||
xb.safePrintf("]]></content>\n");
|
||||
xb.safePrintf("</response>\n");
|
||||
sb->htmlEncode ( xb.getBufStart() );
|
||||
}
|
||||
|
||||
if ( PAGENUM == PAGE_RESULTS ) {
|
||||
SafeBuf xb;
|
||||
xb.safePrintf("<response>\n"
|
||||
"\t<statusCode>0</statusCode>\n"
|
||||
"\t<statusMsg>Success</statusMsg>\n"
|
||||
"\t<currentTimeUTC>1404513734</currentTimeUTC>\n"
|
||||
"\t<responseTimeMS>284</responseTimeMS>\n"
|
||||
"\t<docsInCollection>226</docsInCollection>\n"
|
||||
"\t<hits>193</hits>\n"
|
||||
"\t<moreResultsFollow>1</moreResultsFollow>\n"
|
||||
|
||||
"\t<result>\n"
|
||||
"\t\t<imageBase64>/9j/4AAQSkZJRgABAQAAAQABA..."
|
||||
"</imageBase64>\n"
|
||||
"\t\t<imageHeight>350</imageHeight>\n"
|
||||
"\t\t<imageWidth>223</imageWidth>\n"
|
||||
"\t\t<origImageHeight>470</origImageHeight>\n"
|
||||
"\t\t<origImageWidth>300</origImageWidth>\n"
|
||||
"\t\t<title><![CDATA[U.S....]]></title>\n"
|
||||
"\t\t<sum>Department of the Interior protects "
|
||||
"America's natural resources and</sum>\n"
|
||||
"\t\t<url><![CDATA[www.doi.gov]]></url>\n"
|
||||
"\t\t<size> 64k</size>\n"
|
||||
"\t\t<docId>34111603247</docId>\n"
|
||||
"\t\t<site>www.doi.gov</site>\n"
|
||||
"\t\t<spidered>1404512549</spidered>\n"
|
||||
"\t\t<firstIndexedDateUTC>1404512549"
|
||||
"</firstIndexedDateUTC>\n"
|
||||
"\t\t<contentHash32>2680492249</contentHash32>\n"
|
||||
"\t\t<language>English</language>\n"
|
||||
"\t</result>\n"
|
||||
|
||||
"</response>\n");
|
||||
sb->htmlEncode ( xb.getBufStart() );
|
||||
}
|
||||
|
||||
|
||||
sb->safePrintf("</pre>");
|
||||
sb->safePrintf ( "</td></tr></table><br>\n\n" );
|
||||
|
||||
@ -2756,23 +2917,74 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
|
||||
sb->safePrintf (
|
||||
"<table style=max-width:80%%; %s>"
|
||||
"<tr class=hdrow><td colspan=9>"
|
||||
"<center><b>JSON Output</b></tr></tr>"
|
||||
"<tr><td>"
|
||||
"<center><b>Example JSON Output</b> "
|
||||
"(&format=json)</tr></tr>"
|
||||
"<tr><td bgcolor=%s>"
|
||||
, TABLE_STYLE
|
||||
, LIGHT_BLUE
|
||||
);
|
||||
sb->safePrintf("<pre>\n");
|
||||
desc = s_pages[PAGENUM].m_jsonOutputDesc;
|
||||
if ( ! desc )
|
||||
desc = "{ \"response:\"{\n"
|
||||
"\t\"status\":N, "
|
||||
"# 0 on success, otherwise an "
|
||||
"error code\n"
|
||||
"\t\"statusMsg\":\"xxx\" "
|
||||
"# xxx is \"Success\" on success, "
|
||||
"otherwise the error message.\n"
|
||||
"\t}\n"
|
||||
"}";
|
||||
sb->htmlEncode ( desc);
|
||||
|
||||
|
||||
// example output in xml
|
||||
if ( PAGENUM == PAGE_GET ) {
|
||||
sb->safePrintf(
|
||||
"{ \"response:\"{\n"
|
||||
"\t\"statusCode\":0,\n"
|
||||
"\t\"statusMsg\":\"Success\",\n"
|
||||
"\t\"url\":\"http://www.doi.gov/\",\n"
|
||||
"\t\"docId\":34111603247,\n"
|
||||
"\t\"cachedTimeUTC\":1404512549,\n"
|
||||
"\t\"cachedTimeStr\":\"Jul 04, 2014 UTC\",\n"
|
||||
"\t\"content\":\"");
|
||||
SafeBuf js;
|
||||
js.jsonEncode(get);
|
||||
sb->htmlEncode(js.getBufStart());
|
||||
sb->safePrintf("\"\n"
|
||||
"}\n"
|
||||
"}\n");
|
||||
}
|
||||
|
||||
if ( PAGENUM == PAGE_RESULTS ) {
|
||||
sb->safePrintf(
|
||||
"{ \"response:\"{\n"
|
||||
"\t\"statusCode\":0,\n"
|
||||
"\t\"statusMsg\":\"Success\",\n"
|
||||
|
||||
"\t\"currentTimeUTC\":1404588231,\n"
|
||||
"\t\"responseTimeMS\":312,\n"
|
||||
"\t\"docsInCollection\":226,\n"
|
||||
"\t\"hits\":193,\n"
|
||||
"\t\"moreResultsFollow\":1,\n"
|
||||
"\t\"results\":[\n"
|
||||
|
||||
"\t{\n"
|
||||
"\t\t\"imageBase64\":\"/9j/4AAQSkZJR...\",\n"
|
||||
"\t\t\"imageHeight\":223,\n"
|
||||
"\t\t\"imageWidth\":350,\n"
|
||||
"\t\t\"origImageHeight\":300,\n"
|
||||
"\t\t\"origImageWidth\":470,\n"
|
||||
"\t\t\"title\":\"U.S....\",\n"
|
||||
"\t\t\"sum\":\"Department of the Interior "
|
||||
"protects America's natural resources.\",\n"
|
||||
"\t\t\"url\":\"www.doi.gov\",\n"
|
||||
"\t\t\"size\":\" 64k\",\n"
|
||||
"\t\t\"docId\":34111603247,\n"
|
||||
"\t\t\"site\":\"www.doi.gov\",\n"
|
||||
"\t\t\"spidered\":1404512549,\n"
|
||||
"\t\t\"firstIndexedDateUTC\":1404512549,\n"
|
||||
"\t\t\"contentHash32\":2680492249,\n"
|
||||
"\t\t\"language\":\"English\"\n"
|
||||
"\t}\n"
|
||||
"\t,\n"
|
||||
"\t...\n"
|
||||
|
||||
"]\n"
|
||||
"}\n"
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
sb->safePrintf("</pre>");
|
||||
sb->safePrintf ( "</td></tr></table><br>\n\n" );
|
||||
|
||||
|
8
Pages.h
8
Pages.h
@ -85,7 +85,7 @@ bool sendPageAPI ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageWordVec ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageQualityAgent ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageThesaurus ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageStatsdb ( TcpSocket *s , HttpRequest *r );
|
||||
bool sendPageGraph ( TcpSocket *s , HttpRequest *r );
|
||||
|
||||
// values for m_usePost:
|
||||
#define M_GET 0x00
|
||||
@ -110,8 +110,8 @@ class WebPage {
|
||||
char *m_desc; // page description
|
||||
bool (* m_function)(TcpSocket *s , HttpRequest *r);
|
||||
long m_niceness;
|
||||
char *m_xmlOutputDesc;
|
||||
char *m_jsonOutputDesc;
|
||||
char *m_reserved1;
|
||||
char *m_reserved2;
|
||||
char m_pgflags;
|
||||
};
|
||||
|
||||
@ -340,7 +340,7 @@ enum {
|
||||
|
||||
PAGE_HOSTS ,
|
||||
PAGE_STATS , // 10
|
||||
PAGE_STATSDB ,
|
||||
PAGE_GRAPH , // PAGE_STATSDB ,
|
||||
PAGE_PERF ,
|
||||
PAGE_SOCKETS ,
|
||||
|
||||
|
8
Parms.h
8
Parms.h
@ -152,7 +152,6 @@ class GigablastRequest {
|
||||
char *m_urlsBuf;
|
||||
char m_stripBox;
|
||||
char m_harvestLinksBox;
|
||||
char m_forceRespiderBox;
|
||||
|
||||
/////////////
|
||||
//
|
||||
@ -200,6 +199,9 @@ class GigablastRequest {
|
||||
#define PF_REQUIRED 0x4000
|
||||
#define PF_REBUILDPROXYTABLE 0x8000
|
||||
|
||||
#define PF_NOHTML 0x10000
|
||||
|
||||
|
||||
class Parm {
|
||||
public:
|
||||
char *m_title; // displayed above m_desc on admin gui page
|
||||
@ -317,7 +319,7 @@ class Parms {
|
||||
long nc ,
|
||||
long pd ,
|
||||
bool isCrawlbot ,
|
||||
bool isJSON,
|
||||
char format, //bool isJSON,
|
||||
TcpSocket *sock
|
||||
);
|
||||
|
||||
@ -353,7 +355,7 @@ class Parms {
|
||||
long pd ,
|
||||
bool lastRow ,
|
||||
bool isCrawlbot = false,
|
||||
bool isJSON = false ) ;
|
||||
char format = FORMAT_HTML);//bool isJSON = false ) ;
|
||||
|
||||
char *getTHIS ( HttpRequest *r , long page );
|
||||
|
||||
|
15
RdbDump.cpp
15
RdbDump.cpp
@ -58,6 +58,13 @@ bool RdbDump::set ( //char *coll ,
|
||||
// use 0 for collectionless
|
||||
if ( rdb && rdb->m_isCollectionLess ) m_collnum = 0;
|
||||
|
||||
// are we like catdb/statsdb etc.?
|
||||
m_doCollCheck = true;
|
||||
if ( rdb && rdb->m_isCollectionLess ) m_doCollCheck = false;
|
||||
// RdbMerge also calls us but rdb is always set to NULL and it was
|
||||
// causing a merge on catdb (collectionless) to screw up
|
||||
if ( ! rdb ) m_doCollCheck = false;
|
||||
|
||||
/*
|
||||
if ( ! coll && g_catdb.getRdb() == rdb )
|
||||
strcpy(m_coll, "catdb");
|
||||
@ -1023,14 +1030,18 @@ void RdbDump::continueDumping() {
|
||||
|
||||
// if someone reset/deleted the collection we were dumping...
|
||||
CollectionRec *cr = g_collectiondb.getRec ( m_collnum );
|
||||
if ( ! cr ) {
|
||||
// . do not do this for statsdb/catdb which always use collnum of 0
|
||||
// . RdbMerge also calls us but gives a NULL m_rdb so we can't
|
||||
// set m_isCollectionless to false
|
||||
if ( ! cr && m_doCollCheck ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
// m_file is invalid if collrec got nuked because so did
|
||||
// the Rdbbase which has the files
|
||||
log("db: continue dumping lost collection");
|
||||
}
|
||||
|
||||
// bitch about errors
|
||||
else if (g_errno)log("db: Dump to %s had error writing: %s.",
|
||||
if (g_errno)log("db: Dump to %s had error writing: %s.",
|
||||
m_file->getFilename(),mstrerror(g_errno));
|
||||
|
||||
// go back now if we were NOT dumping a tree
|
||||
|
@ -183,6 +183,8 @@ class RdbDump {
|
||||
//char m_coll [ MAX_COLL_LEN + 1 ];
|
||||
collnum_t m_collnum;
|
||||
|
||||
bool m_doCollCheck;
|
||||
|
||||
bool m_tried;
|
||||
|
||||
bool m_isSuspended;
|
||||
|
21
RdbTree.cpp
21
RdbTree.cpp
@ -1212,6 +1212,12 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
|
||||
long hkp = 0;
|
||||
char useHalfKeys = false;
|
||||
|
||||
// these guy always use a collnum of 0
|
||||
bool doCollRecCheck = true;
|
||||
if ( !strcmp(m_dbname,"catdb") ) doCollRecCheck = false;
|
||||
if ( !strcmp(m_dbname,"statsdb") ) doCollRecCheck = false;
|
||||
|
||||
|
||||
if ( !strcmp(m_dbname,"indexdb") ) useHalfKeys = true;
|
||||
if ( !strcmp(m_dbname,"datedb" ) ) useHalfKeys = true;
|
||||
if ( !strcmp(m_dbname,"tfndb" ) ) useHalfKeys = true;
|
||||
@ -1232,12 +1238,17 @@ bool RdbTree::checkTree2 ( bool printMsgs , bool doChainTest ) {
|
||||
// for posdb
|
||||
if ( m_ks == 18 &&(m_keys[i*m_ks] & 0x06) ) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
|
||||
// bad collnum?
|
||||
collnum_t cn = m_collnums[i];
|
||||
if ( m_rdbId>=0 && (cn >= g_collectiondb.m_numRecs || cn < 0) )
|
||||
return log("db: bad collnum in tree");
|
||||
if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
|
||||
return log("db: collnum is obsolete in tree");
|
||||
if ( doCollRecCheck ) {
|
||||
collnum_t cn = m_collnums[i];
|
||||
if ( m_rdbId>=0 &&
|
||||
(cn >= g_collectiondb.m_numRecs || cn < 0) )
|
||||
return log("db: bad collnum in tree");
|
||||
if ( m_rdbId>=0 && ! g_collectiondb.m_recs[cn] )
|
||||
return log("db: collnum is obsolete in tree");
|
||||
}
|
||||
|
||||
// if no left/right kid it MUST be -1
|
||||
if ( m_left[i] < -1 )
|
||||
return log(
|
||||
|
13
SafeBuf.cpp
13
SafeBuf.cpp
@ -392,6 +392,10 @@ long SafeBuf::saveToFile ( char *dir , char *filename ) {
|
||||
return dumpToFile ( buf );
|
||||
}
|
||||
|
||||
long SafeBuf::save ( char *fullFilename ) {
|
||||
return dumpToFile ( fullFilename );
|
||||
}
|
||||
|
||||
long SafeBuf::dumpToFile(char *filename ) {
|
||||
retry22:
|
||||
long fd = open ( filename , O_CREAT | O_WRONLY | O_TRUNC,
|
||||
@ -2785,6 +2789,15 @@ bool SafeBuf::safeStrcpyPrettyJSON ( char *decodedJson ) {
|
||||
}
|
||||
*/
|
||||
|
||||
bool SafeBuf::jsonEncode ( char *src , long srcLen ) {
|
||||
char c = src[srcLen];
|
||||
src[srcLen] = 0;
|
||||
bool status = jsonEncode ( src );
|
||||
src[srcLen] = c;
|
||||
return status;
|
||||
}
|
||||
|
||||
// encode into json
|
||||
bool SafeBuf::safeUtf8ToJSON ( char *utf8 ) {
|
||||
|
||||
if ( ! utf8 ) return true;
|
||||
|
@ -56,6 +56,7 @@ struct SafeBuf {
|
||||
long saveToFile ( char *dir , char *filename ) ;
|
||||
long dumpToFile(char *filename);
|
||||
long save ( char *dir, char *fname){return saveToFile(dir,fname); };
|
||||
long save ( char *fullFilename ) ;
|
||||
|
||||
long fillFromFile(char *filename);
|
||||
long fillFromFile(char *dir,char *filename);
|
||||
@ -107,6 +108,8 @@ struct SafeBuf {
|
||||
bool safeStrcpy ( char *s ) ;
|
||||
//bool safeStrcpyPrettyJSON ( char *decodedJson ) ;
|
||||
bool safeUtf8ToJSON ( char *utf8 ) ;
|
||||
bool jsonEncode ( char *utf8 ) { return safeUtf8ToJSON(utf8); };
|
||||
bool jsonEncode ( char *utf8 , long utf8Len );
|
||||
|
||||
bool csvEncode ( char *s , long len , long niceness = 0 );
|
||||
|
||||
|
@ -12,7 +12,7 @@
|
||||
#include "Timedb.h"
|
||||
#include "PageResults.h"
|
||||
|
||||
char getFormatFromRequest ( class HttpRequest *hr ) ;
|
||||
//char getFormatFromRequest ( class HttpRequest *hr ) ;
|
||||
|
||||
SearchInput::SearchInput() {
|
||||
reset();
|
||||
@ -257,7 +257,8 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
return false;
|
||||
}
|
||||
// add to our list
|
||||
if (!m_collnumBuf.safeMemcpy(&cr->m_collnum,sizeof(collnum_t)))
|
||||
if (!m_collnumBuf.safeMemcpy(&tmpcr->m_collnum,
|
||||
sizeof(collnum_t)))
|
||||
return false;
|
||||
// restore the \0 character we wrote in there
|
||||
*end = c;
|
||||
@ -272,10 +273,10 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
// use default collection if none provided
|
||||
if ( ! p && ! token && m_collnumBuf.length() <= 0 ) {
|
||||
// get default collection rec
|
||||
CollectionRec *dr = g_collectiondb.getRec (coll);
|
||||
cr = g_collectiondb.getRec (coll);
|
||||
// add to our list
|
||||
if ( dr &&
|
||||
!m_collnumBuf.safeMemcpy(&dr->m_collnum,
|
||||
if ( cr &&
|
||||
!m_collnumBuf.safeMemcpy(&cr->m_collnum,
|
||||
sizeof(collnum_t)))
|
||||
return false;
|
||||
}
|
||||
@ -294,9 +295,9 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
|
||||
// must have had one
|
||||
if ( ! cr ) {
|
||||
log("si: collection does not exist");
|
||||
g_errno = ENOCOLLREC;
|
||||
return false;
|
||||
log("si: si. collection does not exist");
|
||||
//g_errno = ENOCOLLREC;
|
||||
//return false;
|
||||
}
|
||||
|
||||
// and set from the http request. will set m_coll, etc.
|
||||
@ -310,7 +311,7 @@ bool SearchInput::set ( TcpSocket *sock , HttpRequest *r ) { //, Query *q ) {
|
||||
//////
|
||||
|
||||
// get the format. "xml" "html" "json" --> FORMAT_HTML, FORMAT_CSV ...
|
||||
char tmpFormat = getFormatFromRequest ( &m_hr );
|
||||
char tmpFormat = m_hr.getReplyFormat();//getFormatFromRequest ( &m_hr);
|
||||
// now override automatic defaults for special cases
|
||||
if ( tmpFormat != FORMAT_HTML ) {
|
||||
m_familyFilter = 0;
|
||||
@ -960,51 +961,6 @@ uint8_t SearchInput::detectQueryLanguage(void) {
|
||||
}
|
||||
*/
|
||||
|
||||
char getFormatFromRequest ( HttpRequest *r ) {
|
||||
|
||||
char *formatStr = r->getString("format");
|
||||
|
||||
//if ( ! formatStr ) return FORMAT_HTML;
|
||||
|
||||
char format = FORMAT_HTML;
|
||||
|
||||
// what format should search results be in? default is html
|
||||
if ( formatStr && strcmp(formatStr,"html") == 0 ) format = FORMAT_HTML;
|
||||
if ( formatStr && strcmp(formatStr,"json") == 0 ) format = FORMAT_JSON;
|
||||
if ( formatStr && strcmp(formatStr,"xml") == 0 ) format = FORMAT_XML;
|
||||
if ( formatStr && strcmp(formatStr,"csv") == 0 ) format = FORMAT_CSV;
|
||||
if ( formatStr && strcmp(formatStr,"iframe")==0)
|
||||
format=FORMAT_WIDGET_IFRAME;
|
||||
if ( formatStr && strcmp(formatStr,"ajax")==0)
|
||||
format=FORMAT_WIDGET_AJAX;
|
||||
if ( formatStr && strcmp(formatStr,"append")==0)
|
||||
format=FORMAT_WIDGET_APPEND;
|
||||
|
||||
// support old api &xml=1 to mean &format=1
|
||||
if ( r->getLong("xml",0) ) {
|
||||
format = FORMAT_XML;
|
||||
}
|
||||
|
||||
// also support &json=1
|
||||
if ( r->getLong("json",0) ) {
|
||||
format = FORMAT_JSON;
|
||||
}
|
||||
|
||||
if ( r->getLong("csv",0) ) {
|
||||
format = FORMAT_CSV;
|
||||
}
|
||||
|
||||
if ( r->getLong("iframe",0) ) {
|
||||
format = FORMAT_WIDGET_IFRAME;
|
||||
}
|
||||
|
||||
if ( r->getLong("ajax",0) ) {
|
||||
format = FORMAT_WIDGET_AJAX;
|
||||
}
|
||||
|
||||
if ( r->getLong("append",0) ) {
|
||||
format = FORMAT_WIDGET_APPEND;
|
||||
}
|
||||
|
||||
return format;
|
||||
}
|
||||
//char getFormatFromRequest ( HttpRequest *r ) {
|
||||
//
|
||||
//}
|
||||
|
44
Spider.cpp
44
Spider.cpp
@ -5207,6 +5207,8 @@ void doneSleepingWrapperSL ( int fd , void *state ) {
|
||||
// if spidering disabled then do not do this crap
|
||||
if ( ! g_conf.m_spideringEnabled ) return;
|
||||
//if ( ! g_conf.m_webSpideringEnabled ) return;
|
||||
// or if trying to exit
|
||||
if ( g_process.m_mode == EXIT_MODE ) return;
|
||||
|
||||
// wait for clock to sync with host #0
|
||||
if ( ! isClockInSync() ) {
|
||||
@ -5517,6 +5519,8 @@ void SpiderLoop::spiderDoledUrls ( ) {
|
||||
|
||||
// must be spidering to dole out
|
||||
if ( ! g_conf.m_spideringEnabled ) return;
|
||||
// or if trying to exit
|
||||
if ( g_process.m_mode == EXIT_MODE ) return;
|
||||
// if we don't have all the url counts from all hosts, then wait.
|
||||
// one host is probably down and was never up to begin with
|
||||
if ( ! s_countsAreValid ) return;
|
||||
@ -6617,7 +6621,9 @@ bool SpiderLoop::spiderUrl9 ( SpiderRequest *sreq ,
|
||||
return true;
|
||||
}
|
||||
// turned off?
|
||||
if ( ( (! g_conf.m_spideringEnabled
|
||||
if ( ( (! g_conf.m_spideringEnabled ||
|
||||
// or if trying to exit
|
||||
g_process.m_mode == EXIT_MODE
|
||||
) && // ! g_conf.m_webSpideringEnabled ) &&
|
||||
! sreq->m_isInjecting ) ||
|
||||
// repairing the collection's rdbs?
|
||||
@ -8584,7 +8590,16 @@ bool sendPage ( State11 *st ) {
|
||||
g_stats.m_allErrorsOld[i] == 0 &&
|
||||
bucketsNew[i] == 0 && bucketsOld[i] == 0 ) continue;
|
||||
sb.safePrintf (
|
||||
"<tr bgcolor=#%s><td><b>%s</b></td>"
|
||||
"<tr bgcolor=#%s>"
|
||||
"<td><b><a href=/search?c=%s&q=gbstatusmsg%%3A"
|
||||
"%%22"
|
||||
,
|
||||
LIGHT_BLUE , cr->m_coll );
|
||||
sb.urlEncode(mstrerror(i));
|
||||
sb.safePrintf ("%%22>"
|
||||
"%s"
|
||||
"</a>"
|
||||
"</b></td>"
|
||||
"<td>%lli</td>"
|
||||
"<td>%lli</td>"
|
||||
"<td>%lli</td>"
|
||||
@ -8592,7 +8607,6 @@ bool sendPage ( State11 *st ) {
|
||||
"<td>%li</td>"
|
||||
"<td>%li</td>"
|
||||
"</tr>\n" ,
|
||||
LIGHT_BLUE,
|
||||
mstrerror(i),
|
||||
g_stats.m_allErrorsNew[i] +
|
||||
g_stats.m_allErrorsOld[i],
|
||||
@ -10259,6 +10273,14 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
errCode != EDNSDEAD &&
|
||||
// assume diffbot is temporarily experiencing errs
|
||||
errCode != EDIFFBOTINTERNALERROR &&
|
||||
// if diffbot received empty content when d'lding
|
||||
errCode != EDIFFBOTEMPTYCONTENT &&
|
||||
// or diffbot tcp timed out when d'lding the url
|
||||
errCode != EDIFFBOTREQUESTTIMEDOUT &&
|
||||
// if diffbot closed the socket on us...
|
||||
errCode != EDIFFBOTMIMEERROR &&
|
||||
// of the diffbot reply itself was not 200 (OK)
|
||||
errCode != EDIFFBOTBADHTTPSTATUS &&
|
||||
// out of memory while crawling?
|
||||
errCode != ENOMEM &&
|
||||
errCode != ENETUNREACH &&
|
||||
@ -10332,6 +10354,22 @@ long getUrlFilterNum2 ( SpiderRequest *sreq ,
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( strncmp(p,"isreindex",9) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
// if no match continue
|
||||
//if ( (bool)sreq->m_urlIsDocId==val ) continue;
|
||||
if ( (bool)sreq->m_isPageReindex==val ) continue;
|
||||
// skip
|
||||
p += 10;
|
||||
// skip to next constraint
|
||||
p = strstr(p, "&&");
|
||||
// all done?
|
||||
if ( ! p ) return i;
|
||||
p += 2;
|
||||
goto checkNextRule;
|
||||
}
|
||||
|
||||
if ( strncmp(p,"iscontacty",10) == 0 ) {
|
||||
// skip for msg20
|
||||
if ( isForMsg20 ) continue;
|
||||
|
43
Summary.cpp
43
Summary.cpp
@ -58,6 +58,7 @@ bool Summary::set2 ( Xml *xml ,
|
||||
bool doStemming ,
|
||||
long maxSummaryLen ,
|
||||
long maxNumLines ,
|
||||
long numDisplayLines ,
|
||||
long maxNumCharsPerLine ,
|
||||
//long bigSampleRadius ,
|
||||
//long bigSampleMaxLen ,
|
||||
@ -81,6 +82,9 @@ bool Summary::set2 ( Xml *xml ,
|
||||
// to see if it has all the query terms...
|
||||
//if ( maxNumLines <= 0 ) return true;
|
||||
|
||||
m_numDisplayLines = numDisplayLines;
|
||||
m_displayLen = 0;
|
||||
|
||||
//m_useDateLists = useDateLists;
|
||||
//m_exclDateList = exclDateList;
|
||||
//m_begPubDateList = begPubDateList;
|
||||
@ -232,7 +236,12 @@ bool Summary::set2 ( Xml *xml ,
|
||||
// highest scoring window around each term. And then find the highest
|
||||
// of those over all the matching terms.
|
||||
//
|
||||
for ( long numFinal = 0; numFinal < maxNumLines; numFinal++ ){
|
||||
long numFinal;
|
||||
for ( numFinal = 0; numFinal < maxNumLines; numFinal++ ){
|
||||
|
||||
if ( numFinal == m_numDisplayLines )
|
||||
m_displayLen = p - m_summary;
|
||||
|
||||
// reset these at the top of each loop
|
||||
Match *maxm;
|
||||
long long maxScore = 0;
|
||||
@ -508,6 +517,9 @@ bool Summary::set2 ( Xml *xml ,
|
||||
bb[j] |= D_USED;
|
||||
}
|
||||
|
||||
if ( numFinal <= m_numDisplayLines )
|
||||
m_displayLen = p - m_summary;
|
||||
|
||||
/*end = gettimeofdayInMilliseconds();
|
||||
if ( end - start > 10 )
|
||||
log ( LOG_WARN,"summary: took %llims to finish doing summary "
|
||||
@ -530,18 +542,25 @@ bool Summary::set2 ( Xml *xml ,
|
||||
m_summaryExcerptLen[0] = p - m_summary;
|
||||
m_numExcerpts = 1;
|
||||
}
|
||||
// in this case we only have one summary line
|
||||
if ( m_numDisplayLines > 0 )
|
||||
m_displayLen = p - m_summary;
|
||||
}
|
||||
|
||||
|
||||
// If we still didn't find a summary, get the default summary
|
||||
if ( p == m_summary )
|
||||
if ( p == m_summary ) {
|
||||
// then return the default summary
|
||||
return getDefaultSummary ( xml,
|
||||
words,
|
||||
sections,
|
||||
pos,
|
||||
//bigSampleRadius,
|
||||
maxSummaryLen );
|
||||
bool status = getDefaultSummary ( xml,
|
||||
words,
|
||||
sections,
|
||||
pos,
|
||||
//bigSampleRadius,
|
||||
maxSummaryLen );
|
||||
if ( m_numDisplayLines > 0 )
|
||||
m_displayLen = m_summaryLen;
|
||||
return status;
|
||||
}
|
||||
|
||||
// if we don't find a summary, theres no need to NULL terminate
|
||||
if ( p != m_summary ) *p++ = '\0';
|
||||
@ -954,6 +973,10 @@ bool Summary::getDefaultSummary ( Xml *xml,
|
||||
m_summaryLen = xml->getMetaContent(p,maxSummaryLen,
|
||||
"description",11);
|
||||
|
||||
|
||||
if ( m_numDisplayLines > 0 )
|
||||
m_displayLen = m_summaryLen;
|
||||
|
||||
if ( m_summaryLen > 0 ) {
|
||||
m_summaryExcerptLen[0] = m_summaryLen;
|
||||
m_numExcerpts = 1;
|
||||
@ -1056,6 +1079,10 @@ bool Summary::getDefaultSummary ( Xml *xml,
|
||||
*p++ = '\0';
|
||||
// set length
|
||||
m_summaryLen = p - m_summary;
|
||||
|
||||
if ( m_numDisplayLines > 0 )
|
||||
m_displayLen = m_summaryLen;
|
||||
|
||||
if ( m_summaryLen > 50000 ) { char*xx=NULL;*xx=0; }
|
||||
return true;
|
||||
}
|
||||
|
@ -78,6 +78,7 @@ class Summary {
|
||||
//long collLen ,
|
||||
bool doStemming ,
|
||||
long maxSummaryLen ,
|
||||
long numDisplayLines ,
|
||||
long maxNumLines ,
|
||||
long maxNumCharsPerLine ,
|
||||
//long bigSampleRadius ,
|
||||
@ -237,6 +238,12 @@ class Summary {
|
||||
//bool m_freeBuf;
|
||||
//char m_localBuf[10032];
|
||||
|
||||
// if getting more lines for deduping than we need for displaying,
|
||||
// how big is that part of the summary to display?
|
||||
long m_numDisplayLines;
|
||||
long m_displayLen;
|
||||
long getSummaryDisplayLen() { return m_displayLen; }
|
||||
|
||||
long m_maxNumCharsPerLine;
|
||||
|
||||
long m_titleVersion;
|
||||
|
@ -136,7 +136,9 @@ bool TcpServer::init ( void (* requestHandler)(TcpSocket *s) ,
|
||||
struct sockaddr_in name;
|
||||
// parm
|
||||
int options;
|
||||
// if port is -1 don't set up a listening socket
|
||||
// if port is -1 don't set up a listening socket, this is used
|
||||
// for things like blaster that are clients only. or the qatest()
|
||||
// function.
|
||||
if ( m_port == -1 || m_port == 0 ) goto skipServer;
|
||||
// . set up our connection listening socket
|
||||
// . sets g_errno and returns -1 on error
|
||||
@ -756,7 +758,7 @@ static long s_lastTime = 0;
|
||||
TcpSocket *TcpServer::getNewSocket ( ) {
|
||||
// . if outta sd's we close least used socket first
|
||||
// . if they're all in use set g_errno and return NULL
|
||||
if ( m_numIncomingUsed >= *m_maxSocketsPtr )
|
||||
if ( m_maxSocketsPtr && m_numIncomingUsed >= *m_maxSocketsPtr )
|
||||
if ( ! closeLeastUsed () ){
|
||||
// note it in the log
|
||||
long now = getTimeLocal();
|
||||
|
@ -1878,15 +1878,18 @@ bool Title::copyTitle ( Words *w , Pos *pos ,
|
||||
// size of character in bytes, usually 1
|
||||
char cs ;
|
||||
// point to last punct char
|
||||
char *lastp = NULL;
|
||||
char *lastp = dst;//NULL;
|
||||
// convert them always for now
|
||||
bool convertHtmlEntities = true;
|
||||
long charCount = 0;
|
||||
// copy the node @p into "dst"
|
||||
for ( ; src < srcEnd ; src += cs , dst += cs ) {
|
||||
// get src size
|
||||
cs = getUtf8CharSize ( src );
|
||||
// break if we are full!
|
||||
if ( dst + cs >= dstEnd ) break;
|
||||
// or hit our max char limit
|
||||
if ( charCount++ >= m_maxTitleChars ) break;
|
||||
// remember last punct for cutting purposes
|
||||
if ( ! is_alnum_utf8 ( src ) ) lastp = dst;
|
||||
// encode it as an html entity if asked to
|
||||
|
242
XmlDoc.cpp
242
XmlDoc.cpp
@ -890,9 +890,10 @@ char *XmlDoc::getTestDir ( ) {
|
||||
// if Test.cpp explicitly set SpiderRequest::m_useTestSpiderDir bit
|
||||
// then return "test-spider" otherwise...
|
||||
if ( m_sreqValid && m_sreq.m_useTestSpiderDir )
|
||||
return "test-spider";
|
||||
return "qa";//"test-spider";
|
||||
// ... default to "test-parser"
|
||||
return "test-parser";
|
||||
//return "test-parser";
|
||||
return "qa";
|
||||
/*
|
||||
if ( getIsPageParser() )
|
||||
return "test-page-parser";
|
||||
@ -1969,6 +1970,8 @@ bool XmlDoc::injectDoc ( char *url ,
|
||||
SpiderRequest sreq;
|
||||
sreq.setFromInject ( cleanUrl );
|
||||
|
||||
if ( deleteUrl )
|
||||
sreq.m_forceDelete = 1;
|
||||
|
||||
//static char s_dummy[3];
|
||||
// sometims the content is indeed NULL...
|
||||
@ -2282,6 +2285,9 @@ bool XmlDoc::indexDoc ( ) {
|
||||
//
|
||||
////
|
||||
SpiderReply *nsr = getFakeSpiderReply ( );
|
||||
// this can be NULL and g_errno set to ENOCOLLREC or something
|
||||
if ( ! nsr )
|
||||
return true;
|
||||
|
||||
//SafeBuf metaList;
|
||||
if ( ! m_metaList2.pushChar(RDB_SPIDERDB) )
|
||||
@ -3229,6 +3235,10 @@ long *XmlDoc::getIndexCode2 ( ) {
|
||||
if ( gr->getLong("deep",0) ) spamCheck = false;
|
||||
// not for crawlbot
|
||||
if ( cr->m_isCustomCrawl ) spamCheck = false;
|
||||
// only html for now
|
||||
if ( m_contentTypeValid && m_contentType != CT_HTML ) spamCheck =false;
|
||||
// turn this off for now
|
||||
spamCheck = false;
|
||||
// otherwise, check the weights
|
||||
if ( spamCheck ) {
|
||||
char *ws = getWordSpamVec();
|
||||
@ -3272,17 +3282,23 @@ long *XmlDoc::getIndexCode2 ( ) {
|
||||
return &m_indexCode;
|
||||
}
|
||||
|
||||
// if using diffbot and the diffbot reply had a time out error
|
||||
// or otherwise... diffbot failure demands a re-try always i guess.
|
||||
// put this above getSpiderPriority() call otherwise we end up in
|
||||
// a recursive loop with getIndexCode() and getNewSpiderReply()
|
||||
SafeBuf *dbr = getDiffbotReply();
|
||||
if ( ! dbr || dbr == (void *)-1 ) return (long *)dbr;
|
||||
if ( m_diffbotReplyValid && m_diffbotReplyError ) {
|
||||
m_indexCode= m_diffbotReplyError;
|
||||
m_indexCodeValid = true;
|
||||
return &m_indexCode;
|
||||
}
|
||||
// . if using diffbot and the diffbot reply had a time out error
|
||||
// or otherwise... diffbot failure demands a re-try always i guess.
|
||||
// put this above getSpiderPriority() call otherwise we end up in
|
||||
// a recursive loop with getIndexCode() and getNewSpiderReply()
|
||||
// . NO, don't do this anymore, however, if there is a diffbot
|
||||
// reply error then record it in the spider reply BUT only if it is
|
||||
// a diffbot reply error that warrants a retry. for instance,
|
||||
// EDIFFBOTCOULDNOTDOWNLOAD happens when diffbot got a 404 or 500
|
||||
// error trying to download the page so it probably should not
|
||||
// retry. but EDIFFBOTREQUESTTIMEDOUT should retry.
|
||||
// SafeBuf *dbr = getDiffbotReply();
|
||||
// if ( ! dbr || dbr == (void *)-1 ) return (long *)dbr;
|
||||
// if ( m_diffbotReplyValid && m_diffbotReplyError ) {
|
||||
// m_indexCode= m_diffbotReplyError;
|
||||
// m_indexCodeValid = true;
|
||||
// return &m_indexCode;
|
||||
// }
|
||||
|
||||
// no error otherwise
|
||||
m_indexCode = 0;
|
||||
@ -9639,8 +9655,10 @@ Url **XmlDoc::getRedirUrl() {
|
||||
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
|
||||
return &m_redirUrlPtr;
|
||||
}
|
||||
// if we followed too many then bail
|
||||
if ( ++m_numRedirects >= 4 ) {
|
||||
// . if we followed too many then bail
|
||||
// . www.motorolamobility.com www.outlook.com ... failed when we
|
||||
// had >= 4 here
|
||||
if ( ++m_numRedirects >= 5 ) {
|
||||
if ( ! keep ) m_redirError = EDOCTOOMANYREDIRECTS;
|
||||
return &m_redirUrlPtr;
|
||||
}
|
||||
@ -10702,6 +10720,8 @@ char *XmlDoc::getIsIndexed ( ) {
|
||||
// note it
|
||||
if ( ! m_calledMsg22e )
|
||||
setStatus ( "checking titledb for old title rec");
|
||||
else
|
||||
setStatus ( "back from msg22e call");
|
||||
|
||||
// . consult the title rec tree!
|
||||
// . "justCheckTfndb" is set to true here!
|
||||
@ -13621,7 +13641,35 @@ void gotDiffbotReplyWrapper ( void *state , TcpSocket *s ) {
|
||||
THIS->m_diffbotUrl.getBufStart(),
|
||||
page
|
||||
);
|
||||
THIS->m_diffbotReplyError = EDIFFBOTINTERNALERROR;
|
||||
// try to get the right error code
|
||||
char *err = strstr(page,"\"error\":\"");
|
||||
if ( err ) err += 9;
|
||||
long code = EDIFFBOTUNKNOWNERROR;
|
||||
if ( err && !strncmp(err,"Unable to apply rules",21))
|
||||
code = EDIFFBOTUNABLETOAPPLYRULES;
|
||||
// like .pdf pages get this error
|
||||
if ( err && !strncmp(err,"Could not parse page",20))
|
||||
code = EDIFFBOTCOULDNOTPARSE;
|
||||
// if it is 404... 502, etc. any http status code
|
||||
if ( err && !strncmp(err,"Could not download page",23))
|
||||
code = EDIFFBOTCOULDNOTDOWNLOAD;
|
||||
// custom api does not apply to the url
|
||||
if ( err && !strncmp(err,"Invalid API",11))
|
||||
code = EDIFFBOTINVALIDAPI;
|
||||
if ( err && !strncmp(err,"Version required",16))
|
||||
code = EDIFFBOTVERSIONREQ;
|
||||
if ( err && !strncmp(err,"Empty content",13))
|
||||
code = EDIFFBOTEMPTYCONTENT;
|
||||
if ( err && !strncmp(err,"No content received",19))
|
||||
code = EDIFFBOTEMPTYCONTENT;
|
||||
if ( err && !strncmp(err,"Request timed",13))
|
||||
code = EDIFFBOTREQUESTTIMEDOUT;
|
||||
// error processing url
|
||||
if ( err && !strncmp(err,"Error processing",16))
|
||||
code = EDIFFBOTURLPROCESSERROR;
|
||||
if ( err && !strncmp(err,"Your token has exp",18))
|
||||
code = EDIFFBOTTOKENEXPIRED;
|
||||
THIS->m_diffbotReplyError = code;
|
||||
}
|
||||
// a hack for detecting if token is expired
|
||||
if ( ! ttt && cr && strstr ( page , ":429}" ) ) {
|
||||
@ -15183,6 +15231,7 @@ long long *XmlDoc::getDownloadEndTime ( ) {
|
||||
if ( m_deleteFromIndex ) {
|
||||
m_downloadEndTime = 0;
|
||||
m_downloadEndTimeValid = true;
|
||||
return &m_downloadEndTime;
|
||||
}
|
||||
|
||||
// if recycling content use its download end time
|
||||
@ -15199,7 +15248,7 @@ long long *XmlDoc::getDownloadEndTime ( ) {
|
||||
return &m_downloadEndTime;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// need a valid reply
|
||||
char **reply = getHttpReply ();
|
||||
if ( ! reply || reply == (void *)-1 ) return (long long *)reply;
|
||||
@ -17021,7 +17070,8 @@ char **XmlDoc::getUtf8Content ( ) {
|
||||
// it should be there if trying to delete as well!
|
||||
m_deleteFromIndex ) {
|
||||
log("xmldoc: null utf8 content for docid-based "
|
||||
"titlerec lookup which was not found");
|
||||
"titlerec (d=%lli) lookup which was not found",
|
||||
m_docId);
|
||||
ptr_utf8Content = NULL;
|
||||
size_utf8Content = 0;
|
||||
m_utf8ContentValid = true;
|
||||
@ -19804,7 +19854,9 @@ bool XmlDoc::verifyMetaList ( char *p , char *pend , bool forDelete ) {
|
||||
if ( *p & 0x01 ) del = false;
|
||||
else del = true;
|
||||
// must always be negative if deleteing
|
||||
if ( m_deleteFromIndex && ! del ) {
|
||||
// spiderdb is exempt because we add a spiderreply that is
|
||||
// positive and a spiderdoc
|
||||
if ( m_deleteFromIndex && ! del && rdbId != RDB_SPIDERDB) {
|
||||
char *xx=NULL;*xx=0; }
|
||||
// get the key size. a table lookup in Rdb.cpp.
|
||||
long ks ;
|
||||
@ -20485,7 +20537,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// if we are indexing a subdoc piece of a multidoc url
|
||||
// then parentUrl should return non-NULL
|
||||
char *parentUrl = getDiffbotParentUrl(od->m_firstUrl.m_url);
|
||||
if ( ! parentUrl ) goto skip9;
|
||||
if ( ! parentUrl && od->m_contentType != CT_STATUS )
|
||||
goto skip9;
|
||||
// in that case we need to reindex the parent url not the
|
||||
// subdoc url, so make the spider reply gen quick
|
||||
//SpiderReply *newsr = od->getFakeSpiderReply();
|
||||
@ -20537,12 +20590,23 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// been fulfilled!
|
||||
if( ! m_zbuf.safeMemcpy (&srep, srep.getRecSize()))
|
||||
return NULL;
|
||||
// complain
|
||||
if ( ! cr->m_diffbotApiUrl.length()<1 && !cr->m_isCustomCrawl )
|
||||
log("build: doing query reindex but diffbot api "
|
||||
"url is not set in spider controls");
|
||||
|
||||
// but also store a new spider request for the parent url
|
||||
SpiderRequest ksr;
|
||||
long long pd;
|
||||
|
||||
// skip if doc is a spider status "document". their docids
|
||||
// often get added during a query reindex but we should ignore
|
||||
// them completely.
|
||||
if ( od->m_contentType == CT_STATUS )
|
||||
goto returnList;
|
||||
|
||||
//goto returnList;
|
||||
|
||||
// complain
|
||||
if ( cr->m_diffbotApiUrl.length()<1 && !cr->m_isCustomCrawl )
|
||||
log("build: doing query reindex but diffbot api "
|
||||
"url is not set in spider controls");
|
||||
// just copy original request
|
||||
memcpy ( &ksr , &m_sreq , m_sreq.getRecSize() );
|
||||
// do not spider links, it's a page reindex of a multidoc url
|
||||
@ -20551,6 +20615,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
ksr.m_ignoreDocUnchangedError = 1;
|
||||
// no longer docid based we set it to parentUrl
|
||||
ksr.m_urlIsDocId = 0;
|
||||
// but consider it a manual add. this should already be set.
|
||||
ksr.m_isPageReindex = 1;
|
||||
// but it is not docid based, so overwrite the docid
|
||||
// in ksr.m_url with the parent multidoc url. it \0 terms it.
|
||||
strcpy(ksr.m_url , parentUrl );//, MAX_URL_LEN-1);
|
||||
@ -20558,7 +20624,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
//if ( ! od->m_firstIpValid ) { char *xx=NULL;*xx=0; }
|
||||
// set the key, ksr.m_key. isDel = false
|
||||
// fake docid
|
||||
long long pd = g_titledb.getProbableDocId(parentUrl);
|
||||
pd = g_titledb.getProbableDocId(parentUrl);
|
||||
ksr.setKey ( m_sreq.m_firstIp, pd , false );
|
||||
// store this
|
||||
if ( ! m_zbuf.pushChar(RDB_SPIDERDB) )
|
||||
@ -20566,6 +20632,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// then the request
|
||||
if ( ! m_zbuf.safeMemcpy(&ksr,ksr.getRecSize() ) )
|
||||
return NULL;
|
||||
returnList:
|
||||
// prevent cores in indexDoc()
|
||||
m_indexCode = EREINDEXREDIR;
|
||||
m_indexCodeValid = true;
|
||||
@ -20960,7 +21027,8 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// if recycling json objects, leave them there!
|
||||
if ( *recycle ) nukeJson = false;
|
||||
// you have to be a diffbot crawl to do this
|
||||
if ( ! cr->m_isCustomCrawl ) nukeJson = false;
|
||||
// no, not if you have th diffbot api url set... so take this out
|
||||
//if ( ! cr->m_isCustomCrawl ) nukeJson = false;
|
||||
// do not remove old json objects if pageparser.cpp test
|
||||
// because that can not change the index, etc.
|
||||
if ( getIsPageParser() ) nukeJson = false;
|
||||
@ -21818,7 +21886,12 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// but don't do this if it is pagereindex. why is pagereindex
|
||||
// setting the injecting flag anyway?
|
||||
long needSpiderdb3 = 0;
|
||||
if ( m_sreqValid && m_sreq.m_isInjecting )//&&!m_sreq.m_isPageReindex)
|
||||
if ( m_sreqValid &&
|
||||
m_sreq.m_isInjecting &&
|
||||
m_sreq.m_fakeFirstIp &&
|
||||
! m_sreq.m_forceDelete &&
|
||||
/// don't add requests like http://xyz.com/xxx-diffbotxyz0 though
|
||||
! m_isDiffbotJSONObject )
|
||||
needSpiderdb3 = m_sreq.getRecSize() + 1;
|
||||
need += needSpiderdb3;
|
||||
|
||||
@ -22325,11 +22398,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// if we are injecting we must add the spider request
|
||||
// we are injecting from so the url can be scheduled to be
|
||||
// spidered again
|
||||
if ( m_sreqValid &&
|
||||
m_sreq.m_isInjecting &&
|
||||
m_sreq.m_fakeFirstIp &&
|
||||
/// don't add requests like http://xyz.com/xxx-diffbotxyz0 though
|
||||
! m_isDiffbotJSONObject ) {
|
||||
if ( needSpiderdb3 ) {
|
||||
// note it
|
||||
setStatus("adding spider request");
|
||||
// checkpoint
|
||||
@ -23308,6 +23377,10 @@ SpiderReply *XmlDoc::getNewSpiderReply ( ) {
|
||||
else
|
||||
m_srep.m_hadDiffbotError = false;
|
||||
|
||||
// if we only had an error code in the diffbot reply, record that
|
||||
if ( ! m_indexCode && m_diffbotReplyError )
|
||||
m_srep.m_errCode = m_diffbotReplyError;
|
||||
|
||||
// sanity. if being called directly from indexDoc() because of
|
||||
// an error like out of memory, then we do not know if it is
|
||||
// indexed or not or was indexed...
|
||||
@ -25112,11 +25185,11 @@ bool XmlDoc::hashNoSplit ( HashTableX *tt ) {
|
||||
// hash gbimage: for permalinks only for Images.cpp
|
||||
for ( long i = 0 ; i < m_images.m_numImages ; i++ ) {
|
||||
// get the node number
|
||||
long nn = m_images.m_imageNodes[i];
|
||||
//long nn = m_images.m_imageNodes[i];
|
||||
// get the url of the image
|
||||
XmlNode *xn = m_xml.getNodePtr(nn);
|
||||
//XmlNode *xn = m_xml.getNodePtr(nn);
|
||||
long srcLen;
|
||||
char *src = xn->getFieldValue("src",&srcLen);
|
||||
char *src = m_images.getImageUrl(i,&srcLen);
|
||||
// set it to the full url
|
||||
Url iu;
|
||||
// use "pageUrl" as the baseUrl
|
||||
@ -25488,6 +25561,17 @@ SafeBuf *XmlDoc::getSpiderReplyMetaList ( SpiderReply *reply ) {
|
||||
return &m_spiderReplyMetaList;
|
||||
}
|
||||
|
||||
// we double add regular html urls in a query reindex because the
|
||||
// json url adds the parent, so the parent gets added twice sometimes,
|
||||
// and for some reason it is adding a spider status doc the 2nd time
|
||||
// so cut that out. this is kinda a hack b/c i'm not sure what's
|
||||
// going on. but you can set a break point here and see what's up if
|
||||
// you want.
|
||||
if ( m_indexCodeValid && m_indexCode == EDOCFORCEDELETE ) {
|
||||
m_spiderReplyMetaListValid = true;
|
||||
return &m_spiderReplyMetaList;
|
||||
}
|
||||
|
||||
// . fake this out so we do not core
|
||||
// . hashWords3() uses it i guess
|
||||
bool forcedLangId = false;
|
||||
@ -28586,28 +28670,37 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
*/
|
||||
|
||||
// does they want a summary?
|
||||
if ( m_req->m_numSummaryLines>0 && ! reply->ptr_sum ) {
|
||||
char *sum = getHighlightedSummary();
|
||||
if ( ! sum || sum == (void *)-1 ) return (Msg20Reply *)sum;
|
||||
if ( m_req->m_numSummaryLines>0 && ! reply->ptr_displaySum ) {
|
||||
char *hsum = getHighlightedSummary();
|
||||
if ( ! hsum || hsum == (void *)-1 ) return (Msg20Reply *)hsum;
|
||||
//Summary *s = getSummary();
|
||||
//if ( ! s || s == (void *)-1 ) return (Msg20Reply *)s;
|
||||
//long sumLen = m_finalSummaryBuf.length();
|
||||
// is it size and not length?
|
||||
long sumLen = 0;
|
||||
long hsumLen = 0;
|
||||
// seems like it can return 0x01 if none...
|
||||
//if ( sum == (char *)0x01 ) sum = NULL;
|
||||
// get len
|
||||
if ( sum ) sumLen = gbstrlen(sum);
|
||||
// must be \0 terminated
|
||||
if ( sumLen > 0 && sum[sumLen] ) { char *xx=NULL;*xx=0; }
|
||||
if ( hsum == (char *)0x01 ) hsum = NULL;
|
||||
// get len. this is the HIGHLIGHTED summary so it is ok.
|
||||
if ( hsum ) hsumLen = gbstrlen(hsum);
|
||||
// must be \0 terminated. not any more, it can be a subset
|
||||
// of a larger summary used for deduping
|
||||
if ( hsumLen > 0 && hsum[hsumLen] ) { char *xx=NULL;*xx=0; }
|
||||
// assume size is 0
|
||||
long sumSize = 0;
|
||||
//long sumSize = 0;
|
||||
// include the \0 in size
|
||||
if ( sum ) sumSize = sumLen + 1;
|
||||
//if ( sum ) sumSize = sumLen + 1;
|
||||
// do not get any more than "me" lines/excerpts of summary
|
||||
//long max = m_req->m_numSummaryLines;
|
||||
// grab stuff from it!
|
||||
//reply->m_proximityScore = s->getProximityScore();
|
||||
reply-> ptr_sum = sum;//s->getSummary();
|
||||
reply->size_sum = sumSize;//s->getSummaryLen(max)+1;
|
||||
reply-> ptr_displaySum = hsum;//s->getSummary();
|
||||
reply->size_displaySum = hsumLen+1;//sumSize;//s->getSummaryLen
|
||||
// this is unhighlighted for deduping, and it might be longer
|
||||
// . seems like we are not using this for deduping but using
|
||||
// the gigabit vector in Msg40.cpp, so take out for now
|
||||
//reply-> ptr_dedupSum = s->m_summary;
|
||||
//reply->size_dedupSum = s->m_summaryLen+1;
|
||||
//if ( s->m_summaryLen == 0 ) reply->size_dedupSum = 0;
|
||||
//reply->m_diversity = s->getDiversity();
|
||||
}
|
||||
|
||||
@ -28675,6 +28768,15 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
|
||||
}
|
||||
}
|
||||
|
||||
// this is not documented because i don't think it will be popular
|
||||
if ( m_req->m_getHeaderTag ) {
|
||||
SafeBuf *htb = getHeaderTagBuf();
|
||||
if ( ! htb || htb == (SafeBuf *)-1 ) return (Msg20Reply *)htb;
|
||||
// it should be null terminated
|
||||
reply->ptr_htag = htb->getBufStart();
|
||||
reply->size_htag = htb->getLength() + 1;
|
||||
}
|
||||
|
||||
// breathe
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
@ -29674,6 +29776,38 @@ char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) {
|
||||
return m_dbuf;
|
||||
}
|
||||
|
||||
SafeBuf *XmlDoc::getHeaderTagBuf() {
|
||||
if ( m_htbValid ) return &m_htb;
|
||||
|
||||
Sections *ss = getSections();
|
||||
if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss;
|
||||
|
||||
// scan sections
|
||||
Section *si = ss->m_rootSection;
|
||||
for ( ; si ; si = si->m_next ) {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
if ( si->m_tagId == TAG_H1 ) break;
|
||||
}
|
||||
// if no h1 tag then make buf empty
|
||||
if ( ! si ) {
|
||||
m_htb.nullTerm();
|
||||
m_htbValid = true;
|
||||
return &m_htb;
|
||||
}
|
||||
// otherwise, set it
|
||||
char *a = m_words.m_words[si->m_firstWordPos];
|
||||
char *b = m_words.m_words[si->m_lastWordPos] ;
|
||||
b += m_words.m_wordLens[si->m_lastWordPos];
|
||||
|
||||
// copy it
|
||||
m_htb.safeMemcpy ( a , b - a );
|
||||
m_htb.nullTerm();
|
||||
m_htbValid = true;
|
||||
return &m_htb;
|
||||
}
|
||||
|
||||
|
||||
Title *XmlDoc::getTitle ( ) {
|
||||
if ( m_titleValid ) return &m_title;
|
||||
// need a buncha crap
|
||||
@ -29775,6 +29909,10 @@ Summary *XmlDoc::getSummary () {
|
||||
false , // doStemming
|
||||
m_req->m_summaryMaxLen ,
|
||||
numLines ,
|
||||
// . displayLines, # lines we are displaying
|
||||
// . Summary::getDisplayLen() will return the
|
||||
// length of the summary to display
|
||||
m_req->m_numSummaryLines ,
|
||||
cr->m_summaryMaxNumCharsPerLine,
|
||||
m_req->m_ratInSummary ,
|
||||
getFirstUrl() ,
|
||||
@ -29807,11 +29945,15 @@ char *XmlDoc::getHighlightedSummary ( ) {
|
||||
|
||||
// get the summary
|
||||
char *sum = s->getSummary();
|
||||
long sumLen = s->getSummaryLen();
|
||||
//long sumLen = s->getSummaryLen();
|
||||
long sumLen = s->getSummaryDisplayLen();
|
||||
|
||||
//sum[sumLen] = 0;
|
||||
|
||||
// assume no highlighting?
|
||||
if ( ! m_req->m_highlightQueryTerms || sumLen == 0 ) {
|
||||
m_finalSummaryBuf.safeMemcpy ( sum , sumLen + 1 );
|
||||
m_finalSummaryBuf.safeMemcpy ( sum , sumLen );
|
||||
m_finalSummaryBuf.nullTerm();
|
||||
m_finalSummaryBufValid = true;
|
||||
return m_finalSummaryBuf.getBufStart();
|
||||
//char *fsum = m_finalSummaryBuf.getBufStart();
|
||||
|
3
XmlDoc.h
3
XmlDoc.h
@ -821,6 +821,7 @@ class XmlDoc {
|
||||
Query *getQuery() ;
|
||||
Matches *getMatches () ;
|
||||
char *getDescriptionBuf ( char *displayMetas , long *dlen ) ;
|
||||
SafeBuf *getHeaderTagBuf();
|
||||
class Title *getTitle ();
|
||||
class Summary *getSummary () ;
|
||||
char *getHighlightedSummary ();
|
||||
@ -1377,6 +1378,7 @@ class XmlDoc {
|
||||
bool m_matchesValid;
|
||||
bool m_dbufValid;
|
||||
bool m_titleValid;
|
||||
bool m_htbValid;
|
||||
bool m_collnumValid;
|
||||
//bool m_twidsValid;
|
||||
bool m_termId32BufValid;
|
||||
@ -2010,6 +2012,7 @@ class XmlDoc {
|
||||
// meta description buf
|
||||
long m_dbufLen;
|
||||
char m_dbuf[1024];
|
||||
SafeBuf m_htb;
|
||||
Title m_title;
|
||||
Summary m_summary;
|
||||
char m_isCompromised;
|
||||
|
@ -257,6 +257,8 @@ long XmlNode::setCommentNode2 ( char *node ) {
|
||||
// look for ending of ]> like for <![if gt IE 6]>
|
||||
if ( node[i] !='>' ) continue;
|
||||
if ( node[i-1] ==']' ) break;
|
||||
// look for ending of --> like for <![endif]-->
|
||||
if ( node[i-1] == '-' && node[i-2] == '-' ) break;
|
||||
}
|
||||
|
||||
// skip i over the >, if any (could be end of doc)
|
||||
|
6
changelog
Normal file
6
changelog
Normal file
@ -0,0 +1,6 @@
|
||||
gb (1.1-1) unstable; urgency=low
|
||||
|
||||
* Lots of bug fixes
|
||||
* API updates.
|
||||
|
||||
-- mwells <gigablast@mail.com> Sat, 05 Jul 2014 18:38:35 -0700
|
@ -26,6 +26,8 @@ bool sendPageSEO(TcpSocket *s, HttpRequest *hr) {return true;}
|
||||
//long g_qbufNeedSave = false;
|
||||
//SafeBuf g_qbuf;
|
||||
|
||||
bool g_recoveryMode;
|
||||
|
||||
#define RDFBUFFER_SIZE (1024*1024*10)
|
||||
#define RDFSTRUCTURE_FILE "structure.rdf.u8"
|
||||
#define RDFCONTENT_FILE "content.rdf.u8"
|
||||
|
@ -32,6 +32,11 @@ override_dh_strip:
|
||||
# debian/gb.substvars and makes dpkg -i bitch about dependencies not being met
|
||||
override_dh_shlibdeps:
|
||||
echo "skipping dh_shlibdeps call! MDW"
|
||||
# adding the line below here does not seem to make dpkg prompt to
|
||||
# install netpbm, rather just bitch about it and make it harder to install
|
||||
# echo "building our own gb.substvars"
|
||||
# echo "misc:Depends=netpbm (>= 0.0)" > debian/gb.substvars
|
||||
# echo "misc:Depends=netpbm" > debian/gb.substvars
|
||||
|
||||
# override_dh_shlibdeps-indep:
|
||||
# echo "shit"
|
||||
|
@ -835,7 +835,7 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
|
||||
<table cellpadding=1 border=0 width=100% bgcolor=#0079ba>
|
||||
<tr><td><center><b><font color=#ffffff size=+1>Building a DMOZ Based Directory</td></tr></table>
|
||||
<br>
|
||||
<<i>Last Updated October 2013</i>>
|
||||
<<i>Last Updated July 2014</i>>
|
||||
<br>
|
||||
<br>
|
||||
|
||||
@ -849,9 +849,9 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
|
||||
<br> $ wget http://rdf.dmoz.org/rdf/structure.rdf.u8.gz
|
||||
<br> $ gunzip structure.rdf.u8.gz</b>
|
||||
<br>
|
||||
<li>Execute <i>dmozparse</i> in its directory with the <i>new</i> option to generate the catdb dat files.<br> <b>$ dmozparse new</b><br>
|
||||
<li>Execute <i>dmozparse</i> in its directory with the <i>new</i> option to generate the catdb dat files.<br> <b>$ ./dmozparse new</b><br>
|
||||
|
||||
<li>Execute the installcat script command on host 0 to distribute the catdb files to all the hosts.<br>This just does an scp/rcp from host 0 to the other hosts listed in <a href=#hosts>hosts.conf</a>.<br> <b>$ gb installcat</b><br>
|
||||
<li>Execute the installcat script command on host 0 to distribute the catdb files to all the hosts.<br>This just does an scp/rcp from host 0 to the other hosts listed in <a href=#hosts>hosts.conf</a>.<br> <b>$ ./gb installcat</b><br>
|
||||
|
||||
<li>Make sure all spiders are stopped and inactive.<br>
|
||||
|
||||
@ -865,7 +865,7 @@ Now if you are <a href=#input>interfacing to Gigablast</a> from another program
|
||||
<li>Gigablast provides the unique ability to search the content of the pages in the DMOZ directory. But in order to search the pages in DMOZ we have to index them.
|
||||
So execute <i>dmozparse</i> with the <i>urldump -s</i> option to create the html/gbdmoz.urls.txt.* files which contain all the URLs in DMOZ. (Excluding URLs that contained hashtags, '#'.)
|
||||
|
||||
<br><b>$ dmozparse urldump -s</b>
|
||||
<br><b>$ ./dmozparse urldump -s</b>
|
||||
|
||||
<br><li>Now tell Gigablast to index each URL listed in each gbdmoz.urls.txt.* file. Make sure you specify the collection you are using for DMOZ, in the example below it uses <i>main</i>. You can use the <a href=/addurl>add url</a> page to add the gbdmoz.urls.txt.* files or you can use curl (or wget) like:
|
||||
<br>
|
||||
|
81
main.cpp
81
main.cpp
@ -129,6 +129,9 @@
|
||||
//#include "Facebook.h"
|
||||
//#include "Accessdb.h"
|
||||
|
||||
// from qa.cpp
|
||||
bool qatest ( ) ;
|
||||
|
||||
// call this to shut everything down
|
||||
bool mainShutdown ( bool urgent ) ;
|
||||
//bool mainShutdown2 ( bool urgent ) ;
|
||||
@ -1453,6 +1456,70 @@ int main2 ( int argc , char *argv[] ) {
|
||||
g_conf.m_save = false;
|
||||
|
||||
|
||||
//
|
||||
// run our smoketests
|
||||
//
|
||||
if ( strcmp ( cmd, "qa" ) == 0 ) {
|
||||
// let's ensure our core file can dump
|
||||
struct rlimit lim;
|
||||
lim.rlim_cur = lim.rlim_max = RLIM_INFINITY;
|
||||
if ( setrlimit(RLIMIT_CORE,&lim) )
|
||||
log("qa::setrlimit: %s", mstrerror(errno) );
|
||||
// 50MB
|
||||
g_conf.m_maxMem = 50000000;
|
||||
// init our table for doing zobrist hashing
|
||||
if ( ! hashinit() ) {
|
||||
log("qa::hashinit failed" ); return 0; }
|
||||
// init memory class after conf since it gets maxMem from Conf
|
||||
if ( ! g_mem.init ( 200000000 ) ) {
|
||||
log("qa::Mem init failed" ); return 0; }
|
||||
if (!ucInit(g_hostdb.m_dir)) {
|
||||
log("Unicode initialization failed!");
|
||||
return 1;
|
||||
}
|
||||
g_conf.m_askRootNameservers = true;
|
||||
//g_conf.m_dnsIps [0] = atoip ( "192.168.0.1", 11 );
|
||||
//g_conf.m_dnsClientPort = 9909;
|
||||
g_conf.m_dnsMaxCacheMem = 1024*10;
|
||||
// hack http server port to -1 (none)
|
||||
//g_conf.m_httpPort = 0;
|
||||
g_conf.m_httpMaxSockets = 200;
|
||||
//g_conf.m_httpMaxReadBufSize = 102*1024*1024;
|
||||
g_conf.m_httpMaxSendBufSize = 16*1024;
|
||||
// init the loop
|
||||
if ( ! g_loop.init() ) {
|
||||
log("qa::Loop init failed" ); return 0; }
|
||||
// . then dns client
|
||||
// . server should listen to a socket and register with g_loop
|
||||
if ( ! g_dns.init(14834) ) {
|
||||
log("qa::Dns client init failed" ); return 0; }
|
||||
// . then webserver
|
||||
// . server should listen to a socket and register with g_loop
|
||||
// . use -1 for both http and https ports to mean do not
|
||||
// listen on any ports. we are a client only.
|
||||
if ( ! g_httpServer.init( -1 , -1 ) ) {
|
||||
log("qa::HttpServer init failed" ); return 0; }
|
||||
// set our new pid
|
||||
g_mem.setPid();
|
||||
g_threads.setPid();
|
||||
g_log.setPid();
|
||||
//
|
||||
// beging the qaloop
|
||||
//
|
||||
qatest();
|
||||
//
|
||||
// wait for some i/o signals
|
||||
//
|
||||
if ( ! g_loop.runLoop() ) {
|
||||
log("db: runLoop failed." );
|
||||
return 1;
|
||||
}
|
||||
// no error, return 0
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
|
||||
// log the version
|
||||
//log(LOG_INIT,"conf: Gigablast Server %s",GBVersion);
|
||||
|
||||
@ -5044,7 +5111,19 @@ int install ( install_flag_konst_t installFlag , long hostId , char *dir ,
|
||||
else if ( installFlag == ifk_installcat ) {
|
||||
// . copy catdb files to all hosts
|
||||
// don't copy to ourselves
|
||||
if ( h2->m_hostId == 0 ) continue;
|
||||
if ( h2->m_hostId == 0 ) {
|
||||
sprintf(tmp,
|
||||
"cp "
|
||||
"content.rdf.u8 "
|
||||
"structure.rdf.u8 "
|
||||
"gbdmoz.structure.dat "
|
||||
"gbdmoz.content.dat "
|
||||
"%scatdb/",
|
||||
h2->m_dir);
|
||||
log(LOG_INIT,"admin: %s", tmp);
|
||||
system ( tmp );
|
||||
continue;
|
||||
}
|
||||
sprintf(tmp,
|
||||
"rcp "
|
||||
"%scatdb/content.rdf.u8 "
|
||||
|
488
qa.cpp
488
qa.cpp
@ -4,15 +4,18 @@
|
||||
|
||||
static long s_failures = 0;
|
||||
|
||||
bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
|
||||
bool getUrl( char *path ,
|
||||
void (* callback) (void *state, TcpSocket *sock) ,
|
||||
char *post = NULL ) {
|
||||
SafeBuf sb;
|
||||
sb.safePrintf ( "http://%s:%li%s"
|
||||
, iptoa(g_hostdb.m_myHost->m_ip)
|
||||
, (long)g_hostdb.m_myHost->m_port
|
||||
, (long)g_hostdb.m_myHost->m_httpPort
|
||||
, path
|
||||
);
|
||||
Url u;
|
||||
u.set ( sb.getBufStart() );
|
||||
log("qa: getting %s",sb.getBufStart());
|
||||
if ( ! g_httpServer.getDoc ( u.getUrl() ,
|
||||
0 , // ip
|
||||
0 , // offset
|
||||
@ -25,7 +28,13 @@ bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
|
||||
0, // proxyport
|
||||
-1, // maxtextdoclen
|
||||
-1, // maxotherdoclen
|
||||
NULL ) ) // useragent
|
||||
NULL , // useragent
|
||||
"HTTP/1.0" , // protocol
|
||||
true , // doPost
|
||||
NULL , // cookie
|
||||
NULL , // additionalHeader
|
||||
NULL , // fullRequest
|
||||
post ) )
|
||||
return false;
|
||||
// error?
|
||||
log("qa: getUrl error: %s",mstrerror(g_errno));
|
||||
@ -34,27 +43,90 @@ bool getUrl( char *path , void (* callback) (void *state, TcpSocket *sock) ) {
|
||||
|
||||
bool qatest ( ) ;
|
||||
|
||||
void qatestWrapper ( void *state , TcpSocket *sock ) { qatest(); }
|
||||
void markOut ( char *reply , char *needle ) {
|
||||
|
||||
// return false if blocked, true otherwise
|
||||
bool addColl ( ) {
|
||||
static bool s_flag = false;
|
||||
if ( s_flag ) return true;
|
||||
s_flag = true;
|
||||
return getUrl ( "/admin/addcoll?c=qatest123" , qatestWrapper );
|
||||
if ( ! reply ) return;
|
||||
|
||||
char *s = strstr ( reply , needle );
|
||||
if ( ! s ) return;
|
||||
|
||||
for ( ; *s && ! is_digit(*s); s++ );
|
||||
|
||||
// find end of digit stream
|
||||
//char *end = s;
|
||||
//while ( ; *end && is_digit(*s); end++ );
|
||||
// just bury the digit stream now, zeroing out was not
|
||||
// a consistent LENGTH if we had 10 hits vs 9... making the hash
|
||||
// different
|
||||
|
||||
// space out digits
|
||||
for ( ; *s && is_digit(*s); s++ ) *s = ' ';
|
||||
}
|
||||
|
||||
// do not hash
|
||||
long qa_hash32 ( char *s ) {
|
||||
unsigned long h = 0;
|
||||
long k = 0;
|
||||
for ( long i = 0 ; s[i] ; i++ ) {
|
||||
// skip if not first space and back to back spaces
|
||||
if ( s[i] == ' ' &&i>0 && s[i-1]==' ') continue;
|
||||
h ^= g_hashtab [(unsigned char)k] [(unsigned char)s[i]];
|
||||
k++;
|
||||
}
|
||||
return h;
|
||||
}
|
||||
|
||||
long s_replyCRC = 0;
|
||||
TcpSocket *s_sock = NULL;
|
||||
|
||||
void qatestWrapper ( void *state , TcpSocket *sock ) {
|
||||
log("qa: got reply(%li)=%s",sock->m_readOffset,sock->m_readBuf);
|
||||
|
||||
// get mime
|
||||
HttpMime mime;
|
||||
mime.set ( sock->m_readBuf , sock->m_readOffset , NULL );
|
||||
// only hash content since mime has a timestamp in it
|
||||
char *content = mime.getContent();
|
||||
long contentLen = mime.getContentLen();
|
||||
if ( content[contentLen] ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
char *reply = sock->m_readBuf;
|
||||
|
||||
// take out <responseTimeMS>
|
||||
markOut ( reply , "<currentTimeUTC>");
|
||||
|
||||
markOut ( reply , "<responseTimeMS>");
|
||||
|
||||
// until i figure this one out, take it out
|
||||
markOut ( reply , "<docsInCollection>");
|
||||
|
||||
// until i figure this one out, take it out
|
||||
markOut ( reply , "<hits>");
|
||||
|
||||
// make checksum. we ignore back to back spaces so this
|
||||
// hash works for <docsInCollection>10 vs <docsInCollection>9
|
||||
s_replyCRC = qa_hash32 ( content );
|
||||
|
||||
// this too is used for recording the reply into a file on disk
|
||||
s_sock = sock;
|
||||
|
||||
// continue qa loop
|
||||
qatest();
|
||||
|
||||
}
|
||||
|
||||
// first inject a set list of urls
|
||||
static char **s_urlPtrs = NULL;
|
||||
static long s_numUrls = 0;
|
||||
static char **s_contentPtrs = NULL;
|
||||
static SafeBuf s_ubuf1;
|
||||
static SafeBuf s_ubuf2;
|
||||
static SafeBuf s_cbuf2;
|
||||
|
||||
|
||||
bool loadUrls ( ) {
|
||||
static bool s_loaded = false;
|
||||
if ( s_loaded ) return true;
|
||||
s_loaded = true;
|
||||
// use injectme3 file
|
||||
s_ubuf1.load("./injectme3");
|
||||
// scan for +++URL: xxxxx
|
||||
@ -62,6 +134,8 @@ bool loadUrls ( ) {
|
||||
for ( ; *s ; s++ ) {
|
||||
if ( strncmp(s,"+++URL: ",8) ) continue;
|
||||
// got one
|
||||
// \0 term it for s_contentPtrs below
|
||||
*s = '\0';
|
||||
// find end of it
|
||||
s += 8;
|
||||
char *e = s;
|
||||
@ -72,27 +146,16 @@ bool loadUrls ( ) {
|
||||
s_ubuf2.pushLong((long)s);
|
||||
// skip past that
|
||||
s = e;
|
||||
// point to content
|
||||
s_cbuf2.pushLong((long)(s+1));
|
||||
}
|
||||
// make array of url ptrs
|
||||
s_urlPtrs = (char **)s_ubuf2.getBufStart();
|
||||
s_contentPtrs= (char **)s_cbuf2.getBufStart();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool injectUrls ( ) {
|
||||
loadUrls();
|
||||
static long s_ii = 0;
|
||||
for ( ; s_ii < s_numUrls ; ) {
|
||||
// pre-inc it
|
||||
s_ii++;
|
||||
// inject using html api
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("/admin/inject?c=qatest123&delete=0&u=");
|
||||
sb.urlEncode ( s_urlPtrs[s_ii] );
|
||||
return getUrl ( sb.getBufStart() , qatestWrapper );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
static char *s_queries[] = {
|
||||
"the",
|
||||
"+the",
|
||||
@ -106,116 +169,7 @@ static char *s_queries[] = {
|
||||
"cat -dog",
|
||||
"site:wisc.edu"
|
||||
};
|
||||
|
||||
static long s_checksums[] = {
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0,
|
||||
0
|
||||
};
|
||||
|
||||
static long s_qi1 = 0;
|
||||
|
||||
void doneSearching1 ( void *state , TcpSocket *sock ) {
|
||||
//loadQueries1();
|
||||
long ii = s_qi1 - 1;
|
||||
// get checksum of it
|
||||
HttpMime hm;
|
||||
hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
|
||||
char *page = sock->m_readBuf + hm.getMimeLen() ;
|
||||
// we will need to ignore fields like the latency etc.
|
||||
// perhaps pass that in as a cgi parm. &qa=1
|
||||
long crc = hash32n ( page );
|
||||
if ( crc != s_checksums[ii] ) {
|
||||
log("qatest: query '%s' checksum %lu != %lu",
|
||||
s_queries[ii],
|
||||
s_checksums[ii],
|
||||
crc);
|
||||
s_failures++;
|
||||
}
|
||||
// resume the qa loop
|
||||
qatest();
|
||||
}
|
||||
|
||||
|
||||
// ensure search results are consistent
|
||||
bool searchTest1 () {
|
||||
long nq = sizeof(s_queries)/sizeof(char *);
|
||||
for ( ; s_qi1 < nq ; ) {
|
||||
// pre-inc it
|
||||
s_qi1++;
|
||||
// inject using html api
|
||||
SafeBuf sb;
|
||||
// qa=1 tell gb to exclude "variable" or "random" things
|
||||
// from the serps so we can checksum it consistently
|
||||
sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
|
||||
sb.urlEncode ( s_queries[s_qi1] );
|
||||
return getUrl ( sb.getBufStart() , doneSearching1 );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static long s_qi2 = 0;
|
||||
|
||||
void doneSearching2 ( void *state , TcpSocket *sock ) {
|
||||
//loadQueries1();
|
||||
long ii = s_qi2 - 1;
|
||||
// get checksum of it
|
||||
HttpMime hm;
|
||||
hm.set ( sock->m_readBuf , sock->m_readOffset , NULL );
|
||||
char *page = sock->m_readBuf + hm.getMimeLen() ;
|
||||
// we will need to ignore fields like the latency etc.
|
||||
// perhaps pass that in as a cgi parm. &qa=1
|
||||
long crc = hash32n ( page );
|
||||
if ( crc != s_checksums[ii] ) {
|
||||
log("qatest: query '%s' checksum %lu != %lu",
|
||||
s_queries[ii],
|
||||
s_checksums[ii],
|
||||
crc);
|
||||
s_failures++;
|
||||
}
|
||||
// resume the qa loop
|
||||
qatest();
|
||||
}
|
||||
|
||||
|
||||
// ensure search results are consistent
|
||||
bool searchTest2 () {
|
||||
long nq = sizeof(s_queries)/sizeof(char *);
|
||||
for ( ; s_qi2 < nq ; ) {
|
||||
// pre-inc it
|
||||
s_qi2++;
|
||||
// inject using html api
|
||||
SafeBuf sb;
|
||||
// qa=1 tell gb to exclude "variable" or "random" things
|
||||
// from the serps so we can checksum it consistently
|
||||
sb.safePrintf ( "/search?c=qatest123&qa=1&q=" );
|
||||
sb.urlEncode ( s_queries[s_qi2] );
|
||||
return getUrl ( sb.getBufStart() , doneSearching2 );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool deleteUrls ( ) {
|
||||
static long s_ii2 = 0;
|
||||
for ( ; s_ii2 < s_numUrls ; ) {
|
||||
// pre-inc it
|
||||
s_ii2++;
|
||||
// reject using html api
|
||||
SafeBuf sb;
|
||||
sb.safePrintf( "/admin/inject?c=qatest123&delete=1&u=");
|
||||
sb.urlEncode ( s_urlPtrs[s_ii2] );
|
||||
return getUrl ( sb.getBufStart() , qatestWrapper );
|
||||
}
|
||||
return true;
|
||||
}
|
||||
*/
|
||||
|
||||
#include "Msg0.h"
|
||||
static Msg0 s_msg0;
|
||||
@ -371,67 +325,238 @@ bool checkSpidersDone ( ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool delColl ( ) {
|
||||
static bool s_flag = false;
|
||||
if ( s_flag ) return true;
|
||||
s_flag = true;
|
||||
return getUrl ( "/admin/delcoll?c=qatest123" , qatestWrapper );
|
||||
//static long s_phase = -1;
|
||||
|
||||
void checkCRC ( long needCRC ) {
|
||||
|
||||
// and our current reply
|
||||
SafeBuf fb2;
|
||||
fb2.safeMemcpy(s_sock->m_readBuf,s_sock->m_readOffset);
|
||||
fb2.nullTerm();
|
||||
|
||||
if ( s_replyCRC == needCRC ) {
|
||||
// save reply if good
|
||||
char fn3[1024];
|
||||
sprintf(fn3,"%sqa/reply.%li",g_hostdb.m_dir,needCRC);
|
||||
File ff; ff.set ( fn3 );
|
||||
if ( ff.doesExist() ) return;
|
||||
// if not there yet then save it
|
||||
fb2.save(fn3);
|
||||
return;
|
||||
}
|
||||
|
||||
const char *emsg = "qa: bad replyCRC of %li should be %li "
|
||||
"\n";//"phase=%li\n";
|
||||
fprintf(stderr,emsg,s_replyCRC,needCRC);//,s_phase-1);
|
||||
// get response on file
|
||||
SafeBuf fb1;
|
||||
char fn1[1024];
|
||||
sprintf(fn1,"%sqa/reply.%li",g_hostdb.m_dir,needCRC);
|
||||
fb1.load(fn1);
|
||||
fb1.nullTerm();
|
||||
// break up into lines
|
||||
char fn2[1024];
|
||||
sprintf(fn2,"/tmp/reply.%li",s_replyCRC);
|
||||
fb2.save ( fn2 );
|
||||
|
||||
// do the diff between the two replies so we can see what changed
|
||||
char cmd[1024];
|
||||
sprintf(cmd,"diff %s %s",fn1,fn2);
|
||||
fprintf(stderr,"%s\n",cmd);
|
||||
system(cmd);
|
||||
// if this is zero allow it to slide by. it is learning mode i guess.
|
||||
// so we can learn what crc we need to use.
|
||||
if ( needCRC == 0 ) return;
|
||||
// otherwise, stop right there for debugging
|
||||
exit(1);
|
||||
}
|
||||
|
||||
#undef usleep
|
||||
|
||||
static long s_rdbId1 = 0;
|
||||
static long s_rdbId2 = 0;
|
||||
//static long s_rdbId3 = 0;
|
||||
//
|
||||
// the injection qa test suite
|
||||
//
|
||||
bool qainject () {
|
||||
|
||||
// . run a series of tests to ensure that gb is functioning properly
|
||||
// . use s_urls[] array of urls for injecting and spider seeding
|
||||
// . contain an archive copy of all webpages in the injectme3 file and
|
||||
// in pagearchive1.txt file
|
||||
// . while initially spidering store pages in pagearchive1.txt so we can
|
||||
// replay later. store up to 100,000 pages in there.
|
||||
bool qatest ( ) {
|
||||
static bool s_x1 = false;
|
||||
if ( ! s_x1 ) {
|
||||
s_x1 = true;
|
||||
return getUrl ( "/admin/delcoll?delcoll=qatest123" ,
|
||||
qatestWrapper );
|
||||
}
|
||||
|
||||
//
|
||||
// add the 'qatest123' collection
|
||||
if ( ! addColl () ) return false;
|
||||
//
|
||||
static bool s_x2 = false;
|
||||
if ( ! s_x2 ) {
|
||||
s_x2 = true;
|
||||
if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" ,
|
||||
qatestWrapper ) )
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// check addcoll reply
|
||||
//
|
||||
static bool s_x3 = false;
|
||||
if ( ! s_x3 ) {
|
||||
s_x3 = true;
|
||||
checkCRC ( 238170006 );
|
||||
}
|
||||
|
||||
//
|
||||
// inject urls, return false if not done yet
|
||||
if ( ! injectUrls ( ) ) return false;
|
||||
//
|
||||
static bool s_x4 = false;
|
||||
if ( ! s_x4 ) {
|
||||
// TODO: try delimeter based injection too
|
||||
loadUrls();
|
||||
static long s_ii = 0;
|
||||
for ( ; s_ii < s_ubuf2.length()/(long)sizeof(char *) ; ) {
|
||||
// inject using html api
|
||||
SafeBuf sb;
|
||||
sb.safePrintf("&c=qatest123&deleteurl=0&"
|
||||
"format=xml&u=");
|
||||
sb.urlEncode ( s_urlPtrs[s_ii] );
|
||||
// the content
|
||||
sb.safePrintf("&hasmime=1");
|
||||
sb.safePrintf("&content=");
|
||||
sb.urlEncode(s_contentPtrs[s_ii] );
|
||||
sb.nullTerm();
|
||||
// pre-inc it in case getUrl() blocks
|
||||
s_ii++;
|
||||
getUrl("/admin/inject",qatestWrapper,sb.getBufStart());
|
||||
return false;
|
||||
}
|
||||
s_x4 = true;
|
||||
}
|
||||
|
||||
// +the
|
||||
static bool s_x5 = false;
|
||||
if ( ! s_x5 ) {
|
||||
usleep(500000);
|
||||
s_x5 = true;
|
||||
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
|
||||
qatestWrapper );
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool s_x6 = false;
|
||||
if ( ! s_x6 ) { s_x6 = true ; checkCRC ( -1452050577 ); }
|
||||
|
||||
|
||||
// sports news
|
||||
static bool s_x7 = false;
|
||||
if ( ! s_x7 ) {
|
||||
s_x7 = true;
|
||||
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=sports+news",
|
||||
qatestWrapper );
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool s_x8 = false;
|
||||
if ( ! s_x8 ) { s_x8 = true; checkCRC ( -1586622518 ); }
|
||||
|
||||
//
|
||||
// eject/delete the urls
|
||||
//
|
||||
static long s_ii2 = 0;
|
||||
for ( ; s_ii2 < s_ubuf2.length()/(long)sizeof(char *) ; ) {
|
||||
// reject using html api
|
||||
SafeBuf sb;
|
||||
sb.safePrintf( "/admin/inject?c=qatest123&deleteurl=1&"
|
||||
"format=xml&u=");
|
||||
sb.urlEncode ( s_urlPtrs[s_ii2] );
|
||||
sb.nullTerm();
|
||||
// pre-inc it in case getUrl() blocks
|
||||
s_ii2++;
|
||||
getUrl ( sb.getBufStart() , qatestWrapper );
|
||||
return false;
|
||||
}
|
||||
|
||||
//
|
||||
// make sure no results left, +the
|
||||
//
|
||||
static bool s_x9 = false;
|
||||
if ( ! s_x9 ) {
|
||||
usleep(500000);
|
||||
s_x9 = true;
|
||||
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
|
||||
qatestWrapper );
|
||||
return false;
|
||||
}
|
||||
|
||||
// seems to have <docsInCollection>2</>
|
||||
static bool s_y1 = false;
|
||||
if ( ! s_y1 ) { s_y1 = true; checkCRC ( -1672870556 ); }
|
||||
|
||||
//
|
||||
// try delimeter based injecting
|
||||
//
|
||||
static bool s_y2 = false;
|
||||
if ( ! s_y2 ) {
|
||||
s_y2 = true;
|
||||
SafeBuf sb;
|
||||
// delim=+++URL:
|
||||
sb.safePrintf("&c=qatest123&deleteurl=0&"
|
||||
"delim=%%2B%%2B%%2BURL%%3A&format=xml&u=xyz.com&"
|
||||
"hasmime=1&content=");
|
||||
// use injectme3 file
|
||||
SafeBuf ubuf;
|
||||
ubuf.load("./injectme3");
|
||||
sb.urlEncode(ubuf.getBufStart());
|
||||
getUrl ( "/admin/inject",qatestWrapper,sb.getBufStart());
|
||||
return false;
|
||||
}
|
||||
|
||||
// check the reply, seems to have only a single docid in it...
|
||||
static bool s_y3 = false;
|
||||
if ( ! s_y3 ) { s_y3 = true; checkCRC ( -1970198487 ); }
|
||||
|
||||
// now query check
|
||||
static bool s_y4 = false;
|
||||
if ( ! s_y4 ) {
|
||||
usleep(500000);
|
||||
s_y4 = true;
|
||||
getUrl ( "/search?c=qatest123&qa=1&format=xml&q=%2Bthe",
|
||||
qatestWrapper );
|
||||
return false;
|
||||
}
|
||||
|
||||
// check search results crc
|
||||
static bool s_y5 = false;
|
||||
if ( ! s_y5 ) { s_y5 = true; checkCRC ( -480078278 ); }
|
||||
|
||||
|
||||
|
||||
|
||||
// test search results
|
||||
if ( ! searchTest1 () ) return false;
|
||||
|
||||
// delete all urls cleanly now
|
||||
if ( ! deleteUrls ( ) ) return false;
|
||||
|
||||
// now get rdblist for every rdb for this coll and make sure all zero!
|
||||
if ( ! checkRdbLists ( &s_rdbId1 ) ) return false;
|
||||
//if ( ! checkRdbLists ( &s_rdbId1 ) ) return false;
|
||||
|
||||
// dump, tight merge and ensure no data in our rdbs for this coll
|
||||
if ( ! dumpTreesToDisk() ) return false;
|
||||
//if ( ! dumpTreesToDisk() ) return false;
|
||||
|
||||
// wait for tight merge to complete
|
||||
if ( ! waitForMergeToFinish() ) return false;
|
||||
//if ( ! waitForMergeToFinish() ) return false;
|
||||
|
||||
// now get rdblist for every rdb for this coll and make sure all zero!
|
||||
if ( ! checkRdbLists ( &s_rdbId2 ) ) return false;
|
||||
//if ( ! checkRdbLists ( &s_rdbId2 ) ) return false;
|
||||
|
||||
// reset the collection so we can test spidering
|
||||
if ( ! resetColl ( ) ) return false;
|
||||
//if ( ! resetColl ( ) ) return false;
|
||||
|
||||
// add urls to seed spider with. make msg13.cpp recognize qatest123
|
||||
// collection and return 404 on urls not in our official list so
|
||||
// we can ensure search result consistency. msg13.cpp will initially
|
||||
// store the pages in a file, like the first 1,000 or so pages.
|
||||
if ( ! addUrlTest () ) return false;
|
||||
//if ( ! addUrlTest () ) return false;
|
||||
|
||||
// wait for spidering to complete. sleep callback. # of spidered urls
|
||||
// will be x, so we know when to stop
|
||||
if ( ! checkSpidersDone() ) return false;
|
||||
|
||||
// . now search again on the large collection most likely
|
||||
// . store search queries and checksum into queries2.txt
|
||||
// . a 0 (or no) checksum means we should fill it in
|
||||
if ( ! searchTest2 () ) return false;
|
||||
//if ( ! checkSpidersDone() ) return false;
|
||||
|
||||
// try a query delete
|
||||
//if ( ! queryDeleteTest() ) return false;
|
||||
@ -440,7 +565,30 @@ bool qatest ( ) {
|
||||
//if ( ! checkRdbLists ( &s_rdbId3 ) ) return false;
|
||||
|
||||
// delete the collection
|
||||
if ( ! delColl() ) return false;
|
||||
static bool s_fee = false;
|
||||
if ( ! s_fee ) {
|
||||
s_fee = true;
|
||||
return getUrl ( "/admin/delcoll?delcoll=qatest123" ,
|
||||
qatestWrapper );
|
||||
}
|
||||
|
||||
static bool s_fee2 = false;
|
||||
if ( ! s_fee2 ) {
|
||||
s_fee2 = true;
|
||||
fprintf(stderr,"\n\n\nSUCCESSFULLY COMPLETED QA TEST\n\n\n");
|
||||
exit(0);
|
||||
}
|
||||
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// . run a series of tests to ensure that gb is functioning properly
|
||||
// . uses the ./qa subdirectory to hold archive pages, ips, spider dates to
|
||||
// ensure consistency between tests for exact replays
|
||||
bool qatest ( ) {
|
||||
|
||||
return qainject();
|
||||
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user