checkpoint #2

This commit is contained in:
Matt 2015-05-03 17:51:47 -07:00
parent 0df4abc759
commit b39a065259
5 changed files with 185 additions and 158 deletions

View File

@ -13,10 +13,6 @@
// from XmlDoc.cpp
bool isRobotsTxtFile ( char *url , int32_t urlLen ) ;
//
// HTML INJECITON PAGE CODE
//
static bool sendHttpReply ( void *state );
static void sendHttpReplyWrapper ( void *state ) ;
@ -26,7 +22,7 @@ static void sendHttpReplyWrapper ( void *state ) ;
// but if we call serialize() then it makes news ones into its own blob.
// so we gotta know our first and last ptr_* pointers for serialize/deseria().
// kinda like how search input works
bool setInjectionRequestFromParms ( TcpSocket *sock ,
void setInjectionRequestFromParms ( TcpSocket *sock ,
HttpRequest *hr ,
CollectionRec *cr ,
InjectionRequest *ir ) {
@ -40,61 +36,61 @@ bool setInjectionRequestFromParms ( TcpSocket *sock ,
// scan the parms
for ( int i = 0 ; i < numParms ; i++ ) {
Parm *m = &m_parms[i];
if ( m->m_objType != OBJ_INJECTION ) continue;
if ( m->m_obj != OBJ_IR ) continue;
// get it
if ( m->m_type == TYPE_STRING ) {
char *str = hr->getString(m->m_parmName,m->m_parmDef);
if ( m->m_type == TYPE_CHARPTR ||
m->m_type == TYPE_FILEUPLOADBUTTON ) {
char *str = hr->getString(m->m_cgi,m->m_def);
// serialize it as a string
char **ptrPtr = m->m_off + (char *)ir;
char **ptrPtr = &((char *)ir + m->m_off);
// store the ptr pointing into hr buf for now
*ptrPtr = str;
// how many strings are we past ptr_url?
int32_t count = ptrPtr - &ir->ptr_url;
// and length. include \0
int32_t *sizePtr = &ir->size_url + count;
*sizePtr = gbstrlen(str) + 1;
if ( str ) *sizePtr = gbstrlen(str) + 1;
else *sizePtr = 0;
continue;
}
// numbers are easy
if ( m->m_type == TYPE_INT ) {
int32_t *ii = (int32_t *)(m->m_off + (char *)ir);
*ii = hr->getLong(m->m_parmName,m->m_parmDef );
else if ( m->m_type == TYPE_LONG ) {
int32_t *ii = (int32_t *)((char *)ir + m->m_off);
int32_t def = atoll(m->m_def);
*ii = hr->getLong(m->m_cgi,def);
}
if ( m->m_type == TYPE_CHAR ) {
char *ii = (char *)(m->m_off + (char *)ir);
*ii = (char)hr->getLong(m->m_parmName,m->m_parmDef );
}
if ( m->m_type == TYPE_FLOAT ) {
float *ii = (float *)(m->m_off + (char *)ir);
*ii = hr->getFloat(m->m_parmName,m->m_parmDef );
else if ( m->m_type == TYPE_CHECKBOX ||
m->m_type == TYPE_BOOL ) {
char *ii = (char *)((char *)ir + m->m_off);
int32_t def = atoll(m->m_def);
*ii = (char)hr->getLong(m->m_cgi,def);
}
// if unsupported let developer know
else { char *xx=NULL;*xx=0; }
}
// if content is "" make it NULL so XmlDoc will download it
// if user really wants empty content they can put a space in there
// TODO: update help then...
if ( ir->m_content && ! ir->m_content[0] )
ir->m_content = NULL;
if ( ir->ptr_content && ! ir->ptr_content[0] )
ir->ptr_content = NULL;
if ( ir->m_contentFile && ! ir->m_contentFile[0] )
ir->m_contentFile = NULL;
if ( ir->ptr_contentFile && ! ir->ptr_contentFile[0] )
ir->ptr_contentFile = NULL;
if ( ir->m_contentDelim && ! ir->m_contentDelim[0] )
ir->m_contentDelim = NULL;
if ( ir->ptr_contentDelim && ! ir->ptr_contentDelim[0] )
ir->ptr_contentDelim = NULL;
if ( ir->ptr_queryToScrape && ! ir->ptr_queryToScrape[0] )
ir->ptr_queryToScrape = NULL;
if ( ir->m_url && ! ir->m_url[0] )
ir->m_url = NULL;
if ( ir->ptr_url && ! ir->ptr_url[0] )
ir->ptr_url = NULL;
// if we had a delimeter but not content, zero it out...
char *content = ir->m_content;
if ( ! content ) content = ir->m_contentFile;
if ( ! content ) ir->m_contentDelim = NULL;
return true;
if ( ! ir->ptr_content && ! ir->ptr_contentFile )
ir->ptr_contentDelim = NULL;
}
// void doneLocalInjectWrapper ( void *state ) {
@ -131,13 +127,31 @@ Host *getHostToHandleInjection ( char *url ) {
// . "sir" is the serialized injectionrequest
// . this is called from the http interface, as well as from
// XmlDoc::indexWarcOrArc() to inject individual recs/docs from the warc/arc
bool sendInjectionRequestToHost ( InjectionRequest *sir ,
nt32_t sirSize ,
void *state ,
void (* callback)(void *) ) {
// . returns false and sets g_errno on error, true on success
bool Msg7::sendInjectionRequestToHost ( InjectionRequest *ir ,
void *state ,
void (* callback)(void *) ) {
// ensure it is our own
if ( &m_injectionRequest != ir ) { char *xx=NULL;*xx=0; }
int32_t sirSize = 0;
char *sir = serializeMsg2 ( ir ,
sizeof(InjectionRequest),
&ir->ptr_url,
&ir->size_url ,
&sirSize );
// oom?
if ( ! sir )
return log("inject: failed to serialize request");
// save it for freeing later
m_sir = sir;
m_sirSize = sirSize;
// forward it to another shard?
Host *host = getHostToHandleInjection ( sir->ptr_url );
Host *host = getHostToHandleInjection ( ir->ptr_url );
// . ok, forward it to another host now
// . and call got gotForwardedReplyWrapper when reply comes in
@ -245,24 +259,24 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
}
// if no url do not inject
if ( ! ir->m_url ) {
if ( ! ir->ptr_url ) {
log("inject: no url provied to inject");
g_errno = EBADURL;
return sendHttpReply ( msg7 );
}
InjectionRequest *ir = &m_ir;
InjectionRequest *ir = &msg7->m_injectionRequest;
m_state = state;
m_callback = callback;
// this will be NULL if the "content" was empty or not given
char *content = ir->m_content;
char *content = ir->ptr_content;
// . try the uploaded file if nothing in the text area
// . this will be NULL if the "content" was empty or not given
if ( ! content ) content = ir->m_contentFile;
if ( ! content ) content = ir->ptr_contentFile;
// forward it to another shard?
//Host *host = getHostToHandleInjection ( ir->ptr_url );
@ -281,20 +295,8 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
// return g_httpServer.sendErrorReply(sock,g_errno,msg,NULL);
// }
int32_t sirSize = 0;
InjectionRequest *sir = ir->serializeMsg ( &sirSize );
if ( ! sir ) {
// oom?
log("inject: error serializing injection request",
mstrerror(g_errno));
return sendHttpReply ( msg7 );
}
// when we receive the udp reply then send back the http reply
if ( ! sendInjectionRequestToHost ( sir ,
sirSize ,
msg7 ,
sendHttpReplyWrapper ) )
if ( ! msg7->sendInjectionRequestToHost (ir, msg7 , sendHttpReplyWrapper ) )
return false;
// error?
@ -328,7 +330,7 @@ bool sendHttpReply ( void *state ) {
char format = msg7->m_format;
// no url parm?
if ( ! g_errno && ! ir->m_url && format != FORMAT_HTML )
if ( ! g_errno && ! ir->ptr_url && format != FORMAT_HTML )
g_errno = EMISSINGINPUT;
if ( g_errno && g_errno != EDOCUNCHANGED ) {
@ -434,7 +436,7 @@ bool sendHttpReply ( void *state ) {
// end debug
//
char *url = ir->m_url;
char *url = ir->ptr_url;
// . if we're talking w/ a robot he doesn't care about this crap
// . send him back the error code (0 means success)
@ -466,11 +468,11 @@ bool sendHttpReply ( void *state ) {
sb.safePrintf ( "<center>Error injecting url: <b>%s[%i]</b>"
"</center>",
mstrerror(g_errno) , g_errno);
else if ( (ir->m_url&&ir->m_url[0]) ||
(ir->m_queryToScrape&&ir->m_queryToScrape[0]) )
else if ( (ir->ptr_url && ir->ptr_url[0]) ||
(ir->ptr_queryToScrape&&ir->ptr_queryToScrape[0]) )
sb.safePrintf ( "<center><b>Sucessfully injected %s"
"</center><br>"
, ir->m_url
, ir->ptr_url
//, xd->m_firstUrl.m_url
);
@ -580,6 +582,8 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
Msg7::Msg7 () {
m_xd = NULL;
m_sir = NULL;
m_inUse = false;
reset();
}
@ -593,6 +597,7 @@ Msg7::~Msg7 () {
void Msg7::reset() {
m_round = 0;
//if ( m_inUse ) { char *xx=NULL;*xx=0; }
//m_firstTime = true;
//m_fixMe = false;
//m_injectCount = 0;
@ -604,6 +609,10 @@ void Msg7::reset() {
delete (m_xd);
m_xd = NULL;
}
if ( m_sir ) {
mfree ( m_sir , m_sirSize , "m7ir" );
m_sir = NULL;
}
}
// when XmlDoc::inject() complets it calls this
@ -853,7 +862,7 @@ bool Msg7::scrapeQuery ( ) {
GigablastRequest *ir = &m_ir;
// error?
char *qts = ir->m_queryToScrape;
char *qts = ir->ptr_queryToScrape;
if ( ! qts ) { char *xx=NULL;*xx=0; }
if ( gbstrlen(qts) > 500 ) {
@ -907,8 +916,8 @@ bool Msg7::scrapeQuery ( ) {
// parent docid is 0
sreq.setKey(firstIp,0LL,false);
char *coll2 = ir->m_coll;
CollectionRec *cr = g_collectiondb.getRec ( coll2 );
//char *coll2 = ir->m_coll;
CollectionRec *cr = g_collectiondb.getRec ( ir->m_collnum );//coll2 );
// forceDEl = false, niceness = 0
m_xd.set4 ( &sreq , NULL , cr->m_coll , NULL , 0 );

View File

@ -62,6 +62,7 @@ class Msg7 {
public:
//GigablastRequest m_gr;
InjectionRequest m_injectionRequest;
//SafeBuf m_injectUrlBuf;
//bool m_firstTime;
@ -71,6 +72,9 @@ public:
//int32_t m_injectCount;
//bool m_isDoneInjecting;
char *m_sir;
int32_t m_sirSize;
bool m_needsSet;
XmlDoc *m_xd;
TcpSocket *m_socket;
@ -79,6 +83,9 @@ public:
char m_useAhrefs;
HashTableX m_linkDedupTable;
// referenced by InjectionRequest::ptr_content
SafeBuf m_contentBuf;
SafeBuf m_sbuf; // for holding entire titlerec for importing
void *m_state;
@ -94,7 +101,9 @@ public:
//void constructor();
Msg7 ();
~Msg7 ();
//bool m_inUse;
bool m_inUse;
class XmlDoc *m_stashxd;
void reset();

104
Parms.cpp
View File

@ -32,7 +32,7 @@
#include "Test.h"
#include "Rebalance.h"
#include "SpiderProxy.h" // buildProxyTable()
#include "PageInject.h"
#include "PageInject.h" // InjectionRequest
// width of input box in characters for url filter expression
#define REGEX_TXT_MAX 80
@ -4886,6 +4886,8 @@ void Parms::init ( ) {
GigablastRequest gr;
InjectionRequest ir;
/*
m->m_title = "delete collection";
m->m_desc = "A collection name to delete. You can specify multiple "
@ -14896,45 +14898,45 @@ void Parms::init ( ) {
//m->m_cgi2 = "u";
//m->m_cgi3 = "seed"; // pagerawlbot
//m->m_cgi4 = "injecturl";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API | PF_REQUIRED;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_url - (char *)&gr;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
// alias #1
m->m_title = "url";
m->m_cgi = "u";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_url - (char *)&gr;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
// alias #2
m->m_title = "url";
m->m_cgi = "seed";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_HIDDEN | PF_DIFFBOT;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_url - (char *)&gr;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
// alias #3
m->m_title = "url";
m->m_cgi = "injecturl";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_url - (char *)&gr;
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
m++;
@ -14943,24 +14945,24 @@ void Parms::init ( ) {
"and inject their links. You are not required to supply "
"the <i>url</i> parm if you supply this parm.";
m->m_cgi = "qts";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_queryToScrape - (char *)&gr;
m->m_off = (char *)&ir.ptr_queryToScrape - (char *)&ir;
m++;
m->m_title = "inject links";
m->m_desc = "Should we inject the links found in the injected "
"content as well?";
m->m_cgi = "injectlinks";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_injectLinks - (char *)&gr;
m->m_off = (char *)&ir.m_injectLinks - (char *)&ir;
m++;
@ -14968,47 +14970,47 @@ void Parms::init ( ) {
m->m_desc = "Add the outlinks of the injected content into spiderdb "
"for spidering?";
m->m_cgi = "spiderlinks";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
// leave off because could start spidering whole web unintentionally
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_spiderLinks - (char *)&gr;
m->m_off = (char *)&ir.m_spiderLinks - (char *)&ir;
m++;
m->m_title = "int16_t reply";
m->m_desc = "Should the injection response be int16_t and simple?";
m->m_title = "short reply";
m->m_desc = "Should the injection response be short and simple?";
m->m_cgi = "quick";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_shortReply - (char *)&gr;
m->m_off = (char *)&ir.m_shortReply - (char *)&ir;
m++;
m->m_title = "only inject content if new";
m->m_desc = "If the specified url is already in the index then "
"skip the injection.";
m->m_cgi = "newonly";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_newOnly - (char *)&gr;
m->m_off = (char *)&ir.m_newOnly - (char *)&ir;
m++;
m->m_title = "delete from index";
m->m_desc = "Delete the specified url from the index.";
m->m_cgi = "deleteurl";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_deleteUrl - (char *)&gr;
m->m_off = (char *)&ir.m_deleteUrl - (char *)&ir;
m++;
m->m_title = "recycle content";
@ -15016,68 +15018,68 @@ void Parms::init ( ) {
"re-download the content, just use the content that was "
"stored in the cache from last time.";
m->m_cgi = "recycle";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_recycle - (char *)&gr;
m->m_off = (char *)&ir.m_recycle - (char *)&ir;
m++;
m->m_title = "dedup url";
m->m_desc = "Do not index the url if there is already another "
"url in the index with the same content.";
m->m_cgi = "dedup";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_dedup - (char *)&gr;
m->m_off = (char *)&ir.m_dedup - (char *)&ir;
m++;
m->m_title = "do consistency checking";
m->m_desc = "Turn this on for debugging.";
m->m_cgi = "consist";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_doConsistencyTesting - (char *)&gr;
m->m_off = (char *)&ir.m_doConsistencyTesting - (char *)&ir;
m++;
m->m_title = "hop count";
m->m_desc = "Use this hop count when injecting the page.";
m->m_cgi = "hopcount";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_hopCount - (char *)&gr;
m->m_off = (char *)&ir.m_hopCount - (char *)&ir;
m++;
m->m_title = "last spider time";
m->m_desc = "Override last time spidered";
m->m_cgi = "lastspidered";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_lastSpidered - (char *)&gr;
m->m_off = (char *)&ir.m_lastSpidered - (char *)&ir;
m++;
m->m_title = "first indexed";
m->m_desc = "Override first indexed time";
m->m_cgi = "firstindexed";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_firstIndexed - (char *)&gr;
m->m_off = (char *)&ir.m_firstIndexed - (char *)&ir;
m++;
@ -15085,12 +15087,12 @@ void Parms::init ( ) {
m->m_desc = "If the content of the url is provided below, does "
"it begin with an HTTP mime header?";
m->m_cgi = "hasmime";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHECKBOX;
m->m_def = "0";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_hasMime - (char *)&gr;
m->m_off = (char *)&ir.m_hasMime - (char *)&ir;
m++;
m->m_title = "content delimeter";
@ -15104,12 +15106,12 @@ void Parms::init ( ) {
"injected url. Otherwise it will append numbers to the "
"url you provide above.";
m->m_cgi = "delim";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_contentDelim - (char *)&gr;
m->m_off = (char *)&ir.ptr_contentDelim - (char *)&ir;
m++;
@ -15120,12 +15122,12 @@ void Parms::init ( ) {
"Possible values: <b>text/html text/plain text/xml "
"application/json</b>";
m->m_cgi = "contenttype";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR; //text/html application/json application/xml
m->m_def = "text/html";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_contentTypeStr - (char *)&gr;
m->m_off = (char *)&ir.ptr_contentTypeStr - (char *)&ir;
m++;
m->m_title = "content charset";
@ -15135,24 +15137,24 @@ void Parms::init ( ) {
"which is 106. "
"See iana_charset.h for the numeric values.";
m->m_cgi = "charset";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_LONG;
m->m_def = "106";
m->m_flags = PF_API;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_charset - (char *)&gr;
m->m_off = (char *)&ir.m_charset - (char *)&ir;
m++;
m->m_title = "upload content file";
m->m_desc = "Instead of specifying the content to be injected in "
"the text box below, upload this file for it.";
m->m_cgi = "file";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_FILEUPLOADBUTTON;
m->m_def = NULL;
m->m_flags = PF_NOAPI;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_contentFile - (char *)&gr;
m->m_off = (char *)&ir.ptr_contentFile - (char *)&ir;
m++;
m->m_title = "content";
@ -15166,35 +15168,35 @@ void Parms::init ( ) {
"inject empty content, otherwise the content will "
"be downloaded from the url.";
m->m_cgi = "content";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API|PF_TEXTAREA;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_content - (char *)&gr;
m->m_off = (char *)&ir.ptr_content - (char *)&ir;
m++;
m->m_title = "get sectiondb voting info";
m->m_desc = "Return section information of injected content for "
"the injected subdomain. ";
m->m_cgi = "sections";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_BOOL;
m->m_def = "0";
m->m_flags = PF_API|PF_NOHTML;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_getSections - (char *)&gr;
m->m_off = (char *)&ir.m_getSections - (char *)&ir;
m++;
m->m_title = "diffbot reply";
m->m_desc = "Used exclusively by diffbot. Do not use.";
m->m_cgi = "diffbotreply";
m->m_obj = OBJ_GBREQUEST;
m->m_obj = OBJ_IR;
m->m_type = TYPE_CHARPTR;
m->m_def = NULL;
m->m_flags = PF_API|PF_TEXTAREA|PF_NOHTML; // do not show in our api
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_diffbotReply - (char *)&gr;
m->m_off = (char *)&ir.ptr_diffbotReply - (char *)&ir;
m++;

View File

@ -39,6 +39,7 @@ enum {
OBJ_COLL ,
OBJ_SI , // SearchInput class
OBJ_GBREQUEST , // for GigablastRequest class of parms
OBJ_IR , // InjectionRequest class from PageInject.h
OBJ_NONE
};

View File

@ -3238,8 +3238,10 @@ bool XmlDoc::indexContainerDoc ( ) {
void doneInjectingArchiveRec ( void *state ) {
XmlDoc *THIS = (XmlDoc *)state;
THIS->m_numInjectionsOut--;
Msg7 *THIS = (Msg7 *)state;
XmlDoc *xd = THIS->m_stashxd;
THIS->m_inUse = false;
xd->m_numInjectionsOut--;
log("build: archive: injection thread returned. %"INT32" out now.",
THIS->m_numInjectionsOut);
THIS->m_masterLoop ( THIS );
@ -3681,48 +3683,47 @@ bool XmlDoc::indexWarcOrArc ( char ctype ) {
// inject input parms:
GigablastRequest *gr = &msg7->m_gr;
// init the input parms
//memset ( gr , 0 , sizeof(GigablastRequest) );
InjectionRequest *ir = &msg7->m_injectionRequest;
// reset it
gr->m_hopCount = *hc + 1;
ir->m_hopCount = *hc + 1;
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
gr->m_collnum = m_collnum;
ir->m_collnum = m_collnum;
// will this work on a content delimeterized doc?
gr->m_deleteUrl = m_deleteFromIndex;
ir->m_deleteUrl = m_deleteFromIndex;
// each subdoc will have a mime since it is a warc
gr->m_hasMime = true;
ir->m_hasMime = true;
// it has a mime so we shouldn't need to set this
gr->m_contentTypeStr = NULL;
ir->ptr_contentTypeStr = NULL;
// we are injecting a single page, not a container file
gr->m_contentDelim = NULL;
ir->ptr_contentDelim = NULL;
// miscelleaneous. faster than memsetting the whole gr class (32k)
gr->m_getSections = 0;
gr->m_gotSections = 0;
gr->m_queryToScrape = NULL;
gr->m_contentFile = NULL;
gr->m_diffbotReply = NULL;
gr->m_spiderLinks = false;
gr->m_injectLinks = false;
gr->m_shortReply = false;
gr->m_newOnly = false;
gr->m_recycle = false;
gr->m_dedup = true;
gr->m_doConsistencyTesting = false;
gr->m_charset = 0;
ir->m_getSections = 0;
ir->m_gotSections = 0;
ir->m_spiderLinks = false;
ir->m_injectLinks = false;
ir->m_shortReply = false;
ir->m_newOnly = false;
ir->m_recycle = false;
ir->m_dedup = true;
ir->m_doConsistencyTesting = false;
ir->m_charset = 0;
ir->ptr_queryToScrape = NULL;
ir->ptr_contentFile = NULL;
ir->ptr_diffbotReply = NULL;
//
// set 'timestamp' for injection
//
gr->m_firstIndexed = recTime;
gr->m_lastSpidered = recTime;
ir->m_firstIndexed = recTime;
ir->m_lastSpidered = recTime;
//
// set 'ip' for injection
//
gr->m_injectDocIp = 0;
ir->m_injectDocIp = 0;
// get the record IP address from the warc header if there
if ( recIp ) {
// get end of ip
@ -3731,21 +3732,21 @@ bool XmlDoc::indexWarcOrArc ( char ctype ) {
while ( *ipEnd && ! is_wspace_a(*ipEnd) ) ipEnd++;
// we now have the ip address for doing ip: searches
// this func is in ip.h
gr->m_injectDocIp = atoip ( recIp, ipEnd-recIp );
ir->m_injectDocIp = atoip ( recIp, ipEnd-recIp );
}
// we end up repopulating m_fileBuf to read the next warc sometimes
// so do not destroy the content we are injecting from the original
// m_fileBuf. so we have to copy it.
gr->m_contentBuf.reset();
gr->m_contentBuf.reserve ( httpReplySize + 1 );
gr->m_contentBuf.safeMemcpy ( httpReply , httpReplySize );
gr->m_contentBuf.nullTerm();
msg7->m_contentBuf.reset();
msg7->m_contentBuf.reserve ( httpReplySize + 1 );
msg7->m_contentBuf.safeMemcpy ( httpReply , httpReplySize );
msg7->m_contentBuf.nullTerm();
//
// set 'content' for injection
//
gr->m_content = gr->m_contentBuf.getBufStart();
ir->ptr_content = msg7->m_contentBuf.getBufStart();
// null term it and hope it doesn't hurt anything!!!!!
//httpReply [ httpReplySize ] = '\0';
@ -3754,25 +3755,30 @@ bool XmlDoc::indexWarcOrArc ( char ctype ) {
// set the rest of the injection parms
gr->m_hopCount = -1;
gr->m_diffbotReply = 0;
gr->m_newOnly = 0;
ir->m_hopCount = -1;
ir->m_newOnly = 0;
// all warc records have the http mime
gr->m_hasMime = true;
gr->m_url = recUrl;
ir->m_hasMime = true;
ir->ptr_url = recUrl;
// load balance over the shards
gr->m_forwardRequest = 1;
// stash this
m_msg7->m_stashxd = this;
QUICKPOLL ( m_niceness );
// log it
log("build: archive: injecting archive url %s",recUrl);
QUICKPOLL ( m_niceness );
if ( ! msg7->inject2 ( this , doneInjectingArchiveRec ) )
if (msg7->sendInjectionRequestToHost(ir,msg7,doneInjectingArchiveRec)){
m_numInjectionsOut++;
else
log("build: index archive: msg7: %s",mstrerror(g_errno));
msg7->m_inUse = true;
goto loop;
}
log("build: index archive: msg7 inject: %s",
mstrerror(g_errno));
goto loop;
}