mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
checkpoint #2
This commit is contained in:
parent
0df4abc759
commit
b39a065259
135
PageInject.cpp
135
PageInject.cpp
@ -13,10 +13,6 @@
|
||||
// from XmlDoc.cpp
|
||||
bool isRobotsTxtFile ( char *url , int32_t urlLen ) ;
|
||||
|
||||
//
|
||||
// HTML INJECITON PAGE CODE
|
||||
//
|
||||
|
||||
static bool sendHttpReply ( void *state );
|
||||
static void sendHttpReplyWrapper ( void *state ) ;
|
||||
|
||||
@ -26,7 +22,7 @@ static void sendHttpReplyWrapper ( void *state ) ;
|
||||
// but if we call serialize() then it makes news ones into its own blob.
|
||||
// so we gotta know our first and last ptr_* pointers for serialize/deseria().
|
||||
// kinda like how search input works
|
||||
bool setInjectionRequestFromParms ( TcpSocket *sock ,
|
||||
void setInjectionRequestFromParms ( TcpSocket *sock ,
|
||||
HttpRequest *hr ,
|
||||
CollectionRec *cr ,
|
||||
InjectionRequest *ir ) {
|
||||
@ -40,61 +36,61 @@ bool setInjectionRequestFromParms ( TcpSocket *sock ,
|
||||
// scan the parms
|
||||
for ( int i = 0 ; i < numParms ; i++ ) {
|
||||
Parm *m = &m_parms[i];
|
||||
if ( m->m_objType != OBJ_INJECTION ) continue;
|
||||
if ( m->m_obj != OBJ_IR ) continue;
|
||||
// get it
|
||||
if ( m->m_type == TYPE_STRING ) {
|
||||
char *str = hr->getString(m->m_parmName,m->m_parmDef);
|
||||
if ( m->m_type == TYPE_CHARPTR ||
|
||||
m->m_type == TYPE_FILEUPLOADBUTTON ) {
|
||||
char *str = hr->getString(m->m_cgi,m->m_def);
|
||||
// serialize it as a string
|
||||
char **ptrPtr = m->m_off + (char *)ir;
|
||||
char **ptrPtr = &((char *)ir + m->m_off);
|
||||
// store the ptr pointing into hr buf for now
|
||||
*ptrPtr = str;
|
||||
// how many strings are we past ptr_url?
|
||||
int32_t count = ptrPtr - &ir->ptr_url;
|
||||
// and length. include \0
|
||||
int32_t *sizePtr = &ir->size_url + count;
|
||||
*sizePtr = gbstrlen(str) + 1;
|
||||
if ( str ) *sizePtr = gbstrlen(str) + 1;
|
||||
else *sizePtr = 0;
|
||||
continue;
|
||||
}
|
||||
// numbers are easy
|
||||
if ( m->m_type == TYPE_INT ) {
|
||||
int32_t *ii = (int32_t *)(m->m_off + (char *)ir);
|
||||
*ii = hr->getLong(m->m_parmName,m->m_parmDef );
|
||||
else if ( m->m_type == TYPE_LONG ) {
|
||||
int32_t *ii = (int32_t *)((char *)ir + m->m_off);
|
||||
int32_t def = atoll(m->m_def);
|
||||
*ii = hr->getLong(m->m_cgi,def);
|
||||
}
|
||||
if ( m->m_type == TYPE_CHAR ) {
|
||||
char *ii = (char *)(m->m_off + (char *)ir);
|
||||
*ii = (char)hr->getLong(m->m_parmName,m->m_parmDef );
|
||||
}
|
||||
if ( m->m_type == TYPE_FLOAT ) {
|
||||
float *ii = (float *)(m->m_off + (char *)ir);
|
||||
*ii = hr->getFloat(m->m_parmName,m->m_parmDef );
|
||||
else if ( m->m_type == TYPE_CHECKBOX ||
|
||||
m->m_type == TYPE_BOOL ) {
|
||||
char *ii = (char *)((char *)ir + m->m_off);
|
||||
int32_t def = atoll(m->m_def);
|
||||
*ii = (char)hr->getLong(m->m_cgi,def);
|
||||
}
|
||||
// if unsupported let developer know
|
||||
else { char *xx=NULL;*xx=0; }
|
||||
}
|
||||
|
||||
|
||||
// if content is "" make it NULL so XmlDoc will download it
|
||||
// if user really wants empty content they can put a space in there
|
||||
// TODO: update help then...
|
||||
if ( ir->m_content && ! ir->m_content[0] )
|
||||
ir->m_content = NULL;
|
||||
if ( ir->ptr_content && ! ir->ptr_content[0] )
|
||||
ir->ptr_content = NULL;
|
||||
|
||||
if ( ir->m_contentFile && ! ir->m_contentFile[0] )
|
||||
ir->m_contentFile = NULL;
|
||||
if ( ir->ptr_contentFile && ! ir->ptr_contentFile[0] )
|
||||
ir->ptr_contentFile = NULL;
|
||||
|
||||
if ( ir->m_contentDelim && ! ir->m_contentDelim[0] )
|
||||
ir->m_contentDelim = NULL;
|
||||
if ( ir->ptr_contentDelim && ! ir->ptr_contentDelim[0] )
|
||||
ir->ptr_contentDelim = NULL;
|
||||
|
||||
if ( ir->ptr_queryToScrape && ! ir->ptr_queryToScrape[0] )
|
||||
ir->ptr_queryToScrape = NULL;
|
||||
|
||||
if ( ir->m_url && ! ir->m_url[0] )
|
||||
ir->m_url = NULL;
|
||||
if ( ir->ptr_url && ! ir->ptr_url[0] )
|
||||
ir->ptr_url = NULL;
|
||||
|
||||
// if we had a delimeter but not content, zero it out...
|
||||
char *content = ir->m_content;
|
||||
if ( ! content ) content = ir->m_contentFile;
|
||||
if ( ! content ) ir->m_contentDelim = NULL;
|
||||
|
||||
return true;
|
||||
if ( ! ir->ptr_content && ! ir->ptr_contentFile )
|
||||
ir->ptr_contentDelim = NULL;
|
||||
}
|
||||
|
||||
// void doneLocalInjectWrapper ( void *state ) {
|
||||
@ -131,13 +127,31 @@ Host *getHostToHandleInjection ( char *url ) {
|
||||
// . "sir" is the serialized injectionrequest
|
||||
// . this is called from the http interface, as well as from
|
||||
// XmlDoc::indexWarcOrArc() to inject individual recs/docs from the warc/arc
|
||||
bool sendInjectionRequestToHost ( InjectionRequest *sir ,
|
||||
nt32_t sirSize ,
|
||||
void *state ,
|
||||
void (* callback)(void *) ) {
|
||||
// . returns false and sets g_errno on error, true on success
|
||||
bool Msg7::sendInjectionRequestToHost ( InjectionRequest *ir ,
|
||||
void *state ,
|
||||
void (* callback)(void *) ) {
|
||||
|
||||
// ensure it is our own
|
||||
if ( &m_injectionRequest != ir ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
int32_t sirSize = 0;
|
||||
char *sir = serializeMsg2 ( ir ,
|
||||
sizeof(InjectionRequest),
|
||||
&ir->ptr_url,
|
||||
&ir->size_url ,
|
||||
&sirSize );
|
||||
// oom?
|
||||
if ( ! sir )
|
||||
return log("inject: failed to serialize request");
|
||||
|
||||
|
||||
// save it for freeing later
|
||||
m_sir = sir;
|
||||
m_sirSize = sirSize;
|
||||
|
||||
// forward it to another shard?
|
||||
Host *host = getHostToHandleInjection ( sir->ptr_url );
|
||||
Host *host = getHostToHandleInjection ( ir->ptr_url );
|
||||
|
||||
// . ok, forward it to another host now
|
||||
// . and call got gotForwardedReplyWrapper when reply comes in
|
||||
@ -245,24 +259,24 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
}
|
||||
|
||||
// if no url do not inject
|
||||
if ( ! ir->m_url ) {
|
||||
if ( ! ir->ptr_url ) {
|
||||
log("inject: no url provied to inject");
|
||||
g_errno = EBADURL;
|
||||
return sendHttpReply ( msg7 );
|
||||
}
|
||||
|
||||
|
||||
InjectionRequest *ir = &m_ir;
|
||||
InjectionRequest *ir = &msg7->m_injectionRequest;
|
||||
|
||||
m_state = state;
|
||||
m_callback = callback;
|
||||
|
||||
// this will be NULL if the "content" was empty or not given
|
||||
char *content = ir->m_content;
|
||||
char *content = ir->ptr_content;
|
||||
|
||||
// . try the uploaded file if nothing in the text area
|
||||
// . this will be NULL if the "content" was empty or not given
|
||||
if ( ! content ) content = ir->m_contentFile;
|
||||
if ( ! content ) content = ir->ptr_contentFile;
|
||||
|
||||
// forward it to another shard?
|
||||
//Host *host = getHostToHandleInjection ( ir->ptr_url );
|
||||
@ -281,20 +295,8 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
|
||||
// return g_httpServer.sendErrorReply(sock,g_errno,msg,NULL);
|
||||
// }
|
||||
|
||||
int32_t sirSize = 0;
|
||||
InjectionRequest *sir = ir->serializeMsg ( &sirSize );
|
||||
if ( ! sir ) {
|
||||
// oom?
|
||||
log("inject: error serializing injection request",
|
||||
mstrerror(g_errno));
|
||||
return sendHttpReply ( msg7 );
|
||||
}
|
||||
|
||||
// when we receive the udp reply then send back the http reply
|
||||
if ( ! sendInjectionRequestToHost ( sir ,
|
||||
sirSize ,
|
||||
msg7 ,
|
||||
sendHttpReplyWrapper ) )
|
||||
if ( ! msg7->sendInjectionRequestToHost (ir, msg7 , sendHttpReplyWrapper ) )
|
||||
return false;
|
||||
|
||||
// error?
|
||||
@ -328,7 +330,7 @@ bool sendHttpReply ( void *state ) {
|
||||
char format = msg7->m_format;
|
||||
|
||||
// no url parm?
|
||||
if ( ! g_errno && ! ir->m_url && format != FORMAT_HTML )
|
||||
if ( ! g_errno && ! ir->ptr_url && format != FORMAT_HTML )
|
||||
g_errno = EMISSINGINPUT;
|
||||
|
||||
if ( g_errno && g_errno != EDOCUNCHANGED ) {
|
||||
@ -434,7 +436,7 @@ bool sendHttpReply ( void *state ) {
|
||||
// end debug
|
||||
//
|
||||
|
||||
char *url = ir->m_url;
|
||||
char *url = ir->ptr_url;
|
||||
|
||||
// . if we're talking w/ a robot he doesn't care about this crap
|
||||
// . send him back the error code (0 means success)
|
||||
@ -466,11 +468,11 @@ bool sendHttpReply ( void *state ) {
|
||||
sb.safePrintf ( "<center>Error injecting url: <b>%s[%i]</b>"
|
||||
"</center>",
|
||||
mstrerror(g_errno) , g_errno);
|
||||
else if ( (ir->m_url&&ir->m_url[0]) ||
|
||||
(ir->m_queryToScrape&&ir->m_queryToScrape[0]) )
|
||||
else if ( (ir->ptr_url && ir->ptr_url[0]) ||
|
||||
(ir->ptr_queryToScrape&&ir->ptr_queryToScrape[0]) )
|
||||
sb.safePrintf ( "<center><b>Sucessfully injected %s"
|
||||
"</center><br>"
|
||||
, ir->m_url
|
||||
, ir->ptr_url
|
||||
//, xd->m_firstUrl.m_url
|
||||
);
|
||||
|
||||
@ -580,6 +582,8 @@ void handleRequest7 ( UdpSlot *slot , int32_t netnice ) {
|
||||
|
||||
Msg7::Msg7 () {
|
||||
m_xd = NULL;
|
||||
m_sir = NULL;
|
||||
m_inUse = false;
|
||||
reset();
|
||||
}
|
||||
|
||||
@ -593,6 +597,7 @@ Msg7::~Msg7 () {
|
||||
|
||||
void Msg7::reset() {
|
||||
m_round = 0;
|
||||
//if ( m_inUse ) { char *xx=NULL;*xx=0; }
|
||||
//m_firstTime = true;
|
||||
//m_fixMe = false;
|
||||
//m_injectCount = 0;
|
||||
@ -604,6 +609,10 @@ void Msg7::reset() {
|
||||
delete (m_xd);
|
||||
m_xd = NULL;
|
||||
}
|
||||
if ( m_sir ) {
|
||||
mfree ( m_sir , m_sirSize , "m7ir" );
|
||||
m_sir = NULL;
|
||||
}
|
||||
}
|
||||
|
||||
// when XmlDoc::inject() complets it calls this
|
||||
@ -853,7 +862,7 @@ bool Msg7::scrapeQuery ( ) {
|
||||
GigablastRequest *ir = &m_ir;
|
||||
|
||||
// error?
|
||||
char *qts = ir->m_queryToScrape;
|
||||
char *qts = ir->ptr_queryToScrape;
|
||||
if ( ! qts ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
if ( gbstrlen(qts) > 500 ) {
|
||||
@ -907,8 +916,8 @@ bool Msg7::scrapeQuery ( ) {
|
||||
// parent docid is 0
|
||||
sreq.setKey(firstIp,0LL,false);
|
||||
|
||||
char *coll2 = ir->m_coll;
|
||||
CollectionRec *cr = g_collectiondb.getRec ( coll2 );
|
||||
//char *coll2 = ir->m_coll;
|
||||
CollectionRec *cr = g_collectiondb.getRec ( ir->m_collnum );//coll2 );
|
||||
|
||||
// forceDEl = false, niceness = 0
|
||||
m_xd.set4 ( &sreq , NULL , cr->m_coll , NULL , 0 );
|
||||
|
11
PageInject.h
11
PageInject.h
@ -62,6 +62,7 @@ class Msg7 {
|
||||
public:
|
||||
|
||||
//GigablastRequest m_gr;
|
||||
InjectionRequest m_injectionRequest;
|
||||
|
||||
//SafeBuf m_injectUrlBuf;
|
||||
//bool m_firstTime;
|
||||
@ -71,6 +72,9 @@ public:
|
||||
//int32_t m_injectCount;
|
||||
//bool m_isDoneInjecting;
|
||||
|
||||
char *m_sir;
|
||||
int32_t m_sirSize;
|
||||
|
||||
bool m_needsSet;
|
||||
XmlDoc *m_xd;
|
||||
TcpSocket *m_socket;
|
||||
@ -79,6 +83,9 @@ public:
|
||||
char m_useAhrefs;
|
||||
HashTableX m_linkDedupTable;
|
||||
|
||||
// referenced by InjectionRequest::ptr_content
|
||||
SafeBuf m_contentBuf;
|
||||
|
||||
SafeBuf m_sbuf; // for holding entire titlerec for importing
|
||||
|
||||
void *m_state;
|
||||
@ -94,7 +101,9 @@ public:
|
||||
//void constructor();
|
||||
Msg7 ();
|
||||
~Msg7 ();
|
||||
//bool m_inUse;
|
||||
bool m_inUse;
|
||||
|
||||
class XmlDoc *m_stashxd;
|
||||
|
||||
void reset();
|
||||
|
||||
|
104
Parms.cpp
104
Parms.cpp
@ -32,7 +32,7 @@
|
||||
#include "Test.h"
|
||||
#include "Rebalance.h"
|
||||
#include "SpiderProxy.h" // buildProxyTable()
|
||||
#include "PageInject.h"
|
||||
#include "PageInject.h" // InjectionRequest
|
||||
|
||||
// width of input box in characters for url filter expression
|
||||
#define REGEX_TXT_MAX 80
|
||||
@ -4886,6 +4886,8 @@ void Parms::init ( ) {
|
||||
|
||||
GigablastRequest gr;
|
||||
|
||||
InjectionRequest ir;
|
||||
|
||||
/*
|
||||
m->m_title = "delete collection";
|
||||
m->m_desc = "A collection name to delete. You can specify multiple "
|
||||
@ -14896,45 +14898,45 @@ void Parms::init ( ) {
|
||||
//m->m_cgi2 = "u";
|
||||
//m->m_cgi3 = "seed"; // pagerawlbot
|
||||
//m->m_cgi4 = "injecturl";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_API | PF_REQUIRED;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_url - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
|
||||
m++;
|
||||
|
||||
// alias #1
|
||||
m->m_title = "url";
|
||||
m->m_cgi = "u";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_HIDDEN;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_url - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
|
||||
m++;
|
||||
|
||||
// alias #2
|
||||
m->m_title = "url";
|
||||
m->m_cgi = "seed";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_HIDDEN | PF_DIFFBOT;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_url - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
|
||||
m++;
|
||||
|
||||
// alias #3
|
||||
m->m_title = "url";
|
||||
m->m_cgi = "injecturl";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_HIDDEN;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_url - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_url - (char *)&ir;
|
||||
m++;
|
||||
|
||||
|
||||
@ -14943,24 +14945,24 @@ void Parms::init ( ) {
|
||||
"and inject their links. You are not required to supply "
|
||||
"the <i>url</i> parm if you supply this parm.";
|
||||
m->m_cgi = "qts";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_queryToScrape - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_queryToScrape - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "inject links";
|
||||
m->m_desc = "Should we inject the links found in the injected "
|
||||
"content as well?";
|
||||
m->m_cgi = "injectlinks";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_injectLinks - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_injectLinks - (char *)&ir;
|
||||
m++;
|
||||
|
||||
|
||||
@ -14968,47 +14970,47 @@ void Parms::init ( ) {
|
||||
m->m_desc = "Add the outlinks of the injected content into spiderdb "
|
||||
"for spidering?";
|
||||
m->m_cgi = "spiderlinks";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
// leave off because could start spidering whole web unintentionally
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_spiderLinks - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_spiderLinks - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "int16_t reply";
|
||||
m->m_desc = "Should the injection response be int16_t and simple?";
|
||||
m->m_title = "short reply";
|
||||
m->m_desc = "Should the injection response be short and simple?";
|
||||
m->m_cgi = "quick";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_shortReply - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_shortReply - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "only inject content if new";
|
||||
m->m_desc = "If the specified url is already in the index then "
|
||||
"skip the injection.";
|
||||
m->m_cgi = "newonly";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_newOnly - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_newOnly - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "delete from index";
|
||||
m->m_desc = "Delete the specified url from the index.";
|
||||
m->m_cgi = "deleteurl";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_deleteUrl - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_deleteUrl - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "recycle content";
|
||||
@ -15016,68 +15018,68 @@ void Parms::init ( ) {
|
||||
"re-download the content, just use the content that was "
|
||||
"stored in the cache from last time.";
|
||||
m->m_cgi = "recycle";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_recycle - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_recycle - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "dedup url";
|
||||
m->m_desc = "Do not index the url if there is already another "
|
||||
"url in the index with the same content.";
|
||||
m->m_cgi = "dedup";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_dedup - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_dedup - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "do consistency checking";
|
||||
m->m_desc = "Turn this on for debugging.";
|
||||
m->m_cgi = "consist";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN; // | PF_API
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_doConsistencyTesting - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_doConsistencyTesting - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "hop count";
|
||||
m->m_desc = "Use this hop count when injecting the page.";
|
||||
m->m_cgi = "hopcount";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN; // | PF_API
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_hopCount - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_hopCount - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "last spider time";
|
||||
m->m_desc = "Override last time spidered";
|
||||
m->m_cgi = "lastspidered";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN; // | PF_API
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_lastSpidered - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_lastSpidered - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "first indexed";
|
||||
m->m_desc = "Override first indexed time";
|
||||
m->m_cgi = "firstindexed";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_HIDDEN; // | PF_API
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_firstIndexed - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_firstIndexed - (char *)&ir;
|
||||
m++;
|
||||
|
||||
|
||||
@ -15085,12 +15087,12 @@ void Parms::init ( ) {
|
||||
m->m_desc = "If the content of the url is provided below, does "
|
||||
"it begin with an HTTP mime header?";
|
||||
m->m_cgi = "hasmime";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHECKBOX;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_hasMime - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_hasMime - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "content delimeter";
|
||||
@ -15104,12 +15106,12 @@ void Parms::init ( ) {
|
||||
"injected url. Otherwise it will append numbers to the "
|
||||
"url you provide above.";
|
||||
m->m_cgi = "delim";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_contentDelim - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_contentDelim - (char *)&ir;
|
||||
m++;
|
||||
|
||||
|
||||
@ -15120,12 +15122,12 @@ void Parms::init ( ) {
|
||||
"Possible values: <b>text/html text/plain text/xml "
|
||||
"application/json</b>";
|
||||
m->m_cgi = "contenttype";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR; //text/html application/json application/xml
|
||||
m->m_def = "text/html";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_contentTypeStr - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_contentTypeStr - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "content charset";
|
||||
@ -15135,24 +15137,24 @@ void Parms::init ( ) {
|
||||
"which is 106. "
|
||||
"See iana_charset.h for the numeric values.";
|
||||
m->m_cgi = "charset";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_def = "106";
|
||||
m->m_flags = PF_API;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_charset - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_charset - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "upload content file";
|
||||
m->m_desc = "Instead of specifying the content to be injected in "
|
||||
"the text box below, upload this file for it.";
|
||||
m->m_cgi = "file";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_FILEUPLOADBUTTON;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_NOAPI;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_contentFile - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_contentFile - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "content";
|
||||
@ -15166,35 +15168,35 @@ void Parms::init ( ) {
|
||||
"inject empty content, otherwise the content will "
|
||||
"be downloaded from the url.";
|
||||
m->m_cgi = "content";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_API|PF_TEXTAREA;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_content - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_content - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "get sectiondb voting info";
|
||||
m->m_desc = "Return section information of injected content for "
|
||||
"the injected subdomain. ";
|
||||
m->m_cgi = "sections";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_BOOL;
|
||||
m->m_def = "0";
|
||||
m->m_flags = PF_API|PF_NOHTML;
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_getSections - (char *)&gr;
|
||||
m->m_off = (char *)&ir.m_getSections - (char *)&ir;
|
||||
m++;
|
||||
|
||||
m->m_title = "diffbot reply";
|
||||
m->m_desc = "Used exclusively by diffbot. Do not use.";
|
||||
m->m_cgi = "diffbotreply";
|
||||
m->m_obj = OBJ_GBREQUEST;
|
||||
m->m_obj = OBJ_IR;
|
||||
m->m_type = TYPE_CHARPTR;
|
||||
m->m_def = NULL;
|
||||
m->m_flags = PF_API|PF_TEXTAREA|PF_NOHTML; // do not show in our api
|
||||
m->m_page = PAGE_INJECT;
|
||||
m->m_off = (char *)&gr.m_diffbotReply - (char *)&gr;
|
||||
m->m_off = (char *)&ir.ptr_diffbotReply - (char *)&ir;
|
||||
m++;
|
||||
|
||||
|
||||
|
1
Parms.h
1
Parms.h
@ -39,6 +39,7 @@ enum {
|
||||
OBJ_COLL ,
|
||||
OBJ_SI , // SearchInput class
|
||||
OBJ_GBREQUEST , // for GigablastRequest class of parms
|
||||
OBJ_IR , // InjectionRequest class from PageInject.h
|
||||
OBJ_NONE
|
||||
};
|
||||
|
||||
|
92
XmlDoc.cpp
92
XmlDoc.cpp
@ -3238,8 +3238,10 @@ bool XmlDoc::indexContainerDoc ( ) {
|
||||
|
||||
|
||||
void doneInjectingArchiveRec ( void *state ) {
|
||||
XmlDoc *THIS = (XmlDoc *)state;
|
||||
THIS->m_numInjectionsOut--;
|
||||
Msg7 *THIS = (Msg7 *)state;
|
||||
XmlDoc *xd = THIS->m_stashxd;
|
||||
THIS->m_inUse = false;
|
||||
xd->m_numInjectionsOut--;
|
||||
log("build: archive: injection thread returned. %"INT32" out now.",
|
||||
THIS->m_numInjectionsOut);
|
||||
THIS->m_masterLoop ( THIS );
|
||||
@ -3681,48 +3683,47 @@ bool XmlDoc::indexWarcOrArc ( char ctype ) {
|
||||
|
||||
|
||||
// inject input parms:
|
||||
GigablastRequest *gr = &msg7->m_gr;
|
||||
// init the input parms
|
||||
//memset ( gr , 0 , sizeof(GigablastRequest) );
|
||||
InjectionRequest *ir = &msg7->m_injectionRequest;
|
||||
// reset it
|
||||
gr->m_hopCount = *hc + 1;
|
||||
ir->m_hopCount = *hc + 1;
|
||||
if ( ! m_collnumValid ) { char *xx=NULL;*xx=0; }
|
||||
gr->m_collnum = m_collnum;
|
||||
ir->m_collnum = m_collnum;
|
||||
// will this work on a content delimeterized doc?
|
||||
gr->m_deleteUrl = m_deleteFromIndex;
|
||||
ir->m_deleteUrl = m_deleteFromIndex;
|
||||
// each subdoc will have a mime since it is a warc
|
||||
gr->m_hasMime = true;
|
||||
ir->m_hasMime = true;
|
||||
// it has a mime so we shouldn't need to set this
|
||||
gr->m_contentTypeStr = NULL;
|
||||
ir->ptr_contentTypeStr = NULL;
|
||||
// we are injecting a single page, not a container file
|
||||
gr->m_contentDelim = NULL;
|
||||
ir->ptr_contentDelim = NULL;
|
||||
// miscelleaneous. faster than memsetting the whole gr class (32k)
|
||||
gr->m_getSections = 0;
|
||||
gr->m_gotSections = 0;
|
||||
gr->m_queryToScrape = NULL;
|
||||
gr->m_contentFile = NULL;
|
||||
gr->m_diffbotReply = NULL;
|
||||
gr->m_spiderLinks = false;
|
||||
gr->m_injectLinks = false;
|
||||
gr->m_shortReply = false;
|
||||
gr->m_newOnly = false;
|
||||
gr->m_recycle = false;
|
||||
gr->m_dedup = true;
|
||||
gr->m_doConsistencyTesting = false;
|
||||
gr->m_charset = 0;
|
||||
ir->m_getSections = 0;
|
||||
ir->m_gotSections = 0;
|
||||
ir->m_spiderLinks = false;
|
||||
ir->m_injectLinks = false;
|
||||
ir->m_shortReply = false;
|
||||
ir->m_newOnly = false;
|
||||
ir->m_recycle = false;
|
||||
ir->m_dedup = true;
|
||||
ir->m_doConsistencyTesting = false;
|
||||
ir->m_charset = 0;
|
||||
|
||||
ir->ptr_queryToScrape = NULL;
|
||||
ir->ptr_contentFile = NULL;
|
||||
ir->ptr_diffbotReply = NULL;
|
||||
|
||||
|
||||
//
|
||||
// set 'timestamp' for injection
|
||||
//
|
||||
gr->m_firstIndexed = recTime;
|
||||
gr->m_lastSpidered = recTime;
|
||||
ir->m_firstIndexed = recTime;
|
||||
ir->m_lastSpidered = recTime;
|
||||
|
||||
|
||||
//
|
||||
// set 'ip' for injection
|
||||
//
|
||||
gr->m_injectDocIp = 0;
|
||||
ir->m_injectDocIp = 0;
|
||||
// get the record IP address from the warc header if there
|
||||
if ( recIp ) {
|
||||
// get end of ip
|
||||
@ -3731,21 +3732,21 @@ bool XmlDoc::indexWarcOrArc ( char ctype ) {
|
||||
while ( *ipEnd && ! is_wspace_a(*ipEnd) ) ipEnd++;
|
||||
// we now have the ip address for doing ip: searches
|
||||
// this func is in ip.h
|
||||
gr->m_injectDocIp = atoip ( recIp, ipEnd-recIp );
|
||||
ir->m_injectDocIp = atoip ( recIp, ipEnd-recIp );
|
||||
}
|
||||
|
||||
// we end up repopulating m_fileBuf to read the next warc sometimes
|
||||
// so do not destroy the content we are injecting from the original
|
||||
// m_fileBuf. so we have to copy it.
|
||||
gr->m_contentBuf.reset();
|
||||
gr->m_contentBuf.reserve ( httpReplySize + 1 );
|
||||
gr->m_contentBuf.safeMemcpy ( httpReply , httpReplySize );
|
||||
gr->m_contentBuf.nullTerm();
|
||||
msg7->m_contentBuf.reset();
|
||||
msg7->m_contentBuf.reserve ( httpReplySize + 1 );
|
||||
msg7->m_contentBuf.safeMemcpy ( httpReply , httpReplySize );
|
||||
msg7->m_contentBuf.nullTerm();
|
||||
|
||||
//
|
||||
// set 'content' for injection
|
||||
//
|
||||
gr->m_content = gr->m_contentBuf.getBufStart();
|
||||
ir->ptr_content = msg7->m_contentBuf.getBufStart();
|
||||
|
||||
// null term it and hope it doesn't hurt anything!!!!!
|
||||
//httpReply [ httpReplySize ] = '\0';
|
||||
@ -3754,25 +3755,30 @@ bool XmlDoc::indexWarcOrArc ( char ctype ) {
|
||||
|
||||
|
||||
// set the rest of the injection parms
|
||||
gr->m_hopCount = -1;
|
||||
gr->m_diffbotReply = 0;
|
||||
gr->m_newOnly = 0;
|
||||
ir->m_hopCount = -1;
|
||||
ir->m_newOnly = 0;
|
||||
// all warc records have the http mime
|
||||
gr->m_hasMime = true;
|
||||
gr->m_url = recUrl;
|
||||
ir->m_hasMime = true;
|
||||
ir->ptr_url = recUrl;
|
||||
|
||||
// load balance over the shards
|
||||
gr->m_forwardRequest = 1;
|
||||
// stash this
|
||||
m_msg7->m_stashxd = this;
|
||||
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
// log it
|
||||
log("build: archive: injecting archive url %s",recUrl);
|
||||
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
if ( ! msg7->inject2 ( this , doneInjectingArchiveRec ) )
|
||||
if (msg7->sendInjectionRequestToHost(ir,msg7,doneInjectingArchiveRec)){
|
||||
m_numInjectionsOut++;
|
||||
else
|
||||
log("build: index archive: msg7: %s",mstrerror(g_errno));
|
||||
msg7->m_inUse = true;
|
||||
goto loop;
|
||||
}
|
||||
|
||||
log("build: index archive: msg7 inject: %s",
|
||||
mstrerror(g_errno));
|
||||
|
||||
goto loop;
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user