fix injection stuff

This commit is contained in:
mwells 2014-12-04 09:29:17 -07:00
parent 8157c5be14
commit a7462ed1f4
5 changed files with 62 additions and 19 deletions

View File

@ -258,7 +258,7 @@ bool sendReply ( void *state ) {
// . if we're talking w/ a robot he doesn't care about this crap
// . send him back the error code (0 means success)
if ( url && gr->m_int16_tReply ) {
if ( url && gr->m_shortReply ) {
char buf[1024*32];
char *p = buf;
// return docid and hostid
@ -634,7 +634,11 @@ bool Msg7::inject ( void *state ,
gr->m_newOnly, // index iff new
this ,
doneInjectingWrapper9 ) )
doneInjectingWrapper9 ,
// extra shit
gr->m_firstIndexed,
gr->m_lastSpidered ) )
// we blocked...
return false;
@ -1302,7 +1306,7 @@ bool ImportState::importLoop ( ) {
// gr->m_diffbotReply = NULL;
// gr->m_injectLinks = false;
// gr->m_spiderLinks = true;
// gr->m_int16_tReply = false;
// gr->m_shortReply = false;
// gr->m_newOnly = false;
// gr->m_deleteUrl = false;
// gr->m_recycle = true; // recycle content? or sitelinks?

View File

@ -14816,7 +14816,7 @@ void Parms::init ( ) {
m->m_def = "0";
m->m_flags = PF_HIDDEN;
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_int16_tReply - (char *)&gr;
m->m_off = (char *)&gr.m_shortReply - (char *)&gr;
m++;
m->m_title = "only inject content if new";
@ -14889,6 +14889,28 @@ void Parms::init ( ) {
m->m_off = (char *)&gr.m_hopCount - (char *)&gr;
m++;
m->m_title = "last spider time";
m->m_desc = "Override last time spidered";
m->m_cgi = "lastspidered";
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_lastSpidered - (char *)&gr;
m++;
m->m_title = "first indexed";
m->m_desc = "Override first indexed time";
m->m_cgi = "firstindexed";
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_LONG;
m->m_def = "0";
m->m_flags = PF_HIDDEN; // | PF_API
m->m_page = PAGE_INJECT;
m->m_off = (char *)&gr.m_firstIndexed - (char *)&gr;
m++;
m->m_title = "content has mime";
m->m_desc = "If the content of the url is provided below, does "

10
Parms.h
View File

@ -129,7 +129,7 @@ class GigablastRequest {
char *m_diffbotReply; // secret thing from dan
char m_injectLinks;
char m_spiderLinks;
char m_int16_tReply;
char m_shortReply;
char m_newOnly;
char m_deleteUrl;
char m_recycle;
@ -139,7 +139,13 @@ class GigablastRequest {
char m_getSections;
char m_gotSections;
int32_t m_charset;
int32_t m_hopCount;
int32_t m_hopCount; // hopcount
// older ones
uint32_t m_firstIndexed; // firstimdexed
uint32_t m_lastSpidered; // lastspidered;
///////////

View File

@ -1973,7 +1973,10 @@ bool XmlDoc::injectDoc ( char *url ,
bool newOnly, // index iff new
void *state,
void (*callback)(void *state) ) {
void (*callback)(void *state) ,
uint32_t firstIndexed,
uint32_t lastSpidered ) {
// wait until we are synced with host #0
if ( ! isClockInSync() ) {
@ -2007,6 +2010,9 @@ bool XmlDoc::injectDoc ( char *url ,
SpiderRequest sreq;
sreq.setFromInject ( cleanUrl );
if ( lastSpidered )
sreq.m_addedTime = lastSpidered;
if ( deleteUrl )
sreq.m_forceDelete = 1;
@ -2035,7 +2041,7 @@ bool XmlDoc::injectDoc ( char *url ,
deleteUrl, // false, // deleteFromIndex ,
0,//forcedIp ,
contentType ,
0,//lastSpidered ,
lastSpidered,//lastSpidered overide
contentHasMime )) {
// g_errno should be set if that returned false
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
@ -2058,15 +2064,15 @@ bool XmlDoc::injectDoc ( char *url ,
//if ( recycleContent ) m_recycleContent = true;
// othercrap
//if ( firstIndexed ) {
// m_firstIndexedDate = firstIndexed;
// m_firstIndexedDateValid = true;
//}
if ( firstIndexed ) {
m_firstIndexedDate = firstIndexed;
m_firstIndexedDateValid = true;
}
//if ( lastSpidered ) {
// m_spideredTime = lastSpidered;
// m_spideredTimeValid = true;
//}
if ( lastSpidered ) {
m_spideredTime = lastSpidered;
m_spideredTimeValid = true;
}
if ( hopCount != -1 ) {
m_hopCount = hopCount;
@ -22697,7 +22703,9 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
m_sreq.m_fakeFirstIp &&
! m_sreq.m_forceDelete &&
// do not rebuild spiderdb if only rebuilding posdb
m_useSpiderdb &&
// this is explicitly for injecting so we need to add
// the spider request to spiderdb...
//m_useSpiderdb &&
/// don't add requests like http://xyz.com/xxx-diffbotxyz0 though
! m_isDiffbotJSONObject )
needSpiderdb3 = m_sreq.getRecSize() + 1;
@ -23229,7 +23237,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
if ( m_useSecondaryRdbs )
getRebuiltSpiderRequest ( &revisedReq );
// this fills it in
// this fills it in for doing injections
if ( ! m_useSecondaryRdbs ) {
getRevisedSpiderRequest ( &revisedReq );
// sanity log

View File

@ -2359,7 +2359,10 @@ class XmlDoc {
bool newOnly, // index iff new
void *state,
void (*callback)(void *state) );
void (*callback)(void *state) ,
uint32_t firstIndexedTime = 0,
uint32_t lastSpideredDate = 0 );
bool injectLinks ( HashTableX *linkDedupTable ,