From a7462ed1f423ddaad75b5892fc2c59d9a1946c11 Mon Sep 17 00:00:00 2001 From: mwells Date: Thu, 4 Dec 2014 09:29:17 -0700 Subject: [PATCH] fix injection stuff --- PageInject.cpp | 10 +++++++--- Parms.cpp | 24 +++++++++++++++++++++++- Parms.h | 10 ++++++++-- XmlDoc.cpp | 32 ++++++++++++++++++++------------ XmlDoc.h | 5 ++++- 5 files changed, 62 insertions(+), 19 deletions(-) diff --git a/PageInject.cpp b/PageInject.cpp index f94c1acd..7933c529 100644 --- a/PageInject.cpp +++ b/PageInject.cpp @@ -258,7 +258,7 @@ bool sendReply ( void *state ) { // . if we're talking w/ a robot he doesn't care about this crap // . send him back the error code (0 means success) - if ( url && gr->m_int16_tReply ) { + if ( url && gr->m_shortReply ) { char buf[1024*32]; char *p = buf; // return docid and hostid @@ -634,7 +634,11 @@ bool Msg7::inject ( void *state , gr->m_newOnly, // index iff new this , - doneInjectingWrapper9 ) ) + doneInjectingWrapper9 , + + // extra shit + gr->m_firstIndexed, + gr->m_lastSpidered ) ) // we blocked... return false; @@ -1302,7 +1306,7 @@ bool ImportState::importLoop ( ) { // gr->m_diffbotReply = NULL; // gr->m_injectLinks = false; // gr->m_spiderLinks = true; - // gr->m_int16_tReply = false; + // gr->m_shortReply = false; // gr->m_newOnly = false; // gr->m_deleteUrl = false; // gr->m_recycle = true; // recycle content? or sitelinks? diff --git a/Parms.cpp b/Parms.cpp index 06918e9f..4a8710c7 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -14816,7 +14816,7 @@ void Parms::init ( ) { m->m_def = "0"; m->m_flags = PF_HIDDEN; m->m_page = PAGE_INJECT; - m->m_off = (char *)&gr.m_int16_tReply - (char *)&gr; + m->m_off = (char *)&gr.m_shortReply - (char *)&gr; m++; m->m_title = "only inject content if new"; @@ -14889,6 +14889,28 @@ void Parms::init ( ) { m->m_off = (char *)&gr.m_hopCount - (char *)&gr; m++; + m->m_title = "last spider time"; + m->m_desc = "Override last time spidered"; + m->m_cgi = "lastspidered"; + m->m_obj = OBJ_GBREQUEST; + m->m_type = TYPE_LONG; + m->m_def = "0"; + m->m_flags = PF_HIDDEN; // | PF_API + m->m_page = PAGE_INJECT; + m->m_off = (char *)&gr.m_lastSpidered - (char *)&gr; + m++; + + m->m_title = "first indexed"; + m->m_desc = "Override first indexed time"; + m->m_cgi = "firstindexed"; + m->m_obj = OBJ_GBREQUEST; + m->m_type = TYPE_LONG; + m->m_def = "0"; + m->m_flags = PF_HIDDEN; // | PF_API + m->m_page = PAGE_INJECT; + m->m_off = (char *)&gr.m_firstIndexed - (char *)&gr; + m++; + m->m_title = "content has mime"; m->m_desc = "If the content of the url is provided below, does " diff --git a/Parms.h b/Parms.h index 2701afd2..7b63795b 100644 --- a/Parms.h +++ b/Parms.h @@ -129,7 +129,7 @@ class GigablastRequest { char *m_diffbotReply; // secret thing from dan char m_injectLinks; char m_spiderLinks; - char m_int16_tReply; + char m_shortReply; char m_newOnly; char m_deleteUrl; char m_recycle; @@ -139,7 +139,13 @@ class GigablastRequest { char m_getSections; char m_gotSections; int32_t m_charset; - int32_t m_hopCount; + int32_t m_hopCount; // hopcount + // older ones + uint32_t m_firstIndexed; // firstimdexed + uint32_t m_lastSpidered; // lastspidered; + + + /////////// diff --git a/XmlDoc.cpp b/XmlDoc.cpp index e876bc9b..c7ebd183 100644 --- a/XmlDoc.cpp +++ b/XmlDoc.cpp @@ -1973,7 +1973,10 @@ bool XmlDoc::injectDoc ( char *url , bool newOnly, // index iff new void *state, - void (*callback)(void *state) ) { + void (*callback)(void *state) , + + uint32_t firstIndexed, + uint32_t lastSpidered ) { // wait until we are synced with host #0 if ( ! isClockInSync() ) { @@ -2007,6 +2010,9 @@ bool XmlDoc::injectDoc ( char *url , SpiderRequest sreq; sreq.setFromInject ( cleanUrl ); + if ( lastSpidered ) + sreq.m_addedTime = lastSpidered; + if ( deleteUrl ) sreq.m_forceDelete = 1; @@ -2035,7 +2041,7 @@ bool XmlDoc::injectDoc ( char *url , deleteUrl, // false, // deleteFromIndex , 0,//forcedIp , contentType , - 0,//lastSpidered , + lastSpidered,//lastSpidered overide contentHasMime )) { // g_errno should be set if that returned false if ( ! g_errno ) { char *xx=NULL;*xx=0; } @@ -2058,15 +2064,15 @@ bool XmlDoc::injectDoc ( char *url , //if ( recycleContent ) m_recycleContent = true; // othercrap - //if ( firstIndexed ) { - // m_firstIndexedDate = firstIndexed; - // m_firstIndexedDateValid = true; - //} + if ( firstIndexed ) { + m_firstIndexedDate = firstIndexed; + m_firstIndexedDateValid = true; + } - //if ( lastSpidered ) { - // m_spideredTime = lastSpidered; - // m_spideredTimeValid = true; - //} + if ( lastSpidered ) { + m_spideredTime = lastSpidered; + m_spideredTimeValid = true; + } if ( hopCount != -1 ) { m_hopCount = hopCount; @@ -22697,7 +22703,9 @@ char *XmlDoc::getMetaList ( bool forDelete ) { m_sreq.m_fakeFirstIp && ! m_sreq.m_forceDelete && // do not rebuild spiderdb if only rebuilding posdb - m_useSpiderdb && + // this is explicitly for injecting so we need to add + // the spider request to spiderdb... + //m_useSpiderdb && /// don't add requests like http://xyz.com/xxx-diffbotxyz0 though ! m_isDiffbotJSONObject ) needSpiderdb3 = m_sreq.getRecSize() + 1; @@ -23229,7 +23237,7 @@ char *XmlDoc::getMetaList ( bool forDelete ) { if ( m_useSecondaryRdbs ) getRebuiltSpiderRequest ( &revisedReq ); - // this fills it in + // this fills it in for doing injections if ( ! m_useSecondaryRdbs ) { getRevisedSpiderRequest ( &revisedReq ); // sanity log diff --git a/XmlDoc.h b/XmlDoc.h index 9f03ffc0..c893c7f6 100644 --- a/XmlDoc.h +++ b/XmlDoc.h @@ -2359,7 +2359,10 @@ class XmlDoc { bool newOnly, // index iff new void *state, - void (*callback)(void *state) ); + void (*callback)(void *state) , + + uint32_t firstIndexedTime = 0, + uint32_t lastSpideredDate = 0 ); bool injectLinks ( HashTableX *linkDedupTable ,