speed up large query reindexes by using fake firstips

limited to 0-64k to avoid excessive doledb winner generations. fix bug when injecting a content-less url that has the canonical tag in it. force it to go through.
2024-10-04 12:17:35 +03:00 · 2014-12-17 16:19:04 -08:00 · 2014-12-17 16:19:04 -08:00 · 0f9cb96b91
commit 0f9cb96b91
parent d57f2264c4
3 changed files with 23 additions and 17 deletions
--- a/PageReindex.cpp
+++ b/PageReindex.cpp
@ -435,7 +435,11 @@ bool Msg1c::gotList ( ) {
 		// url is a docid!
 		sprintf ( sr.m_url , "%"UINT64"" , docId );
 		// make a fake first ip
-		int32_t firstIp = (docId & 0xffffffff);
+		// use only 64k values so we don't stress doledb/waittrees/etc.
+		// for large #'s of docids
+		int32_t firstIp = (docId & 0x0000ffff);
+		// 0 is not a legit val. it'll core below.
+		if ( firstIp == 0 ) firstIp = 1;
 		// use a fake ip
 		sr.m_firstIp        =  firstIp;//nowGlobal;
 		// we are not really injecting...
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -111,7 +111,7 @@ XmlDoc::XmlDoc() {
 	for ( int32_t i = 0 ; i < MAX_XML_DOCS ; i++ ) m_xmlDocs[i] = NULL;
 	m_freed = false;
 	m_contentInjected = false;
-	m_wasInjected = false;
+	m_wasContentInjected = false;
 	//m_coll  = NULL;
 	m_ubuf = NULL;
 	m_pbuf = NULL;
@ -544,7 +544,7 @@ void XmlDoc::reset ( ) {
 	// reset this
 	m_contentInjected = false;
 	m_rawUtf8ContentValid = false;
-	m_wasInjected = false;
+	m_wasContentInjected = false;

 	m_rootDoc = NULL;

@ -1192,7 +1192,7 @@ bool XmlDoc::set4 ( SpiderRequest *sreq      ,
 		//m_utf8ContentValid         = true;

 		m_contentInjected     = true;
-		m_wasInjected         = true;
+		m_wasContentInjected         = true;
 		m_contentType         = contentType;
 		m_contentTypeValid    = true;
 		// use this ip as well for now to avoid ip lookup
@ -3081,11 +3081,11 @@ int32_t *XmlDoc::getIndexCode2 ( ) {

 	// if this is an injection and "newonly" is not zero then we
 	// only want to do the injection if the url is "new", meaning not
-	// already indexed. "m_wasInjected" will be true if this is
+	// already indexed. "m_wasContentInjected" will be true if this is
 	// an injection. "m_newOnly" will be true if the injector only
 	// wants to proceed with the injection if this url is not already
 	// indexed.
-	if ( m_wasInjected && m_newOnly ) {
+	if ( m_wasContentInjected && m_newOnly ) {
 		XmlDoc **pod = getOldXmlDoc ( );
 		if ( ! pod || pod == (XmlDoc **)-1 ) return (int32_t *)pod;
 		XmlDoc *od = *pod;
@ -3093,7 +3093,7 @@ int32_t *XmlDoc::getIndexCode2 ( ) {
 		// then abandon this injection. it was spidered the old
 		// fashioned way and we want to preserve it and NOT overwrite
 		// it with this injection.
-		if ( od && ! od->m_wasInjected ) {
+		if ( od && ! od->m_wasContentInjected ) {
 			m_indexCode = EABANDONED;
 			m_indexCodeValid = true;
 			return &m_indexCode;
@ -3102,7 +3102,7 @@ int32_t *XmlDoc::getIndexCode2 ( ) {
 		// in the special case that m_newOnly is "2". otherwise
 		// if m_newOnly is 1 then we will overwrite any existing
 		// titlerecs that were injected themselves.
-		if ( od && od->m_wasInjected && m_newOnly == 2 ) {
+		if ( od && od->m_wasContentInjected && m_newOnly == 2 ) {
 			m_indexCode = EABANDONED;
 			m_indexCodeValid = true;
 			return &m_indexCode;
@ -3297,7 +3297,8 @@ int32_t *XmlDoc::getIndexCode2 ( ) {
 	// . returns NULL if we are the canonical url
 	// . do not do this check if the page was injected
 	bool checkCanonical = true;
-	if ( m_wasInjected ) checkCanonical = false;
+	if ( m_wasContentInjected ) checkCanonical = false;
+	if ( m_isInjecting && m_isInjectingValid ) checkCanonical = false;
 	// do not do canonical deletion if recycling content either i guess
 	if ( m_sreqValid && m_sreq.m_recycleContent ) checkCanonical = false;
 	// do not delete from being canonical if doing a query reindex
@ -15679,7 +15680,7 @@ char **XmlDoc::getHttpReply2 ( ) {
 	//	isTestColl = false;

 	// sanity check. keep injections fast. no downloading!
-	if ( m_wasInjected ) { 
+	if ( m_wasContentInjected ) { 
 		log("xmldoc: url injection failed! error!");
 		char *xx=NULL;*xx=0; 
 	}
@ -17556,7 +17557,7 @@ char **XmlDoc::getExpandedUtf8Content ( ) {
 	}

 	// do not do iframe expansion in order to keep injections fast
-	if ( m_wasInjected ) {
+	if ( m_wasContentInjected ) {
 		m_expandedUtf8Content     = m_rawUtf8Content;
 		m_expandedUtf8ContentSize = m_rawUtf8ContentSize;
 		m_expandedUtf8ContentValid = true;
@ -20030,13 +20031,14 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
 		sb->safePrintf("hasrssitem=1 ");

 	// was the content itself injected?
-	if ( m_wasInjected ) // m_sreqValid && m_sreq.m_isInjecting )
+	if ( m_wasContentInjected )
 		sb->safePrintf("contentinjected=1 ");
 	else
 		sb->safePrintf("contentinjected=0 ");

 	// might have just injected the url and downloaded the content?
-	if ( m_sreqValid && m_sreq.m_isInjecting )
+	if ( (m_sreqValid && m_sreq.m_isInjecting) || 
+	     (m_isInjecting && m_isInjectingValid) )
 		sb->safePrintf("urlinjected=1 ");
 	else
 		sb->safePrintf("urlinjected=0 ");
@ -37244,7 +37246,7 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
 	if ( now-timestamp > 10*86400 ) addRootLang = true;
 	// injects do not download the root doc for speed reasons, so do not 
 	// bother for them unless the doc itself is the root.
-	if ( m_wasInjected && !*isRoot ) addRootLang = false;
+	if ( m_wasContentInjected && !*isRoot ) addRootLang = false;
 	// . get the two letter (usually) language code from the id
 	// . i think the two chinese languages are 5 letters
 	if ( addRootLang ) {
@ -37364,7 +37366,7 @@ SafeBuf *XmlDoc::getNewTagBuf ( ) {
 	// or if it is 10 days old or more
 	if ( now-timestamp > 10*86400 ) addRootTitle = true;
 	// but not if injected
-	if ( m_wasInjected && ! *isRoot ) addRootTitle = false;
+	if ( m_wasContentInjected && ! *isRoot ) addRootTitle = false;
 	// add it then
 	if ( addRootTitle &&
 	     ! tbuf->addTag(mysite,"roottitles",now,"xmldoc",
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -326,7 +326,7 @@ class XmlDoc {
 	uint16_t  m_isRSS:1;
 	uint16_t  m_isPermalink:1;
 	uint16_t  m_isAdult:1;
-	uint16_t  m_wasInjected:1;//eliminateMenus:1;
+	uint16_t  m_wasContentInjected:1;//eliminateMenus:1;
 	uint16_t  m_spiderLinks:1;
 	uint16_t  m_isContentTruncated:1;
 	uint16_t  m_isLinkSpam:1;
@ -1385,7 +1385,7 @@ class XmlDoc {
 	bool m_hasUseFakeIpsMetaTagValid;
 	bool m_outlinkIsIndexedVectorValid;
 	bool m_isSiteRootValid;
-	bool m_wasInjectedValid;
+	bool m_wasContentInjectedValid;
 	bool m_outlinkHopCountVectorValid;
 	//bool m_isSpamValid;
 	bool m_isFilteredValid;