now we pass both injection tests in qa.cpp

2024-10-04 12:17:35 +03:00 · 2015-05-02 12:32:13 -07:00 · 2015-05-02 12:32:13 -07:00 · 16b73a9bdd
commit 16b73a9bdd
parent b55359a95d
3 changed files with 111 additions and 20 deletions
--- a/PageInject.cpp
+++ b/PageInject.cpp
@ -550,7 +550,7 @@ bool Msg7::inject2 ( void *state ,
 	// shortcut
 	XmlDoc *xd = &m_xd;

-	if ( ! gr->m_url ) {
+	if ( ! gr->m_url || ! gr->m_url[0] ) {
 	     // if there is a record delimeter, we form a new fake url
 	     // for each record based on content hash
 	     //! gr->m_contentDelim ) {
@ -692,6 +692,8 @@ bool Msg7::inject2 ( void *state ,
 			       cr ,
 			       content , // start , // content ,
 			       gr->m_diffbotReply,
+			       // if this doc is a 'container doc' then
+			       // hasMime applies to the SUBDOCS only!!
 			       gr->m_hasMime, // content starts with http mime?
 			       gr->m_hopCount,
 			       gr->m_charset,
@ -714,7 +716,8 @@ bool Msg7::inject2 ( void *state ,
 			       gr->m_lastSpidered ,
 			       // the ip of the url being injected.
 			       // use 0 if unknown and it won't be valid.
-			       gr->m_injectDocIp
+			       gr->m_injectDocIp ,
+			       gr->m_contentDelim
 			       ) )
 		// we blocked...
 		return false;
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -1154,7 +1154,8 @@ bool XmlDoc::set4 ( SpiderRequest *sreq      ,
 		    int32_t           forcedIp ,
 		    uint8_t        contentType ,
 		    uint32_t         spideredTime ,
-		    bool           contentHasMime ) {
+		    bool           contentHasMimeArg ,
+		    char          *contentDelim ) {

 	// sanity check
 	if ( sreq->m_dataSize == 0 ) { char *xx=NULL;*xx=0; }
@ -1179,6 +1180,21 @@ bool XmlDoc::set4 ( SpiderRequest *sreq      ,
 	if ( sreq->m_forceDelete )
 		m_deleteFromIndex = true;

+	// if we are a container doc then we need the content delimeter,
+	// unless if we are a warc or arc, then we know how those delimit
+	// already.
+	m_contentDelim = contentDelim;
+	m_contentDelimValid = true;
+
+	bool contentHasMime = contentHasMimeArg;
+	// but if we are a container doc then this parm applies to each subdoc
+	// not to us, so turn it off for this part.
+	if ( isContainerDoc() )	{
+		contentHasMime    = false;
+		m_subDocsHaveMime = contentHasMimeArg;
+	}
+
+
 	char *utf8Content = utf8ContentArg;

 	if ( contentHasMime && utf8Content ) {
@ -2016,7 +2032,7 @@ bool XmlDoc::injectDoc ( char *url ,
 			 CollectionRec *cr ,
 			 char *content ,
 			 char *diffbotReply, // usually null
-			 bool contentHasMime ,
+			 bool contentHasMimeArg ,
 			 int32_t hopCount,
 			 int32_t charset,

@ -2030,7 +2046,8 @@ bool XmlDoc::injectDoc ( char *url ,

 			 uint32_t firstIndexed,
 			 uint32_t lastSpidered ,
-			 int32_t injectDocIp ) {
+			 int32_t injectDocIp ,
+			 char *contentDelim ) {

 	// wait until we are synced with host #0
 	if ( ! isClockInSync() ) {
@ -2098,7 +2115,8 @@ bool XmlDoc::injectDoc ( char *url ,
 		      injectDocIp, // 0,//forcedIp ,
 		      contentType ,
 		      lastSpidered,//lastSpidered overide
-		      contentHasMime )) {
+		      contentHasMimeArg ,
+		      contentDelim )) {
 		// g_errno should be set if that returned false
 		if ( ! g_errno ) { char *xx=NULL;*xx=0; }
 		return true;
@ -2726,7 +2744,8 @@ bool XmlDoc::indexDoc2 ( ) {
 	// handle docs that consist of subdocs that need to be injected
 	// or indexed individually.
 	if ( m_firstUrlValid && m_firstUrl.isWarc() ) {
-		// this returns false if it would block and callback will be called
+		// this returns false if it would block and callback will be 
+		// called
 		if ( ! indexWarc () )
 			return false;
 		// all done! no need to add the parent doc.
@ -2734,7 +2753,8 @@ bool XmlDoc::indexDoc2 ( ) {
 	}

 	if ( m_firstUrlValid && m_firstUrl.isArc() ) {
-		// this returns false if it would block and callback will be called
+		// this returns false if it would block and callback will be 
+		// called
 		if ( ! indexArc () )
 			return false;
 		// all done! no need to add the parent doc.
@ -3009,6 +3029,7 @@ bool isRobotsTxtFile ( char *u , int32_t ulen ) {
 bool XmlDoc::isContainerDoc ( ) {
 	if ( m_firstUrlValid && m_firstUrl.isWarc() ) return true;
 	if ( m_firstUrlValid && m_firstUrl.isArc () ) return true;
+	if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
 	if ( m_contentDelim ) return true;
 	return false;
 }
@ -3052,6 +3073,11 @@ bool XmlDoc::indexContainerDoc ( ) {
 	if ( ! m_anyContentPtr ) {
 		// init the content cursor to point to the first subdoc
 		m_anyContentPtr = *cpp;
+		// but skip over initial separator if there. that is a
+		// faux pau
+		int32_t dlen = gbstrlen(m_contentDelim);
+		if ( strncmp(m_anyContentPtr,m_contentDelim,dlen) == 0 )
+			m_anyContentPtr += dlen;
 		// init the input parms
 		memset ( gr , 0 , sizeof(GigablastRequest) );
 		// reset it
@ -3063,32 +3089,30 @@ bool XmlDoc::indexContainerDoc ( ) {
 		// will this work on a content delimeterized doc?
 		gr->m_deleteUrl = m_deleteFromIndex;
 		// each subdoc will have a mime since it is an arc
-		gr->m_hasMime = true;
+		gr->m_hasMime = m_subDocsHaveMime;//true;
 	}

 subdocLoop:

 	QUICKPOLL ( m_niceness );

+	// EOF?
+	if ( m_anyContentPtr == (char *)-1 ) 
+		return true;
+
 	// we had \0 terminated the end of the previous record, so put back
 	if ( m_savedChar && ! *m_anyContentPtr ) {
 		*m_anyContentPtr = m_savedChar;
 		m_anyContentPtr += gbstrlen(m_contentDelim);
 	}

-	// EOF?
-	if ( ! *m_anyContentPtr ) return true;
-
-	// . should have the url as well.
-	// . the url, ip etc. are on a single \n terminated line for an arc!
-	char *separator = strstr(m_anyContentPtr,m_contentDelim);

 	// index this subdoc
 	gr->m_content = m_anyContentPtr;

-	// these are not defined. will be autoset in set4() i guess.
-	gr->m_firstIndexed = 0;
-	gr->m_lastSpidered = 0;
+	// . should have the url as well.
+	// . the url, ip etc. are on a single \n terminated line for an arc!
+	char *separator = strstr(m_anyContentPtr,m_contentDelim);

 	if ( separator ) {
 		m_savedChar = *separator;
@ -3096,6 +3120,63 @@ bool XmlDoc::indexContainerDoc ( ) {
 		*m_anyContentPtr = '\0';
 	}

+	// if no separator found, this is our last injection
+	if ( ! separator )
+		m_anyContentPtr = (char *)-1;
+
+	// these are not defined. will be autoset in set4() i guess.
+	gr->m_firstIndexed = 0;
+	gr->m_lastSpidered = 0;
+
+	bool setUrl = false;
+
+	// HOWEVER, if an hasmime is true and an http:// follows
+	// the delimeter then use that as the url...
+	// this way we can specify our own urls.
+	if ( gr->m_hasMime ) {
+		char *du = gr->m_content;
+		//du += gbstrlen(delim);
+		if ( du && is_wspace_a ( *du ) ) du++;
+		if ( du && is_wspace_a ( *du ) ) du++;
+		if ( du && is_wspace_a ( *du ) ) du++;
+		if ( gr->m_hasMime && 
+		     (strncasecmp( du,"http://",7) == 0 ||
+		      strncasecmp( du,"https://",8) == 0 ) ) {
+			// flag it
+			setUrl = true;
+			// find end of it
+			char *uend = du + 7;
+			for ( ; *uend && ! is_wspace_a(*uend) ; uend++ );
+			// inject that then
+			m_injectUrlBuf.reset();
+			m_injectUrlBuf.safeMemcpy ( du , uend - du );
+			m_injectUrlBuf.nullTerm();
+			// and point to the actual http mime then
+			// well, skip that space, right
+			gr->m_content = uend + 1;
+			gr->m_url = m_injectUrlBuf.getBufStart();
+		}
+	}
+
+
+	QUICKPOLL ( m_niceness );
+
+	// make the url from parent url
+	// use hash of the content
+	int64_t ch64 = hash64n ( gr->m_content , 0LL );
+
+	QUICKPOLL ( m_niceness );
+
+	if ( ! setUrl ) {
+		// reset it
+		m_injectUrlBuf.reset();
+		// by default append a -<ch64> to the provided url
+		m_injectUrlBuf.safePrintf("%s-%"UINT64"",
+					  m_firstUrl.getUrl(),ch64);
+		gr->m_url = m_injectUrlBuf.getBufStart();
+	}
+
+
 	if ( ! m_msg7->inject2 ( m_masterState , m_masterLoop ) )
 		// it would block, callback will be called later
 		return false;
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -473,7 +473,9 @@ class XmlDoc {
 		    int32_t             forcedIp = 0 ,
 		    uint8_t          contentType = CT_HTML ,
 		    uint32_t           spideredTime = 0 , // time_t
-		    bool             contentHasMime = false ) ;
+		    bool             contentHasMime = false ,
+		    // for container docs, what is the separator of subdocs?
+		    char            *contentDelim = NULL ) ;

 	// we now call this right away rather than at download time!
 	int32_t getSpideredTime();
@ -1057,6 +1059,8 @@ class XmlDoc {
 	char *m_arcContentPtr;
 	char *m_anyContentPtr;
 	char *m_contentDelim;
+	SafeBuf m_injectUrlBuf;
+	bool m_subDocsHaveMime;

 	// . same thing, a little more complicated
 	// . these classes are only set on demand
@ -1181,6 +1185,7 @@ class XmlDoc {
 	char     m_addedSpiderReplySizeValid;
 	char     m_addedStatusDocSizeValid;
 	char     m_downloadStartTimeValid;
+	char     m_contentDelimValid;
 	//char   m_docQualityValid;
 	char     m_siteValid;
 	char     m_startTimeValid;
@ -2418,7 +2423,9 @@ class XmlDoc {

 			 uint32_t firstIndexedTime = 0,
 			 uint32_t lastSpideredDate = 0 ,
-			 int32_t  injectDocIp = 0 );
+			 int32_t  injectDocIp = 0 ,
+			 // for container docs consisting of subdocs to inject
+			 char *contentDelim = NULL );


 	bool injectLinks  ( HashTableX *linkDedupTable ,