Merge branch 'ia' into ia-zak

2024-10-04 12:17:35 +03:00 · 2015-09-30 07:58:31 -06:00 · 2015-09-30 07:58:31 -06:00 · cb4bbe8892
commit cb4bbe8892
parent 68a3679fae d4c677170f
4 changed files with 171 additions and 21 deletions
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -5702,10 +5702,12 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
 	*/
 		
 	if ( mr->size_metadataBuf && si->m_format == FORMAT_JSON) {
-		sb->safePrintf("\t\t\"metadata\":");
-		sb->safeMemcpy(mr->ptr_metadataBuf, mr->size_metadataBuf);
-		sb->pushChar(',');
-
+		sb->safePrintf("\t\t\"metadata\":[");
+		//sb->safeMemcpy(mr->ptr_metadataBuf, mr->size_metadataBuf);
+		sb->safeStrcpy(mr->ptr_metadataBuf);
+		// without this \n we seem to lose our ] i guess it gets
+		// backed up over
+		sb->safePrintf("],\n");
 	}


--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -246,6 +246,8 @@ void XmlDoc::reset ( ) {

 	m_redirUrl.reset();

+	m_updatedMetaData = false;
+
 	m_ipStartTime = 0;
 	m_ipEndTime   = 0;
 	m_diffbotReplyRetries = 0;
@ -21681,7 +21683,6 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
 	else
 		sb->safePrintf("addlistsize=%05"INT32" ",(int32_t)0);

-
 	if ( m_addedSpiderRequestSizeValid )
 		sb->safePrintf("addspiderreqsize=%05"INT32" ",
 			       m_addedSpiderRequestSize);
@ -21969,6 +21970,9 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
 	if ( m_httpStatusValid && m_httpStatus != 200 )
 		sb->safePrintf("httpstatus=%"INT32" ",(int32_t)m_httpStatus);
 		
+	if ( m_updatedMetaData )
+		sb->safePrintf("updatedmetadata=1 ");
+
 	if ( m_isDupValid && m_isDup )
 		sb->safePrintf("dupofdocid=%"INT64" ",m_docIdWeAreADupOf);

@ -23683,9 +23687,21 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		size_linkInfo1 = 0;
 		m_linkInfo1Valid = false;

+		bool indexNewTimeStamp = false;
+		if ( getUseTimeAxis() &&
+		     od && 
+		     m_hasMetadata &&
+		     *indexCode == EDOCUNCHANGED 
+		     //m_spideredTimeValid &&
+		     //od->m_spideredTime != m_spideredTime 
+		     )
+			indexNewTimeStamp = true;
+		     
+
+
 		// . if not using spiderdb we are done at this point
 		// . this happens for diffbot json replies (m_dx)
-		if ( ! m_useSpiderdb ) {
+		if ( ! m_useSpiderdb && ! indexNewTimeStamp ) {
 			m_metaList = NULL;
 			m_metaListSize = 0;
 			return (char *)0x01;
@ -23724,6 +23740,19 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 		// blocked?
 		if (spiderStatusDocMetaList==(void *)-1) 
 			return (char *)-1;
+
+		// . now append the new stuff.
+		// . we overwrite the old titlerec with the new one that has
+		//   some more json in the ptr_metaInfo buffer so we hash
+		//   its new timestamp. 'gbspiderdate' and any info from
+		//   the meta info given in the injection request if there.
+		//   this allows you to tag each document, even multiple
+		//   versions of the same url with the same content. so if
+		//   you spider the doc again and it is unchanged since last
+		//   time we still index some of this meta stuff.
+		if ( indexNewTimeStamp )
+			appendNewMetaInfo (spiderStatusDocMetaList,forDelete);
+
 		// need to alloc space for it too
 		int32_t len = spiderStatusDocMetaList->length();
 		needx += len;
@ -28667,6 +28696,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
 	// . this is in LinkInfo::hash
 	//if ( ! hashMinInlinks ( table , linkInfo ) ) return NULL;

+	if ( ! hashMetaData      ( table ) ) return NULL;
+
 	// return true if we don't need to print parser info
 	//if ( ! m_pbuf ) return true;
 	// print out the table into g_bufPtr now if we need to 
@ -28695,6 +28726,78 @@ int32_t XmlDoc::getBoostFromSiteNumInlinks ( int32_t inlinks ) {
 	return boost1;
 }

+bool XmlDoc::appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) {
+
+	// set4() called from the inject sets these two things for meta data
+	// which is basically json that augments the doc, tags it with stuff
+	if ( ! m_hasMetadata ) return true;
+	if ( ! ptr_metadata  ) return true;
+
+	XmlDoc **pod = getOldXmlDoc ( );
+	if ( ! pod ) { char *xx=NULL;*xx=0; }
+	if ( pod == (XmlDoc **)-1 ) { char *xx=NULL;*xx=0; }
+	// this is non-NULL if it existed
+	XmlDoc *od = *pod;
+
+	// wtf?
+	if ( ! od ) return true;
+
+	SafeBuf md;
+
+	// copy over and append
+	if ( ! md.safeMemcpy ( od->ptr_metadata , od->size_metadata ) )
+		return false;
+	// remove trailing \0 if there
+	md.removeLastChar ( '\0' );
+	// separate from the new stuff
+	if ( ! md.safePrintf(",\n") )
+		return false;
+
+	if ( ! md.safeMemcpy ( ptr_metadata , size_metadata ) )
+		return false;
+
+	if ( ! md.nullTerm ( ) )
+		return false;
+	// update his meta data
+	od->ptr_metadata = md.getBufStart();
+	od->size_metadata = md.length();
+
+	int32_t nw = gbstrlen(ptr_metadata) * 4;
+
+	HashTableX tt1;
+	int32_t need4 = nw * 4 + 5000;
+	if ( ! tt1.set ( 18 , 4 , need4,NULL,0,false,m_niceness,"posdb-i2"))
+		return false;
+
+	od->hashMetaData ( &tt1 );
+
+	// store the posdb keys from tt1 into our safebuf, tmp
+	SafeBuf sb;
+	if ( m_usePosdb && ! addTable144 ( &tt1 , od->m_docId , &sb ) )
+		return false;
+
+	int64_t uh48 = m_firstUrl.getUrlHash48();
+
+	// and re-formulate (and compress) his new title rec
+	SafeBuf trec;
+	if ( ! od->setTitleRecBuf ( &trec , od->m_docId , uh48 ) )
+		return false;
+
+	// store the posdb keys in the meta list
+	if ( m_usePosdb && ! metaList->safeMemcpy ( &sb ) )
+		return false;
+
+	// store the updated titlerec into the meta list
+	if ( m_useTitledb && ! metaList->pushChar(RDB_TITLEDB) )
+		return false;
+	if ( m_useTitledb && ! metaList->safeMemcpy(&trec) )
+		return false;
+
+	m_updatedMetaData = true;
+
+	return true;
+}
+
 // . this is kinda hacky because it uses a short XmlDoc on the stack
 // . no need to hash this stuff for regular documents since all the terms
 //   are fielded by gberrorstr, gberrornum or gbisreply.
@ -29555,16 +29658,45 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
 	}


-	if(ptr_metadata) {
-		Json jpMetadata;
+	return true;
+}

-		if (jpMetadata.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)){
-			hashJSONFields2 ( tt , &hi , &jpMetadata , false );
-		} else {
-			log("XmlDoc had error parsing json in metadata %s", ptr_metadata);
-		}
+
+bool XmlDoc::hashMetaData ( HashTableX *tt ) {
+
+	if ( ! ptr_metadata ) return true;
+
+	Json jp;
+
+	if ( ! jp.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)) {
+		log("XmlDoc had error parsing json in metadata %s", 
+		    ptr_metadata);
+		return false;
 	}

+	// set up the hashing parms
+	HashInfo hi;
+	hi.m_hashGroup = HASHGROUP_INMETATAG;
+	hi.m_tt        = tt;
+	hi.m_desc      = "meta data";
+	hi.m_useCountTable = false;
+
+	// always reset to word pos to 0 now when hashing a json field
+	// since it shouldn't matter because they are in a field so we
+	// have to search like myfield:whatever. this way we can
+	// augment ptr_metadata on an EDOCUNCHANGED error and
+	// not end up with undeleteable data in posdb. if we have
+	// duplicate fields in our doc and our doc is json, we could have
+	// some word position conflicts, which kinda sucks, but can be
+	// avoided becomes this is HASHGROUP_INMETATAG, but should really
+	// be HASHGROUP_INMETADATA just to be sure.
+	int32_t saved =  m_dist;
+	m_dist = 0;
+
+	hashJSONFields2 ( tt , &hi , &jp , false );
+
+	m_dist = saved;
+
 	return true;
 }

@ -52104,6 +52236,7 @@ char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
 			totalHash32 ^= vh32;
 		}
 		*/
+
 		// index like "title:whatever"
 		hi->m_prefix = name;
 		hashString ( val , vlen , hi );
--- a/XmlDoc.h
+++ b/XmlDoc.h
@ -521,6 +521,7 @@ class XmlDoc {
 	bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 );
 	// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
 	SafeBuf *getTitleRecBuf ( );
+	bool appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) ;
 	SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
 					      bool forDelete ) ;
 	SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
@ -768,6 +769,8 @@ class XmlDoc {
 	uint64_t m_ipStartTime;
 	uint64_t m_ipEndTime;

+	bool m_updatedMetaData;
+
 	void copyFromOldDoc ( class XmlDoc *od ) ;

 	class SpiderReply *getFakeSpiderReply ( );
@ -813,6 +816,7 @@ class XmlDoc {
 	int32_t getBoostFromSiteNumInlinks ( int32_t inlinks ) ;
 	bool hashSpiderReply (class SpiderReply *reply ,class HashTableX *tt) ;
 	bool hashMetaTags ( class HashTableX *table ) ;
+	bool hashMetaData ( class HashTableX *table ) ;
 	bool hashIsClean ( class HashTableX *table ) ;
 	bool hashZipCodes ( class HashTableX *table ) ;
 	bool hashMetaZip ( class HashTableX *table ) ;
--- a/qa.cpp
+++ b/qa.cpp
@ -1455,6 +1455,13 @@ bool qaTimeAxis ( ) {
                          "format=xml&u=");
            sb.urlEncode ( s_urlPtrs[s_flags[URL_COUNTER]]);
            sb.safePrintf("&hasmime=1");
+	    // add some meta data now, the current time stamp so we can
+	    // make sure the meta data is updated even if its EDOCUNCHANGED
+	    sb.safePrintf("&metadata=");
+	    static int32_t s_count9 = 0;
+	    SafeBuf tmp;
+	    tmp.safePrintf("{\"qatesttime\":%"INT32"}\n",s_count9++);
+	    sb.urlEncode ( tmp.getBufStart(), tmp.getLength() );
            sb.safePrintf("&content=");
            sb.urlEncode(s_contentPtrs[contentIndex]);

@ -1490,13 +1497,17 @@ bool qaTimeAxis ( ) {
 		return false;
 	}

-	// if ( ! s_flags[EXAMINE_RESULTS] ) {
-	// 	s_flags[16] = true;
-	// 	if ( ! getUrl ( "/search?c=qatest123&qa=1&q=%2Bthe"
-	// 			"&dsrt=500",
-	// 			702467314 ) )
-	// 		return false;
-	// }
+	// this doc should have qatesttime:197 and qatesttime:198
+	// since it had a EDOCUNCHANGED error the 2nd time around but
+	// different metadata.
+	if ( ! s_flags[EXAMINE_RESULTS1] ) {
+	 	s_flags[16] = true;
+	 	if ( ! getUrl ( "/search?c=qatest123&qa=1&"
+				"format=json&"
+				"q=qatesttime:197",
+	 			702467314 ) )
+	 		return false;
+	}

    return true;
 }
@ -1634,7 +1645,7 @@ bool qaInjectMetadata ( ) {

 		char* metadata = "{\"testtest\":42,\"a-hyphenated-name\":5, "
 			"\"a-string-value\":\"can we search for this\", "
-			"an array:['a','b', 'c', 1,2,3], "
+			"\"an array\":[\"a\",\"b\", \"c\", 1,2,3], "
 			"\"a field with spaces\":6, \"compound\":{\"field\":7}}";
 		
 		s_flags[ADD_INITIAL_URLS]++;