mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
Merge branch 'ia' into ia-zak
This commit is contained in:
commit
cb4bbe8892
@ -5702,10 +5702,12 @@ bool printResult ( State0 *st, int32_t ix , int32_t *numPrintedSoFar ) {
|
||||
*/
|
||||
|
||||
if ( mr->size_metadataBuf && si->m_format == FORMAT_JSON) {
|
||||
sb->safePrintf("\t\t\"metadata\":");
|
||||
sb->safeMemcpy(mr->ptr_metadataBuf, mr->size_metadataBuf);
|
||||
sb->pushChar(',');
|
||||
|
||||
sb->safePrintf("\t\t\"metadata\":[");
|
||||
//sb->safeMemcpy(mr->ptr_metadataBuf, mr->size_metadataBuf);
|
||||
sb->safeStrcpy(mr->ptr_metadataBuf);
|
||||
// without this \n we seem to lose our ] i guess it gets
|
||||
// backed up over
|
||||
sb->safePrintf("],\n");
|
||||
}
|
||||
|
||||
|
||||
|
151
XmlDoc.cpp
151
XmlDoc.cpp
@ -246,6 +246,8 @@ void XmlDoc::reset ( ) {
|
||||
|
||||
m_redirUrl.reset();
|
||||
|
||||
m_updatedMetaData = false;
|
||||
|
||||
m_ipStartTime = 0;
|
||||
m_ipEndTime = 0;
|
||||
m_diffbotReplyRetries = 0;
|
||||
@ -21681,7 +21683,6 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
|
||||
else
|
||||
sb->safePrintf("addlistsize=%05"INT32" ",(int32_t)0);
|
||||
|
||||
|
||||
if ( m_addedSpiderRequestSizeValid )
|
||||
sb->safePrintf("addspiderreqsize=%05"INT32" ",
|
||||
m_addedSpiderRequestSize);
|
||||
@ -21969,6 +21970,9 @@ bool XmlDoc::logIt ( SafeBuf *bb ) {
|
||||
if ( m_httpStatusValid && m_httpStatus != 200 )
|
||||
sb->safePrintf("httpstatus=%"INT32" ",(int32_t)m_httpStatus);
|
||||
|
||||
if ( m_updatedMetaData )
|
||||
sb->safePrintf("updatedmetadata=1 ");
|
||||
|
||||
if ( m_isDupValid && m_isDup )
|
||||
sb->safePrintf("dupofdocid=%"INT64" ",m_docIdWeAreADupOf);
|
||||
|
||||
@ -23683,9 +23687,21 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
size_linkInfo1 = 0;
|
||||
m_linkInfo1Valid = false;
|
||||
|
||||
bool indexNewTimeStamp = false;
|
||||
if ( getUseTimeAxis() &&
|
||||
od &&
|
||||
m_hasMetadata &&
|
||||
*indexCode == EDOCUNCHANGED
|
||||
//m_spideredTimeValid &&
|
||||
//od->m_spideredTime != m_spideredTime
|
||||
)
|
||||
indexNewTimeStamp = true;
|
||||
|
||||
|
||||
|
||||
// . if not using spiderdb we are done at this point
|
||||
// . this happens for diffbot json replies (m_dx)
|
||||
if ( ! m_useSpiderdb ) {
|
||||
if ( ! m_useSpiderdb && ! indexNewTimeStamp ) {
|
||||
m_metaList = NULL;
|
||||
m_metaListSize = 0;
|
||||
return (char *)0x01;
|
||||
@ -23724,6 +23740,19 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
// blocked?
|
||||
if (spiderStatusDocMetaList==(void *)-1)
|
||||
return (char *)-1;
|
||||
|
||||
// . now append the new stuff.
|
||||
// . we overwrite the old titlerec with the new one that has
|
||||
// some more json in the ptr_metaInfo buffer so we hash
|
||||
// its new timestamp. 'gbspiderdate' and any info from
|
||||
// the meta info given in the injection request if there.
|
||||
// this allows you to tag each document, even multiple
|
||||
// versions of the same url with the same content. so if
|
||||
// you spider the doc again and it is unchanged since last
|
||||
// time we still index some of this meta stuff.
|
||||
if ( indexNewTimeStamp )
|
||||
appendNewMetaInfo (spiderStatusDocMetaList,forDelete);
|
||||
|
||||
// need to alloc space for it too
|
||||
int32_t len = spiderStatusDocMetaList->length();
|
||||
needx += len;
|
||||
@ -28667,6 +28696,8 @@ char *XmlDoc::hashAll ( HashTableX *table ) {
|
||||
// . this is in LinkInfo::hash
|
||||
//if ( ! hashMinInlinks ( table , linkInfo ) ) return NULL;
|
||||
|
||||
if ( ! hashMetaData ( table ) ) return NULL;
|
||||
|
||||
// return true if we don't need to print parser info
|
||||
//if ( ! m_pbuf ) return true;
|
||||
// print out the table into g_bufPtr now if we need to
|
||||
@ -28695,6 +28726,78 @@ int32_t XmlDoc::getBoostFromSiteNumInlinks ( int32_t inlinks ) {
|
||||
return boost1;
|
||||
}
|
||||
|
||||
bool XmlDoc::appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) {
|
||||
|
||||
// set4() called from the inject sets these two things for meta data
|
||||
// which is basically json that augments the doc, tags it with stuff
|
||||
if ( ! m_hasMetadata ) return true;
|
||||
if ( ! ptr_metadata ) return true;
|
||||
|
||||
XmlDoc **pod = getOldXmlDoc ( );
|
||||
if ( ! pod ) { char *xx=NULL;*xx=0; }
|
||||
if ( pod == (XmlDoc **)-1 ) { char *xx=NULL;*xx=0; }
|
||||
// this is non-NULL if it existed
|
||||
XmlDoc *od = *pod;
|
||||
|
||||
// wtf?
|
||||
if ( ! od ) return true;
|
||||
|
||||
SafeBuf md;
|
||||
|
||||
// copy over and append
|
||||
if ( ! md.safeMemcpy ( od->ptr_metadata , od->size_metadata ) )
|
||||
return false;
|
||||
// remove trailing \0 if there
|
||||
md.removeLastChar ( '\0' );
|
||||
// separate from the new stuff
|
||||
if ( ! md.safePrintf(",\n") )
|
||||
return false;
|
||||
|
||||
if ( ! md.safeMemcpy ( ptr_metadata , size_metadata ) )
|
||||
return false;
|
||||
|
||||
if ( ! md.nullTerm ( ) )
|
||||
return false;
|
||||
// update his meta data
|
||||
od->ptr_metadata = md.getBufStart();
|
||||
od->size_metadata = md.length();
|
||||
|
||||
int32_t nw = gbstrlen(ptr_metadata) * 4;
|
||||
|
||||
HashTableX tt1;
|
||||
int32_t need4 = nw * 4 + 5000;
|
||||
if ( ! tt1.set ( 18 , 4 , need4,NULL,0,false,m_niceness,"posdb-i2"))
|
||||
return false;
|
||||
|
||||
od->hashMetaData ( &tt1 );
|
||||
|
||||
// store the posdb keys from tt1 into our safebuf, tmp
|
||||
SafeBuf sb;
|
||||
if ( m_usePosdb && ! addTable144 ( &tt1 , od->m_docId , &sb ) )
|
||||
return false;
|
||||
|
||||
int64_t uh48 = m_firstUrl.getUrlHash48();
|
||||
|
||||
// and re-formulate (and compress) his new title rec
|
||||
SafeBuf trec;
|
||||
if ( ! od->setTitleRecBuf ( &trec , od->m_docId , uh48 ) )
|
||||
return false;
|
||||
|
||||
// store the posdb keys in the meta list
|
||||
if ( m_usePosdb && ! metaList->safeMemcpy ( &sb ) )
|
||||
return false;
|
||||
|
||||
// store the updated titlerec into the meta list
|
||||
if ( m_useTitledb && ! metaList->pushChar(RDB_TITLEDB) )
|
||||
return false;
|
||||
if ( m_useTitledb && ! metaList->safeMemcpy(&trec) )
|
||||
return false;
|
||||
|
||||
m_updatedMetaData = true;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// . this is kinda hacky because it uses a short XmlDoc on the stack
|
||||
// . no need to hash this stuff for regular documents since all the terms
|
||||
// are fielded by gberrorstr, gberrornum or gbisreply.
|
||||
@ -29555,16 +29658,45 @@ bool XmlDoc::hashMetaTags ( HashTableX *tt ) {
|
||||
}
|
||||
|
||||
|
||||
if(ptr_metadata) {
|
||||
Json jpMetadata;
|
||||
return true;
|
||||
}
|
||||
|
||||
if (jpMetadata.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)){
|
||||
hashJSONFields2 ( tt , &hi , &jpMetadata , false );
|
||||
} else {
|
||||
log("XmlDoc had error parsing json in metadata %s", ptr_metadata);
|
||||
}
|
||||
|
||||
bool XmlDoc::hashMetaData ( HashTableX *tt ) {
|
||||
|
||||
if ( ! ptr_metadata ) return true;
|
||||
|
||||
Json jp;
|
||||
|
||||
if ( ! jp.parseJsonStringIntoJsonItems (ptr_metadata, m_niceness)) {
|
||||
log("XmlDoc had error parsing json in metadata %s",
|
||||
ptr_metadata);
|
||||
return false;
|
||||
}
|
||||
|
||||
// set up the hashing parms
|
||||
HashInfo hi;
|
||||
hi.m_hashGroup = HASHGROUP_INMETATAG;
|
||||
hi.m_tt = tt;
|
||||
hi.m_desc = "meta data";
|
||||
hi.m_useCountTable = false;
|
||||
|
||||
// always reset to word pos to 0 now when hashing a json field
|
||||
// since it shouldn't matter because they are in a field so we
|
||||
// have to search like myfield:whatever. this way we can
|
||||
// augment ptr_metadata on an EDOCUNCHANGED error and
|
||||
// not end up with undeleteable data in posdb. if we have
|
||||
// duplicate fields in our doc and our doc is json, we could have
|
||||
// some word position conflicts, which kinda sucks, but can be
|
||||
// avoided becomes this is HASHGROUP_INMETATAG, but should really
|
||||
// be HASHGROUP_INMETADATA just to be sure.
|
||||
int32_t saved = m_dist;
|
||||
m_dist = 0;
|
||||
|
||||
hashJSONFields2 ( tt , &hi , &jp , false );
|
||||
|
||||
m_dist = saved;
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@ -52104,6 +52236,7 @@ char *XmlDoc::hashJSONFields2 ( HashTableX *table ,
|
||||
totalHash32 ^= vh32;
|
||||
}
|
||||
*/
|
||||
|
||||
// index like "title:whatever"
|
||||
hi->m_prefix = name;
|
||||
hashString ( val , vlen , hi );
|
||||
|
4
XmlDoc.h
4
XmlDoc.h
@ -521,6 +521,7 @@ class XmlDoc {
|
||||
bool setTitleRecBuf ( SafeBuf *buf , int64_t docId, int64_t uh48 );
|
||||
// sets m_titleRecBuf/m_titleRecBufValid/m_titleRecKey[Valid]
|
||||
SafeBuf *getTitleRecBuf ( );
|
||||
bool appendNewMetaInfo ( SafeBuf *metaList , bool forDelete ) ;
|
||||
SafeBuf *getSpiderStatusDocMetaList ( class SpiderReply *reply ,
|
||||
bool forDelete ) ;
|
||||
SafeBuf *getSpiderStatusDocMetaList2 ( class SpiderReply *reply ) ;
|
||||
@ -768,6 +769,8 @@ class XmlDoc {
|
||||
uint64_t m_ipStartTime;
|
||||
uint64_t m_ipEndTime;
|
||||
|
||||
bool m_updatedMetaData;
|
||||
|
||||
void copyFromOldDoc ( class XmlDoc *od ) ;
|
||||
|
||||
class SpiderReply *getFakeSpiderReply ( );
|
||||
@ -813,6 +816,7 @@ class XmlDoc {
|
||||
int32_t getBoostFromSiteNumInlinks ( int32_t inlinks ) ;
|
||||
bool hashSpiderReply (class SpiderReply *reply ,class HashTableX *tt) ;
|
||||
bool hashMetaTags ( class HashTableX *table ) ;
|
||||
bool hashMetaData ( class HashTableX *table ) ;
|
||||
bool hashIsClean ( class HashTableX *table ) ;
|
||||
bool hashZipCodes ( class HashTableX *table ) ;
|
||||
bool hashMetaZip ( class HashTableX *table ) ;
|
||||
|
27
qa.cpp
27
qa.cpp
@ -1455,6 +1455,13 @@ bool qaTimeAxis ( ) {
|
||||
"format=xml&u=");
|
||||
sb.urlEncode ( s_urlPtrs[s_flags[URL_COUNTER]]);
|
||||
sb.safePrintf("&hasmime=1");
|
||||
// add some meta data now, the current time stamp so we can
|
||||
// make sure the meta data is updated even if its EDOCUNCHANGED
|
||||
sb.safePrintf("&metadata=");
|
||||
static int32_t s_count9 = 0;
|
||||
SafeBuf tmp;
|
||||
tmp.safePrintf("{\"qatesttime\":%"INT32"}\n",s_count9++);
|
||||
sb.urlEncode ( tmp.getBufStart(), tmp.getLength() );
|
||||
sb.safePrintf("&content=");
|
||||
sb.urlEncode(s_contentPtrs[contentIndex]);
|
||||
|
||||
@ -1490,13 +1497,17 @@ bool qaTimeAxis ( ) {
|
||||
return false;
|
||||
}
|
||||
|
||||
// if ( ! s_flags[EXAMINE_RESULTS] ) {
|
||||
// s_flags[16] = true;
|
||||
// if ( ! getUrl ( "/search?c=qatest123&qa=1&q=%2Bthe"
|
||||
// "&dsrt=500",
|
||||
// 702467314 ) )
|
||||
// return false;
|
||||
// }
|
||||
// this doc should have qatesttime:197 and qatesttime:198
|
||||
// since it had a EDOCUNCHANGED error the 2nd time around but
|
||||
// different metadata.
|
||||
if ( ! s_flags[EXAMINE_RESULTS1] ) {
|
||||
s_flags[16] = true;
|
||||
if ( ! getUrl ( "/search?c=qatest123&qa=1&"
|
||||
"format=json&"
|
||||
"q=qatesttime:197",
|
||||
702467314 ) )
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
@ -1634,7 +1645,7 @@ bool qaInjectMetadata ( ) {
|
||||
|
||||
char* metadata = "{\"testtest\":42,\"a-hyphenated-name\":5, "
|
||||
"\"a-string-value\":\"can we search for this\", "
|
||||
"an array:['a','b', 'c', 1,2,3], "
|
||||
"\"an array\":[\"a\",\"b\", \"c\", 1,2,3], "
|
||||
"\"a field with spaces\":6, \"compound\":{\"field\":7}}";
|
||||
|
||||
s_flags[ADD_INITIAL_URLS]++;
|
||||
|
Loading…
Reference in New Issue
Block a user