mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
spaces in links fix.
added gbssDiffbotUri to gbss docs.
This commit is contained in:
parent
e6a914d882
commit
0a48930ba3
24
Linkdb.cpp
24
Linkdb.cpp
@ -5759,6 +5759,30 @@ bool Links::addLink ( char *link , int32_t linkLen , int32_t nodeNum ,
|
||||
// we now use everything has is for sites like file.org
|
||||
bool addWWW = false;
|
||||
|
||||
/////
|
||||
//
|
||||
// hack fix. if link has spaces in it convert to +'s
|
||||
// will fix urls like those in anchor tags on
|
||||
// http://www.birmingham-boxes.co.uk/catagory.asp
|
||||
//
|
||||
/////
|
||||
bool hasSpaces = false;
|
||||
char tmp[MAX_URL_LEN+1];
|
||||
for ( int32_t k = 0 ; k < linkLen ; k++ ) {
|
||||
if ( link[k] == ' ' ) hasSpaces = true;
|
||||
// watch out for unterminated quotes
|
||||
if ( link[k] == '>' ) { hasSpaces = false; break; }
|
||||
}
|
||||
for ( int32_t k=0;hasSpaces && linkLen<MAX_URL_LEN && k<linkLen ;k++){
|
||||
tmp[k ] = link[k];
|
||||
if ( tmp[k] == ' ' ) tmp[k] = '+';
|
||||
tmp[k+1] = '\0';
|
||||
}
|
||||
if ( hasSpaces )
|
||||
link = tmp;
|
||||
|
||||
|
||||
|
||||
url.set ( m_baseUrl ,
|
||||
link ,
|
||||
linkLen ,
|
||||
|
32
XmlDoc.cpp
32
XmlDoc.cpp
@ -27227,6 +27227,12 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
CollectionRec *cr = getCollRec();
|
||||
if ( ! cr ) return NULL;
|
||||
|
||||
Json *jp1 = NULL;
|
||||
if ( cr->m_isCustomCrawl && m_isDiffbotJSONObject ) {
|
||||
jp1 = getParsedJson();
|
||||
if ( ! jp1 || jp1 == (void *)-1) return (SafeBuf *)jp1;
|
||||
}
|
||||
|
||||
// sanity
|
||||
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
|
||||
|
||||
@ -27287,11 +27293,23 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
jd.safePrintf("\"gbssAgeInIndex\":"
|
||||
"%"UINT32",\n",now - od->m_spideredTime);
|
||||
|
||||
if ( cr->m_isCustomCrawl ) {
|
||||
if ( m_isDiffbotJSONObject )
|
||||
jd.safePrintf("\"gbssIsDiffbotObject\":1,\n");
|
||||
if ( cr->m_isCustomCrawl && m_isDiffbotJSONObject ) {
|
||||
jd.safePrintf("\"gbssIsDiffbotObject\":1,\n");
|
||||
JsonItem *jsonItem = NULL;
|
||||
if ( jp1 ) jsonItem = jp1->getItem("diffbotUri");
|
||||
if ( jsonItem ) {
|
||||
jd.safePrintf("\"gbssDiffbotUri\":\"");
|
||||
int32_t vlen;
|
||||
char *val = jsonItem->getValueAsString( &vlen );
|
||||
if ( val ) jd.safeMemcpy ( val , vlen );
|
||||
jd.safePrintf("\",\n");
|
||||
}
|
||||
else
|
||||
jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
|
||||
jd.safePrintf("\"gbssDiffbotUri\":"
|
||||
"\"none\",\n");
|
||||
}
|
||||
else if ( cr->m_isCustomCrawl ) {
|
||||
jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
|
||||
}
|
||||
|
||||
jd.safePrintf("\"gbssDomain\":\"");
|
||||
@ -27482,8 +27500,8 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
return NULL;
|
||||
|
||||
|
||||
Json jp;
|
||||
if ( ! jp.parseJsonStringIntoJsonItems ( jd.getBufStart(),m_niceness)){
|
||||
Json jp2;
|
||||
if (! jp2.parseJsonStringIntoJsonItems ( jd.getBufStart(),m_niceness)){
|
||||
g_errno = EBADJSONPARSER;
|
||||
return NULL;
|
||||
}
|
||||
@ -27502,7 +27520,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
|
||||
hi.m_useSections = false;
|
||||
|
||||
// fill up tt4. false -> do not hash without field prefixes.
|
||||
hashJSONFields2 ( &tt4 , &hi , &jp , false );
|
||||
hashJSONFields2 ( &tt4 , &hi , &jp2 , false );
|
||||
|
||||
/*
|
||||
char buf[64];
|
||||
|
Loading…
Reference in New Issue
Block a user