spaces in links fix.

added gbssDiffbotUri to gbss docs.
This commit is contained in:
Matt Wells 2015-04-24 10:23:07 -06:00
parent e6a914d882
commit 0a48930ba3
2 changed files with 49 additions and 7 deletions

View File

@ -5759,6 +5759,30 @@ bool Links::addLink ( char *link , int32_t linkLen , int32_t nodeNum ,
// we now use everything has is for sites like file.org
bool addWWW = false;
/////
//
// hack fix. if link has spaces in it convert to +'s
// will fix urls like those in anchor tags on
// http://www.birmingham-boxes.co.uk/catagory.asp
//
/////
bool hasSpaces = false;
char tmp[MAX_URL_LEN+1];
for ( int32_t k = 0 ; k < linkLen ; k++ ) {
if ( link[k] == ' ' ) hasSpaces = true;
// watch out for unterminated quotes
if ( link[k] == '>' ) { hasSpaces = false; break; }
}
for ( int32_t k=0;hasSpaces && linkLen<MAX_URL_LEN && k<linkLen ;k++){
tmp[k ] = link[k];
if ( tmp[k] == ' ' ) tmp[k] = '+';
tmp[k+1] = '\0';
}
if ( hasSpaces )
link = tmp;
url.set ( m_baseUrl ,
link ,
linkLen ,

View File

@ -27227,6 +27227,12 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
CollectionRec *cr = getCollRec();
if ( ! cr ) return NULL;
Json *jp1 = NULL;
if ( cr->m_isCustomCrawl && m_isDiffbotJSONObject ) {
jp1 = getParsedJson();
if ( ! jp1 || jp1 == (void *)-1) return (SafeBuf *)jp1;
}
// sanity
if ( ! m_indexCodeValid ) { char *xx=NULL;*xx=0; }
@ -27287,11 +27293,23 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
jd.safePrintf("\"gbssAgeInIndex\":"
"%"UINT32",\n",now - od->m_spideredTime);
if ( cr->m_isCustomCrawl ) {
if ( m_isDiffbotJSONObject )
jd.safePrintf("\"gbssIsDiffbotObject\":1,\n");
if ( cr->m_isCustomCrawl && m_isDiffbotJSONObject ) {
jd.safePrintf("\"gbssIsDiffbotObject\":1,\n");
JsonItem *jsonItem = NULL;
if ( jp1 ) jsonItem = jp1->getItem("diffbotUri");
if ( jsonItem ) {
jd.safePrintf("\"gbssDiffbotUri\":\"");
int32_t vlen;
char *val = jsonItem->getValueAsString( &vlen );
if ( val ) jd.safeMemcpy ( val , vlen );
jd.safePrintf("\",\n");
}
else
jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
jd.safePrintf("\"gbssDiffbotUri\":"
"\"none\",\n");
}
else if ( cr->m_isCustomCrawl ) {
jd.safePrintf("\"gbssIsDiffbotObject\":0,\n");
}
jd.safePrintf("\"gbssDomain\":\"");
@ -27482,8 +27500,8 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
return NULL;
Json jp;
if ( ! jp.parseJsonStringIntoJsonItems ( jd.getBufStart(),m_niceness)){
Json jp2;
if (! jp2.parseJsonStringIntoJsonItems ( jd.getBufStart(),m_niceness)){
g_errno = EBADJSONPARSER;
return NULL;
}
@ -27502,7 +27520,7 @@ SafeBuf *XmlDoc::getSpiderStatusDocMetaList2 ( SpiderReply *reply ) {
hi.m_useSections = false;
// fill up tt4. false -> do not hash without field prefixes.
hashJSONFields2 ( &tt4 , &hi , &jp , false );
hashJSONFields2 ( &tt4 , &hi , &jp2 , false );
/*
char buf[64];