bring back meta tag display in results again.

added qa tests for advanced search and api parms.
various api parm fixes and hides.
do not do test url on proxies if test url empty.
This commit is contained in:
mwells 2014-09-27 15:54:55 -07:00
parent 9d738cdb8b
commit afd41676d2
8 changed files with 270 additions and 64 deletions

View File

@ -51,6 +51,8 @@ bool printPairScore ( SafeBuf *sb , SearchInput *si , PairScore *ps ,
bool printScoresHeader ( SafeBuf *sb ) ;
bool printMetaContent ( Msg40 *msg40 , long i ,State0 *st, SafeBuf *sb );
bool printSingleScore ( SafeBuf *sb , SearchInput *si , SingleScore *ss ,
Msg20Reply *mr , Msg40 *msg40 ) ;
@ -2279,7 +2281,7 @@ bool printSearchResultsHeader ( State0 *st ) {
if ( si->m_format == FORMAT_XML )
sb->safePrintf("\t\t<omitCount>%li</omitCount>\n",
sb->safePrintf("\t<omitCount>%li</omitCount>\n",
msg40->m_omitCount);
if ( si->m_format == FORMAT_JSON )
@ -4436,6 +4438,16 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
if ( si->m_format == FORMAT_HTML && strLen )
sb->safePrintf("<br>\n");
/////////
//
// meta tag values for &dt=keywords ...
//
/////////
if ( mr->ptr_dbuf && mr->size_dbuf>1 )
printMetaContent ( msg40 , ix,st,sb);
////////////
//
// . print DMOZ topics under the summary
@ -8774,41 +8786,53 @@ bool printSearchFiltersBar ( SafeBuf *sb , HttpRequest *hr ) {
s_mi[n].m_icon = NULL;
n++;
// META TAGS
s_mi[n].m_menuNum = 9;
s_mi[n].m_title = "No Meta Tags";
s_mi[n].m_cgi = "dt=";
s_mi[n].m_icon = NULL;
n++;
s_mi[n].m_menuNum = 9;
s_mi[n].m_title = "Show Meta Tags";
s_mi[n].m_cgi = "dt=keywords+description";
s_mi[n].m_icon = NULL;
n++;
// ADMIN
s_mi[n].m_menuNum = 9;
s_mi[n].m_menuNum = 10;
s_mi[n].m_title = "Show Admin View";
s_mi[n].m_cgi = "admin=1";
s_mi[n].m_icon = NULL;
n++;
s_mi[n].m_menuNum = 9;
s_mi[n].m_menuNum = 10;
s_mi[n].m_title = "Show User View";
s_mi[n].m_cgi = "admin=0";
s_mi[n].m_icon = NULL;
n++;
s_mi[n].m_menuNum = 10;
s_mi[n].m_menuNum = 11;
s_mi[n].m_title = "Action";
s_mi[n].m_cgi = "";
s_mi[n].m_icon = NULL;
n++;
s_mi[n].m_menuNum = 10;
s_mi[n].m_menuNum = 11;
s_mi[n].m_title = "Respider all results";
s_mi[n].m_cgi = "/admin/reindex";
s_mi[n].m_icon = NULL;
n++;
s_mi[n].m_menuNum = 10;
s_mi[n].m_menuNum = 11;
s_mi[n].m_title = "Delete all results";
s_mi[n].m_cgi = "/admin/reindex";
s_mi[n].m_icon = NULL;
n++;
s_mi[n].m_menuNum = 10;
s_mi[n].m_menuNum = 11;
s_mi[n].m_title = "Scrape from google/bing";
s_mi[n].m_cgi = "/admin/inject";
s_mi[n].m_icon = NULL;
@ -9097,3 +9121,90 @@ bool replaceParm2 ( char *cgi , SafeBuf *newUrl ,
if ( ! newUrl->nullTerm() ) return false;
return true;
}
bool printMetaContent ( Msg40 *msg40 , long i , State0 *st, SafeBuf *sb ) {
// store the user-requested meta tags content
SearchInput *si = &st->m_si;
char *pp = si->m_displayMetas;
char *ppend = pp + gbstrlen(si->m_displayMetas);
Msg20 *m = msg40->m_msg20[i];//getMsg20(i);
Msg20Reply *mr = m->m_r;
char *dbuf = mr->ptr_dbuf;//msg40->getDisplayBuf(i);
long dbufLen = mr->size_dbuf-1;//msg40->getDisplayBufLen(i);
char *dbufEnd = dbuf + (dbufLen-1);
char *dptr = dbuf;
//bool printedSomething = false;
// loop over the names of the requested meta tags
while ( pp < ppend && dptr < dbufEnd ) {
// . assure last byte of dbuf is \0
// provided dbufLen > 0
// . this insures sprintf and gbstrlen won't
// crash on dbuf/dptr
if ( dbuf [ dbufLen ] != '\0' ) {
log(LOG_LOGIC,"query: Meta tag buffer has no \\0.");
break;
}
// skip initial spaces
while ( pp < ppend && is_wspace_a(*pp) ) pp++;
// break if done
if ( ! *pp ) break;
// that's the start of the meta tag name
char *ss = pp;
// . find end of that meta tag name
// . can end in :<integer> -- specifies max len
while ( pp < ppend && ! is_wspace_a(*pp) &&
*pp != ':' ) pp++;
// save current char
char c = *pp;
char *cp = pp;
// NULL terminate the name
*pp++ = '\0';
// if ':' was specified, skip the rest
if ( c == ':' ) while ( pp < ppend && ! is_wspace_a(*pp)) pp++;
// print the name
//long sslen = gbstrlen ( ss );
//long ddlen = gbstrlen ( dptr );
long ddlen = dbufLen;
//if ( p + sslen + ddlen + 100 > pend ) continue;
// newspaperarchive wants tags printed even if no value
// make sure the meta tag isn't fucked up
for ( long ti = 0; ti < ddlen; ti++ ) {
if ( dptr[ti] == '"' ||
dptr[ti] == '>' ||
dptr[ti] == '<' ||
dptr[ti] == '\r' ||
dptr[ti] == '\n' ||
dptr[ti] == '\0' ) {
ddlen = ti;
break;
}
}
if ( ddlen > 0 ) {
// ship it out
if ( si->m_format == FORMAT_XML ) {
sb->safePrintf ( "\t\t<display name=\"%s\">"
"<![CDATA[", ss );
sb->cdataEncode ( dptr, ddlen );
sb->safePrintf ( "]]></display>\n" );
}
else if ( si->m_format == FORMAT_JSON ) {
sb->safePrintf ( "\t\t\"display.%s\":\"",ss);
sb->jsonEncode ( dptr, ddlen );
sb->safePrintf ( "\",\n");
}
// otherwise, print in light gray
else {
sb->safePrintf("<font color=#c62939>"
"<b>%s</b>: ", ss );
sb->safeMemcpy ( dptr, ddlen );
sb->safePrintf ( "</font><br>" );
}
}
// restore tag name buffer
*cp = c;
// point to next content of tag to display
dptr += ddlen + 1;
}
return true;
}

View File

@ -3202,7 +3202,8 @@ bool printApiForPage ( SafeBuf *sb , long PAGENUM , CollectionRec *cr ) {
"<td>STRING</td>"
"<td>output format</td>"
"<td>html</td>"
"<td>Display output in this format.</td>"
"<td>Display output in this format. Can be "
"<i>html</i>, <i>json</i> or <i>xml</i>.</td>"
"</tr>"
, blues[count%2]
, count

129
Parms.cpp
View File

@ -6794,6 +6794,7 @@ void Parms::init ( ) {
m->m_flags = PF_API;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
/*
@ -8694,6 +8695,17 @@ void Parms::init ( ) {
m++;
m->m_title = "use cache";
m->m_desc = "Use 0 if Gigablast should not read or write from "
"any caches at any level.";
m->m_def = "-1";
m->m_off = (char *)&si.m_useCache - y;
m->m_type = TYPE_CHAR;
m->m_cgi = "usecache";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "read from cache";
m->m_desc = "Should we read search results from the cache? Set "
"to false to fix dmoz bug.";
@ -8708,17 +8720,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_SI;
m++;
m->m_title = "use cache";
m->m_desc = "Use 0 if Gigablast should not read or write from "
"any caches at any level.";
m->m_def = "-1";
m->m_off = (char *)&si.m_useCache - y;
m->m_type = TYPE_CHAR;
m->m_cgi = "usecache";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "write to cache";
m->m_desc = "Use 0 if Gigablast should not write to "
"any caches at any level.";
@ -8772,6 +8773,7 @@ void Parms::init ( ) {
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "restrict search to pages that link to this url";
@ -8787,7 +8789,8 @@ void Parms::init ( ) {
m++;
m->m_title = "search for this phrase quoted";
m->m_desc = "The phrase which will be quoted.";
m->m_desc = "The phrase which will be quoted in the query. From the "
"advanced search page, adv.html.";
m->m_off = (char *)&si.m_quote1 - y;
m->m_type = TYPE_CHARPTR;//STRING;
//m->m_size = 512;
@ -8796,10 +8799,12 @@ void Parms::init ( ) {
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "search for this second phrase quoted";
m->m_desc = "The phrase which will be quoted.";
m->m_desc = "The phrase which will be quoted in the query. From the "
"advanced search page, adv.html.";
m->m_off = (char *)&si.m_quote2 - y;
m->m_type = TYPE_CHARPTR;//STRING;
//m->m_size = 512;
@ -8808,6 +8813,7 @@ void Parms::init ( ) {
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
/*
@ -8840,7 +8846,8 @@ void Parms::init ( ) {
m++;
m->m_title = "require these query terms";
m->m_desc = "Returned results will have all the words in X.";
m->m_desc = "Returned results will have all the words in X. "
"From the advanced search page, adv.html.";
m->m_off = (char *)&si.m_plus - y;
m->m_def = NULL;
m->m_type = TYPE_CHARPTR;//STRING;
@ -8850,10 +8857,12 @@ void Parms::init ( ) {
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "avoid these query terms";
m->m_desc = "Returned results will NOT have any of the words in X.";
m->m_desc = "Returned results will NOT have any of the words in X. "
"From the advanced search page, adv.html.";
m->m_off = (char *)&si.m_minus - y;
m->m_type = TYPE_CHARPTR;//STRING;
m->m_cgi = "minus";
@ -8862,6 +8871,7 @@ void Parms::init ( ) {
m->m_sprpp = 0;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "format of the returned search results";
@ -8873,6 +8883,7 @@ void Parms::init ( ) {
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_cgi = "format";
m->m_flags = PF_NOAPI; // alread in the api, so don't repeat
m++;
m->m_title = "family filter";
@ -8903,10 +8914,8 @@ void Parms::init ( ) {
m++;
m->m_title = "cached page highlight query";
m->m_desc = "Highlight the terms in this query instead. For "
"display of the cached page.";
m->m_desc = "Highlight the terms in this query instead.";
m->m_def = NULL;
m->m_off = (char *)&si.m_highlightQuery - y;
m->m_type = TYPE_CHARPTR;//STRING;
@ -8918,6 +8927,7 @@ void Parms::init ( ) {
m->m_obj = OBJ_SI;
m++;
/*
m->m_title = "highlight event date in summaries.";
m->m_desc = "Can be 0 or 1 to respectively disable or enable "
@ -8946,8 +8956,8 @@ void Parms::init ( ) {
*/
m->m_title = "Query match offsets";
m->m_desc = "Return a list of the offsets of each query word"
"actually matched in the document. 1 means byte offset,"
m->m_desc = "Return a list of the offsets of each query word "
"actually matched in the document. 1 means byte offset, "
"and 2 means word offset.";
m->m_def = "0";
m->m_off = (char *)&si.m_queryMatchOffsets - y;
@ -8957,6 +8967,7 @@ void Parms::init ( ) {
m->m_smax = 2;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "boolean status";
@ -9064,21 +9075,46 @@ void Parms::init ( ) {
m++;
*/
m->m_title = "niceness";
m->m_desc = "Can be 0 or 1. 0 is usually a faster, high-priority "
"query, 1 is a slower, lower-priority query.";
m->m_def = "0";
m->m_off = (char *)&si.m_niceness - y;
m->m_type = TYPE_LONG;
m->m_cgi = "niceness";
m->m_smin = 0;
m->m_smax = 1;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "debug flag";
m->m_desc = "Is 1 to log debug information, 0 otherwise.";
m->m_def = "0";
m->m_off = (char *)&si.m_debug - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "debug";
//m->m_priv = 1;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "return number of docs per topic";
m->m_desc = "Use 1 if you want Gigablast to return the number of "
"documents in the search results that contained each topic.";
"documents in the search results that contained each topic "
"(gigabit).";
m->m_def = "1";
m->m_off = (char *)&si.m_returnDocIdCount - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "rdc";
m->m_flags = PF_HIDDEN | PF_NOSAVE;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "return docids per topic";
m->m_desc = "Use 1 if you want Gigablast to return the list of "
"docIds from the search results that contained each topic.";
"docIds from the search results that contained each topic "
"(gigabit).";
m->m_def = "0";
m->m_off = (char *)&si.m_returnDocIds - y;
m->m_type = TYPE_BOOL;
@ -9089,7 +9125,7 @@ void Parms::init ( ) {
m->m_title = "return popularity per topic";
m->m_desc = "Use 1 if you want Gigablast to return the popularity "
"of each topic.";
"of each topic (gigabit).";
m->m_def = "0";
m->m_off = (char *)&si.m_returnPops - y;
m->m_type = TYPE_BOOL;
@ -9099,19 +9135,6 @@ void Parms::init ( ) {
m->m_obj = OBJ_SI;
m++;
m->m_title = "niceness";
m->m_desc = "Can be 0 or 1. 0 is usually a faster, high-priority "
"query, 1 is a slower, lower-priority query.";
m->m_def = "0";
m->m_off = (char *)&si.m_niceness - y;
m->m_type = TYPE_LONG;
m->m_cgi = "niceness";
m->m_smin = 0;
m->m_smax = 1;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
//m->m_title = "compound list max size";
//m->m_desc = "Is the max size in bytes of the compound termlist. "
// "Each document id is 6 bytes.";
@ -9124,23 +9147,12 @@ void Parms::init ( ) {
//m++;
m->m_title = "debug flag";
m->m_desc = "Is 1 to log debug information, 0 otherwise.";
m->m_def = "0";
m->m_off = (char *)&si.m_debug - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "debug";
//m->m_priv = 1;
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
m->m_title = "debug gigabits flag";
m->m_desc = "Is 1 to log gigabits debug information, 0 otherwise.";
m->m_def = "0";
m->m_off = (char *)&si.m_debugGigabits - y;
m->m_type = TYPE_BOOL;
m->m_cgi = "debug";
m->m_cgi = "debuggigabits";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m++;
@ -9165,6 +9177,7 @@ void Parms::init ( ) {
m->m_cgi = "iu";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "image link";
@ -9177,6 +9190,7 @@ void Parms::init ( ) {
m->m_cgi = "ix";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
m->m_title = "image width";
@ -9187,6 +9201,7 @@ void Parms::init ( ) {
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_def = "200";
m->m_flags = PF_NOAPI;
m++;
m->m_title = "image height";
@ -9198,6 +9213,7 @@ void Parms::init ( ) {
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_def = "200";
m->m_flags = PF_NOAPI;
m++;
// m->m_title = "password";
@ -9273,6 +9289,7 @@ void Parms::init ( ) {
m->m_cgi = "gbcountry";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
/*
@ -9374,6 +9391,7 @@ void Parms::init ( ) {
m->m_cgi = "qcs";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
// buzz
@ -9385,6 +9403,7 @@ void Parms::init ( ) {
m->m_cgi = "inlinks";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
// buzz
@ -9398,6 +9417,7 @@ void Parms::init ( ) {
m->m_cgi = "outlinks";
m->m_page = PAGE_RESULTS;
m->m_obj = OBJ_SI;
m->m_flags = PF_NOAPI;
m++;
// buzz
@ -9541,6 +9561,17 @@ void Parms::init ( ) {
m->m_flags = PF_API;
m++;
m->m_title = "query";
m->m_desc = "Highlight this query in the page.";
m->m_def = "";
m->m_type = TYPE_CHARPTR;
m->m_page = PAGE_GET;
m->m_obj = OBJ_GBREQUEST;
m->m_cgi = "q";
m->m_off = (char *)&gr.m_query - (char *)&gr;
m->m_flags = PF_API;
m++;
/*
// for /get
m->m_title = "query highlighting query";

View File

@ -159,6 +159,7 @@ class GigablastRequest {
long long m_docId;
long m_strip;
char m_includeHeader;
char m_highlightQuery;
///////////
//

View File

@ -505,6 +505,9 @@ bool downloadTestUrlFromProxies ( ) {
// only host #0 should do the testing i guess
//if ( g_hostdb.m_myHost->m_hostId != 0 ) return true;
// no need if no url
if ( g_conf.m_proxyTestUrl.length() <= 1 ) return true;
// if host #0 dies then host #1 must take its place managing the
// spider proxies
Host *h0 = g_hostdb.getFirstAliveHost();

View File

@ -29549,11 +29549,11 @@ Msg20Reply *XmlDoc::getMsg20Reply ( ) {
// returns values of specified meta tags
if ( ! reply->ptr_dbuf && m_req->size_displayMetas > 1 ) {
long dlen; char *d;
d = getDescriptionBuf(m_req->ptr_displayMetas,&dlen);
long dsize; char *d;
d = getDescriptionBuf(m_req->ptr_displayMetas,&dsize);
if ( ! d || d == (char *)-1 ) return (Msg20Reply *)d;
reply->ptr_dbuf = d;
reply->size_dbuf = dlen + 1;
reply->size_dbuf = dsize; // includes \0
}
// breathe
@ -30437,9 +30437,9 @@ Matches *XmlDoc::getMatches () {
}
// sender wants meta description, custom tags, etc.
char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) {
char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dsize ) {
// return the buffer if we got it
if ( m_dbufValid ) { *dlen = m_dbufLen; return m_dbuf; }
if ( m_dbufValid ) { *dsize = m_dbufSize; return m_dbuf; }
Xml *xml = getXml();
if ( ! xml || xml == (Xml *)-1 ) return (char *)xml;
// now get the content of the requested display meta tags
@ -30483,6 +30483,14 @@ char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) {
gbstrlen(s) , // name len
"name" , // http-equiv/name
false );// convert &#'s?
dptr[wlen] = '\0';
// test it out
if ( ! verifyUtf8 ( dptr ) ) {
log("xmldoc: invalid utf8 content for meta tag %s.",s);
continue;
}
// advance and NULL terminate
dptr += wlen;
*dptr++ = '\0';
@ -30492,8 +30500,9 @@ char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) {
"was encountered. Truncating.",dbufEnd-m_dbuf);
}
// what is the size of the content of displayed meta tags?
m_dbufLen = dptr - m_dbuf;
m_dbufSize = dptr - m_dbuf;
m_dbufValid = true;
*dsize = m_dbufSize;
return m_dbuf;
}

View File

@ -2032,7 +2032,7 @@ class XmlDoc {
Query m_query;
Matches m_matches;
// meta description buf
long m_dbufLen;
long m_dbufSize;
char m_dbuf[1024];
SafeBuf m_htb;
Title m_title;

50
qa.cpp
View File

@ -605,6 +605,55 @@ bool qainject1 ( ) {
return false;
}
//
// adv.html test
//
// query for 'test' using adv.html advanced search interface
if ( ! s_flags[27] ) {
s_flags[27] = true;
if ( ! getUrl (
"/search?c=qatest123&qa=17&format=xml&"
"dr=1&pss=50&sc=1&hacr=1&quotea=web+site&"
"gblang=1&minus=transcripts&n=150",
123 ) )
return false;
}
// &sites= test
if ( ! s_flags[28] ) {
s_flags[28] = true;
if ( ! getUrl (
"/search?c=qatest123&qa=17&format=xml&q=web&"
"sortby=2&"
// html only:
"sw=20&"
"filetype=html&"
"ff=1&"
"facet=gbfacetint:gbhopcount&"
"sites=mindtools.com+www.redcross.org"
, 123 ) )
return false;
}
// html test of summary width
if ( ! s_flags[29] ) {
s_flags[29] = true;
if ( ! getUrl (
"/search?c=qatest123&qa=17&format=html&q=web&"
// html only:
"sw=20&tml=10&ns=1&smxcpl=30&qh=0&n=100&"
"dt=keywords+description&"
"facet=gbfacetint:gbspiderdate&"
, 123 ) )
return false;
}
// stop for now
return true;
//
// eject/delete the urls
//
@ -1227,6 +1276,7 @@ bool qareindex() {
return true;
}
/*
static char *s_urls1 =
" walmart.com"