query scrape fixes

This commit is contained in:
mwells 2014-07-29 19:55:31 -07:00
parent 9f70d43a4b
commit 3f584ecdaa
3 changed files with 73 additions and 5 deletions

View File

@ -43,6 +43,7 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) {
}
mnew ( msg7, sizeof(Msg7) , "PageInject" );
msg7->m_socket = sock;
char format = hr->getReplyFormat();
@ -566,7 +567,7 @@ void doneInjectingLinksWrapper ( void *state ) {
// return if it blocks
if ( ! msg7->scrapeQuery() ) return;
}
TcpSocket *s = msg7->m_socket;
// otherwise, parse out the search results so steve can display them
if ( g_errno )
sb->safePrintf("<error><![CDATA[%s]]></error>\n",
@ -580,7 +581,8 @@ void doneInjectingLinksWrapper ( void *state ) {
//p += sprintf ( p , "scraping status ");
// print error msg out, too or "Success"
//p += sprintf ( p , "%s", mstrerror(g_errno));
g_httpServer.sendDynamicPage ( s,
TcpSocket *sock = msg7->m_socket;
g_httpServer.sendDynamicPage ( sock,
sb->getBufStart(),
sb->length(),
-1/*cachetime*/);
@ -610,6 +612,7 @@ bool Msg7::scrapeQuery ( ) {
// first encode the query
SafeBuf ebuf;
ebuf.urlEncode ( qts ); // queryUNEncoded );
ebuf.nullTerm();
char *uf;
if ( m_round == 1 )
@ -672,7 +675,7 @@ bool Msg7::scrapeQuery ( ) {
if ( m_useAhrefs )
m_xd.m_useAhrefs = true;
m_xd.m_reallyInjectLinks = gr->m_injectLinks;
m_xd.m_reallyInjectLinks = true;//gr->m_injectLinks;
//
// rather than just add the links of the page to spiderdb,

View File

@ -14239,7 +14239,8 @@ void Parms::init ( ) {
m->m_title = "query to scrape";
m->m_desc = "Scrape popular search engines for this query "
"and inject their links.";
"and inject their links. You are not required to supply "
"the <i>url</i> parm if you supply this parm.";
m->m_cgi = "qts";
m->m_obj = OBJ_GBREQUEST;
m->m_type = TYPE_CHARPTR;

66
qa.cpp
View File

@ -1193,6 +1193,66 @@ bool qaspider2 ( ) {
return true;
}
bool qascrape ( ) {
//
// delete the 'qatest123' collection
//
//static bool s_x1 = false;
if ( ! s_flags[0] ) {
s_flags[0] = true;
if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) )
return false;
}
//
// add the 'qatest123' collection
//
//static bool s_x2 = false;
if ( ! s_flags[1] ) {
s_flags[1] = true;
if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" ,
// checksum of reply expected
238170006 ) )
return false;
}
// scrape it
if ( ! s_flags[3] ) {
s_flags[3] = true;
SafeBuf sb;
sb.safePrintf( "/admin/inject?c=qatest123&"
"format=xml&qts=test");
if ( ! getUrl ( sb.getBufStart() , 0 ) )
return false;
}
// verify no results for gbhopcount:2 query
//static bool s_y4 = false;
if ( ! s_flags[6] ) {
s_flags[6] = true;
if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&"
"q=test",
-1310551262 ) )
return false;
}
//static bool s_fee2 = false;
if ( ! s_flags[13] ) {
s_flags[13] = true;
log("qa: SUCCESSFULLY COMPLETED "
"QA SCRAPE TEST");
return true;
}
return true;
}
/*
bool qaspider ( ) {
@ -1229,7 +1289,11 @@ static QATest s_qatests[] = {
{qaspider2,
"spiderHopCountTest",
"Test spidering walmart.com and ibm.com using hopcount limit."}
"Test spidering ibm.com using hopcount limit."},
{qascrape,
"queryScrapeTest",
"Scrape and inject results from google and bing."}
};