From 3f584ecdaa7dc12b710dac9a4f8b6d4da991f784 Mon Sep 17 00:00:00 2001 From: mwells Date: Tue, 29 Jul 2014 19:55:31 -0700 Subject: [PATCH] query scrape fixes --- PageInject.cpp | 9 ++++--- Parms.cpp | 3 ++- qa.cpp | 66 +++++++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 5 deletions(-) diff --git a/PageInject.cpp b/PageInject.cpp index da3b47a4..a8fd0052 100644 --- a/PageInject.cpp +++ b/PageInject.cpp @@ -43,6 +43,7 @@ bool sendPageInject ( TcpSocket *sock , HttpRequest *hr ) { } mnew ( msg7, sizeof(Msg7) , "PageInject" ); + msg7->m_socket = sock; char format = hr->getReplyFormat(); @@ -566,7 +567,7 @@ void doneInjectingLinksWrapper ( void *state ) { // return if it blocks if ( ! msg7->scrapeQuery() ) return; } - TcpSocket *s = msg7->m_socket; + // otherwise, parse out the search results so steve can display them if ( g_errno ) sb->safePrintf("\n", @@ -580,7 +581,8 @@ void doneInjectingLinksWrapper ( void *state ) { //p += sprintf ( p , "scraping status "); // print error msg out, too or "Success" //p += sprintf ( p , "%s", mstrerror(g_errno)); - g_httpServer.sendDynamicPage ( s, + TcpSocket *sock = msg7->m_socket; + g_httpServer.sendDynamicPage ( sock, sb->getBufStart(), sb->length(), -1/*cachetime*/); @@ -610,6 +612,7 @@ bool Msg7::scrapeQuery ( ) { // first encode the query SafeBuf ebuf; ebuf.urlEncode ( qts ); // queryUNEncoded ); + ebuf.nullTerm(); char *uf; if ( m_round == 1 ) @@ -672,7 +675,7 @@ bool Msg7::scrapeQuery ( ) { if ( m_useAhrefs ) m_xd.m_useAhrefs = true; - m_xd.m_reallyInjectLinks = gr->m_injectLinks; + m_xd.m_reallyInjectLinks = true;//gr->m_injectLinks; // // rather than just add the links of the page to spiderdb, diff --git a/Parms.cpp b/Parms.cpp index c934d12c..751749f7 100644 --- a/Parms.cpp +++ b/Parms.cpp @@ -14239,7 +14239,8 @@ void Parms::init ( ) { m->m_title = "query to scrape"; m->m_desc = "Scrape popular search engines for this query " - "and inject their links."; + "and inject their links. You are not required to supply " + "the url parm if you supply this parm."; m->m_cgi = "qts"; m->m_obj = OBJ_GBREQUEST; m->m_type = TYPE_CHARPTR; diff --git a/qa.cpp b/qa.cpp index 40d960c2..e38f3646 100644 --- a/qa.cpp +++ b/qa.cpp @@ -1193,6 +1193,66 @@ bool qaspider2 ( ) { return true; } + +bool qascrape ( ) { + // + // delete the 'qatest123' collection + // + //static bool s_x1 = false; + if ( ! s_flags[0] ) { + s_flags[0] = true; + if ( ! getUrl ( "/admin/delcoll?xml=1&delcoll=qatest123" ) ) + return false; + } + + // + // add the 'qatest123' collection + // + //static bool s_x2 = false; + if ( ! s_flags[1] ) { + s_flags[1] = true; + if ( ! getUrl ( "/admin/addcoll?addcoll=qatest123&xml=1" , + // checksum of reply expected + 238170006 ) ) + return false; + } + + + // scrape it + if ( ! s_flags[3] ) { + s_flags[3] = true; + SafeBuf sb; + sb.safePrintf( "/admin/inject?c=qatest123&" + "format=xml&qts=test"); + if ( ! getUrl ( sb.getBufStart() , 0 ) ) + return false; + } + + + + // verify no results for gbhopcount:2 query + //static bool s_y4 = false; + if ( ! s_flags[6] ) { + s_flags[6] = true; + if ( ! getUrl ( "/search?c=qatest123&qa=1&format=xml&" + "q=test", + -1310551262 ) ) + return false; + } + + + //static bool s_fee2 = false; + if ( ! s_flags[13] ) { + s_flags[13] = true; + log("qa: SUCCESSFULLY COMPLETED " + "QA SCRAPE TEST"); + return true; + } + + return true; +} + + /* bool qaspider ( ) { @@ -1229,7 +1289,11 @@ static QATest s_qatests[] = { {qaspider2, "spiderHopCountTest", - "Test spidering walmart.com and ibm.com using hopcount limit."} + "Test spidering ibm.com using hopcount limit."}, + + {qascrape, + "queryScrapeTest", + "Scrape and inject results from google and bing."} };