open-source-search-engine/PageInject.cpp

#include "gb-include.h"

#include "PageInject.h"
#include "HttpServer.h"
#include "Pages.h"
#include "Users.h"
#include "XmlDoc.h"
#include "PageParser.h"
#include "Repair.h"
#include "PageCrawlBot.h"

static bool sendReply        ( void *state );

static void sendReplyWrapper ( void *state ) {
	sendReply ( state );
};

// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageInject ( TcpSocket *s , HttpRequest *r ) {
	// get the collection
	long  collLen = 0;
	char *coll  = r->getString ( "c" , &collLen  , NULL /*default*/);

	long crawlbotAPI = r->getLong("crawlbotapi",0);

	// get collection rec
	CollectionRec *cr = g_collectiondb.getRec ( coll );
	// bitch if no collection rec found
	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		log("build: Injection from %s failed. "
		    "Collection \"%s\" does not exist.",
		    iptoa(s->m_ip),coll);
		return g_httpServer.sendErrorReply(s,500,
					      "collection does not exist");
	}

	// make a new state
	Msg7 *msg7;
	try { msg7= new (Msg7); }
	catch ( ... ) {
		g_errno = ENOMEM;
		log("PageInject: new(%i): %s",
		    sizeof(Msg7),mstrerror(g_errno));
		return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));}
	mnew ( msg7, sizeof(Msg7) , "PageInject" );

	msg7->m_socket = s;

	msg7->m_isScrape = false;

	msg7->m_crawlbotAPI = crawlbotAPI;

	strncpy(msg7->m_coll,cr->m_coll,MAX_COLL_LEN);

	// for diffbot
	//if ( crawlbotAPI )
	msg7->m_hr.copy ( r );

	// a scrape request?
	char *qts = r->getString("qts",NULL);
	if ( qts && ! qts[0] ) qts = NULL;
	if ( qts ) {
		// qts is html encoded? NO! fix that below then...
		//char *uf="http://www.google.com/search?num=50&"
		//	"q=%s&scoring=d&filter=0";
		msg7->m_isScrape = true;
		msg7->m_qbuf.safeStrcpy(qts);
		msg7->m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab");
		msg7->m_useAhrefs = r->getLong("useahrefs",0);
		// default to yes, injectlinks.. no default to no
		msg7->m_injectLinks = r->getLong("injectlinks",0);
		if ( ! msg7->scrapeQuery ( ) ) return false;
		return sendReply ( msg7 );
	}

	if ( ! msg7->inject ( s , r , msg7 , sendReplyWrapper ) )
		return false;

	// it did not block, i gues we are done
	return sendReply ( msg7 );
}

bool sendReply ( void *state ) {
	// get the state properly
	Msg7 *msg7= (Msg7 *) state;
	// extract info from state
	TcpSocket *s = msg7->m_socket;

	XmlDoc *xd = &msg7->m_xd;
	// log it
	//if ( msg7->m_url[0] ) xd->logIt();

	// msg7 has the docid for what we injected, iff g_errno is not set
	//long long docId  = msg7->m_msg7.m_docId;
	//long      hostId = msg7->m_msg7.m_hostId;
	long long docId  = xd->m_docId;
	long      hostId = 0;//msg7->m_msg7.m_hostId;


	//
	// debug
	//

	/*
	// now get the meta list, in the process it will print out a
	// bunch of junk into msg7->m_pbuf
	if ( xd->m_docId ) {
		char *metalist = xd->getMetaList ( 1,1,1,1,1,1 );
		if ( ! metalist || metalist==(void *)-1){char *xx=NULL;*xx=0;}
		// print it out
		SafeBuf *pbuf = &msg7->m_sbuf;
		xd->printDoc( pbuf );
		bool status = g_httpServer.sendDynamicPage( msg7->m_socket ,
							   pbuf->getBufStart(),
							    pbuf->length() ,
							    -1, //cachtime
							    false ,//postreply?
							    NULL, //ctype
							    -1 , //httpstatus
							    NULL,//cookie
							    "utf-8");
		// delete the state now
		mdelete ( st , sizeof(Msg7) , "PageInject" );
		delete (st);
		// return the status
		return status;
	}
	*/
	//
	// end debug
	//


	// page is not more than 32k
	//char buf[1024*32];


	// . if we're talking w/ a robot he doesn't care about this crap
	// . send him back the error code (0 means success)
	if ( msg7->m_quickReply ) {
		char buf[1024*32];
		char *p = buf;
		// set g_errno to index code
		if ( xd->m_indexCodeValid &&
		     xd->m_indexCode &&
		     ! g_errno )
			g_errno = xd->m_indexCode;
		// return docid and hostid
		if ( ! g_errno ) p += sprintf ( p ,
					   "0,docId=%lli,hostId=%li," ,
					   docId , hostId );
		// print error number here
		else  p += sprintf ( p , "%li,0,0,", (long)g_errno );
		// print error msg out, too or "Success"
		p += sprintf ( p , "%s", mstrerror(g_errno));
		mdelete ( msg7, sizeof(Msg7) , "PageInject" );
		delete (msg7);
		return g_httpServer.sendDynamicPage ( s, buf , gbstrlen(buf) ,
						      -1/*cachetime*/);
	}

	// get an active ptr into buf
	//char *p    = buf;
	//char *pend = buf + 1024*32;

	SafeBuf sb;

	// print admin bar
	g_pages.printAdminTop ( &sb, s , &msg7->m_hr );

	// if there was an error let them know
	char msg[1024];
	char *pm = "";
	if ( g_errno ) {
		sprintf ( msg ,"Error injecting url: <b>%s[%i]</b>",
			  mstrerror(g_errno) , g_errno);
		pm = msg;
	}
	//else if ( msg7->m_injected )
	//	pm = "url successfully injected";

	// bail if not enabled
	//if ( ! g_conf.m_injectionEnabled ) {
	//	sprintf ( msg ,"<font color=red>URL injection is disabled "
	//		  "in the Master Controls</font>");
	//	pm = msg;
	//}

	sb.safePrintf(
		      "<style>"
		      ".poo { background-color:#%s;}\n"
		      "</style>\n" ,
		      LIGHT_BLUE );

	//char *c = msg7->m_coll;
	char bb [ MAX_COLL_LEN + 60 ];
	bb[0]='\0';
	//if ( c && c[0] ) sprintf ( bb , " (%s)", c);

	// make a table, each row will be an injectable parameter
	sb.safePrintf (
		  "<center>"
		  "<b>%s</b>\n\n" // the url msg
		  //"<FORM method=POST action=/inject>\n\n"

		  "<FORM method=GET action=/inject>\n\n"

		  //"<input type=hidden name=pwd value=\"%s\">\n"
		  //"<input type=hidden name=username value=\"%s\">\n"
		  "<table %s>"
		  "<tr class=hdrow><td colspan=2>"
		  "<center>"
		  //"<font size=+1>"
		  "<b>"
		  "Inject URL</b>%s"
		  //"</font>"
		  "</td></tr>\n\n"

		  "<tr class=poo><td><b>url</b>"
		  "<br>"
		  "<font size=-2>"
		  "Specify the URL that will be immediately crawled and "
		  "indexed in real time "
		  "while you wait. The browser will return the "
		  "final index status code. Alternatively, "
		  "use the <a href=/admin/addurl>add url</a> page "
		  "to add urls individually or in bulk "
		  "without having to wait for the pages to be "
		  "actually indexed in realtime. "

		  "By default, injected urls "
		  "take precedence over the \"insitelist\" directive in the "
		  "<a href=/admin/filters>url filters</a> "
		  "so injected urls need not match the "
		  "<a href=/admin/sites>spider sites</a> patterns. You can "
		  "change that behavior in the <a href=/admin/filters>url "
		  "filters</a> if you want. "
		  "Injected urls will have a "
		  "<a href=/admin/filters#hopcount>hopcount</a> of 0. "
		  "The injection api is described on the "
		  "<a href=/admin/api>api</a> page."

		  "</font>"
		  "</td>"

		  "<td width=50%%>\n"
		  "<input type=text name=u value=\"\" size=50>"
		  "</td></tr>\n\n"

		  "<tr class=poo><td><b>query to scrape</b>"

		  "<br>"
		  "<font size=-2>"
		  "Scrape other search engines and inject their links "
		  "for this query. "
		  "</font>"

		  "</td>"
		  "<td>\n"
		  "<input type=text name=qts value=\"\" size=50>"
		  "</td></tr>\n\n"

		  //"<tr class=poo><td><b>use ahrefs.com</b></td>"
		  //"<td>\n"
		  //"<input type=radio name=useahrefs value=0 checked>no &nbsp; "
		  //"<input type=radio name=useahrefs value=1>yes "
		  //"</td></tr>\n\n"


		  "<tr class=poo><td><b>spider links</b></td>"
		  "<td>\n"
		  "<input type=radio name=spiderlinks value=0>no &nbsp; "
		  "<input type=radio name=spiderlinks value=1 checked>yes "
		  "<br>"
		  "<font size=1>Should we add the page's outlinks to "
		  "spiderdb for spidering? "
		  "Default: yes"
		  "</font>"
		  "</td></tr>\n\n"


		  "<tr class=poo><td><b>inject scraped links</b></td>"
		  "<td>\n"
		  "<input type=radio name=injectlinks value=0 checked>no &nbsp; "
		  "<input type=radio name=injectlinks value=1>yes "
		  "</td></tr>\n\n"

		  "<tr class=poo><td><b>collection</b></td>"
		  "<td>\n"
		  "<input type=text name=c value=\"%s\" size=15>"
		  "</td></tr>\n\n"

		  "<tr class=poo><td><b>quick reply?</b><br>"
		  "<font size=1>Should reply be short? "
		  "Default: no"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=quick value=0 checked>no &nbsp; "
		  "<input type=radio name=quick value=1>yes "
		  "</td></tr>\n\n"

		  "<tr class=poo><td><b>only inject new docs?</b><br>"
		  "<font size=1>Skips injection if docs already indexed. "
		  "Default: no"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=newonly value=0 checked>no &nbsp; "
		  "<input type=radio name=newonly value=1>yes "
		  "</td></tr>\n\n"


		  "<tr class=poo><td><b>delete url?</b><br>"
		  "<font size=1>Should this url be deleted from the index? "
		  "Default: no"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=deleteurl value=0 checked>no &nbsp; "
		  "<input type=radio name=deleteurl value=1>yes "
		  "</td></tr>\n\n"


		  "<tr class=poo><td><b>recycle content?</b><br>"
		  "<font size=1>Should page content be recycled if "
		  "reindexing? "
		  "Default: no"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=recycle value=0 checked>no &nbsp; "
		  "<input type=radio name=recycle value=1>yes "
		  "</td></tr>\n\n"

		  /*
		  "<tr class=poo><td><b>ip</b><br>"
		  "<font size=1>IP address of the url. If blank then "
		  "Gigablast will look up. "
		  "Default: blank"
		  "</td>"
		  "<td>\n<input type=text name=ip value=\"\" size=15>"
		  "</td></tr>\n\n"
		  */

		  /*
		  "<tr class=poo><td><b>do ip lookups?</b><br>"
		  "<font size=1>Should Gigablast look up the IP address "
		  "of the url, if it is not provided. "
		  "Default: yes"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=iplookups value=0>no &nbsp; "
		  "<input type=radio name=iplookups value=1 checked>yes "
		  "</td></tr>\n\n"
		  */

		  //"<tr class=poo><td><b>is url new?</b><br>"
		  //"<font size=1>Is this url new to the index? If unsure "
		  //"then you should say no here. "
		  //"Default: yes"
		  //"</td>"
		  //"<td>\n"
		  //"<input type=radio name=isnew value=0>no &nbsp; "
		  //"<input type=radio name=isnew value=1 checked>yes "
		  //"</td></tr>\n\n"

		  "<tr class=poo><td><b>dedup?</b><br>"
		  "<font size=1>Should this url be skipped if there is "
		  "already  a url in the index from this same domain with "
		  "this same content? "
		  "Default: yes"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=dedup value=0>no &nbsp; "
		  "<input type=radio name=dedup value=1 checked>yes "
		  "</td></tr>\n\n" ,
		  //"<tr class=poo><td><b>ruleset</b><br>"
		  //"<font size=1>Use this ruleset to index the URL. "
		  //"Default: auto"
		  //"</td>"
		  //"<td>\n<select name=rs>" ,
		  pm , // msg7->m_pwd ,
		  //msg7->m_username,
		  TABLE_STYLE , bb , msg7->m_coll );


	//p += gbstrlen(p);

	// . print pulldown menu of different site filenums
	// . 0 - default site
	// . 1 - banned  site
	// . 2 - bad     site
	// . 3 - decent  site
	// . 4 - good    site
	// . 5 - super   site
	/*
	for ( long i = 0 ; i < 10000 ; i++ ) {
		Xml *xml = g_tagdb.getSiteXml(i, msg7->m_coll,
					       gbstrlen(msg7->m_coll));
		if ( ! xml ) break;
		long  slen;
		char *s = xml->getString ( "name" , &slen );
		if ( s && slen > 0 ) {
			char c = s[slen];
			s[slen] = '\0';
			sprintf ( p , "<option value=%li>%s", i , s );
			s[slen] = c;
		}
		else
			sprintf ( p , "<option value=%li>#%li", i , i );
		p += gbstrlen ( p );
	}
	// end the pull-down menu
	sprintf ( p , "</select></td></tr>\n\n" );
	p += gbstrlen ( p );
	*/

	// make a table, each row will be an injectable parameter
	sb.safePrintf (
		  "<tr class=poo><td><b>content has mime</b><br>"
		  "<font size=1>IP address of the url. If blank then "
		  "Gigablast will look up. "
		  "Default: blank"
		  "</td>"
		  "<td>\n"
		  "<input type=radio name=hasmime value=0 checked>no &nbsp; "
		  "<input type=radio name=hasmime value=1>yes "
		  "</td></tr>\n\n"

		  "<tr class=poo><td colspan=2>"
		  "<center>"
		  "<b>content</b><br>"
		  "<font size=1>If you want to supply the URL's content "
		  "rather than have Gigablast download it, then "
		  "enter the content here. "
		  "Enter MIME header "
		  "first if \"content has mime\" is set to true above. "
		  "Separate MIME from actual content with two returns."
		  "<br>"
		  "<input type=submit value=Submit>"
		  "<br>"
		  "\n"
		  "<textarea rows=32 cols=80 name=content>"
		  "</textarea>"
		  "</center>"
		  "</td></tr></table>\n"

		  "<br>"
		  "<br>\n\n"
		  "<center>"
		  "<input type=submit value=Submit>"
		  "</center>"

		  "</form>\n"
		  );


	sb.safePrintf( "\n</body>\n</html>\n");
	// print the final tail
	//p += g_httpServer.printTail ( p , pend - p , true /*adminLink?*/);
	// clear g_errno, if any, so our reply send goes through
	g_errno = 0;
	// calculate buffer length
	//long bufLen = p - buf;
	// nuke state
	mdelete ( msg7, sizeof(Msg7) , "PageInject" );
	delete (msg7);
	// . send this page
	// . encapsulates in html header and tail
	// . make a Mime
	// . i thought we need -2 for cacheTime, but i guess not
	return g_httpServer.sendDynamicPage (s,
					     sb.getBufStart(),
					     sb.length(),
					     -1/*cachetime*/);
}


Msg7::Msg7 () {
	//m_needsSet = true;
	m_contentAllocSize = 0;
	m_content = NULL;
	m_round = 0;
}

Msg7::~Msg7 () {
	if ( m_content )
		mfree ( m_content , m_contentAllocSize,"injcont");
	m_content = NULL;
}

bool Msg7::inject ( TcpSocket *s ,
		    HttpRequest *r ,
		    void *state ,
		    void (*callback)(void *state) ,
		    long spiderLinksDefault ,
		    char *collOveride ) {

	// save socket
	// socket is responsible for free the HTTP request, which contains
	// the POSTed content, so if he gets destroyed we have to make sure
	// we no longer reference that content.
	m_socket  = s;

	long  contentLen;

	// get the junk
	//char *coll        = r->getString ( "c" , NULL  , NULL /*default*/);
	//if ( ! coll ) coll = "main";
	// sometimes crawlbot will add or reset a coll and do an inject
	// in PageCrawlBot.cpp
	//if ( ! coll ) coll = r->getString("addcoll");
	//if ( ! coll ) coll = r->getString("resetcoll");
	//if ( ! coll ) coll = collOveride;

	// default to main
	//if ( ! coll || ! coll[0] ) coll = "main";

	if ( collOveride && ! collOveride[0] ) collOveride = NULL;

	CollectionRec *cr = NULL;
	if ( collOveride ) cr = g_collectiondb.getRec ( collOveride );
	else cr = g_collectiondb.getRec ( r );

	if ( ! cr ) {
		g_errno = ENOCOLLREC;
		return true;
	}

	char *coll = cr->m_coll;

	bool  quickReply     = r->getLong   ( "quick" , 0 );
	//char *pwd            = r->getString ( "pwd" , NULL );
	char *url            = r->getString ( "u" , NULL , NULL /*default*/);
	// for diffbot.cpp api
	if ( ! url ) url = r->getString("injecturl",NULL,NULL);
	if ( ! url ) url = r->getString("url",NULL,NULL);
	// PageCrawlBot.cpp uses "seed"
	if ( ! url ) url = r->getString("seed",NULL,NULL);

	bool  recycleContent = r->getLong   ( "recycle",0);
	//char *ips            = r->getString ( "ip" , NULL , NULL );
	//char *username       = g_users.getUsername(r);
	//long firstIndexed = r->getLongLong("firstindexed",0LL);
	//long lastSpidered = r->getLongLong("lastspidered",0LL);
	long hopCount     = r->getLong("hopcount",-1);
	long newOnly      = r->getLong("newonly",0);
	long charset      = r->getLong("charset",-1);
	long deleteUrl    = r->getLong("deleteurl",0);
	char hasMime      = r->getLong("hasmime",0);
	// do consistency testing?
	bool doConsistencyTesting = r->getLong("dct",0);
	// . default spiderlinks to no for injects. no, not for
	//   seed urls from PageCrawlbot.cpp, ppl expect links to be spidered.
	// . support both camel and all-lower cases
	long spiderLinks = r->getLong("spiderLinks",spiderLinksDefault);
	spiderLinks = r->getLong("spiderlinks",spiderLinks);

	long  forcedIp  = 0;

	//if ( ips ) forcedIp = atoip ( ips , gbstrlen(ips) );

	char *content        = r->getString ( "content" , &contentLen , NULL );
	// mark doesn't like to url-encode his content
	if ( ! content ) {
		content    = r->getUnencodedContent    ();
		contentLen = r->getUnencodedContentLen ();
		//contentIsEncoded = false;
	}

	// a secret thing from dan
	long dbrLen = 0;
	char *diffbotReply = r->getString("diffbotreply",&dbrLen,NULL);

	// test
	//diffbotReply = "{\"request\":{\"pageUrl\":\"http://www.washingtonpost.com/2011/03/10/ABe7RaQ_moreresults.html\",\"api\":\"article\",\"version\":3},\"objects\":[{\"icon\":\"http://www.washingtonpost.com/favicon.ico\",\"text\":\"In Case You Missed It\nWeb Hostess Live: The latest from the Web (vForum, May 15, 2014; 3:05 PM)\nGot Plans: Advice from the Going Out Guide (vForum, May 15, 2014; 2:05 PM)\nWhat to Watch: TV chat with Hank Stuever (vForum, May 15, 2014; 1:10 PM)\nColor of Money Live (vForum, May 15, 2014; 1:05 PM)\nWeb Hostess Live: The latest from the Web (vForum, May 15, 2014; 12:25 PM)\nMichael Devine outdoor entertaining and design | Home Front (vForum, May 15, 2014; 12:20 PM)\nThe Answer Sheet: Education chat with Valerie Strauss (vForum, May 14, 2014; 2:00 PM)\nThe Reliable Source Live (vForum, May 14, 2014; 1:05 PM)\nAsk Tom: Rants, raves and questions on the DC dining scene (vForum, May 14, 2014; 12:15 PM)\nOn Parenting with Meghan Leahy (vForum, May 14, 2014; 12:10 PM)\nAsk Aaron: The week in politics (vForum, May 13, 2014; 3:05 PM)\nEugene Robinson Live (vForum, May 13, 2014; 2:05 PM)\nTuesdays with Moron: Chatological Humor Update (vForum, May 13, 2014; 12:00 PM)\nComPost Live with Alexandra Petri (vForum, May 13, 2014; 11:05 AM)\nAsk Boswell: Redskins, Nationals and Washington sports (vForum, May 12, 2014; 1:50 PM)\nAdvice from Slate's 'Dear Prudence' (vForum, May 12, 2014; 1:40 PM)\nDr. Gridlock (vForum, May 12, 2014; 1:35 PM)\nSwitchback: Talking Tech (vForum, May 9, 2014; 12:05 PM)\nThe Fix Live (vForum, May 9, 2014; 12:00 PM)\nWhat to Watch: TV chat with Hank Stuever (vForum, May 8, 2014; 1:10 PM)\nMore News\",\"title\":\"The Washington Post\",\"diffbotUri\":\"article|3|828850106\",\"pageUrl\":\"http://www.washingtonpost.com/2011/03/10/ABe7RaQ_moreresults.html\",\"humanLanguage\":\"en\",\"html\":\"<p>In Case You Missed It<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/web-hostess-140515-new.html\\\">Web Hostess Live: The latest from the Web<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 3:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/got-plans-05-15-2014.html\\\">Got Plans: Advice from the Going Out Guide<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 2:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/tv-chat-140515.html\\\">What to Watch: TV chat with Hank Stuever<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 1:10 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/color-of-money-live-20140515.html\\\">Color of Money Live<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 1:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/web-hostess-140515-new.html\\\">Web Hostess Live: The latest from the Web<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 12:25 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/home-front-0515.html\\\">Michael Devine outdoor entertaining and design | Home Front<\\/a>  <\\/p>\n<p>(vForum, May 15, 2014; 12:20 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/the-answer-sheet-20140514.html\\\">The Answer Sheet: Education chat with Valerie Strauss<\\/a>  <\\/p>\n<p>(vForum, May 14, 2014; 2:00 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/the-reliable-source-140514-new.html\\\">The Reliable Source Live<\\/a>  <\\/p>\n<p>(vForum, May 14, 2014; 1:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/ask-tom-5-14-14.html\\\">Ask Tom: Rants, raves and questions on the DC dining scene <\\/a>  <\\/p>\n<p>(vForum, May 14, 2014; 12:15 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/parenting-0514.html\\\">On Parenting with Meghan Leahy<\\/a>  <\\/p>\n<p>(vForum, May 14, 2014; 12:10 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/post-politics-ask-aaron-051313.html\\\">Ask Aaron: The week in politics<\\/a>  <\\/p>\n<p>(vForum, May 13, 2014; 3:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/opinion-focus-with-eugene-robinson-20140513.html\\\">Eugene Robinson Live<\\/a>  <\\/p>\n<p>(vForum, May 13, 2014; 2:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/gene-weingarten-140513.html\\\">Tuesdays with Moron: Chatological Humor Update<\\/a>  <\\/p>\n<p>(vForum, May 13, 2014; 12:00 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/compost-live-140513.html\\\">ComPost Live with Alexandra Petri<\\/a>  <\\/p>\n<p>(vForum, May 13, 2014; 11:05 AM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/ask-boswell-1400512.html\\\">Ask Boswell: Redskins, Nationals and Washington sports<\\/a>  <\\/p>\n<p>(vForum, May 12, 2014; 1:50 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/dear-prudence-140512.html\\\">Advice from Slate's 'Dear Prudence'<\\/a>  <\\/p>\n<p>(vForum, May 12, 2014; 1:40 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/gridlock-0512.html\\\">Dr. Gridlock <\\/a>  <\\/p>\n<p>(vForum, May 12, 2014; 1:35 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/switchback-20140509.html\\\">Switchback: Talking Tech<\\/a>  <\\/p>\n<p>(vForum, May 9, 2014; 12:05 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/live-fix-140509.html\\\">The Fix Live<\\/a>  <\\/p>\n<p>(vForum, May 9, 2014; 12:00 PM)<\\/p>\n<p>  <a href=\\\"http://live.washingtonpost.com/tv-chat-140508.html\\\">What to Watch: TV chat with Hank Stuever<\\/a>  <\\/p>\n<p>(vForum, May 8, 2014; 1:10 PM)<\\/p>\n<p>  <a href=\\\"http://www.washingtonpost.com/2011/03/10/ /2011/03/10/ABe7RaQ_moreresults.html ?startIndex=20&dwxLoid=\\\">More News <\\/a>  <\\/p>\",\"date\":\"Tue, 13 May 2014 00:00:00 GMT\",\"type\":\"article\"}]}";


	// we do not want the parser every holding up a query really
	long niceness = 1;

	// tell xmldoc to download the doc
	if ( contentLen == 0 ) content = NULL;

	// the http request gets freed if this blocks, so we have to
	// copy the content!!!
	if ( content ) {
		m_contentAllocSize = contentLen + 1;
		m_content = mdup ( content , contentLen + 1 , "injcont" );
	}
	else {
		m_content = NULL;
		m_contentAllocSize = 0;
	}

	return inject ( url,
			forcedIp,
			m_content,
			contentLen,
			diffbotReply,
			recycleContent,
			CT_HTML, // contentType,
			coll,
			quickReply ,
			NULL,//username ,
			NULL,//pwd,
			niceness,
			state,
			callback,
			//firstIndexed,
			//lastSpidered,
			hopCount,
			newOnly,
			charset,
			spiderLinks,
			deleteUrl,
			hasMime,
			doConsistencyTesting);
}

// . returns false if blocked, true otherwise
// . if returns false will call your callback(state) when is done
// . returns true and sets g_errno on error
bool Msg7::inject ( char *url ,
		    long  forcedIp ,
		    char *content ,
		    long  contentLen ,
		    char *diffbotReply,
		    bool  recycleContent,
		    uint8_t contentType,
		    char *coll ,
		    bool  quickReply ,
		    char *username ,
		    char *pwd ,
		    long  niceness,
		    void *state ,
		    void (*callback)(void *state),
		    //long firstIndexed,
		    //long lastSpidered,
		    long hopCount,
		    char newOnly,
		    short charset,
		    char spiderLinks,
		    char deleteUrl,
		    char hasMime,
		    bool doConsistencyTesting
		    ) {

	m_quickReply = quickReply;

	// store coll
	//if ( ! coll ) { g_errno = ENOCOLLREC; return true; }
	//      long collLen = gbstrlen ( coll );
	//if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
	//strncpy ( m_coll , coll , collLen );
	//m_coll [ collLen ] = '\0';

	CollectionRec *cr = g_collectiondb.getRec ( coll );
	if ( ! cr ) { g_errno = ENOCOLLREC; return true; }

	// store user
	//long ulen = 0;
	//if ( username ) ulen = gbstrlen(username);
	//if ( ulen >= MAX_USER_SIZE-1 ) {g_errno = EBUFOVERFLOW; return true;}
	//if ( username ) strcpy( m_username, username );

	// store password
	//long pwdLen = 0;
	//if ( pwd ) pwdLen = gbstrlen(pwd);
	//m_pwd [ 0 ] ='\0';
	//if ( pwdLen > 31 ) pwdLen = 31;
	//if ( pwdLen > 0 ) strncpy ( m_pwd , pwd , pwdLen );
	//m_pwd [ pwdLen ] = '\0';

	// store url
	if ( ! url ) { g_errno = 0; return true; }
	long urlLen = gbstrlen(url);
	if ( urlLen > MAX_URL_LEN ) {g_errno = EBADENGINEER; return true; }
	// skip injecting if no url given! just print the admin page.
	if ( urlLen <= 0 ) return true;
	//strcpy ( m_url , url );

	if ( g_repairMode ) { g_errno = EREPAIRING; return true; }

	// send template reply if no content supplied
	//if ( ! content && ! recycleContent ) {
	//	log("inject: no content supplied to inject command and "
	//	    "recycleContent is false.");
	//	//return true;
	//}

	// shortcut
	XmlDoc *xd = &m_xd;

	if ( ! xd->injectDoc ( url ,
			       cr ,
			       content ,
			       diffbotReply,
			       hasMime , // content starts with http mime?
			       hopCount,
			       charset,

			       deleteUrl,
			       contentType, // CT_HTML, CT_XML
			       spiderLinks ,
			       newOnly, // index iff new

			       state,
			       callback ) )
		// we blocked...
		return false;

	return true;
}


///////////////
//
// SCRAPE GOOGLE
//
// and inject the serps
//
///////////////


void doneInjectingLinksWrapper ( void *state ) {
	Msg7 *msg7 = (Msg7 *)state;
	SafeBuf *sb = &msg7->m_sb;
	// copy the serps into ou rbuf
	if ( ! g_errno ) {
		// print header
		if ( sb->length() == 0 ) {
			// print header of page
			sb->safePrintf("<?xml version=\"1.0\" "
				       "encoding=\"UTF-8\" ?>\n"
				       "<response>\n" );
		}
		// serp header
		if ( msg7->m_round == 1 )
			sb->safePrintf("\t<googleResults>\n");
		else
			sb->safePrintf("\t<bingResults>\n");
		// print results
		sb->safeMemcpy(&msg7->m_xd.m_serpBuf);
		// end that
		if ( msg7->m_round == 1 )
			sb->safePrintf("\t</googleResults>\n");
		else
			sb->safePrintf("\t</bingResults>\n");
	}
	// do bing now
	if ( msg7->m_round == 1 ) {
		// return if it blocks
		if ( ! msg7->scrapeQuery() ) return;
	}
	TcpSocket *s = msg7->m_socket;
	// otherwise, parse out the search results so steve can display them
	if ( g_errno )
		sb->safePrintf("<error><![CDATA[%s]]></error>\n",
			       mstrerror(g_errno));
	// print header of page
	sb->safePrintf("</response>\n");
	// page is not more than 32k
	//char buf[1024*32];
	//char *p = buf;
	// return docid and hostid
	//p += sprintf ( p , "scraping status ");
	// print error msg out, too or "Success"
	//p += sprintf ( p , "%s", mstrerror(g_errno));
	g_httpServer.sendDynamicPage ( s,
				       sb->getBufStart(),
				       sb->length(),
				       -1/*cachetime*/);
	// hopefully sb buffer is copied becaues this will free it:
	mdelete ( msg7, sizeof(Msg7) , "PageInject" );
	delete (msg7);
}

// . "uf" is printf url format to scrape with a %s for the query
// . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0";
bool Msg7::scrapeQuery ( ) {

	// advance round now in case we return early
	m_round++;

	// error?
	if ( m_qbuf.length() > 500 ) {
		g_errno = EQUERYTOOBIG;
		return true;
	}

	// first encode the query
	SafeBuf ebuf;
	ebuf.urlEncode ( m_qbuf.getBufStart() ); // queryUNEncoded );

	char *uf;
	if ( m_round == 1 )
		// set to 1 for debugging
		uf="http://www.google.com/search?num=20&"
			"q=%s&scoring=d&filter=0";
		//uf = "https://startpage.com/do/search?q=%s";
		//uf = "http://www.google.com/"
		//	"/cse?cx=013269018370076798483%3A8eec3papwpi&"
		//	"ie=UTF-8&q=%s&"
		//	"num=20";
	else
		uf="http://www.bing.com/search?q=%s";

	// skip bing for now
	//if ( m_round == 2 )
	//	return true;
	//if ( m_round == 1 )
	//	return true;

	// make the url we will download
	char ubuf[2048];
	sprintf ( ubuf , uf , ebuf.getBufStart() );

	// log it
	log("inject: SCRAPING %s",ubuf);

	SpiderRequest sreq;
	sreq.reset();
	// set the SpiderRequest
	strcpy(sreq.m_url, ubuf);
	// . tell it to only add the hosts of each outlink for now!
	// . that will be passed on to when XmlDoc calls Links::set() i guess
	// . xd will not reschedule the scraped url into spiderdb either
	sreq.m_isScraping = 1;
	sreq.m_fakeFirstIp = 1;
	long firstIp = hash32n(ubuf);
	if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
	sreq.m_firstIp = firstIp;
	// parent docid is 0
	sreq.setKey(firstIp,0LL,false);

	// forceDEl = false, niceness = 0
	m_xd.set4 ( &sreq , NULL , m_coll , NULL , 0 );

	//m_xd.m_isScraping = true;

	// download without throttling
	//m_xd.m_throttleDownload = false;

	// disregard this
	m_xd.m_useRobotsTxt = false;

	// this will tell it to index ahrefs first before indexing
	// the doc. but do NOT do this if we are from ahrefs.com
	// ourselves to avoid recursive explosion!!
	if ( m_useAhrefs )
		m_xd.m_useAhrefs = true;

	m_xd.m_reallyInjectLinks = m_injectLinks;

	//
	// rather than just add the links of the page to spiderdb,
	// let's inject them!
	//
	m_xd.setCallback ( this , doneInjectingLinksWrapper );

	// niceness is 0
	m_linkDedupTable.set(4,0,512,NULL,0,false,0,"ldtab2");

	// do we actually inject the links, or just scrape?
	if ( ! m_xd.injectLinks ( &m_linkDedupTable ,
				  NULL,
				  this ,
				  doneInjectingLinksWrapper ) )
		return false;
	// otherwise, just download the google/bing search results so we
	// can display them in xml
	//else if ( m_xd.getUtf8Content() == (char **)-1 )
	//	return false;

	// print reply..
	//printReply();
	return true;
}