more awesome fixes

2024-10-04 04:07:13 +03:00 · 2014-04-09 13:31:11 -07:00 · 2014-04-09 13:31:11 -07:00 · 2adf5b9bc5
commit 2adf5b9bc5
parent 72dc660598
5 changed files with 54 additions and 25 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -2389,14 +2389,17 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 		// maybe this is good enough
 		//if ( sc ) sc->m_waitingTreeNeedsRebuild = true;
 		
-		CollectionRec *cr = sc->m_cr;
+		//CollectionRec *cr = sc->m_cr;

 		// . rebuild sitetable? in PageBasic.cpp.
 		// . re-adds seed spdierrequests using msg4
 		// . true = addSeeds
-		updateSiteListTables ( m_collnum , 
-				       true , 
-				       cr->m_siteListBuf.getBufStart() );
+		// . no, don't do this now because we call updateSiteList()
+		//   when we have &sitelist=xxxx in the request which will
+		//   handle updating those tables
+		//updateSiteListTables ( m_collnum , 
+		//		       true , 
+		//		       cr->m_siteListBuf.getBufStart() );
 	}


--- a/Log.cpp
+++ b/Log.cpp
@ -289,8 +289,8 @@ bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
 	if ( *x == ':' ) x++;
 	if ( *x == ' ' ) x++;
 	strncpy ( p , x , avail );
-	// capitalize for consistency
-	if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
+	// capitalize for consistency. no, makes grepping log msgs harder.
+	//if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
 	p += gbstrlen(p);
 	// back up over spaces
 	while ( p[-1] == ' ' ) p--;
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -5647,7 +5647,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
 	long width  = hr->getLong("width",100);
 	long height = hr->getLong("height",300);
 	long refresh = hr->getLong("refresh",300);
-	char *def = "<style>html {font-size:12px;font-family:arial;background-color:transparent;color:black;}span.dayheader { font-size:14px;font-weight:bold;}span.title { font-size:16px;font-weight:bold;}span.countdown { font-size:12px;color:red;}span.summary { font-size:12px;}span.address { font-size:12px;color:purple;}span.times { font-size:12px;color:green;}span.dates { font-size:12px;}span.prevnext { font-size:12px;font-weight:bold;}</style>";//<h2>News</h2>";
+	char *def = "<style>html {font-size:12px;font-family:arial;background-color:transparent;color:black;}span.title { font-size:16px;font-weight:bold;}span.summary { font-size:12px;} span.date { font-size:12px;}span.prevnext { font-size:12px;font-weight:bold;}</style>";//<h2>News</h2>";
 	long len1,len2,len3,len4;
 	char *header = hr->getString("header",&len1,def);
 	char *sites = hr->getString("sites",&len2,"");
@ -5952,7 +5952,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
 			 "<br>"
 			//"<br><br><br>"
 			"<font style=\"font-size:16px;\">"
-			"Insert the following code into your website to "
+			"Insert the following code into your webpage to "
 			"generate the widget %s. "
 			//"<br>"
 			//"<b><u>"
@ -6016,6 +6016,9 @@ bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {

 	SafeBuf parmList;

+	collnum_t cn = -1;
+	if ( cr ) cn = cr->m_collnum;
+
 	// . first update their collection with the sites to crawl
 	// . this is NOT a custom diffbot crawl, just a regular one using
 	//   the new crawl filters logic, "siteList"
@ -6024,35 +6027,38 @@ bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {
 	if ( sites && ! cr && token ) {
 		// we need to add the new collnum, so reserve it
 		collnum_t newCollnum = g_collectiondb.reserveCollNum();
+		// use that
+		cn = newCollnum;
 		// add the new colection named <token>-widget123
-		g_parms.addNewParmToList1 ( &parmList,newCollnum,
-					    coll,0,"addColl");
+		g_parms.addNewParmToList1 ( &parmList,cn,coll,0,"addColl");
+		// note it
+		log("widget: adding new widget coll %s",coll);
+	}
+
+
+	if ( cn >= 0 && token ) {
 		// use special url filters profile that spiders sites
 		// shallowly and frequently to pick up new news stories
 		// "1" = (long)UFP_NEWS
 		char ttt[12];
 		sprintf(ttt,"%li",(long)UFP_NEWS);
-		g_parms.addNewParmToList1 ( &parmList,newCollnum,ttt,0,
+		g_parms.addNewParmToList1 ( &parmList,cn,ttt,0,
 					    "urlfiltersprofile");
 		// use diffbot analyze
 		char durl[1024];
 		sprintf(durl,
-			"http://www.diffbot.com/api?mode=analyze&token=%s",
+			"http://api.diffbot.com/v2/analyze?mode=auto&token=%s",
 			token);
 		// TODO: ensure we call diffbot ok
-		g_parms.addNewParmToList1 ( &parmList,newCollnum,
-					    durl,0,"apiUrl");
-		// the list of sites to spider
-		g_parms.addNewParmToList1 ( &parmList,newCollnum,
-					    sites,0,"sitelist");
-		// note it
-		log("widget: adding new widget coll %s",coll);
+		g_parms.addNewParmToList1 ( &parmList,cn,durl,0,"apiUrl");
 	}

-	// update the list of sites to crawl and search and show in widget
-	if ( sites && token && cr )
-		g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,
-					    sites,0,"sitelist");
+	if ( ! sites ) sites = "";
+
+	// . update the list of sites to crawl and search and show in widget
+	// . if they give an empty list then allow that, it will stop crawling
+	if ( cn >= 0 && token )
+		g_parms.addNewParmToList1 ( &parmList,cn,sites,0,"sitelist");


 	if ( parmList.length() ) {
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -18303,6 +18303,11 @@ bool XmlDoc::isSpam ( char   *u         ,
 // should we index the doc? if already indexed, and is filtered, we delete it
 char *XmlDoc::getIsFiltered ( ) {
 	if ( m_isFilteredValid ) return &m_isFiltered;
+	if ( m_isDiffbotJSONObject ) {
+		m_isFiltered = false;
+		m_isFilteredValid = true;
+		return &m_isFiltered;
+	}
 	long *priority = getSpiderPriority();
 	if ( ! priority || priority == (void *)-1 ) return (char *)priority;
 	m_isFiltered = false;
@ -18513,6 +18518,12 @@ bool XmlDoc::logIt ( ) {
 	if ( m_contentHash32Valid )
 		sb.safePrintf("ch32=%010lu ",m_contentHash32);

+	if ( m_domHash32Valid )
+		sb.safePrintf("dh32=%010lu ",m_domHash32);
+
+	if ( m_siteHash32Valid )
+		sb.safePrintf("sh32=%010lu ",m_siteHash32);
+
 	if ( m_isPermalinkValid )
 		sb.safePrintf("ispermalink=%li ",(long)m_isPermalink);

@ -20787,6 +20798,11 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
 			sreq.m_hopCountValid = 1;
 			sreq.m_fakeFirstIp   = 1;
 			sreq.m_firstIp       = firstIp;
+			// so we can match url filters' "insitelist" directive
+			// in Spider.cpp::getUrlFilterNum()
+			sreq.m_domHash32  = m_domHash32;
+			sreq.m_siteHash32 = m_siteHash32;
+			sreq.m_hostHash32 = m_siteHash32;
 			// set this
 			if (!m_dx->set4 ( &sreq       ,
 					  NULL        ,
--- a/main.cpp
+++ b/main.cpp
@ -16875,8 +16875,12 @@ char *getcwd2 ( char *arg ) {
 	getcwd ( s_cwdBuf , 1024 );
 	char *end = s_cwdBuf + gbstrlen(s_cwdBuf);

-	memcpy ( end , arg , alen );
-	end += alen;
+	// if "arg" is a RELATIVE path then append it
+	if ( arg && arg[0]!='/' ) {
+		memcpy ( end , arg , alen );
+		end += alen;
+	}
+
 	*end = '\0';

 	// size of the whole thing