more updates

2024-10-04 12:17:35 +03:00 · 2014-04-09 11:03:31 -07:00 · 2014-04-09 11:03:31 -07:00 · be99155986
commit be99155986
parent 9e1199f113
10 changed files with 424 additions and 57 deletions
--- a/Collectiondb.cpp
+++ b/Collectiondb.cpp
@ -1807,31 +1807,232 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {

 	long n = 0;

-	//strcpy(m_regExs   [n],"default");
+	/*
 	m_regExs[n].set("default");
 	m_regExs[n].nullTerm();
-	m_numRegExs++;
-
 	m_spiderFreqs     [n] = 30; // 30 days default
-	m_numRegExs2++;
-
 	m_spiderPriorities[n] = 0;
-	m_numRegExs3++;
-
 	m_maxSpidersPerRule[n] = 99;
-	m_numRegExs10++;
-
 	m_spiderIpWaits[n] = 1000;
-	m_numRegExs5++;
-
 	m_spiderIpMaxSpiders[n] = 7;
-	m_numRegExs6++;
-
-	//m_spidersEnabled[n] = 1;
-	//m_numRegExs7++;
-
 	m_harvestLinks[n] = 1;
-	m_numRegExs8++;
+	*/
+
+	m_regExs[n].set("isdocidbased");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 80;
+	n++;
+
+	m_regExs[n].set("ismedia");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = -3; // delete!
+	n++;
+
+	// if not in the site list then nuke it
+	m_regExs[n].set("!insitelist");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 0; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = -3; // delete!
+	n++;
+
+	m_regExs[n].set("errorcount>=3 && hastmperror");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 1; // 30 days default
+	m_maxSpidersPerRule  [n] = 1; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 3;
+	n++;
+
+	m_regExs[n].set("errorcount>=1 && hastmperror");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 1; // 30 days default
+	m_maxSpidersPerRule  [n] = 1; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 45;
+	if ( m_urlFiltersProfile == UFP_NEWS )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+	m_regExs[n].set("isaddurl");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 99; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 85;
+	if ( m_urlFiltersProfile == UFP_NEWS )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+	m_regExs[n].set("hopcount==0 && iswww && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7; // 30 days default
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 50;
+	if ( m_urlFiltersProfile == UFP_NEWS )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+
+	m_regExs[n].set("hopcount==0 && iswww");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0; // days b4 respider
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 48;
+	if ( m_urlFiltersProfile == UFP_NEWS )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+	m_regExs[n].set("hopcount==0 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 7.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 49;
+	if ( m_urlFiltersProfile == UFP_NEWS )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+	m_regExs[n].set("hopcount==0");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 10.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 47;
+	if ( m_urlFiltersProfile == UFP_NEWS )
+		m_spiderFreqs [n] = .00347; // 5 mins
+	n++;
+
+	m_regExs[n].set("hopcount==1 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 40;
+	if ( m_urlFiltersProfile == UFP_NEWS )
+		m_spiderFreqs [n] = .04166; // 60 minutes
+	n++;
+
+	m_regExs[n].set("hopcount==1");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 20.0;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 39;
+	if ( m_urlFiltersProfile == UFP_NEWS )
+		m_spiderFreqs [n] = .04166; // 60 minutes
+	n++;
+
+	m_regExs[n].set("hopcount==2 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 30;
+	// do not harvest links if we are spiderings NEWS
+	if ( m_urlFiltersProfile == UFP_NEWS ) {
+		m_spiderFreqs  [n] = 5.0;
+		m_harvestLinks [n] = 0;
+	}
+	n++;
+
+	m_regExs[n].set("hopcount==2");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 40;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 29;
+	// do not harvest links if we are spiderings NEWS
+	if ( m_urlFiltersProfile == UFP_NEWS ) {
+		m_spiderFreqs  [n] = 5.0;
+		m_harvestLinks [n] = 0;
+	}
+	n++;
+
+	m_regExs[n].set("hopcount>=3 && isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 20;
+	// turn off spidering if hopcount is too big and we are spiderings NEWS
+	if ( m_urlFiltersProfile == UFP_NEWS ) {
+		m_maxSpidersPerRule [n] = 0;
+		m_harvestLinks      [n] = 0;
+	}
+	n++;
+
+	m_regExs[n].set("hopcount>=3");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 19;
+	// turn off spidering if hopcount is too big and we are spiderings NEWS
+	if ( m_urlFiltersProfile == UFP_NEWS ) {
+		m_maxSpidersPerRule [n] = 0;
+		m_harvestLinks      [n] = 0;
+	}
+	n++;
+
+	/*
+	m_regExs[n].set("isnew");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = resp4;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 2;
+	n++;
+	*/
+
+	m_regExs[n].set("default");
+	m_harvestLinks       [n] = 1;
+	m_spiderFreqs        [n] = 60;
+	m_maxSpidersPerRule  [n] = 9; // max spiders
+	m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
+	m_spiderIpWaits      [n] = 1000; // same ip wait
+	m_spiderPriorities   [n] = 1;
+	n++;
+
+
+	m_numRegExs   = n;
+	m_numRegExs2  = n;
+	m_numRegExs3  = n;
+	m_numRegExs10 = n;
+	m_numRegExs5  = n;
+	m_numRegExs6  = n;
+	m_numRegExs8  = n;
+
+	// more rules
+
+
+

 	//m_spiderDiffbotApiNum[n] = 1;
 	//m_numRegExs11++;
@ -2145,6 +2346,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
 		// . rebuild sitetable? in PageBasic.cpp.
 		// . re-adds seed spdierrequests using msg4
 		// . true = addSeeds
+		// . rebuilds url filters there too i think
 		updateSiteList ( m_collnum , true );
 	}

--- a/Collectiondb.h
+++ b/Collectiondb.h
@ -683,6 +683,9 @@ class CollectionRec {
 	SafeBuf m_siteListBuf;
 	char m_spiderToo;

+	// see UFP_* values in Parms.h. i.e. UFP_NEWS for crawling for NEWS
+	long m_urlFiltersProfile;
+
 	// . now the url regular expressions
 	// . we chain down the regular expressions
 	// . if a url matches we use that tagdb rec #
--- a/Errno.cpp
+++ b/Errno.cpp
@ -168,6 +168,7 @@ case	EBADHOSTSCONF: return "A hosts.conf is out of sync";
 case    EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
 case	EDOCNONCANONICAL: return "Url was dup of canonical page";
 case    ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
+case    ENOTOKEN: return "Missing token";
 	}
 	// if the remote error bit is clear it must be a regulare errno
 	//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
--- a/Errno.h
+++ b/Errno.h
@ -171,6 +171,7 @@ enum {
 	EBADHOSTSCONF,
 	EWAITINGTOSYNCHOSTSCONF,
 	EDOCNONCANONICAL,
-	ECUSTOMCRAWLMISMATCH  // a crawl request was made with a name that already existed for bulk request (or the other way around)
+	ECUSTOMCRAWLMISMATCH, // a crawl request was made with a name that already existed for bulk request (or the other way around)
+	ENOTOKEN
 };
 #endif
--- a/PageBasic.cpp
+++ b/PageBasic.cpp
@ -156,7 +156,7 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {

 	Url u;

-	for ( ; *pn ; pn++ , lineNum++ ) {
+	for ( ; *pn ; lineNum++ ) {

 		// get end
 		char *s = pn;
@ -169,6 +169,9 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
 	        char *pe = pn;
 		for ( ; pe > s && is_wspace_a(pe[-1]) ; pe-- );

+		// skip over the \n so pn points to next line for next time
+		if ( *pn == '\n' ) pn++;
+
 		// make hash of the line
 		long h32 = hash32 ( s , pe - s );

--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -203,9 +203,8 @@ bool printCSSHead ( SafeBuf *sb , char format ) {
 			      "font-family:Arial, Helvetica, sans-serif;"
 			      );

-
-	if ( format == FORMAT_WIDGET )
-		sb->safePrintf("background-color:000000;");
+	//if ( format == FORMAT_WIDGET )
+	//	sb->safePrintf("background-color:000000;");

 	sb->safePrintf(	      "color: #000000;"
 			      "font-size: 12px;"
@ -975,6 +974,15 @@ bool printSearchResultsHeader ( State0 *st ) {
 		sb->safePrintf("<body>");
 	}

+	HttpRequest *hr = &st->m_hr;
+
+	// lead with user's widget header which usually has custom style tags
+	if ( si->m_format == FORMAT_WIDGET ) {
+		char *header = hr->getString("header",NULL);
+		if ( header ) sb->safeStrcpy ( header );
+	}
+
+
 	if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) {
 		printLogoAndSearchBox ( sb , &st->m_hr , -1 ); // catId = -1
 	}
@ -1048,7 +1056,7 @@ bool printSearchResultsHeader ( State0 *st ) {
 	*/


-	// save how many docs are in it
+	// save how many docs are in this collection
 	long long docsInColl = -1;
 	//RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , si->m_coll );
 	//RdbBase *base = getRdbBase ( (uint8_t)RDB_CLUSTERDB , si->m_coll2 );
@ -1081,10 +1089,10 @@ bool printSearchResultsHeader ( State0 *st ) {
 	    sb->safePrintf("\"moreResultsFollow\":%li,\n", (long)moreFollow);
 	}

-    if ( st->m_header && si->m_format == FORMAT_JSON ) {
-        sb->safePrintf("\"results\":[\n");
-        return true;
-    }
+	if ( st->m_header && si->m_format == FORMAT_JSON ) {
+		sb->safePrintf("\"results\":[\n");
+		return true;
+	}

 	// . did he get a spelling recommendation?
 	// . do not use htmlEncode() on this anymore since receiver
@ -1142,7 +1150,7 @@ bool printSearchResultsHeader ( State0 *st ) {
 			float tfwi = getTermFreqWeight(ssi->m_listSize);
 			for ( long j = i+1; j< nr ; j++ ) {
 				SingleScore *ssj = &dpx->m_singleScores[j];
-				float tfwj = getTermFreqWeight(ssj->m_listSize);
+				float tfwj =getTermFreqWeight(ssj->m_listSize);
 				max += (lw * tfwi * tfwj)/3.0;
 			}
 		}
@ -1237,11 +1245,14 @@ bool printSearchResultsHeader ( State0 *st ) {
 	long collLen = gbstrlen(coll);

 	// otherwise, we had no error
-	if ( numResults == 0 && 
-	     (si->m_format == FORMAT_HTML || si->m_format==FORMAT_WIDGET) ) {
+	if ( numResults == 0 && si->m_format == FORMAT_HTML ) {
 		sb->safePrintf ( "No results found in <b>%s</b> collection.",
 				cr->m_coll);
 	}
+	// the token is currently in the collection name so do not show that
+	else if ( numResults == 0 && si->m_format == FORMAT_WIDGET ) {
+		sb->safePrintf ( "No results found.");
+	}
 	else if ( moreFollow && si->m_format == FORMAT_HTML ) {
 		if ( isAdmin && si->m_docsToScanForReranking > 1 )
 			sb->safePrintf ( "PQR'd " );
@ -1924,6 +1935,10 @@ bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si ,
 			frontTag = "<b>";
 			backTag  = "</b>";
 		}
+		if ( si->m_format == FORMAT_WIDGET ) {
+			frontTag = "<font style=\"background-color:yellow\">" ;
+		}
+
 		Highlight hi;
 		SafeBuf hb;
 		long hlen = hi.set ( &hb,//tt , 
@ -2336,7 +2351,7 @@ bool printResult ( State0 *st, long ix ) {
 		
 		sb->safePrintf("<b style=\""
 			       "text-decoration:none;"
-			       "font-size: 20px;"
+			       "font-size: 15px;"
 			       "font-weight:bold;"
 			       "background-color:rgba(0,0,0,.5);"
 			       "color:white;"
@ -2456,6 +2471,9 @@ bool printResult ( State0 *st, long ix ) {
 		frontTag = "<b>";
 		backTag  = "</b>";
 	}
+	if ( si->m_format == FORMAT_WIDGET ) {
+		frontTag = "<font style=\"background-color:yellow\">" ;
+	}
 	long cols = 80;
 	if ( si->m_format == FORMAT_XML ) 
 		sb->safePrintf("\t\t<title><![CDATA[");
@ -2556,7 +2574,15 @@ bool printResult ( State0 *st, long ix ) {

 	if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t<sum><![CDATA[");

-	sb->brify ( str , strLen, 0 , cols ); // niceness = 0
+	bool printSummary = true;
+	// do not print summaries for widgets by default unless overridden
+	// with &summary=1
+	if ( si->m_format == FORMAT_WIDGET && hr->getLong("summaries",0) == 0 )
+		printSummary = false;
+
+	if ( printSummary )
+		sb->brify ( str , strLen, 0 , cols ); // niceness = 0
+
 	// close xml tag
 	if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></sum>\n");
 	// new line if not xml
@ -5525,24 +5551,20 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
 }


-bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
+bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
 	//
 	// begin print controls
 	//

 	sb->safePrintf("<html>"
 		       "<body bgcolor=#e8e8e8>"
-		      "<title>Widget Creator</title>"
+		       "<title>Widget Creator</title>"
 		      );


-	char *coll = "GLOBAL-INDEX";
-	CollectionRec *cr = g_collectiondb.getRec(coll);
-	if ( ! cr ) {
-		sb->safePrintf("Error. collection %s does not exist",
-			       coll);
-		return true;
-	}
+	//char *coll = "GLOBAL-INDEX";
+	CollectionRec *cr = NULL;
+	if ( coll ) cr = g_collectiondb.getRec(coll);

 	// if admin clicks "edit" in the live widget itself put up
 	// some simpler content editing boxes. token required!
@ -5616,8 +5638,8 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
 	char *c1 = "";
 	char *c2 = "";
 	char *c3 = "";
-	long x1 = hr->getLong("dates"    ,1);
-	long x2 = hr->getLong("summaries",1);
+	long x1 = hr->getLong("dates"    ,0);
+	long x2 = hr->getLong("summaries",0);
 	long x3 = hr->getLong("border"   ,1);
 	if ( x1 ) c1 = " checked";
 	if ( x2 ) c2 = " checked";
@ -5635,7 +5657,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
 	sb->safePrintf("<form method=GET action=/widget>"
 		       "<input type=hidden name=c value=\"%s\">"
 		       "<input type=hidden name=format value=\"widget\">"
-		       , cr->m_coll
+		       , coll
 		       );


@ -5665,6 +5687,10 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
 		      "<b style=font-size:22px;><font style=font-size:27px;>"
 		      "W</font>"
 		      "idget <font style=font-size:27px;>C</font>reator</b>"
+
+		      "<img align=right height=50 width=52 "
+		      "src=http://www.diffbot.com/img/diffy-b.png>"
+
 		      "</td>"
 		      "</tr>"

@ -5694,20 +5720,20 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {


 		      "Show Dates "
-		      "<input type=checkbox "
-		      "onclick=\"toggleBool(this,'dates');reload();\" "
+		      "<input type=checkbox value=1 "
+		      //"onclick=\"toggleBool(this,'dates');reload();\" "
 		      "name=dates%s>"
 		      "<br>"

 		      "Show Summaries "
-		      "<input type=checkbox "
-		      "onclick=\"toggleBool(this,'summaries');reload();\" "
+		      "<input type=checkbox value=1 "
+		      //"onclick=\"toggleBool(this,'summaries');reload();\" "
 		      "name=summaries%s>"
 		      "<br>"

 		      "Frame border "
-		      "<input type=checkbox "
-		      "onclick=\"toggleBool(this,'border');reload();\" "
+		      "<input type=checkbox value=1 "
+		      //"onclick=\"toggleBool(this,'border');reload();\" "
 		      "name=border%s>"
 		      "<br>"

@ -5819,6 +5845,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
 		      //">"
 		       "</td>"
 		      "<td valign=top>"
+		       "<br>"
 		      "<img src=/gears32.png width=64 height=64>"
 		      "<br><br>"
 		      );
@ -5826,6 +5853,9 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {

 	long start = sb->length();

+	char *border = "frameborder=no ";
+	if ( x3 ) border = "";
+
 	// this iframe contains the WIDGET
 	sb->safePrintf (
 		       /*
@ -5858,21 +5888,39 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
 			//"src=\"http://neo.diffbot.com:8000/search?"
 			*/

+			// frameborder=no
+			"%s"
+
 			"src=\""
 			"http://127.0.0.1:8000/search?"
 			"format=widget&"
 			"widgetwidth=%li&widgetheight=%li&"
-			"c=GLOBAL-INDEX&"
+			"c=%s&"
+			"refresh=%li"
 			// show articles sorted by newest pubdate first
-			"q=type%%3Aarticle+gbsortbyint%%3Adate"

-			"\">"
 			, width
 			, height
+			, border
 			, width
 			, height
+			, coll
+			, refresh
 			);

+	sb->safePrintf("&dates=%li",x1);
+	sb->safePrintf("&summaries=%li",x2);
+
+	sb->safePrintf("&q=");
+	sb->urlEncode ( query );
+
+	// widget content header, usually a style tag
+	sb->safePrintf("&header=");
+	sb->urlEncode ( header );
+
+
+
+	sb->safePrintf("\">");

 	sb->safePrintf ( // do not reset the user's "where" cookie
 			// to NYC from looking at this widget!
@ -5901,6 +5949,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
 		//	}

 	sb->safePrintf ( "\n\n"
+			 "<br>"
 			//"<br><br><br>"
 			"<font style=\"font-size:16px;\">"
 			"Insert the following code into your website to "
@ -5941,7 +5990,82 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
 bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {
 	SafeBuf sb;

-	printWidgetPage ( &sb , hr );
+	char *token = hr->getString("token",NULL);
+	if ( token && ! token[0] ) token = NULL;
+
+	if ( ! token ) {
+		g_errno = ENOTOKEN;
+		char *msg = mstrerror(g_errno);
+		return g_httpServer.sendErrorReply(s,g_errno,msg);
+	}
+
+	long tlen = 0;
+	if ( token ) tlen = gbstrlen(token);
+	if ( tlen > 64 ) { 
+		g_errno = ENOCOLLREC;
+		char *msg = mstrerror(g_errno);
+		return g_httpServer.sendErrorReply(s,g_errno,msg);
+	}
+
+	char coll[MAX_COLL_LEN];
+	CollectionRec *cr = NULL;
+	if ( token ) {
+		sprintf(coll,"%s-widget123",token);
+		cr = g_collectiondb.getRec(coll);
+	}
+
+	SafeBuf parmList;
+
+	// . first update their collection with the sites to crawl
+	// . this is NOT a custom diffbot crawl, just a regular one using
+	//   the new crawl filters logic, "siteList"
+	char *sites = hr->getString("sites",NULL);
+	// add the collection if does not exist
+	if ( sites && ! cr && token ) {
+		// we need to add the new collnum, so reserve it
+		collnum_t newCollnum = g_collectiondb.reserveCollNum();
+		// add the new colection named <token>-widget123
+		g_parms.addNewParmToList1 ( &parmList,newCollnum,
+					    coll,0,"addColl");
+		// use special url filters profile that spiders sites
+		// shallowly and frequently to pick up new news stories
+		// "1" = (long)UFP_NEWS
+		char ttt[12];
+		sprintf(ttt,"%li",(long)UFP_NEWS);
+		g_parms.addNewParmToList1 ( &parmList,newCollnum,ttt,0,
+					    "urlfiltersprofile");
+		// use diffbot analyze
+		char durl[1024];
+		sprintf(durl,
+			"http://www.diffbot.com/api?mode=analyze&token=%s",
+			token);
+		// TODO: ensure we call diffbot ok
+		g_parms.addNewParmToList1 ( &parmList,newCollnum,
+					    durl,0,"apiUrl");
+		// the list of sites to spider
+		g_parms.addNewParmToList1 ( &parmList,newCollnum,
+					    sites,0,"sitelist");
+		// note it
+		log("widget: adding new widget coll %s",coll);
+	}
+
+	// update the list of sites to crawl and search and show in widget
+	if ( sites && token && cr )
+		g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,
+					    sites,0,"sitelist");
+
+
+	if ( parmList.length() ) {
+		// send the parms to all hosts in the network
+		g_parms.broadcastParmList ( &parmList , 
+					    NULL,//s,// state is socket i guess
+					    NULL);//doneBroadcastingParms2 );
+	}
+
+
+
+	// now display the widget controls and the widget and the iframe code
+	printWidgetPage ( &sb , hr , coll );

 	return g_httpServer.sendDynamicPage(s,
 					    sb.getBufStart(),
--- a/Parms.cpp
+++ b/Parms.cpp
@ -13310,6 +13310,16 @@ void Parms::init ( ) {
 		"expressions\": "
 		"link:gigablast and doc:quality<X and doc:quality>X.";
 		*/
+
+	m->m_cgi   = "ufp";
+	m->m_xml   = "urlFiltersProfile";
+	m->m_off   = (char *)&cr.m_urlFiltersProfile - x;
+	m->m_type  = TYPE_LONG;
+	m->m_page  = PAGE_FILTERS;
+	m->m_def   = "0"; // UFP_NONE
+	m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS;
+	m++;
+
 	m->m_cgi   = "fe";
 	m->m_xml   = "filterExpression";
 	m->m_max   = MAX_FILTERS;
@ -16192,8 +16202,15 @@ void Parms::init ( ) {
 		if ( m_parms[i].m_off   > mm ||
 		     m_parms[i].m_soff  > mm ||
 		     m_parms[i].m_smaxc > mm   ) {
-			log(LOG_LOGIC,"conf: Bad offset in parm #%li %s.",
-			    i,m_parms[i].m_title);
+			log(LOG_LOGIC,"conf: Bad offset in parm #%li %s."
+			    " (%li,%li,%li,%li). Did you FORGET to include "
+			    "an & before the cr.myVariable when setting "
+			    "m_off for this parm?",
+			    i,m_parms[i].m_title,
+			    mm,
+			    m_parms[i].m_off,
+			    m_parms[i].m_soff,
+			    m_parms[i].m_smaxc);
 			exit(-1);
 		}
 		// do not allow numbers in cgi parms, they are used for
--- a/Parms.h
+++ b/Parms.h
@ -14,6 +14,15 @@
 void handleRequest3e ( UdpSlot *slot , long niceness ) ;
 void handleRequest3f ( UdpSlot *slot , long niceness ) ;

+// "url filters profile" values. used to set default crawl rules
+// in Collectiondb.cpp's CollectionRec::setUrlFiltersToDefaults(). 
+// for instance, UFP_NEWS spiders sites more frequently but less deep in
+// order to get "news" pages and articles
+enum {
+	UFP_CUSTOM = 0 ,
+	UFP_NONE   = 0 ,
+	UFP_NEWS = 1 };
+
 // special priorities for the priority drop down 
 // in the url filters table
 enum {
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -14103,7 +14103,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
 		m_diffbotUrl.pushChar('&');

 	//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
-	m_diffbotUrl.safePrintf("token=%s",cr->m_diffbotToken.getBufStart());
+	// only print token if we have one, because if user provides their
+	// own diffbot url (apiUrl in Parms.cpp) then they  might include
+	// the token in that for their non-custom crawl. m_customCrawl=0.
+	if ( cr->m_diffbotToken.length())
+		m_diffbotUrl.safePrintf("token=%s",
+					cr->m_diffbotToken.getBufStart());
+
 	m_diffbotUrl.safePrintf("&url=");
 	// give diffbot the url to process
 	m_diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
--- a/coll.main.0/coll.conf
+++ b/coll.main.0/coll.conf
@ -288,6 +288,7 @@
 # <i>spider priority</i> of <i>DELETE</i> will cause the URL to not be
 # spidered, or if it has already been indexed, it will be deleted when it is
 # respidered.<br><br>
+<urlFiltersProfile>0</>
 <filterExpression><![CDATA[isdocidbased]]></>
 <filterExpression><![CDATA[ismedia]]></>
 <filterExpression><![CDATA[errorcount&gt;=3 &amp;&amp; hastmperror]]></>