more updates

This commit is contained in:
mwells 2014-04-09 11:03:31 -07:00
parent 9e1199f113
commit be99155986
10 changed files with 424 additions and 57 deletions

View File

@ -1807,31 +1807,232 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
long n = 0;
//strcpy(m_regExs [n],"default");
/*
m_regExs[n].set("default");
m_regExs[n].nullTerm();
m_numRegExs++;
m_spiderFreqs [n] = 30; // 30 days default
m_numRegExs2++;
m_spiderPriorities[n] = 0;
m_numRegExs3++;
m_maxSpidersPerRule[n] = 99;
m_numRegExs10++;
m_spiderIpWaits[n] = 1000;
m_numRegExs5++;
m_spiderIpMaxSpiders[n] = 7;
m_numRegExs6++;
//m_spidersEnabled[n] = 1;
//m_numRegExs7++;
m_harvestLinks[n] = 1;
m_numRegExs8++;
*/
m_regExs[n].set("isdocidbased");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 80;
n++;
m_regExs[n].set("ismedia");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = -3; // delete!
n++;
// if not in the site list then nuke it
m_regExs[n].set("!insitelist");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 0; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = -3; // delete!
n++;
m_regExs[n].set("errorcount>=3 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 3;
n++;
m_regExs[n].set("errorcount>=1 && hastmperror");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 1; // 30 days default
m_maxSpidersPerRule [n] = 1; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 45;
if ( m_urlFiltersProfile == UFP_NEWS )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("isaddurl");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 99; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 85;
if ( m_urlFiltersProfile == UFP_NEWS )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0 && iswww && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7; // 30 days default
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 50;
if ( m_urlFiltersProfile == UFP_NEWS )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0 && iswww");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0; // days b4 respider
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 48;
if ( m_urlFiltersProfile == UFP_NEWS )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 7.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 49;
if ( m_urlFiltersProfile == UFP_NEWS )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==0");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 10.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 47;
if ( m_urlFiltersProfile == UFP_NEWS )
m_spiderFreqs [n] = .00347; // 5 mins
n++;
m_regExs[n].set("hopcount==1 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 40;
if ( m_urlFiltersProfile == UFP_NEWS )
m_spiderFreqs [n] = .04166; // 60 minutes
n++;
m_regExs[n].set("hopcount==1");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 20.0;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 39;
if ( m_urlFiltersProfile == UFP_NEWS )
m_spiderFreqs [n] = .04166; // 60 minutes
n++;
m_regExs[n].set("hopcount==2 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 30;
// do not harvest links if we are spiderings NEWS
if ( m_urlFiltersProfile == UFP_NEWS ) {
m_spiderFreqs [n] = 5.0;
m_harvestLinks [n] = 0;
}
n++;
m_regExs[n].set("hopcount==2");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 40;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 29;
// do not harvest links if we are spiderings NEWS
if ( m_urlFiltersProfile == UFP_NEWS ) {
m_spiderFreqs [n] = 5.0;
m_harvestLinks [n] = 0;
}
n++;
m_regExs[n].set("hopcount>=3 && isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 20;
// turn off spidering if hopcount is too big and we are spiderings NEWS
if ( m_urlFiltersProfile == UFP_NEWS ) {
m_maxSpidersPerRule [n] = 0;
m_harvestLinks [n] = 0;
}
n++;
m_regExs[n].set("hopcount>=3");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 19;
// turn off spidering if hopcount is too big and we are spiderings NEWS
if ( m_urlFiltersProfile == UFP_NEWS ) {
m_maxSpidersPerRule [n] = 0;
m_harvestLinks [n] = 0;
}
n++;
/*
m_regExs[n].set("isnew");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = resp4;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 2;
n++;
*/
m_regExs[n].set("default");
m_harvestLinks [n] = 1;
m_spiderFreqs [n] = 60;
m_maxSpidersPerRule [n] = 9; // max spiders
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
m_spiderIpWaits [n] = 1000; // same ip wait
m_spiderPriorities [n] = 1;
n++;
m_numRegExs = n;
m_numRegExs2 = n;
m_numRegExs3 = n;
m_numRegExs10 = n;
m_numRegExs5 = n;
m_numRegExs6 = n;
m_numRegExs8 = n;
// more rules
//m_spiderDiffbotApiNum[n] = 1;
//m_numRegExs11++;
@ -2145,6 +2346,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
// . rebuild sitetable? in PageBasic.cpp.
// . re-adds seed spdierrequests using msg4
// . true = addSeeds
// . rebuilds url filters there too i think
updateSiteList ( m_collnum , true );
}

View File

@ -683,6 +683,9 @@ class CollectionRec {
SafeBuf m_siteListBuf;
char m_spiderToo;
// see UFP_* values in Parms.h. i.e. UFP_NEWS for crawling for NEWS
long m_urlFiltersProfile;
// . now the url regular expressions
// . we chain down the regular expressions
// . if a url matches we use that tagdb rec #

View File

@ -168,6 +168,7 @@ case EBADHOSTSCONF: return "A hosts.conf is out of sync";
case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
case EDOCNONCANONICAL: return "Url was dup of canonical page";
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
case ENOTOKEN: return "Missing token";
}
// if the remote error bit is clear it must be a regulare errno
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );

View File

@ -171,6 +171,7 @@ enum {
EBADHOSTSCONF,
EWAITINGTOSYNCHOSTSCONF,
EDOCNONCANONICAL,
ECUSTOMCRAWLMISMATCH // a crawl request was made with a name that already existed for bulk request (or the other way around)
ECUSTOMCRAWLMISMATCH, // a crawl request was made with a name that already existed for bulk request (or the other way around)
ENOTOKEN
};
#endif

View File

@ -156,7 +156,7 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
Url u;
for ( ; *pn ; pn++ , lineNum++ ) {
for ( ; *pn ; lineNum++ ) {
// get end
char *s = pn;
@ -169,6 +169,9 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
char *pe = pn;
for ( ; pe > s && is_wspace_a(pe[-1]) ; pe-- );
// skip over the \n so pn points to next line for next time
if ( *pn == '\n' ) pn++;
// make hash of the line
long h32 = hash32 ( s , pe - s );

View File

@ -203,9 +203,8 @@ bool printCSSHead ( SafeBuf *sb , char format ) {
"font-family:Arial, Helvetica, sans-serif;"
);
if ( format == FORMAT_WIDGET )
sb->safePrintf("background-color:000000;");
//if ( format == FORMAT_WIDGET )
// sb->safePrintf("background-color:000000;");
sb->safePrintf( "color: #000000;"
"font-size: 12px;"
@ -975,6 +974,15 @@ bool printSearchResultsHeader ( State0 *st ) {
sb->safePrintf("<body>");
}
HttpRequest *hr = &st->m_hr;
// lead with user's widget header which usually has custom style tags
if ( si->m_format == FORMAT_WIDGET ) {
char *header = hr->getString("header",NULL);
if ( header ) sb->safeStrcpy ( header );
}
if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) {
printLogoAndSearchBox ( sb , &st->m_hr , -1 ); // catId = -1
}
@ -1048,7 +1056,7 @@ bool printSearchResultsHeader ( State0 *st ) {
*/
// save how many docs are in it
// save how many docs are in this collection
long long docsInColl = -1;
//RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , si->m_coll );
//RdbBase *base = getRdbBase ( (uint8_t)RDB_CLUSTERDB , si->m_coll2 );
@ -1081,10 +1089,10 @@ bool printSearchResultsHeader ( State0 *st ) {
sb->safePrintf("\"moreResultsFollow\":%li,\n", (long)moreFollow);
}
if ( st->m_header && si->m_format == FORMAT_JSON ) {
sb->safePrintf("\"results\":[\n");
return true;
}
if ( st->m_header && si->m_format == FORMAT_JSON ) {
sb->safePrintf("\"results\":[\n");
return true;
}
// . did he get a spelling recommendation?
// . do not use htmlEncode() on this anymore since receiver
@ -1142,7 +1150,7 @@ bool printSearchResultsHeader ( State0 *st ) {
float tfwi = getTermFreqWeight(ssi->m_listSize);
for ( long j = i+1; j< nr ; j++ ) {
SingleScore *ssj = &dpx->m_singleScores[j];
float tfwj = getTermFreqWeight(ssj->m_listSize);
float tfwj =getTermFreqWeight(ssj->m_listSize);
max += (lw * tfwi * tfwj)/3.0;
}
}
@ -1237,11 +1245,14 @@ bool printSearchResultsHeader ( State0 *st ) {
long collLen = gbstrlen(coll);
// otherwise, we had no error
if ( numResults == 0 &&
(si->m_format == FORMAT_HTML || si->m_format==FORMAT_WIDGET) ) {
if ( numResults == 0 && si->m_format == FORMAT_HTML ) {
sb->safePrintf ( "No results found in <b>%s</b> collection.",
cr->m_coll);
}
// the token is currently in the collection name so do not show that
else if ( numResults == 0 && si->m_format == FORMAT_WIDGET ) {
sb->safePrintf ( "No results found.");
}
else if ( moreFollow && si->m_format == FORMAT_HTML ) {
if ( isAdmin && si->m_docsToScanForReranking > 1 )
sb->safePrintf ( "PQR'd " );
@ -1924,6 +1935,10 @@ bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si ,
frontTag = "<b>";
backTag = "</b>";
}
if ( si->m_format == FORMAT_WIDGET ) {
frontTag = "<font style=\"background-color:yellow\">" ;
}
Highlight hi;
SafeBuf hb;
long hlen = hi.set ( &hb,//tt ,
@ -2336,7 +2351,7 @@ bool printResult ( State0 *st, long ix ) {
sb->safePrintf("<b style=\""
"text-decoration:none;"
"font-size: 20px;"
"font-size: 15px;"
"font-weight:bold;"
"background-color:rgba(0,0,0,.5);"
"color:white;"
@ -2456,6 +2471,9 @@ bool printResult ( State0 *st, long ix ) {
frontTag = "<b>";
backTag = "</b>";
}
if ( si->m_format == FORMAT_WIDGET ) {
frontTag = "<font style=\"background-color:yellow\">" ;
}
long cols = 80;
if ( si->m_format == FORMAT_XML )
sb->safePrintf("\t\t<title><![CDATA[");
@ -2556,7 +2574,15 @@ bool printResult ( State0 *st, long ix ) {
if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t<sum><![CDATA[");
sb->brify ( str , strLen, 0 , cols ); // niceness = 0
bool printSummary = true;
// do not print summaries for widgets by default unless overridden
// with &summary=1
if ( si->m_format == FORMAT_WIDGET && hr->getLong("summaries",0) == 0 )
printSummary = false;
if ( printSummary )
sb->brify ( str , strLen, 0 , cols ); // niceness = 0
// close xml tag
if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></sum>\n");
// new line if not xml
@ -5525,24 +5551,20 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
}
bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
//
// begin print controls
//
sb->safePrintf("<html>"
"<body bgcolor=#e8e8e8>"
"<title>Widget Creator</title>"
"<title>Widget Creator</title>"
);
char *coll = "GLOBAL-INDEX";
CollectionRec *cr = g_collectiondb.getRec(coll);
if ( ! cr ) {
sb->safePrintf("Error. collection %s does not exist",
coll);
return true;
}
//char *coll = "GLOBAL-INDEX";
CollectionRec *cr = NULL;
if ( coll ) cr = g_collectiondb.getRec(coll);
// if admin clicks "edit" in the live widget itself put up
// some simpler content editing boxes. token required!
@ -5616,8 +5638,8 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
char *c1 = "";
char *c2 = "";
char *c3 = "";
long x1 = hr->getLong("dates" ,1);
long x2 = hr->getLong("summaries",1);
long x1 = hr->getLong("dates" ,0);
long x2 = hr->getLong("summaries",0);
long x3 = hr->getLong("border" ,1);
if ( x1 ) c1 = " checked";
if ( x2 ) c2 = " checked";
@ -5635,7 +5657,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
sb->safePrintf("<form method=GET action=/widget>"
"<input type=hidden name=c value=\"%s\">"
"<input type=hidden name=format value=\"widget\">"
, cr->m_coll
, coll
);
@ -5665,6 +5687,10 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
"<b style=font-size:22px;><font style=font-size:27px;>"
"W</font>"
"idget <font style=font-size:27px;>C</font>reator</b>"
"<img align=right height=50 width=52 "
"src=http://www.diffbot.com/img/diffy-b.png>"
"</td>"
"</tr>"
@ -5694,20 +5720,20 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
"Show Dates "
"<input type=checkbox "
"onclick=\"toggleBool(this,'dates');reload();\" "
"<input type=checkbox value=1 "
//"onclick=\"toggleBool(this,'dates');reload();\" "
"name=dates%s>"
"<br>"
"Show Summaries "
"<input type=checkbox "
"onclick=\"toggleBool(this,'summaries');reload();\" "
"<input type=checkbox value=1 "
//"onclick=\"toggleBool(this,'summaries');reload();\" "
"name=summaries%s>"
"<br>"
"Frame border "
"<input type=checkbox "
"onclick=\"toggleBool(this,'border');reload();\" "
"<input type=checkbox value=1 "
//"onclick=\"toggleBool(this,'border');reload();\" "
"name=border%s>"
"<br>"
@ -5819,6 +5845,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
//">"
"</td>"
"<td valign=top>"
"<br>"
"<img src=/gears32.png width=64 height=64>"
"<br><br>"
);
@ -5826,6 +5853,9 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
long start = sb->length();
char *border = "frameborder=no ";
if ( x3 ) border = "";
// this iframe contains the WIDGET
sb->safePrintf (
/*
@ -5858,21 +5888,39 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
//"src=\"http://neo.diffbot.com:8000/search?"
*/
// frameborder=no
"%s"
"src=\""
"http://127.0.0.1:8000/search?"
"format=widget&"
"widgetwidth=%li&widgetheight=%li&"
"c=GLOBAL-INDEX&"
"c=%s&"
"refresh=%li"
// show articles sorted by newest pubdate first
"q=type%%3Aarticle+gbsortbyint%%3Adate"
"\">"
, width
, height
, border
, width
, height
, coll
, refresh
);
sb->safePrintf("&dates=%li",x1);
sb->safePrintf("&summaries=%li",x2);
sb->safePrintf("&q=");
sb->urlEncode ( query );
// widget content header, usually a style tag
sb->safePrintf("&header=");
sb->urlEncode ( header );
sb->safePrintf("\">");
sb->safePrintf ( // do not reset the user's "where" cookie
// to NYC from looking at this widget!
@ -5901,6 +5949,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
// }
sb->safePrintf ( "\n\n"
"<br>"
//"<br><br><br>"
"<font style=\"font-size:16px;\">"
"Insert the following code into your website to "
@ -5941,7 +5990,82 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {
SafeBuf sb;
printWidgetPage ( &sb , hr );
char *token = hr->getString("token",NULL);
if ( token && ! token[0] ) token = NULL;
if ( ! token ) {
g_errno = ENOTOKEN;
char *msg = mstrerror(g_errno);
return g_httpServer.sendErrorReply(s,g_errno,msg);
}
long tlen = 0;
if ( token ) tlen = gbstrlen(token);
if ( tlen > 64 ) {
g_errno = ENOCOLLREC;
char *msg = mstrerror(g_errno);
return g_httpServer.sendErrorReply(s,g_errno,msg);
}
char coll[MAX_COLL_LEN];
CollectionRec *cr = NULL;
if ( token ) {
sprintf(coll,"%s-widget123",token);
cr = g_collectiondb.getRec(coll);
}
SafeBuf parmList;
// . first update their collection with the sites to crawl
// . this is NOT a custom diffbot crawl, just a regular one using
// the new crawl filters logic, "siteList"
char *sites = hr->getString("sites",NULL);
// add the collection if does not exist
if ( sites && ! cr && token ) {
// we need to add the new collnum, so reserve it
collnum_t newCollnum = g_collectiondb.reserveCollNum();
// add the new colection named <token>-widget123
g_parms.addNewParmToList1 ( &parmList,newCollnum,
coll,0,"addColl");
// use special url filters profile that spiders sites
// shallowly and frequently to pick up new news stories
// "1" = (long)UFP_NEWS
char ttt[12];
sprintf(ttt,"%li",(long)UFP_NEWS);
g_parms.addNewParmToList1 ( &parmList,newCollnum,ttt,0,
"urlfiltersprofile");
// use diffbot analyze
char durl[1024];
sprintf(durl,
"http://www.diffbot.com/api?mode=analyze&token=%s",
token);
// TODO: ensure we call diffbot ok
g_parms.addNewParmToList1 ( &parmList,newCollnum,
durl,0,"apiUrl");
// the list of sites to spider
g_parms.addNewParmToList1 ( &parmList,newCollnum,
sites,0,"sitelist");
// note it
log("widget: adding new widget coll %s",coll);
}
// update the list of sites to crawl and search and show in widget
if ( sites && token && cr )
g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,
sites,0,"sitelist");
if ( parmList.length() ) {
// send the parms to all hosts in the network
g_parms.broadcastParmList ( &parmList ,
NULL,//s,// state is socket i guess
NULL);//doneBroadcastingParms2 );
}
// now display the widget controls and the widget and the iframe code
printWidgetPage ( &sb , hr , coll );
return g_httpServer.sendDynamicPage(s,
sb.getBufStart(),

View File

@ -13310,6 +13310,16 @@ void Parms::init ( ) {
"expressions\": "
"link:gigablast and doc:quality<X and doc:quality>X.";
*/
m->m_cgi = "ufp";
m->m_xml = "urlFiltersProfile";
m->m_off = (char *)&cr.m_urlFiltersProfile - x;
m->m_type = TYPE_LONG;
m->m_page = PAGE_FILTERS;
m->m_def = "0"; // UFP_NONE
m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS;
m++;
m->m_cgi = "fe";
m->m_xml = "filterExpression";
m->m_max = MAX_FILTERS;
@ -16192,8 +16202,15 @@ void Parms::init ( ) {
if ( m_parms[i].m_off > mm ||
m_parms[i].m_soff > mm ||
m_parms[i].m_smaxc > mm ) {
log(LOG_LOGIC,"conf: Bad offset in parm #%li %s.",
i,m_parms[i].m_title);
log(LOG_LOGIC,"conf: Bad offset in parm #%li %s."
" (%li,%li,%li,%li). Did you FORGET to include "
"an & before the cr.myVariable when setting "
"m_off for this parm?",
i,m_parms[i].m_title,
mm,
m_parms[i].m_off,
m_parms[i].m_soff,
m_parms[i].m_smaxc);
exit(-1);
}
// do not allow numbers in cgi parms, they are used for

View File

@ -14,6 +14,15 @@
void handleRequest3e ( UdpSlot *slot , long niceness ) ;
void handleRequest3f ( UdpSlot *slot , long niceness ) ;
// "url filters profile" values. used to set default crawl rules
// in Collectiondb.cpp's CollectionRec::setUrlFiltersToDefaults().
// for instance, UFP_NEWS spiders sites more frequently but less deep in
// order to get "news" pages and articles
enum {
UFP_CUSTOM = 0 ,
UFP_NONE = 0 ,
UFP_NEWS = 1 };
// special priorities for the priority drop down
// in the url filters table
enum {

View File

@ -14103,7 +14103,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
m_diffbotUrl.pushChar('&');
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
m_diffbotUrl.safePrintf("token=%s",cr->m_diffbotToken.getBufStart());
// only print token if we have one, because if user provides their
// own diffbot url (apiUrl in Parms.cpp) then they might include
// the token in that for their non-custom crawl. m_customCrawl=0.
if ( cr->m_diffbotToken.length())
m_diffbotUrl.safePrintf("token=%s",
cr->m_diffbotToken.getBufStart());
m_diffbotUrl.safePrintf("&url=");
// give diffbot the url to process
m_diffbotUrl.urlEncode ( m_firstUrl.getUrl() );

View File

@ -288,6 +288,7 @@
# <i>spider priority</i> of <i>DELETE</i> will cause the URL to not be
# spidered, or if it has already been indexed, it will be deleted when it is
# respidered.<br><br>
<urlFiltersProfile>0</>
<filterExpression><![CDATA[isdocidbased]]></>
<filterExpression><![CDATA[ismedia]]></>
<filterExpression><![CDATA[errorcount&gt;=3 &amp;&amp; hastmperror]]></>