more awesome fixes

This commit is contained in:
mwells 2014-04-09 13:31:11 -07:00
parent 72dc660598
commit 2adf5b9bc5
5 changed files with 54 additions and 25 deletions

View File

@ -2389,14 +2389,17 @@ bool CollectionRec::rebuildUrlFilters ( ) {
// maybe this is good enough
//if ( sc ) sc->m_waitingTreeNeedsRebuild = true;
CollectionRec *cr = sc->m_cr;
//CollectionRec *cr = sc->m_cr;
// . rebuild sitetable? in PageBasic.cpp.
// . re-adds seed spdierrequests using msg4
// . true = addSeeds
updateSiteListTables ( m_collnum ,
true ,
cr->m_siteListBuf.getBufStart() );
// . no, don't do this now because we call updateSiteList()
// when we have &sitelist=xxxx in the request which will
// handle updating those tables
//updateSiteListTables ( m_collnum ,
// true ,
// cr->m_siteListBuf.getBufStart() );
}

View File

@ -289,8 +289,8 @@ bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
if ( *x == ':' ) x++;
if ( *x == ' ' ) x++;
strncpy ( p , x , avail );
// capitalize for consistency
if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
// capitalize for consistency. no, makes grepping log msgs harder.
//if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
p += gbstrlen(p);
// back up over spaces
while ( p[-1] == ' ' ) p--;

View File

@ -5647,7 +5647,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
long width = hr->getLong("width",100);
long height = hr->getLong("height",300);
long refresh = hr->getLong("refresh",300);
char *def = "<style>html {font-size:12px;font-family:arial;background-color:transparent;color:black;}span.dayheader { font-size:14px;font-weight:bold;}span.title { font-size:16px;font-weight:bold;}span.countdown { font-size:12px;color:red;}span.summary { font-size:12px;}span.address { font-size:12px;color:purple;}span.times { font-size:12px;color:green;}span.dates { font-size:12px;}span.prevnext { font-size:12px;font-weight:bold;}</style>";//<h2>News</h2>";
char *def = "<style>html {font-size:12px;font-family:arial;background-color:transparent;color:black;}span.title { font-size:16px;font-weight:bold;}span.summary { font-size:12px;} span.date { font-size:12px;}span.prevnext { font-size:12px;font-weight:bold;}</style>";//<h2>News</h2>";
long len1,len2,len3,len4;
char *header = hr->getString("header",&len1,def);
char *sites = hr->getString("sites",&len2,"");
@ -5952,7 +5952,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
"<br>"
//"<br><br><br>"
"<font style=\"font-size:16px;\">"
"Insert the following code into your website to "
"Insert the following code into your webpage to "
"generate the widget %s. "
//"<br>"
//"<b><u>"
@ -6016,6 +6016,9 @@ bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {
SafeBuf parmList;
collnum_t cn = -1;
if ( cr ) cn = cr->m_collnum;
// . first update their collection with the sites to crawl
// . this is NOT a custom diffbot crawl, just a regular one using
// the new crawl filters logic, "siteList"
@ -6024,35 +6027,38 @@ bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {
if ( sites && ! cr && token ) {
// we need to add the new collnum, so reserve it
collnum_t newCollnum = g_collectiondb.reserveCollNum();
// use that
cn = newCollnum;
// add the new colection named <token>-widget123
g_parms.addNewParmToList1 ( &parmList,newCollnum,
coll,0,"addColl");
g_parms.addNewParmToList1 ( &parmList,cn,coll,0,"addColl");
// note it
log("widget: adding new widget coll %s",coll);
}
if ( cn >= 0 && token ) {
// use special url filters profile that spiders sites
// shallowly and frequently to pick up new news stories
// "1" = (long)UFP_NEWS
char ttt[12];
sprintf(ttt,"%li",(long)UFP_NEWS);
g_parms.addNewParmToList1 ( &parmList,newCollnum,ttt,0,
g_parms.addNewParmToList1 ( &parmList,cn,ttt,0,
"urlfiltersprofile");
// use diffbot analyze
char durl[1024];
sprintf(durl,
"http://www.diffbot.com/api?mode=analyze&token=%s",
"http://api.diffbot.com/v2/analyze?mode=auto&token=%s",
token);
// TODO: ensure we call diffbot ok
g_parms.addNewParmToList1 ( &parmList,newCollnum,
durl,0,"apiUrl");
// the list of sites to spider
g_parms.addNewParmToList1 ( &parmList,newCollnum,
sites,0,"sitelist");
// note it
log("widget: adding new widget coll %s",coll);
g_parms.addNewParmToList1 ( &parmList,cn,durl,0,"apiUrl");
}
// update the list of sites to crawl and search and show in widget
if ( sites && token && cr )
g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,
sites,0,"sitelist");
if ( ! sites ) sites = "";
// . update the list of sites to crawl and search and show in widget
// . if they give an empty list then allow that, it will stop crawling
if ( cn >= 0 && token )
g_parms.addNewParmToList1 ( &parmList,cn,sites,0,"sitelist");
if ( parmList.length() ) {

View File

@ -18303,6 +18303,11 @@ bool XmlDoc::isSpam ( char *u ,
// should we index the doc? if already indexed, and is filtered, we delete it
char *XmlDoc::getIsFiltered ( ) {
if ( m_isFilteredValid ) return &m_isFiltered;
if ( m_isDiffbotJSONObject ) {
m_isFiltered = false;
m_isFilteredValid = true;
return &m_isFiltered;
}
long *priority = getSpiderPriority();
if ( ! priority || priority == (void *)-1 ) return (char *)priority;
m_isFiltered = false;
@ -18513,6 +18518,12 @@ bool XmlDoc::logIt ( ) {
if ( m_contentHash32Valid )
sb.safePrintf("ch32=%010lu ",m_contentHash32);
if ( m_domHash32Valid )
sb.safePrintf("dh32=%010lu ",m_domHash32);
if ( m_siteHash32Valid )
sb.safePrintf("sh32=%010lu ",m_siteHash32);
if ( m_isPermalinkValid )
sb.safePrintf("ispermalink=%li ",(long)m_isPermalink);
@ -20787,6 +20798,11 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
sreq.m_hopCountValid = 1;
sreq.m_fakeFirstIp = 1;
sreq.m_firstIp = firstIp;
// so we can match url filters' "insitelist" directive
// in Spider.cpp::getUrlFilterNum()
sreq.m_domHash32 = m_domHash32;
sreq.m_siteHash32 = m_siteHash32;
sreq.m_hostHash32 = m_siteHash32;
// set this
if (!m_dx->set4 ( &sreq ,
NULL ,

View File

@ -16875,8 +16875,12 @@ char *getcwd2 ( char *arg ) {
getcwd ( s_cwdBuf , 1024 );
char *end = s_cwdBuf + gbstrlen(s_cwdBuf);
memcpy ( end , arg , alen );
end += alen;
// if "arg" is a RELATIVE path then append it
if ( arg && arg[0]!='/' ) {
memcpy ( end , arg , alen );
end += alen;
}
*end = '\0';
// size of the whole thing