mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 04:07:13 +03:00
more awesome fixes
This commit is contained in:
parent
72dc660598
commit
2adf5b9bc5
@ -2389,14 +2389,17 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
// maybe this is good enough
|
||||
//if ( sc ) sc->m_waitingTreeNeedsRebuild = true;
|
||||
|
||||
CollectionRec *cr = sc->m_cr;
|
||||
//CollectionRec *cr = sc->m_cr;
|
||||
|
||||
// . rebuild sitetable? in PageBasic.cpp.
|
||||
// . re-adds seed spdierrequests using msg4
|
||||
// . true = addSeeds
|
||||
updateSiteListTables ( m_collnum ,
|
||||
true ,
|
||||
cr->m_siteListBuf.getBufStart() );
|
||||
// . no, don't do this now because we call updateSiteList()
|
||||
// when we have &sitelist=xxxx in the request which will
|
||||
// handle updating those tables
|
||||
//updateSiteListTables ( m_collnum ,
|
||||
// true ,
|
||||
// cr->m_siteListBuf.getBufStart() );
|
||||
}
|
||||
|
||||
|
||||
|
4
Log.cpp
4
Log.cpp
@ -289,8 +289,8 @@ bool Log::logR ( long long now , long type , char *msg , bool asterisk ,
|
||||
if ( *x == ':' ) x++;
|
||||
if ( *x == ' ' ) x++;
|
||||
strncpy ( p , x , avail );
|
||||
// capitalize for consistency
|
||||
if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
|
||||
// capitalize for consistency. no, makes grepping log msgs harder.
|
||||
//if ( is_alpha_a(*p) ) *p = to_upper_a(*p);
|
||||
p += gbstrlen(p);
|
||||
// back up over spaces
|
||||
while ( p[-1] == ' ' ) p--;
|
||||
|
@ -5647,7 +5647,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
|
||||
long width = hr->getLong("width",100);
|
||||
long height = hr->getLong("height",300);
|
||||
long refresh = hr->getLong("refresh",300);
|
||||
char *def = "<style>html {font-size:12px;font-family:arial;background-color:transparent;color:black;}span.dayheader { font-size:14px;font-weight:bold;}span.title { font-size:16px;font-weight:bold;}span.countdown { font-size:12px;color:red;}span.summary { font-size:12px;}span.address { font-size:12px;color:purple;}span.times { font-size:12px;color:green;}span.dates { font-size:12px;}span.prevnext { font-size:12px;font-weight:bold;}</style>";//<h2>News</h2>";
|
||||
char *def = "<style>html {font-size:12px;font-family:arial;background-color:transparent;color:black;}span.title { font-size:16px;font-weight:bold;}span.summary { font-size:12px;} span.date { font-size:12px;}span.prevnext { font-size:12px;font-weight:bold;}</style>";//<h2>News</h2>";
|
||||
long len1,len2,len3,len4;
|
||||
char *header = hr->getString("header",&len1,def);
|
||||
char *sites = hr->getString("sites",&len2,"");
|
||||
@ -5952,7 +5952,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
|
||||
"<br>"
|
||||
//"<br><br><br>"
|
||||
"<font style=\"font-size:16px;\">"
|
||||
"Insert the following code into your website to "
|
||||
"Insert the following code into your webpage to "
|
||||
"generate the widget %s. "
|
||||
//"<br>"
|
||||
//"<b><u>"
|
||||
@ -6016,6 +6016,9 @@ bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {
|
||||
|
||||
SafeBuf parmList;
|
||||
|
||||
collnum_t cn = -1;
|
||||
if ( cr ) cn = cr->m_collnum;
|
||||
|
||||
// . first update their collection with the sites to crawl
|
||||
// . this is NOT a custom diffbot crawl, just a regular one using
|
||||
// the new crawl filters logic, "siteList"
|
||||
@ -6024,35 +6027,38 @@ bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {
|
||||
if ( sites && ! cr && token ) {
|
||||
// we need to add the new collnum, so reserve it
|
||||
collnum_t newCollnum = g_collectiondb.reserveCollNum();
|
||||
// use that
|
||||
cn = newCollnum;
|
||||
// add the new colection named <token>-widget123
|
||||
g_parms.addNewParmToList1 ( &parmList,newCollnum,
|
||||
coll,0,"addColl");
|
||||
g_parms.addNewParmToList1 ( &parmList,cn,coll,0,"addColl");
|
||||
// note it
|
||||
log("widget: adding new widget coll %s",coll);
|
||||
}
|
||||
|
||||
|
||||
if ( cn >= 0 && token ) {
|
||||
// use special url filters profile that spiders sites
|
||||
// shallowly and frequently to pick up new news stories
|
||||
// "1" = (long)UFP_NEWS
|
||||
char ttt[12];
|
||||
sprintf(ttt,"%li",(long)UFP_NEWS);
|
||||
g_parms.addNewParmToList1 ( &parmList,newCollnum,ttt,0,
|
||||
g_parms.addNewParmToList1 ( &parmList,cn,ttt,0,
|
||||
"urlfiltersprofile");
|
||||
// use diffbot analyze
|
||||
char durl[1024];
|
||||
sprintf(durl,
|
||||
"http://www.diffbot.com/api?mode=analyze&token=%s",
|
||||
"http://api.diffbot.com/v2/analyze?mode=auto&token=%s",
|
||||
token);
|
||||
// TODO: ensure we call diffbot ok
|
||||
g_parms.addNewParmToList1 ( &parmList,newCollnum,
|
||||
durl,0,"apiUrl");
|
||||
// the list of sites to spider
|
||||
g_parms.addNewParmToList1 ( &parmList,newCollnum,
|
||||
sites,0,"sitelist");
|
||||
// note it
|
||||
log("widget: adding new widget coll %s",coll);
|
||||
g_parms.addNewParmToList1 ( &parmList,cn,durl,0,"apiUrl");
|
||||
}
|
||||
|
||||
// update the list of sites to crawl and search and show in widget
|
||||
if ( sites && token && cr )
|
||||
g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,
|
||||
sites,0,"sitelist");
|
||||
if ( ! sites ) sites = "";
|
||||
|
||||
// . update the list of sites to crawl and search and show in widget
|
||||
// . if they give an empty list then allow that, it will stop crawling
|
||||
if ( cn >= 0 && token )
|
||||
g_parms.addNewParmToList1 ( &parmList,cn,sites,0,"sitelist");
|
||||
|
||||
|
||||
if ( parmList.length() ) {
|
||||
|
16
XmlDoc.cpp
16
XmlDoc.cpp
@ -18303,6 +18303,11 @@ bool XmlDoc::isSpam ( char *u ,
|
||||
// should we index the doc? if already indexed, and is filtered, we delete it
|
||||
char *XmlDoc::getIsFiltered ( ) {
|
||||
if ( m_isFilteredValid ) return &m_isFiltered;
|
||||
if ( m_isDiffbotJSONObject ) {
|
||||
m_isFiltered = false;
|
||||
m_isFilteredValid = true;
|
||||
return &m_isFiltered;
|
||||
}
|
||||
long *priority = getSpiderPriority();
|
||||
if ( ! priority || priority == (void *)-1 ) return (char *)priority;
|
||||
m_isFiltered = false;
|
||||
@ -18513,6 +18518,12 @@ bool XmlDoc::logIt ( ) {
|
||||
if ( m_contentHash32Valid )
|
||||
sb.safePrintf("ch32=%010lu ",m_contentHash32);
|
||||
|
||||
if ( m_domHash32Valid )
|
||||
sb.safePrintf("dh32=%010lu ",m_domHash32);
|
||||
|
||||
if ( m_siteHash32Valid )
|
||||
sb.safePrintf("sh32=%010lu ",m_siteHash32);
|
||||
|
||||
if ( m_isPermalinkValid )
|
||||
sb.safePrintf("ispermalink=%li ",(long)m_isPermalink);
|
||||
|
||||
@ -20787,6 +20798,11 @@ char *XmlDoc::getMetaList ( bool forDelete ) {
|
||||
sreq.m_hopCountValid = 1;
|
||||
sreq.m_fakeFirstIp = 1;
|
||||
sreq.m_firstIp = firstIp;
|
||||
// so we can match url filters' "insitelist" directive
|
||||
// in Spider.cpp::getUrlFilterNum()
|
||||
sreq.m_domHash32 = m_domHash32;
|
||||
sreq.m_siteHash32 = m_siteHash32;
|
||||
sreq.m_hostHash32 = m_siteHash32;
|
||||
// set this
|
||||
if (!m_dx->set4 ( &sreq ,
|
||||
NULL ,
|
||||
|
8
main.cpp
8
main.cpp
@ -16875,8 +16875,12 @@ char *getcwd2 ( char *arg ) {
|
||||
getcwd ( s_cwdBuf , 1024 );
|
||||
char *end = s_cwdBuf + gbstrlen(s_cwdBuf);
|
||||
|
||||
memcpy ( end , arg , alen );
|
||||
end += alen;
|
||||
// if "arg" is a RELATIVE path then append it
|
||||
if ( arg && arg[0]!='/' ) {
|
||||
memcpy ( end , arg , alen );
|
||||
end += alen;
|
||||
}
|
||||
|
||||
*end = '\0';
|
||||
|
||||
// size of the whole thing
|
||||
|
Loading…
Reference in New Issue
Block a user