mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
more updates
This commit is contained in:
parent
9e1199f113
commit
be99155986
236
Collectiondb.cpp
236
Collectiondb.cpp
@ -1807,31 +1807,232 @@ void CollectionRec::setUrlFiltersToDefaults ( ) {
|
||||
|
||||
long n = 0;
|
||||
|
||||
//strcpy(m_regExs [n],"default");
|
||||
/*
|
||||
m_regExs[n].set("default");
|
||||
m_regExs[n].nullTerm();
|
||||
m_numRegExs++;
|
||||
|
||||
m_spiderFreqs [n] = 30; // 30 days default
|
||||
m_numRegExs2++;
|
||||
|
||||
m_spiderPriorities[n] = 0;
|
||||
m_numRegExs3++;
|
||||
|
||||
m_maxSpidersPerRule[n] = 99;
|
||||
m_numRegExs10++;
|
||||
|
||||
m_spiderIpWaits[n] = 1000;
|
||||
m_numRegExs5++;
|
||||
|
||||
m_spiderIpMaxSpiders[n] = 7;
|
||||
m_numRegExs6++;
|
||||
|
||||
//m_spidersEnabled[n] = 1;
|
||||
//m_numRegExs7++;
|
||||
|
||||
m_harvestLinks[n] = 1;
|
||||
m_numRegExs8++;
|
||||
*/
|
||||
|
||||
m_regExs[n].set("isdocidbased");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 80;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("ismedia");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
n++;
|
||||
|
||||
// if not in the site list then nuke it
|
||||
m_regExs[n].set("!insitelist");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 0; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = -3; // delete!
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=3 && hastmperror");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 1; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 3;
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("errorcount>=1 && hastmperror");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 1; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 1; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 45;
|
||||
if ( m_urlFiltersProfile == UFP_NEWS )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("isaddurl");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 99; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 85;
|
||||
if ( m_urlFiltersProfile == UFP_NEWS )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7; // 30 days default
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 50;
|
||||
if ( m_urlFiltersProfile == UFP_NEWS )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
|
||||
m_regExs[n].set("hopcount==0 && iswww");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0; // days b4 respider
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 48;
|
||||
if ( m_urlFiltersProfile == UFP_NEWS )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 7.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 49;
|
||||
if ( m_urlFiltersProfile == UFP_NEWS )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==0");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 10.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 47;
|
||||
if ( m_urlFiltersProfile == UFP_NEWS )
|
||||
m_spiderFreqs [n] = .00347; // 5 mins
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==1 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 40;
|
||||
if ( m_urlFiltersProfile == UFP_NEWS )
|
||||
m_spiderFreqs [n] = .04166; // 60 minutes
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==1");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 20.0;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 39;
|
||||
if ( m_urlFiltersProfile == UFP_NEWS )
|
||||
m_spiderFreqs [n] = .04166; // 60 minutes
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==2 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 30;
|
||||
// do not harvest links if we are spiderings NEWS
|
||||
if ( m_urlFiltersProfile == UFP_NEWS ) {
|
||||
m_spiderFreqs [n] = 5.0;
|
||||
m_harvestLinks [n] = 0;
|
||||
}
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount==2");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 40;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 29;
|
||||
// do not harvest links if we are spiderings NEWS
|
||||
if ( m_urlFiltersProfile == UFP_NEWS ) {
|
||||
m_spiderFreqs [n] = 5.0;
|
||||
m_harvestLinks [n] = 0;
|
||||
}
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount>=3 && isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 20;
|
||||
// turn off spidering if hopcount is too big and we are spiderings NEWS
|
||||
if ( m_urlFiltersProfile == UFP_NEWS ) {
|
||||
m_maxSpidersPerRule [n] = 0;
|
||||
m_harvestLinks [n] = 0;
|
||||
}
|
||||
n++;
|
||||
|
||||
m_regExs[n].set("hopcount>=3");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 19;
|
||||
// turn off spidering if hopcount is too big and we are spiderings NEWS
|
||||
if ( m_urlFiltersProfile == UFP_NEWS ) {
|
||||
m_maxSpidersPerRule [n] = 0;
|
||||
m_harvestLinks [n] = 0;
|
||||
}
|
||||
n++;
|
||||
|
||||
/*
|
||||
m_regExs[n].set("isnew");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = resp4;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 2;
|
||||
n++;
|
||||
*/
|
||||
|
||||
m_regExs[n].set("default");
|
||||
m_harvestLinks [n] = 1;
|
||||
m_spiderFreqs [n] = 60;
|
||||
m_maxSpidersPerRule [n] = 9; // max spiders
|
||||
m_spiderIpMaxSpiders [n] = 1; // max spiders per ip
|
||||
m_spiderIpWaits [n] = 1000; // same ip wait
|
||||
m_spiderPriorities [n] = 1;
|
||||
n++;
|
||||
|
||||
|
||||
m_numRegExs = n;
|
||||
m_numRegExs2 = n;
|
||||
m_numRegExs3 = n;
|
||||
m_numRegExs10 = n;
|
||||
m_numRegExs5 = n;
|
||||
m_numRegExs6 = n;
|
||||
m_numRegExs8 = n;
|
||||
|
||||
// more rules
|
||||
|
||||
|
||||
|
||||
|
||||
//m_spiderDiffbotApiNum[n] = 1;
|
||||
//m_numRegExs11++;
|
||||
@ -2145,6 +2346,7 @@ bool CollectionRec::rebuildUrlFilters ( ) {
|
||||
// . rebuild sitetable? in PageBasic.cpp.
|
||||
// . re-adds seed spdierrequests using msg4
|
||||
// . true = addSeeds
|
||||
// . rebuilds url filters there too i think
|
||||
updateSiteList ( m_collnum , true );
|
||||
}
|
||||
|
||||
|
@ -683,6 +683,9 @@ class CollectionRec {
|
||||
SafeBuf m_siteListBuf;
|
||||
char m_spiderToo;
|
||||
|
||||
// see UFP_* values in Parms.h. i.e. UFP_NEWS for crawling for NEWS
|
||||
long m_urlFiltersProfile;
|
||||
|
||||
// . now the url regular expressions
|
||||
// . we chain down the regular expressions
|
||||
// . if a url matches we use that tagdb rec #
|
||||
|
@ -168,6 +168,7 @@ case EBADHOSTSCONF: return "A hosts.conf is out of sync";
|
||||
case EWAITINGTOSYNCHOSTSCONF: return "Wait to ensure hosts.conf in sync";
|
||||
case EDOCNONCANONICAL: return "Url was dup of canonical page";
|
||||
case ECUSTOMCRAWLMISMATCH: return "Job name/type mismatch. Job name has already been used for a crawl or bulk job.";
|
||||
case ENOTOKEN: return "Missing token";
|
||||
}
|
||||
// if the remote error bit is clear it must be a regulare errno
|
||||
//if ( ! ( errnum & REMOTE_ERROR_BIT ) ) return strerror ( errnum );
|
||||
|
3
Errno.h
3
Errno.h
@ -171,6 +171,7 @@ enum {
|
||||
EBADHOSTSCONF,
|
||||
EWAITINGTOSYNCHOSTSCONF,
|
||||
EDOCNONCANONICAL,
|
||||
ECUSTOMCRAWLMISMATCH // a crawl request was made with a name that already existed for bulk request (or the other way around)
|
||||
ECUSTOMCRAWLMISMATCH, // a crawl request was made with a name that already existed for bulk request (or the other way around)
|
||||
ENOTOKEN
|
||||
};
|
||||
#endif
|
||||
|
@ -156,7 +156,7 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
|
||||
Url u;
|
||||
|
||||
for ( ; *pn ; pn++ , lineNum++ ) {
|
||||
for ( ; *pn ; lineNum++ ) {
|
||||
|
||||
// get end
|
||||
char *s = pn;
|
||||
@ -169,6 +169,9 @@ bool updateSiteList ( collnum_t collnum , bool addSeeds ) {
|
||||
char *pe = pn;
|
||||
for ( ; pe > s && is_wspace_a(pe[-1]) ; pe-- );
|
||||
|
||||
// skip over the \n so pn points to next line for next time
|
||||
if ( *pn == '\n' ) pn++;
|
||||
|
||||
// make hash of the line
|
||||
long h32 = hash32 ( s , pe - s );
|
||||
|
||||
|
194
PageResults.cpp
194
PageResults.cpp
@ -203,9 +203,8 @@ bool printCSSHead ( SafeBuf *sb , char format ) {
|
||||
"font-family:Arial, Helvetica, sans-serif;"
|
||||
);
|
||||
|
||||
|
||||
if ( format == FORMAT_WIDGET )
|
||||
sb->safePrintf("background-color:000000;");
|
||||
//if ( format == FORMAT_WIDGET )
|
||||
// sb->safePrintf("background-color:000000;");
|
||||
|
||||
sb->safePrintf( "color: #000000;"
|
||||
"font-size: 12px;"
|
||||
@ -975,6 +974,15 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
sb->safePrintf("<body>");
|
||||
}
|
||||
|
||||
HttpRequest *hr = &st->m_hr;
|
||||
|
||||
// lead with user's widget header which usually has custom style tags
|
||||
if ( si->m_format == FORMAT_WIDGET ) {
|
||||
char *header = hr->getString("header",NULL);
|
||||
if ( header ) sb->safeStrcpy ( header );
|
||||
}
|
||||
|
||||
|
||||
if ( ! g_conf.m_isMattWells && si->m_format == FORMAT_HTML ) {
|
||||
printLogoAndSearchBox ( sb , &st->m_hr , -1 ); // catId = -1
|
||||
}
|
||||
@ -1048,7 +1056,7 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
*/
|
||||
|
||||
|
||||
// save how many docs are in it
|
||||
// save how many docs are in this collection
|
||||
long long docsInColl = -1;
|
||||
//RdbBase *base = getRdbBase ( RDB_CHECKSUMDB , si->m_coll );
|
||||
//RdbBase *base = getRdbBase ( (uint8_t)RDB_CLUSTERDB , si->m_coll2 );
|
||||
@ -1081,10 +1089,10 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
sb->safePrintf("\"moreResultsFollow\":%li,\n", (long)moreFollow);
|
||||
}
|
||||
|
||||
if ( st->m_header && si->m_format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\"results\":[\n");
|
||||
return true;
|
||||
}
|
||||
if ( st->m_header && si->m_format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\"results\":[\n");
|
||||
return true;
|
||||
}
|
||||
|
||||
// . did he get a spelling recommendation?
|
||||
// . do not use htmlEncode() on this anymore since receiver
|
||||
@ -1142,7 +1150,7 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
float tfwi = getTermFreqWeight(ssi->m_listSize);
|
||||
for ( long j = i+1; j< nr ; j++ ) {
|
||||
SingleScore *ssj = &dpx->m_singleScores[j];
|
||||
float tfwj = getTermFreqWeight(ssj->m_listSize);
|
||||
float tfwj =getTermFreqWeight(ssj->m_listSize);
|
||||
max += (lw * tfwi * tfwj)/3.0;
|
||||
}
|
||||
}
|
||||
@ -1237,11 +1245,14 @@ bool printSearchResultsHeader ( State0 *st ) {
|
||||
long collLen = gbstrlen(coll);
|
||||
|
||||
// otherwise, we had no error
|
||||
if ( numResults == 0 &&
|
||||
(si->m_format == FORMAT_HTML || si->m_format==FORMAT_WIDGET) ) {
|
||||
if ( numResults == 0 && si->m_format == FORMAT_HTML ) {
|
||||
sb->safePrintf ( "No results found in <b>%s</b> collection.",
|
||||
cr->m_coll);
|
||||
}
|
||||
// the token is currently in the collection name so do not show that
|
||||
else if ( numResults == 0 && si->m_format == FORMAT_WIDGET ) {
|
||||
sb->safePrintf ( "No results found.");
|
||||
}
|
||||
else if ( moreFollow && si->m_format == FORMAT_HTML ) {
|
||||
if ( isAdmin && si->m_docsToScanForReranking > 1 )
|
||||
sb->safePrintf ( "PQR'd " );
|
||||
@ -1924,6 +1935,10 @@ bool printInlinkText ( SafeBuf *sb , Msg20Reply *mr , SearchInput *si ,
|
||||
frontTag = "<b>";
|
||||
backTag = "</b>";
|
||||
}
|
||||
if ( si->m_format == FORMAT_WIDGET ) {
|
||||
frontTag = "<font style=\"background-color:yellow\">" ;
|
||||
}
|
||||
|
||||
Highlight hi;
|
||||
SafeBuf hb;
|
||||
long hlen = hi.set ( &hb,//tt ,
|
||||
@ -2336,7 +2351,7 @@ bool printResult ( State0 *st, long ix ) {
|
||||
|
||||
sb->safePrintf("<b style=\""
|
||||
"text-decoration:none;"
|
||||
"font-size: 20px;"
|
||||
"font-size: 15px;"
|
||||
"font-weight:bold;"
|
||||
"background-color:rgba(0,0,0,.5);"
|
||||
"color:white;"
|
||||
@ -2456,6 +2471,9 @@ bool printResult ( State0 *st, long ix ) {
|
||||
frontTag = "<b>";
|
||||
backTag = "</b>";
|
||||
}
|
||||
if ( si->m_format == FORMAT_WIDGET ) {
|
||||
frontTag = "<font style=\"background-color:yellow\">" ;
|
||||
}
|
||||
long cols = 80;
|
||||
if ( si->m_format == FORMAT_XML )
|
||||
sb->safePrintf("\t\t<title><![CDATA[");
|
||||
@ -2556,7 +2574,15 @@ bool printResult ( State0 *st, long ix ) {
|
||||
|
||||
if ( si->m_format == FORMAT_XML ) sb->safePrintf("\t\t<sum><![CDATA[");
|
||||
|
||||
sb->brify ( str , strLen, 0 , cols ); // niceness = 0
|
||||
bool printSummary = true;
|
||||
// do not print summaries for widgets by default unless overridden
|
||||
// with &summary=1
|
||||
if ( si->m_format == FORMAT_WIDGET && hr->getLong("summaries",0) == 0 )
|
||||
printSummary = false;
|
||||
|
||||
if ( printSummary )
|
||||
sb->brify ( str , strLen, 0 , cols ); // niceness = 0
|
||||
|
||||
// close xml tag
|
||||
if ( si->m_format == FORMAT_XML ) sb->safePrintf("]]></sum>\n");
|
||||
// new line if not xml
|
||||
@ -5525,24 +5551,20 @@ bool printJsonItemInCSV ( char *json , SafeBuf *sb , State0 *st ) {
|
||||
}
|
||||
|
||||
|
||||
bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr , char *coll ) {
|
||||
//
|
||||
// begin print controls
|
||||
//
|
||||
|
||||
sb->safePrintf("<html>"
|
||||
"<body bgcolor=#e8e8e8>"
|
||||
"<title>Widget Creator</title>"
|
||||
"<title>Widget Creator</title>"
|
||||
);
|
||||
|
||||
|
||||
char *coll = "GLOBAL-INDEX";
|
||||
CollectionRec *cr = g_collectiondb.getRec(coll);
|
||||
if ( ! cr ) {
|
||||
sb->safePrintf("Error. collection %s does not exist",
|
||||
coll);
|
||||
return true;
|
||||
}
|
||||
//char *coll = "GLOBAL-INDEX";
|
||||
CollectionRec *cr = NULL;
|
||||
if ( coll ) cr = g_collectiondb.getRec(coll);
|
||||
|
||||
// if admin clicks "edit" in the live widget itself put up
|
||||
// some simpler content editing boxes. token required!
|
||||
@ -5616,8 +5638,8 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
char *c1 = "";
|
||||
char *c2 = "";
|
||||
char *c3 = "";
|
||||
long x1 = hr->getLong("dates" ,1);
|
||||
long x2 = hr->getLong("summaries",1);
|
||||
long x1 = hr->getLong("dates" ,0);
|
||||
long x2 = hr->getLong("summaries",0);
|
||||
long x3 = hr->getLong("border" ,1);
|
||||
if ( x1 ) c1 = " checked";
|
||||
if ( x2 ) c2 = " checked";
|
||||
@ -5635,7 +5657,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
sb->safePrintf("<form method=GET action=/widget>"
|
||||
"<input type=hidden name=c value=\"%s\">"
|
||||
"<input type=hidden name=format value=\"widget\">"
|
||||
, cr->m_coll
|
||||
, coll
|
||||
);
|
||||
|
||||
|
||||
@ -5665,6 +5687,10 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
"<b style=font-size:22px;><font style=font-size:27px;>"
|
||||
"W</font>"
|
||||
"idget <font style=font-size:27px;>C</font>reator</b>"
|
||||
|
||||
"<img align=right height=50 width=52 "
|
||||
"src=http://www.diffbot.com/img/diffy-b.png>"
|
||||
|
||||
"</td>"
|
||||
"</tr>"
|
||||
|
||||
@ -5694,20 +5720,20 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
|
||||
|
||||
"Show Dates "
|
||||
"<input type=checkbox "
|
||||
"onclick=\"toggleBool(this,'dates');reload();\" "
|
||||
"<input type=checkbox value=1 "
|
||||
//"onclick=\"toggleBool(this,'dates');reload();\" "
|
||||
"name=dates%s>"
|
||||
"<br>"
|
||||
|
||||
"Show Summaries "
|
||||
"<input type=checkbox "
|
||||
"onclick=\"toggleBool(this,'summaries');reload();\" "
|
||||
"<input type=checkbox value=1 "
|
||||
//"onclick=\"toggleBool(this,'summaries');reload();\" "
|
||||
"name=summaries%s>"
|
||||
"<br>"
|
||||
|
||||
"Frame border "
|
||||
"<input type=checkbox "
|
||||
"onclick=\"toggleBool(this,'border');reload();\" "
|
||||
"<input type=checkbox value=1 "
|
||||
//"onclick=\"toggleBool(this,'border');reload();\" "
|
||||
"name=border%s>"
|
||||
"<br>"
|
||||
|
||||
@ -5819,6 +5845,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
//">"
|
||||
"</td>"
|
||||
"<td valign=top>"
|
||||
"<br>"
|
||||
"<img src=/gears32.png width=64 height=64>"
|
||||
"<br><br>"
|
||||
);
|
||||
@ -5826,6 +5853,9 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
|
||||
long start = sb->length();
|
||||
|
||||
char *border = "frameborder=no ";
|
||||
if ( x3 ) border = "";
|
||||
|
||||
// this iframe contains the WIDGET
|
||||
sb->safePrintf (
|
||||
/*
|
||||
@ -5858,21 +5888,39 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
//"src=\"http://neo.diffbot.com:8000/search?"
|
||||
*/
|
||||
|
||||
// frameborder=no
|
||||
"%s"
|
||||
|
||||
"src=\""
|
||||
"http://127.0.0.1:8000/search?"
|
||||
"format=widget&"
|
||||
"widgetwidth=%li&widgetheight=%li&"
|
||||
"c=GLOBAL-INDEX&"
|
||||
"c=%s&"
|
||||
"refresh=%li"
|
||||
// show articles sorted by newest pubdate first
|
||||
"q=type%%3Aarticle+gbsortbyint%%3Adate"
|
||||
|
||||
"\">"
|
||||
, width
|
||||
, height
|
||||
, border
|
||||
, width
|
||||
, height
|
||||
, coll
|
||||
, refresh
|
||||
);
|
||||
|
||||
sb->safePrintf("&dates=%li",x1);
|
||||
sb->safePrintf("&summaries=%li",x2);
|
||||
|
||||
sb->safePrintf("&q=");
|
||||
sb->urlEncode ( query );
|
||||
|
||||
// widget content header, usually a style tag
|
||||
sb->safePrintf("&header=");
|
||||
sb->urlEncode ( header );
|
||||
|
||||
|
||||
|
||||
sb->safePrintf("\">");
|
||||
|
||||
sb->safePrintf ( // do not reset the user's "where" cookie
|
||||
// to NYC from looking at this widget!
|
||||
@ -5901,6 +5949,7 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
// }
|
||||
|
||||
sb->safePrintf ( "\n\n"
|
||||
"<br>"
|
||||
//"<br><br><br>"
|
||||
"<font style=\"font-size:16px;\">"
|
||||
"Insert the following code into your website to "
|
||||
@ -5941,7 +5990,82 @@ bool printWidgetPage ( SafeBuf *sb , HttpRequest *hr ) {
|
||||
bool sendPageWidget ( TcpSocket *s , HttpRequest *hr ) {
|
||||
SafeBuf sb;
|
||||
|
||||
printWidgetPage ( &sb , hr );
|
||||
char *token = hr->getString("token",NULL);
|
||||
if ( token && ! token[0] ) token = NULL;
|
||||
|
||||
if ( ! token ) {
|
||||
g_errno = ENOTOKEN;
|
||||
char *msg = mstrerror(g_errno);
|
||||
return g_httpServer.sendErrorReply(s,g_errno,msg);
|
||||
}
|
||||
|
||||
long tlen = 0;
|
||||
if ( token ) tlen = gbstrlen(token);
|
||||
if ( tlen > 64 ) {
|
||||
g_errno = ENOCOLLREC;
|
||||
char *msg = mstrerror(g_errno);
|
||||
return g_httpServer.sendErrorReply(s,g_errno,msg);
|
||||
}
|
||||
|
||||
char coll[MAX_COLL_LEN];
|
||||
CollectionRec *cr = NULL;
|
||||
if ( token ) {
|
||||
sprintf(coll,"%s-widget123",token);
|
||||
cr = g_collectiondb.getRec(coll);
|
||||
}
|
||||
|
||||
SafeBuf parmList;
|
||||
|
||||
// . first update their collection with the sites to crawl
|
||||
// . this is NOT a custom diffbot crawl, just a regular one using
|
||||
// the new crawl filters logic, "siteList"
|
||||
char *sites = hr->getString("sites",NULL);
|
||||
// add the collection if does not exist
|
||||
if ( sites && ! cr && token ) {
|
||||
// we need to add the new collnum, so reserve it
|
||||
collnum_t newCollnum = g_collectiondb.reserveCollNum();
|
||||
// add the new colection named <token>-widget123
|
||||
g_parms.addNewParmToList1 ( &parmList,newCollnum,
|
||||
coll,0,"addColl");
|
||||
// use special url filters profile that spiders sites
|
||||
// shallowly and frequently to pick up new news stories
|
||||
// "1" = (long)UFP_NEWS
|
||||
char ttt[12];
|
||||
sprintf(ttt,"%li",(long)UFP_NEWS);
|
||||
g_parms.addNewParmToList1 ( &parmList,newCollnum,ttt,0,
|
||||
"urlfiltersprofile");
|
||||
// use diffbot analyze
|
||||
char durl[1024];
|
||||
sprintf(durl,
|
||||
"http://www.diffbot.com/api?mode=analyze&token=%s",
|
||||
token);
|
||||
// TODO: ensure we call diffbot ok
|
||||
g_parms.addNewParmToList1 ( &parmList,newCollnum,
|
||||
durl,0,"apiUrl");
|
||||
// the list of sites to spider
|
||||
g_parms.addNewParmToList1 ( &parmList,newCollnum,
|
||||
sites,0,"sitelist");
|
||||
// note it
|
||||
log("widget: adding new widget coll %s",coll);
|
||||
}
|
||||
|
||||
// update the list of sites to crawl and search and show in widget
|
||||
if ( sites && token && cr )
|
||||
g_parms.addNewParmToList1 ( &parmList,cr->m_collnum,
|
||||
sites,0,"sitelist");
|
||||
|
||||
|
||||
if ( parmList.length() ) {
|
||||
// send the parms to all hosts in the network
|
||||
g_parms.broadcastParmList ( &parmList ,
|
||||
NULL,//s,// state is socket i guess
|
||||
NULL);//doneBroadcastingParms2 );
|
||||
}
|
||||
|
||||
|
||||
|
||||
// now display the widget controls and the widget and the iframe code
|
||||
printWidgetPage ( &sb , hr , coll );
|
||||
|
||||
return g_httpServer.sendDynamicPage(s,
|
||||
sb.getBufStart(),
|
||||
|
21
Parms.cpp
21
Parms.cpp
@ -13310,6 +13310,16 @@ void Parms::init ( ) {
|
||||
"expressions\": "
|
||||
"link:gigablast and doc:quality<X and doc:quality>X.";
|
||||
*/
|
||||
|
||||
m->m_cgi = "ufp";
|
||||
m->m_xml = "urlFiltersProfile";
|
||||
m->m_off = (char *)&cr.m_urlFiltersProfile - x;
|
||||
m->m_type = TYPE_LONG;
|
||||
m->m_page = PAGE_FILTERS;
|
||||
m->m_def = "0"; // UFP_NONE
|
||||
m->m_flags = PF_HIDDEN | PF_REBUILDURLFILTERS;
|
||||
m++;
|
||||
|
||||
m->m_cgi = "fe";
|
||||
m->m_xml = "filterExpression";
|
||||
m->m_max = MAX_FILTERS;
|
||||
@ -16192,8 +16202,15 @@ void Parms::init ( ) {
|
||||
if ( m_parms[i].m_off > mm ||
|
||||
m_parms[i].m_soff > mm ||
|
||||
m_parms[i].m_smaxc > mm ) {
|
||||
log(LOG_LOGIC,"conf: Bad offset in parm #%li %s.",
|
||||
i,m_parms[i].m_title);
|
||||
log(LOG_LOGIC,"conf: Bad offset in parm #%li %s."
|
||||
" (%li,%li,%li,%li). Did you FORGET to include "
|
||||
"an & before the cr.myVariable when setting "
|
||||
"m_off for this parm?",
|
||||
i,m_parms[i].m_title,
|
||||
mm,
|
||||
m_parms[i].m_off,
|
||||
m_parms[i].m_soff,
|
||||
m_parms[i].m_smaxc);
|
||||
exit(-1);
|
||||
}
|
||||
// do not allow numbers in cgi parms, they are used for
|
||||
|
9
Parms.h
9
Parms.h
@ -14,6 +14,15 @@
|
||||
void handleRequest3e ( UdpSlot *slot , long niceness ) ;
|
||||
void handleRequest3f ( UdpSlot *slot , long niceness ) ;
|
||||
|
||||
// "url filters profile" values. used to set default crawl rules
|
||||
// in Collectiondb.cpp's CollectionRec::setUrlFiltersToDefaults().
|
||||
// for instance, UFP_NEWS spiders sites more frequently but less deep in
|
||||
// order to get "news" pages and articles
|
||||
enum {
|
||||
UFP_CUSTOM = 0 ,
|
||||
UFP_NONE = 0 ,
|
||||
UFP_NEWS = 1 };
|
||||
|
||||
// special priorities for the priority drop down
|
||||
// in the url filters table
|
||||
enum {
|
||||
|
@ -14103,7 +14103,13 @@ SafeBuf *XmlDoc::getDiffbotReply ( ) {
|
||||
m_diffbotUrl.pushChar('&');
|
||||
|
||||
//diffbotUrl.safePrintf("http://54.212.86.74/api/%s?token=%s&u="
|
||||
m_diffbotUrl.safePrintf("token=%s",cr->m_diffbotToken.getBufStart());
|
||||
// only print token if we have one, because if user provides their
|
||||
// own diffbot url (apiUrl in Parms.cpp) then they might include
|
||||
// the token in that for their non-custom crawl. m_customCrawl=0.
|
||||
if ( cr->m_diffbotToken.length())
|
||||
m_diffbotUrl.safePrintf("token=%s",
|
||||
cr->m_diffbotToken.getBufStart());
|
||||
|
||||
m_diffbotUrl.safePrintf("&url=");
|
||||
// give diffbot the url to process
|
||||
m_diffbotUrl.urlEncode ( m_firstUrl.getUrl() );
|
||||
|
@ -288,6 +288,7 @@
|
||||
# <i>spider priority</i> of <i>DELETE</i> will cause the URL to not be
|
||||
# spidered, or if it has already been indexed, it will be deleted when it is
|
||||
# respidered.<br><br>
|
||||
<urlFiltersProfile>0</>
|
||||
<filterExpression><![CDATA[isdocidbased]]></>
|
||||
<filterExpression><![CDATA[ismedia]]></>
|
||||
<filterExpression><![CDATA[errorcount>=3 && hastmperror]]></>
|
||||
|
Loading…
Reference in New Issue
Block a user