2014-02-09 02:10:06 +04:00
|
|
|
#include "SafeBuf.h"
|
|
|
|
#include "HttpRequest.h"
|
2014-02-09 03:24:33 +04:00
|
|
|
#include "SearchInput.h"
|
|
|
|
#include "Pages.h"
|
|
|
|
#include "Parms.h"
|
2014-02-09 09:57:30 +04:00
|
|
|
#include "Spider.h"
|
2014-02-09 02:10:06 +04:00
|
|
|
|
2014-02-10 09:51:16 +04:00
|
|
|
bool printExampleTable ( SafeBuf *sb , HttpRequest *hr ) ;
|
2014-02-09 02:10:06 +04:00
|
|
|
|
2014-02-10 02:09:48 +04:00
|
|
|
///////////
|
2014-02-09 02:10:06 +04:00
|
|
|
//
|
|
|
|
// main > Basic > Settings
|
|
|
|
//
|
2014-02-10 02:09:48 +04:00
|
|
|
///////////
|
2014-02-10 09:25:08 +04:00
|
|
|
/*
|
2014-02-09 02:10:06 +04:00
|
|
|
bool sendPageBasicSettings ( TcpSocket *socket , HttpRequest *hr ) {
|
|
|
|
|
|
|
|
char buf [ 128000 ];
|
|
|
|
SafeBuf sb(buf,128000);
|
|
|
|
|
2014-02-10 03:41:43 +04:00
|
|
|
// true = usedefault coll?
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
|
2014-02-09 02:10:06 +04:00
|
|
|
if ( ! cr ) {
|
|
|
|
g_httpServer.sendErrorReply(socket,500,"invalid collection");
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-02-10 09:51:16 +04:00
|
|
|
// . print standard header
|
|
|
|
// . this prints the <form tag as well
|
|
|
|
g_pages.printAdminTop ( &sb , socket , hr );
|
2014-02-09 02:10:06 +04:00
|
|
|
|
|
|
|
|
2014-02-10 09:51:16 +04:00
|
|
|
g_parms.printParms ( &sb , socket , hr );
|
2014-02-09 02:10:06 +04:00
|
|
|
|
2014-02-10 09:51:16 +04:00
|
|
|
|
|
|
|
printExampleTable ( &sb , hr );
|
2014-02-09 02:10:06 +04:00
|
|
|
|
2014-02-10 09:51:16 +04:00
|
|
|
// wrap up the form, print a submit button
|
|
|
|
g_pages.printAdminBottom ( &sb );
|
2014-02-09 02:10:06 +04:00
|
|
|
|
|
|
|
|
|
|
|
return g_httpServer.sendDynamicPage ( socket,
|
|
|
|
sb.getBufStart() ,
|
|
|
|
sb.length() ,
|
|
|
|
-1 ,
|
|
|
|
false,//POSTReply ,
|
|
|
|
NULL , // contType
|
|
|
|
-1 , // httpstatus
|
|
|
|
NULL,//cookie ,
|
|
|
|
NULL );// charset
|
|
|
|
}
|
2014-02-10 09:25:08 +04:00
|
|
|
*/
|
2014-02-09 02:10:06 +04:00
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
class PatternData {
|
|
|
|
public:
|
|
|
|
// hash of the subdomain or domain for this line in sitelist
|
|
|
|
long m_thingHash32;
|
|
|
|
// ptr to the line in CollectionRec::m_siteListBuf
|
|
|
|
char *m_patternStr;
|
|
|
|
// offset of the url path in the pattern, 0 means none
|
|
|
|
short m_pathOff;
|
|
|
|
short m_pathLen;
|
|
|
|
};
|
|
|
|
|
|
|
|
|
2014-02-09 23:38:40 +04:00
|
|
|
// . CommandUpdateSiteList() should call this
|
|
|
|
// . this returns false if it blocks
|
|
|
|
// . returns true and sets g_errno on error
|
|
|
|
// . uses msg4 to add seeds to spiderdb if necessary
|
|
|
|
// . only adds seeds for the shard we are on iff we are responsible for
|
|
|
|
// the fake firstip!!!
|
|
|
|
bool updateSiteList ( collnum_t collnum , char *siteList ) {
|
2014-02-09 09:57:30 +04:00
|
|
|
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( collnum );
|
2014-02-09 23:38:40 +04:00
|
|
|
if ( ! cr ) return true;
|
2014-02-09 09:57:30 +04:00
|
|
|
|
|
|
|
// this might make a new spidercoll...
|
|
|
|
SpiderColl *sc = g_spiderCache.getSpiderColl ( cr->m_collnum );
|
|
|
|
|
2014-02-09 23:38:40 +04:00
|
|
|
// sanity. if in use we should not even be here
|
|
|
|
if ( sc->m_msg4x.m_inUse ) {
|
|
|
|
log("basic: trying to update site list while previous "
|
|
|
|
"update still outstanding.");
|
|
|
|
g_errno = EBADENGINEER;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
// when sitelist is update Parms.cpp should invalidate this flag!
|
|
|
|
//if ( sc->m_siteListTableValid ) return true;
|
|
|
|
|
|
|
|
// hash current sitelist entries, each line so we don't add
|
|
|
|
// dup requests into spiderdb i guess...
|
|
|
|
HashTableX dedup;
|
2014-02-09 23:38:40 +04:00
|
|
|
if ( ! dedup.set ( 4,0,1024,NULL,0,false,0,"sldt") ) return true;
|
2014-02-09 09:57:30 +04:00
|
|
|
// this is a safebuf PARM in Parms.cpp now HOWEVER, not really
|
|
|
|
// because we set it here from a call to CommandUpdateSiteList()
|
|
|
|
// because it requires all this computational crap.
|
|
|
|
char *op = cr->m_siteListBuf.getBufStart();
|
|
|
|
// scan and hash each line in it
|
|
|
|
for ( ; *op ; op++ ) {
|
|
|
|
// get end
|
|
|
|
char *s = op;
|
|
|
|
// skip to end of line marker
|
|
|
|
for ( ; *op && *op != '\n' ; op++ ) ;
|
|
|
|
// keep it simple
|
|
|
|
long h32 = hash32 ( s , op - s );
|
|
|
|
// for deduping
|
2014-02-09 23:38:40 +04:00
|
|
|
if ( ! dedup.addKey ( &h32 ) ) return true;
|
2014-02-09 09:57:30 +04:00
|
|
|
}
|
|
|
|
|
|
|
|
// get the old sitelist Domain Hash to PatternData mapping table
|
|
|
|
HashTableX *dt = &sc->m_siteListDomTable;
|
|
|
|
|
|
|
|
// reset it
|
|
|
|
if ( ! dt->set ( 4 ,
|
|
|
|
sizeof(PatternData),
|
|
|
|
1024 ,
|
|
|
|
NULL ,
|
|
|
|
0 ,
|
|
|
|
true , // allow dup keys?
|
|
|
|
0 , // niceness - at least for now
|
|
|
|
"sldt" ) )
|
2014-02-09 23:38:40 +04:00
|
|
|
return true;
|
2014-02-09 09:57:30 +04:00
|
|
|
|
|
|
|
|
|
|
|
// we can now free the old site list methinks
|
|
|
|
cr->m_siteListBuf.purge();
|
|
|
|
|
|
|
|
// reset flags
|
|
|
|
sc->m_siteListAsteriskLine = NULL;
|
|
|
|
sc->m_siteListHasNegatives = false;
|
|
|
|
sc->m_siteListIsEmpty = true;
|
|
|
|
|
2014-02-09 23:38:40 +04:00
|
|
|
// use this so it will be free automatically when msg4 completes!
|
|
|
|
SafeBuf *spiderReqBuf = &sc->m_msg4x.m_tmpBuf;
|
2014-02-09 09:57:30 +04:00
|
|
|
|
|
|
|
// scan the list
|
|
|
|
char *pn = siteList;
|
|
|
|
|
|
|
|
// completely empty?
|
|
|
|
if ( ! pn ) return true;
|
|
|
|
|
|
|
|
long lineNum = 1;
|
|
|
|
|
|
|
|
Url u;
|
|
|
|
|
|
|
|
for ( ; *pn ; pn++ , lineNum++ ) {
|
|
|
|
|
|
|
|
// get end
|
|
|
|
char *s = pn;
|
|
|
|
// skip to end of line marker
|
|
|
|
for ( ; *pn && *pn != '\n' ; pn++ ) ;
|
|
|
|
|
|
|
|
char *start = s;
|
|
|
|
|
|
|
|
// back p up over spaces in case ended in spaces
|
|
|
|
char *pe = pn;
|
|
|
|
for ( ; pe > s && pe[-1] == ' ' ; pe-- );
|
|
|
|
|
|
|
|
// make hash of the line
|
|
|
|
long h32 = hash32 ( s , pe - s );
|
|
|
|
|
|
|
|
bool exact = false;
|
|
|
|
|
|
|
|
innerLoop:
|
|
|
|
// skip spaces at start of line
|
|
|
|
if ( *s == ' ' ) s++;
|
|
|
|
|
|
|
|
// comment?
|
|
|
|
if ( *s == '#' ) continue;
|
|
|
|
|
|
|
|
// empty line?
|
|
|
|
if ( *s == '\n' ) continue;
|
|
|
|
|
|
|
|
// all?
|
|
|
|
if ( *s == '*' ) {
|
|
|
|
sc->m_siteListAsteriskLine = start;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( *s == '-' ) {
|
|
|
|
sc->m_siteListHasNegatives = true;
|
|
|
|
s++;
|
|
|
|
}
|
|
|
|
|
|
|
|
// exact:?
|
|
|
|
if ( strncmp(s,"exact:",6) == 0 ) {
|
|
|
|
exact = true;
|
|
|
|
s += 6;
|
|
|
|
goto innerLoop;
|
|
|
|
}
|
|
|
|
|
|
|
|
u.set ( s , pe - s );
|
|
|
|
|
|
|
|
// error? skip it then...
|
|
|
|
if ( u.getHostLen() <= 0 ) {
|
|
|
|
log("basic: error on line #%li in sitelist",lineNum);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2014-02-09 23:38:40 +04:00
|
|
|
// see if in existing table for existing site list
|
|
|
|
if ( ! dedup.isInTable ( &h32 ) ) {
|
|
|
|
// make spider request
|
|
|
|
SpiderRequest sreq;
|
|
|
|
sreq.setFromAddUrl ( u.getUrl() );
|
|
|
|
// . add this url to spiderdb as a spiderrequest
|
|
|
|
// . calling msg4 will be the last thing we do
|
|
|
|
if(!spiderReqBuf->safeMemcpy(&sreq,sreq.getRecSize()))
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
// make the data node
|
|
|
|
PatternData pd;
|
|
|
|
// hash of the subdomain or domain for this line in sitelist
|
|
|
|
pd.m_thingHash32 = u.getHostHash32();
|
|
|
|
// . ptr to the line in CollectionRec::m_siteListBuf.
|
|
|
|
// . includes pointing to "exact:" too i guess and tag: later.
|
|
|
|
pd.m_patternStr = start;
|
|
|
|
// offset of the url path in the pattern, 0 means none
|
|
|
|
pd.m_pathOff = 0;
|
|
|
|
// scan url pattern, it should start at "s"
|
|
|
|
char *x = s;
|
|
|
|
// go all the way to the end
|
|
|
|
for ( ; *x && x < pe ; x++ ) {
|
|
|
|
// skip ://
|
|
|
|
if ( x[0] == ':' && x[1] =='/' && x[2] == '/' ) {
|
|
|
|
x += 2;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// stop if we hit another /, that is path start
|
|
|
|
if ( x[0] != '/' ) continue;
|
|
|
|
x++;
|
|
|
|
// empty path besides the /?
|
|
|
|
if ( x >= pe ) break;
|
|
|
|
// ok, we got something here i think
|
|
|
|
if ( u.getPathLen() <= 1 ) { char *xx=NULL;*xx=0; }
|
|
|
|
// calc length from "start" of line so we can
|
|
|
|
// jump to the path quickly for compares
|
|
|
|
pd.m_pathOff = x - start;
|
|
|
|
pd.m_pathLen = pe - x;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
// add to new dt
|
|
|
|
long domHash32 = u.getDomainHash32();
|
|
|
|
if ( ! dt->addKey ( &domHash32 , &pd ) )
|
2014-02-09 23:38:40 +04:00
|
|
|
return true;
|
2014-02-09 09:57:30 +04:00
|
|
|
|
|
|
|
// we have some patterns in there
|
|
|
|
sc->m_siteListIsEmpty = false;
|
|
|
|
}
|
|
|
|
|
|
|
|
// go back to a high niceness
|
|
|
|
dt->m_niceness = MAX_NICENESS;
|
|
|
|
|
|
|
|
long siteListLen = gbstrlen(siteList);
|
|
|
|
cr->m_siteListBuf.safeMemcpy ( siteList , siteListLen + 1 );
|
|
|
|
|
2014-02-09 23:38:40 +04:00
|
|
|
|
|
|
|
// use spidercoll to contain this msg4 but if in use it
|
|
|
|
// won't be able to be deleted until it comes back..
|
|
|
|
if ( ! sc->m_msg4x.addMetaList ( spiderReqBuf ,
|
|
|
|
sc->m_collnum ,
|
|
|
|
// no need for callback since m_msg4x
|
|
|
|
// should set msg4::m_inUse to false
|
|
|
|
// when it comes back
|
|
|
|
NULL , // state
|
|
|
|
NULL , // callback
|
|
|
|
MAX_NICENESS
|
|
|
|
) )
|
|
|
|
return false;
|
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
// . Spider.cpp calls this to see if a url it wants to spider is
|
|
|
|
// in our "site list"
|
|
|
|
// . we should return the row of the FIRST match really
|
|
|
|
// . the url patterns all contain a domain now, so this can use the domain
|
|
|
|
// hash to speed things up
|
|
|
|
// . return ptr to the start of the line in case it has "tag:" i guess
|
|
|
|
char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
|
|
|
|
|
|
|
|
// if it has * and no negatives, we are in!
|
|
|
|
if ( sc->m_siteListAsteriskLine && ! sc->m_siteListHasNegatives )
|
|
|
|
return sc->m_siteListAsteriskLine;
|
|
|
|
|
|
|
|
// if it is just a bunch of comments or blank lines, it is empty
|
|
|
|
if ( sc->m_siteListIsEmpty )
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
char *myPath = NULL;
|
|
|
|
|
|
|
|
// check domain specific tables
|
|
|
|
HashTableX *dt = &sc->m_siteListDomTable;
|
|
|
|
|
|
|
|
// sanity check
|
|
|
|
if ( dt->getNumSlotsUsed() == 0 ) { char *xx=NULL;*xx=0; }
|
|
|
|
|
|
|
|
long slot = dt->getSlot ( &sreq->m_domHash32 );
|
|
|
|
|
|
|
|
// loop over all the patterns that contain this domain and see
|
|
|
|
// the first one we match, and if we match a negative one.
|
|
|
|
for ( ; slot >= 0 ; slot = dt->getNextSlot(slot,&sreq->m_domHash32)) {
|
|
|
|
// get pattern
|
|
|
|
PatternData *pd = (PatternData *)dt->getValueFromSlot ( slot );
|
|
|
|
// is it negative? return NULL if so so url will be ignored
|
|
|
|
//if ( pd->m_patternStr[0] == '-' )
|
|
|
|
// return NULL;
|
|
|
|
// otherwise, it has a path. skip if we don't match path ptrn
|
|
|
|
if ( pd->m_pathOff ) {
|
2014-02-09 23:38:40 +04:00
|
|
|
if ( ! myPath ) myPath = sreq->getUrlPath();
|
2014-02-09 09:57:30 +04:00
|
|
|
if ( strncmp (myPath,
|
|
|
|
pd->m_patternStr + pd->m_pathOff,
|
|
|
|
pd->m_pathLen ) )
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
// is it just a plain domain?
|
|
|
|
if ( pd->m_thingHash32 == sreq->m_domHash32 )
|
|
|
|
// this will be false if negative pattern i guess
|
|
|
|
return pd->m_patternStr;
|
|
|
|
// is it just a subdomain?
|
|
|
|
if ( pd->m_thingHash32 == sreq->m_hostHash32 )
|
|
|
|
// this will be false if negative pattern i guess
|
|
|
|
return pd->m_patternStr;
|
|
|
|
}
|
|
|
|
|
|
|
|
// is there an '*' in the patterns?
|
|
|
|
if ( sc->m_siteListAsteriskLine ) return sc->m_siteListAsteriskLine;
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2014-02-10 09:51:16 +04:00
|
|
|
bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
|
2014-02-09 02:10:06 +04:00
|
|
|
|
2014-02-10 03:41:43 +04:00
|
|
|
// true = useDefault?
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
|
2014-02-09 02:10:06 +04:00
|
|
|
if ( ! cr ) return true;
|
|
|
|
|
2014-02-10 09:51:16 +04:00
|
|
|
/*
|
2014-02-09 02:10:06 +04:00
|
|
|
// it is a safebuf parm
|
|
|
|
char *siteList = cr->m_siteListBuf.getBufStart();
|
2014-02-09 09:57:30 +04:00
|
|
|
if ( ! siteList ) siteList = "";
|
2014-02-09 02:10:06 +04:00
|
|
|
|
|
|
|
SafeBuf msgBuf;
|
|
|
|
char *status = "";
|
2014-02-09 09:57:30 +04:00
|
|
|
long max = 1000000;
|
|
|
|
if ( cr->m_siteListBuf.length() > max ) {
|
2014-02-09 02:10:06 +04:00
|
|
|
msgBuf.safePrintf( "<font color=red><b>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"Site list is over %li bytes large, "
|
|
|
|
"too many to "
|
2014-02-09 02:10:06 +04:00
|
|
|
"display on this web page. Please use the "
|
|
|
|
"file upload feature only for now."
|
|
|
|
"</b></font>"
|
|
|
|
, max );
|
|
|
|
status = " disabled";
|
|
|
|
}
|
2014-02-10 09:51:16 +04:00
|
|
|
*/
|
2014-02-09 02:10:06 +04:00
|
|
|
|
|
|
|
|
2014-02-10 09:51:16 +04:00
|
|
|
/*
|
2014-02-09 23:38:40 +04:00
|
|
|
sb->safePrintf(
|
2014-02-09 02:10:06 +04:00
|
|
|
"On the command like you can issue a command like "
|
|
|
|
|
|
|
|
"<i>"
|
|
|
|
"gb addurls < fileofurls.txt"
|
|
|
|
"</i> or "
|
|
|
|
|
|
|
|
"<i>"
|
|
|
|
"gb addfile < *.html"
|
|
|
|
"</i> or "
|
|
|
|
|
|
|
|
"<i>"
|
|
|
|
"gb injecturls < fileofurls.txt"
|
|
|
|
"</i> or "
|
|
|
|
|
|
|
|
"<i>"
|
|
|
|
"gb injectfile < *.html"
|
|
|
|
"</i> or "
|
|
|
|
|
|
|
|
"to schedule downloads or inject content directly "
|
|
|
|
"into Gigablast."
|
|
|
|
|
2014-02-10 06:09:44 +04:00
|
|
|
"</td><td>"
|
|
|
|
|
|
|
|
"<input "
|
|
|
|
"size=20 "
|
|
|
|
"type=file "
|
|
|
|
"name=urls>"
|
|
|
|
"</td></tr>"
|
|
|
|
|
2014-02-09 02:10:06 +04:00
|
|
|
);
|
2014-02-10 09:51:16 +04:00
|
|
|
*/
|
2014-02-09 02:10:06 +04:00
|
|
|
|
|
|
|
// example table
|
2014-02-10 06:09:44 +04:00
|
|
|
sb->safePrintf ( "<a name=examples></a>"
|
|
|
|
"<table %s>"
|
|
|
|
"<tr class=hdrow><td colspan=2>"
|
|
|
|
"<center><b>Examples</b></tr></tr>"
|
|
|
|
"<tr bgcolor=#%s>"
|
|
|
|
"<td>"
|
|
|
|
,TABLE_STYLE , DARK_BLUE);
|
2014-02-09 02:10:06 +04:00
|
|
|
|
|
|
|
sb->safePrintf(
|
2014-02-10 06:09:44 +04:00
|
|
|
"*"
|
|
|
|
"</td>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"<td>Spider all urls encountered. If you just submit "
|
2014-02-09 02:10:06 +04:00
|
|
|
"this by itself, then Gigablast will initiate spidering "
|
|
|
|
"automatically at dmoz.org, an internet "
|
|
|
|
"directory of good sites.</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
|
|
|
|
// protocol and subdomain match
|
2014-02-09 02:10:06 +04:00
|
|
|
"<tr>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"<td>http://www.goodstuff.com/</td>"
|
2014-02-09 02:10:06 +04:00
|
|
|
"<td>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"Matches urls beginning with "
|
|
|
|
"http://www.goodstuff.com/. "
|
|
|
|
"This is also used as a seed."
|
2014-02-09 02:10:06 +04:00
|
|
|
"</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
// subdomain match
|
2014-02-09 02:10:06 +04:00
|
|
|
"<tr>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"<td>www.goodstuff.com</td>"
|
2014-02-09 02:10:06 +04:00
|
|
|
"<td>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"Exact subdomain match. "
|
|
|
|
"Spider urls on www.goodstuff.com."
|
|
|
|
"This is also used as a seed."
|
2014-02-09 02:10:06 +04:00
|
|
|
"</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
// domain match
|
2014-02-09 02:10:06 +04:00
|
|
|
"<tr>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"<td>goodstuff.com</td>"
|
2014-02-09 02:10:06 +04:00
|
|
|
"<td>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"Exact domain match. "
|
|
|
|
"Spider urls on goodstuff.com and on "
|
|
|
|
"any subdomain of goodstuff.com."
|
|
|
|
"This is also used as a seed, but you might "
|
|
|
|
"have to end up adding www.goodstuff.com separately."
|
2014-02-09 02:10:06 +04:00
|
|
|
"</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
// subdomain AND subdir match
|
2014-02-09 02:10:06 +04:00
|
|
|
"<tr>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"<td>www.goodstuff.com/goodir/anotherdir/</td>"
|
2014-02-09 02:10:06 +04:00
|
|
|
"<td>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"Exact subdomain match and parital subdir match. "
|
|
|
|
"Matches urls on www.goodstuff.com and in the "
|
|
|
|
"/gooddir/anotherdir/ subdirectory. "
|
|
|
|
"This is also used as a seed, but you might "
|
|
|
|
"have to end up adding another seed if it is 404."
|
2014-02-09 02:10:06 +04:00
|
|
|
"</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
|
|
|
|
// exact match
|
2014-02-09 02:10:06 +04:00
|
|
|
"<tr>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"<td>exact:http://xyz.goodstuff.com/</td>"
|
2014-02-09 02:10:06 +04:00
|
|
|
"<td>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"Seed url. "
|
|
|
|
"Matches the single url "
|
|
|
|
"http://xyz.goodstuff.com/."
|
|
|
|
"This is also used as a seed."
|
2014-02-09 02:10:06 +04:00
|
|
|
"</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
// exact match
|
2014-02-09 02:10:06 +04:00
|
|
|
"<tr>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"<td>exact:http://xyz.goodstuff.com/mypage.html</td>"
|
2014-02-09 02:10:06 +04:00
|
|
|
"<td>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"Matches the single url "
|
|
|
|
"http://xyz.goodstuff.com/mypage.html."
|
|
|
|
"This is also used as a seed."
|
2014-02-09 02:10:06 +04:00
|
|
|
"</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
// local subdir match
|
2014-02-09 02:10:06 +04:00
|
|
|
"<tr>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"<td>file://C/mydir/mysubdir/"
|
2014-02-09 02:10:06 +04:00
|
|
|
"<td>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"Matches all local files in the specified directory. "
|
|
|
|
"This is also used as a seed."
|
2014-02-09 02:10:06 +04:00
|
|
|
"</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
// connect to a device and index it as a stream
|
2014-02-10 09:25:08 +04:00
|
|
|
//"<tr>"
|
|
|
|
//"<td>stream:/dev/eth0"
|
|
|
|
//"<td>"
|
|
|
|
//"Connect to a device and index it as a stream. "
|
|
|
|
//"It will be treated like a single huge document for "
|
|
|
|
//"searching purposes with chunks being indexed in "
|
|
|
|
//"realtime. Or chunk it up into individual document "
|
|
|
|
//"chunks, but proximity term searching will have to "
|
|
|
|
//"be adjusted to compute query term distances "
|
|
|
|
//"inter-document."
|
|
|
|
//"</td>"
|
|
|
|
//"</tr>"
|
2014-02-09 02:10:06 +04:00
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
// negative subdomain match
|
2014-02-09 02:10:06 +04:00
|
|
|
"<tr>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"<td>-badstuff.com</td>"
|
|
|
|
"<td>Exclude all pages from the badstuff.com domain. "
|
|
|
|
"Start the url pattern with a - to exclude it from "
|
|
|
|
"the spider set.</td>"
|
2014-02-09 02:10:06 +04:00
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"<td>regexp:-pid=[0-9A-Z]+/</td>"
|
|
|
|
"<td>Url must match this regular expression. "
|
|
|
|
"Try to avoid using these if possible; they can slow "
|
|
|
|
"things down and are confusing to use."
|
|
|
|
"</td>"
|
2014-02-09 02:10:06 +04:00
|
|
|
"</tr>"
|
2014-02-09 09:57:30 +04:00
|
|
|
|
2014-02-09 02:10:06 +04:00
|
|
|
|
2014-02-09 09:57:30 +04:00
|
|
|
// tag match
|
2014-02-10 06:09:44 +04:00
|
|
|
"<tr><td>"
|
2014-02-09 09:57:30 +04:00
|
|
|
//"<td>tag:boots contains:boots<br>"
|
2014-02-10 06:09:44 +04:00
|
|
|
"tag:boots www.westernfootwear.com<br>"
|
|
|
|
"tag:boots www.cowboyshop.com<br>"
|
|
|
|
"tag:boots www.moreboots.com<br>"
|
|
|
|
"tag:boots www.lotsoffootwear.com<br>"
|
2014-02-09 09:57:30 +04:00
|
|
|
//"<td>t:boots -contains:www.cowboyshop.com/shoes/</td>"
|
2014-02-10 06:09:44 +04:00
|
|
|
"</td><td>"
|
2014-02-09 09:57:30 +04:00
|
|
|
"Advance users only. "
|
|
|
|
"Tag any urls matching these 4 url patterns "
|
|
|
|
"so we can use "
|
|
|
|
"the expression <i>tag:boots</i> in the "
|
|
|
|
"<a href=/scheduler>spider scheduler</a> and perhaps "
|
|
|
|
"give such urls higher spider priority."
|
|
|
|
"For more "
|
|
|
|
"precise spidering control over url subsets. "
|
|
|
|
"Preceed any pattern with the tagname followed by "
|
|
|
|
"space to tag it."
|
|
|
|
"</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td># This line is a comment.</td>"
|
|
|
|
"<td>Empty lines and lines starting with # are "
|
|
|
|
"ignored."
|
2014-02-09 02:10:06 +04:00
|
|
|
"</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"</table>"
|
|
|
|
);
|
2014-02-09 23:38:40 +04:00
|
|
|
|
|
|
|
return true;
|
2014-02-09 02:10:06 +04:00
|
|
|
}
|
2014-02-10 09:51:16 +04:00
|
|
|
|
2014-02-10 02:09:48 +04:00
|
|
|
|
|
|
|
// from pagecrawlbot.cpp for printCrawlDetailsInJson()
|
|
|
|
#include "PageCrawlBot.h"
|
|
|
|
|
|
|
|
///////////
|
|
|
|
//
|
|
|
|
// main > Basic > Status
|
|
|
|
//
|
|
|
|
///////////
|
|
|
|
bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
|
|
|
|
|
|
|
|
char buf [ 128000 ];
|
|
|
|
SafeBuf sb(buf,128000);
|
|
|
|
|
|
|
|
char *fs = hr->getString("format",NULL,NULL);
|
|
|
|
char fmt = FORMAT_HTML;
|
|
|
|
if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
|
|
|
|
if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
|
|
|
|
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
|
|
|
|
|
|
|
|
|
2014-02-10 03:41:43 +04:00
|
|
|
// true = usedefault coll?
|
|
|
|
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
|
2014-02-10 02:09:48 +04:00
|
|
|
if ( ! cr ) {
|
|
|
|
g_httpServer.sendErrorReply(socket,500,"invalid collection");
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( fmt == FMT_JSON ) {
|
|
|
|
printCrawlDetailsInJson ( &sb , cr );
|
|
|
|
return g_httpServer.sendDynamicPage (socket,
|
|
|
|
sb.getBufStart(),
|
|
|
|
sb.length(),
|
|
|
|
0); // cachetime
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// print standard header
|
|
|
|
if ( fmt == FORMAT_HTML )
|
|
|
|
// this prints the <form tag as well
|
|
|
|
g_pages.printAdminTop ( &sb , socket , hr );
|
|
|
|
|
|
|
|
|
|
|
|
//
|
|
|
|
// show stats
|
|
|
|
//
|
|
|
|
if ( fmt == FMT_HTML ) {
|
|
|
|
|
|
|
|
char *seedStr = cr->m_diffbotSeeds.getBufStart();
|
|
|
|
if ( ! seedStr ) seedStr = "";
|
|
|
|
|
|
|
|
SafeBuf tmp;
|
|
|
|
long crawlStatus = -1;
|
|
|
|
getSpiderStatusMsg ( cr , &tmp , &crawlStatus );
|
|
|
|
CrawlInfo *ci = &cr->m_localCrawlInfo;
|
|
|
|
long sentAlert = (long)ci->m_sentCrawlDoneAlert;
|
|
|
|
if ( sentAlert ) sentAlert = 1;
|
|
|
|
|
|
|
|
sb.safePrintf(
|
|
|
|
|
|
|
|
"<form method=get action=/crawlbot>"
|
|
|
|
"%s"
|
|
|
|
, sb.getBufStart() // hidden input token/name/..
|
|
|
|
);
|
|
|
|
sb.safePrintf("<TABLE border=0>"
|
|
|
|
"<TR><TD valign=top>"
|
|
|
|
|
|
|
|
"<table border=0 cellpadding=5>"
|
|
|
|
|
|
|
|
//
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Crawl Name:</td>"
|
|
|
|
"<td>%s</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Crawl Type:</td>"
|
|
|
|
"<td>%li</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
//"<tr>"
|
|
|
|
//"<td><b>Collection Alias:</td>"
|
|
|
|
//"<td>%s%s</td>"
|
|
|
|
//"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Token:</td>"
|
|
|
|
"<td>%s</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Seeds:</td>"
|
|
|
|
"<td>%s</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Crawl Status:</td>"
|
|
|
|
"<td>%li</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Crawl Status Msg:</td>"
|
|
|
|
"<td>%s</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Rounds Completed:</td>"
|
|
|
|
"<td>%li</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Has Urls Ready to Spider:</td>"
|
|
|
|
"<td>%li</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
|
|
|
|
// this will have to be in crawlinfo too!
|
|
|
|
//"<tr>"
|
|
|
|
//"<td><b>pages indexed</b>"
|
|
|
|
//"<td>%lli</td>"
|
|
|
|
//"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Objects Found</b></td>"
|
|
|
|
"<td>%lli</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>URLs Harvested</b> (inc. dups)</td>"
|
|
|
|
"<td>%lli</td>"
|
|
|
|
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
//"<tr>"
|
|
|
|
//"<td><b>URLs Examined</b></td>"
|
|
|
|
//"<td>%lli</td>"
|
|
|
|
//"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Page Crawl Attempts</b></td>"
|
|
|
|
"<td>%lli</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Page Crawl Successes</b></td>"
|
|
|
|
"<td>%lli</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Page Crawl Successes This Round</b></td>"
|
|
|
|
"<td>%lli</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Page Process Attempts</b></td>"
|
|
|
|
"<td>%lli</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Page Process Successes</b></td>"
|
|
|
|
"<td>%lli</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
"<tr>"
|
|
|
|
"<td><b>Page Process Successes This Round</b></td>"
|
|
|
|
"<td>%lli</td>"
|
|
|
|
"</tr>"
|
|
|
|
|
|
|
|
|
|
|
|
, cr->m_diffbotCrawlName.getBufStart()
|
|
|
|
|
|
|
|
, (long)cr->m_isCustomCrawl
|
|
|
|
|
|
|
|
, cr->m_diffbotToken.getBufStart()
|
|
|
|
|
|
|
|
, seedStr
|
|
|
|
|
|
|
|
, crawlStatus
|
|
|
|
, tmp.getBufStart()
|
|
|
|
, cr->m_spiderRoundNum
|
|
|
|
, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
|
|
|
|
|
|
|
|
, cr->m_globalCrawlInfo.m_objectsAdded -
|
|
|
|
cr->m_globalCrawlInfo.m_objectsDeleted
|
|
|
|
, cr->m_globalCrawlInfo.m_urlsHarvested
|
|
|
|
//, cr->m_globalCrawlInfo.m_urlsConsidered
|
|
|
|
|
|
|
|
, cr->m_globalCrawlInfo.m_pageDownloadAttempts
|
|
|
|
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
|
|
|
|
, cr->m_globalCrawlInfo.m_pageDownloadSuccessesThisRound
|
|
|
|
|
|
|
|
, cr->m_globalCrawlInfo.m_pageProcessAttempts
|
|
|
|
, cr->m_globalCrawlInfo.m_pageProcessSuccesses
|
|
|
|
, cr->m_globalCrawlInfo.m_pageProcessSuccessesThisRound
|
|
|
|
);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
if ( fmt != FORMAT_JSON )
|
|
|
|
// wrap up the form, print a submit button
|
|
|
|
g_pages.printAdminBottom ( &sb );
|
|
|
|
|
|
|
|
return g_httpServer.sendDynamicPage (socket,
|
|
|
|
sb.getBufStart(),
|
|
|
|
sb.length(),
|
|
|
|
0); // cachetime
|
|
|
|
}
|
|
|
|
|