open-source-search-engine/PageBasic.cpp
Matt Wells b1cd0cac86 indexing spider replies now working.
use type:status to see them or
gbstatus:success or gbstatus:tcp or gbstatus:0.
2014-05-09 18:07:38 -07:00

1418 lines
38 KiB
C++

#include "SafeBuf.h"
#include "HttpRequest.h"
#include "SearchInput.h"
#include "Pages.h"
#include "Parms.h"
#include "Spider.h"
#include "PageResults.h" // for RESULT_HEIGHT
// 5 seconds
#define DEFAULT_WIDGET_RELOAD 1000
//bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) ;
///////////
//
// main > Basic > Settings
//
///////////
/*
bool sendPageBasicSettings ( TcpSocket *socket , HttpRequest *hr ) {
char buf [ 128000 ];
SafeBuf sb(buf,128000);
// true = usedefault coll?
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
if ( ! cr ) {
g_httpServer.sendErrorReply(socket,500,"invalid collection");
return true;
}
// process any incoming request
handleSettingsRequest ( socket , hr );
// . print standard header
// . this prints the <form tag as well
g_pages.printAdminTop ( &sb , socket , hr );
g_parms.printParms ( &sb , socket , hr );
printSitePatternExamples ( &sb , hr );
// wrap up the form, print a submit button
g_pages.printAdminBottom ( &sb );
return g_httpServer.sendDynamicPage ( socket,
sb.getBufStart() ,
sb.length() ,
-1 ,
false,//POSTReply ,
NULL , // contType
-1 , // httpstatus
NULL,//cookie ,
NULL );// charset
}
*/
class PatternData {
public:
// hash of the subdomain or domain for this line in sitelist
long m_thingHash32;
// ptr to the line in CollectionRec::m_siteListBuf
char *m_patternStr;
// offset of the url path in the pattern, 0 means none
short m_pathOff;
short m_pathLen;
};
// . Collectiondb.cpp calls this when any parm flagged with
// PF_REBUILDURLFILTERS is updated
// . it only adds sites via msg4 that are in "siteListArg" but NOT in the
// current CollectionRec::m_siteListBuf
// . updates SpiderColl::m_siteListDomTable to see what doms we can spider
// . updates SpiderColl::m_negSubstringBuf and m_posSubStringBuf to
// see what substrings in urls are disallowed/allowable for spidering
// . this returns false if it blocks
// . returns true and sets g_errno on error
// . uses msg4 to add seeds to spiderdb if necessary if "siteListArg"
// has new urls that are not currently in cr->m_siteListBuf
// . only adds seeds for the shard we are on iff we are responsible for
// the fake firstip!!! that way only one shard does the add.
bool updateSiteListBuf ( collnum_t collnum ,
bool addSeeds ,
char *siteListArg ) {
CollectionRec *cr = g_collectiondb.getRec ( collnum );
if ( ! cr ) return true;
// this might make a new spidercoll...
SpiderColl *sc = g_spiderCache.getSpiderColl ( cr->m_collnum );
// sanity. if in use we should not even be here
if ( sc->m_msg4x.m_inUse ) {
log("basic: trying to update site list while previous "
"update still outstanding.");
g_errno = EBADENGINEER;
return true;
}
// when sitelist is update Parms.cpp should invalidate this flag!
//if ( sc->m_siteListTableValid ) return true;
// hash current sitelist entries, each line so we don't add
// dup requests into spiderdb i guess...
HashTableX dedup;
if ( ! dedup.set ( 4,0,1024,NULL,0,false,0,"sldt") ) return true;
// this is a safebuf PARM in Parms.cpp now HOWEVER, not really
// because we set it here from a call to CommandUpdateSiteList()
// because it requires all this computational crap.
char *op = cr->m_siteListBuf.getBufStart();
// scan and hash each line in it
for ( ; *op ; op++ ) {
// get end
char *s = op;
// skip to end of line marker
for ( ; *op && *op != '\n' ; op++ ) ;
// keep it simple
long h32 = hash32 ( s , op - s );
// for deduping
if ( ! dedup.addKey ( &h32 ) ) return true;
}
// get the old sitelist Domain Hash to PatternData mapping table
// which tells us what domains, subdomains or paths we can or
// can not spider...
HashTableX *dt = &sc->m_siteListDomTable;
// reset it
if ( ! dt->set ( 4 ,
sizeof(PatternData),
1024 ,
NULL ,
0 ,
true , // allow dup keys?
0 , // niceness - at least for now
"sldt" ) )
return true;
// clear old shit
sc->m_posSubstringBuf.purge();
sc->m_negSubstringBuf.purge();
// we can now free the old site list methinks
//cr->m_siteListBuf.purge();
// reset flags
//sc->m_siteListAsteriskLine = NULL;
sc->m_siteListHasNegatives = false;
sc->m_siteListIsEmpty = true;
// use this so it will be free automatically when msg4 completes!
SafeBuf *spiderReqBuf = &sc->m_msg4x.m_tmpBuf;
//char *siteList = cr->m_siteListBuf.getBufStart();
// scan the list
char *pn = siteListArg;
// completely empty?
if ( ! pn ) return true;
long lineNum = 1;
long added = 0;
Url u;
for ( ; *pn ; lineNum++ ) {
// get end
char *s = pn;
// skip to end of line marker
for ( ; *pn && *pn != '\n' ; pn++ ) ;
char *start = s;
// back p up over spaces in case ended in spaces
char *pe = pn;
for ( ; pe > s && is_wspace_a(pe[-1]) ; pe-- );
// skip over the \n so pn points to next line for next time
if ( *pn == '\n' ) pn++;
// make hash of the line
long h32 = hash32 ( s , pe - s );
bool seedMe = true;
bool isUrl = true;
bool isNeg = false;
bool isFilter = true;
innerLoop:
// skip spaces at start of line
if ( *s == ' ' ) s++;
// comment?
if ( *s == '#' ) continue;
// empty line?
if ( *s == '\n' ) continue;
// all?
//if ( *s == '*' ) {
// sc->m_siteListAsteriskLine = start;
// continue;
//}
if ( *s == '-' ) {
sc->m_siteListHasNegatives = true;
isNeg = true;
s++;
}
// exact:?
//if ( strncmp(s,"exact:",6) == 0 ) {
// s += 6;
// goto innerLoop;
//}
// these will be manual adds and should pass url filters
// because they have the "ismanual" directive override
if ( strncmp(s,"seed:",5) == 0 ) {
s += 5;
isFilter = false;
goto innerLoop;
}
if ( strncmp(s,"site:",5) == 0 ) {
s += 5;
seedMe = false;
goto innerLoop;
}
if ( strncmp(s,"contains:",9) == 0 ) {
s += 9;
seedMe = false;
isUrl = false;
goto innerLoop;
}
long slen = pe - s;
// empty line?
if ( slen <= 0 )
continue;
if ( ! isUrl ) {
// add to string buffers
if ( isNeg ) {
if ( !sc->m_negSubstringBuf.safeMemcpy(s,slen))
return true;
if ( !sc->m_negSubstringBuf.pushChar('\0') )
return true;
continue;
}
// add to string buffers
if ( ! sc->m_posSubstringBuf.safeMemcpy(s,slen) )
return true;
if ( ! sc->m_posSubstringBuf.pushChar('\0') )
return true;
continue;
}
u.set ( s , slen );
// error? skip it then...
if ( u.getHostLen() <= 0 ) {
log("basic: error on line #%li in sitelist",lineNum);
continue;
}
// is fake ip assigned to us?
long firstIp = getFakeIpForUrl2 ( &u );
if ( ! isAssignedToUs( firstIp ) ) continue;
// see if in existing table for existing site list
if ( addSeeds &&
// a "site:" directive mean no seeding
// a "contains:" directive mean no seeding
seedMe &&
! dedup.isInTable ( &h32 ) ) {
// make spider request
SpiderRequest sreq;
sreq.setFromAddUrl ( u.getUrl() );
if (
// . add this url to spiderdb as a spiderrequest
// . calling msg4 will be the last thing we do
!spiderReqBuf->safeMemcpy(&sreq,sreq.getRecSize()))
return true;
// count it
added++;
}
// if it is a "seed: xyz.com" thing it is seed only
// do not use it for a filter rule
if ( ! isFilter ) continue;
// make the data node used for filtering urls during spidering
PatternData pd;
// hash of the subdomain or domain for this line in sitelist
pd.m_thingHash32 = u.getHostHash32();
// . ptr to the line in CollectionRec::m_siteListBuf.
// . includes pointing to "exact:" too i guess and tag: later.
pd.m_patternStr = start;
// offset of the url path in the pattern, 0 means none
pd.m_pathOff = 0;
// scan url pattern, it should start at "s"
char *x = s;
// go all the way to the end
for ( ; *x && x < pe ; x++ ) {
// skip ://
if ( x[0] == ':' && x[1] =='/' && x[2] == '/' ) {
x += 2;
continue;
}
// stop if we hit another /, that is path start
if ( x[0] != '/' ) continue;
x++;
// empty path besides the /?
if ( x >= pe ) break;
// ok, we got something here i think
if ( u.getPathLen() <= 1 ) { char *xx=NULL;*xx=0; }
// calc length from "start" of line so we can
// jump to the path quickly for compares. inc "/"
pd.m_pathOff = (x-1) - start;
pd.m_pathLen = pe - (x-1);
break;
}
// add to new dt
long domHash32 = u.getDomainHash32();
if ( ! dt->addKey ( &domHash32 , &pd ) )
return true;
// we have some patterns in there
sc->m_siteListIsEmpty = false;
}
// go back to a high niceness
dt->m_niceness = MAX_NICENESS;
//long siteListLen = gbstrlen(siteList);
//cr->m_siteListBuf.safeMemcpy ( siteList , siteListLen + 1 );
if ( ! addSeeds ) return true;
log("spider: adding %li seed urls",added);
// use spidercoll to contain this msg4 but if in use it
// won't be able to be deleted until it comes back..
if ( ! sc->m_msg4x.addMetaList ( spiderReqBuf ,
sc->m_collnum ,
// no need for callback since m_msg4x
// should set msg4::m_inUse to false
// when it comes back
NULL , // state
NULL , // callback
MAX_NICENESS ,
RDB_SPIDERDB
) )
return false;
return true;
}
// . Spider.cpp calls this to see if a url it wants to spider is
// in our "site list"
// . we should return the row of the FIRST match really
// . the url patterns all contain a domain now, so this can use the domain
// hash to speed things up
// . return ptr to the start of the line in case it has "tag:" i guess
char *getMatchingUrlPattern ( SpiderColl *sc , SpiderRequest *sreq ) {
// if it has * and no negatives, we are in!
//if ( sc->m_siteListAsteriskLine && ! sc->m_siteListHasNegatives )
// return sc->m_siteListAsteriskLine;
// if it is just a bunch of comments or blank lines, it is empty
if ( sc->m_siteListIsEmpty )
return NULL;
// if we had a list of contains: or regex: directives in the sitelist
// we have to linear scan those
char *nb = sc->m_negSubstringBuf.getBufStart();
char *nbend = nb + sc->m_negSubstringBuf.getLength();
for ( ; nb && nb < nbend ; ) {
// return NULL if matches a negative substring
if ( strstr ( sreq->m_url , nb ) ) return NULL;
// skip it
nb += strlen(nb) + 1;
}
char *myPath = NULL;
// check domain specific tables
HashTableX *dt = &sc->m_siteListDomTable;
// get this
CollectionRec *cr = sc->m_cr;
// need to build dom table for pattern matching?
if ( dt->getNumSlotsUsed() == 0 && cr ) {
// do not add seeds, just make siteListDomTable, etc.
updateSiteListBuf ( sc->m_collnum ,
false , // add seeds?
cr->m_siteListBuf.getBufStart() );
}
if ( dt->getNumSlotsUsed() == 0 ) {
// empty site list -- no matches
return NULL;
//char *xx=NULL;*xx=0; }
}
// this table maps a 32-bit domain hash of a domain to a
// patternData class. only for those urls that have firstIps that
// we handle.
long slot = dt->getSlot ( &sreq->m_domHash32 );
// loop over all the patterns that contain this domain and see
// the first one we match, and if we match a negative one.
for ( ; slot >= 0 ; slot = dt->getNextSlot(slot,&sreq->m_domHash32)) {
// get pattern
PatternData *pd = (PatternData *)dt->getValueFromSlot ( slot );
// is it negative? return NULL if so so url will be ignored
//if ( pd->m_patternStr[0] == '-' )
// return NULL;
// otherwise, it has a path. skip if we don't match path ptrn
if ( pd->m_pathOff ) {
if ( ! myPath ) myPath = sreq->getUrlPath();
if ( strncmp (myPath,
pd->m_patternStr + pd->m_pathOff,
pd->m_pathLen ) )
continue;
}
// was the line just a domain and not a subdomain?
if ( pd->m_thingHash32 == sreq->m_domHash32 )
// this will be false if negative pattern i guess
return pd->m_patternStr;
// was it just a subdomain?
if ( pd->m_thingHash32 == sreq->m_hostHash32 )
// this will be false if negative pattern i guess
return pd->m_patternStr;
}
// if we had a list of contains: or regex: directives in the sitelist
// we have to linear scan those
char *pb = sc->m_posSubstringBuf.getBufStart();
char *pend = pb + sc->m_posSubstringBuf.length();
for ( ; pb && pb < pend ; ) {
// return NULL if matches a negative substring
if ( strstr ( sreq->m_url , pb ) ) return pb;
// skip it
pb += strlen(pb) + 1;
}
// is there an '*' in the patterns?
//if ( sc->m_siteListAsteriskLine ) return sc->m_siteListAsteriskLine;
return NULL;
}
bool printSitePatternExamples ( SafeBuf *sb , HttpRequest *hr ) {
// true = useDefault?
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
if ( ! cr ) return true;
/*
// it is a safebuf parm
char *siteList = cr->m_siteListBuf.getBufStart();
if ( ! siteList ) siteList = "";
SafeBuf msgBuf;
char *status = "";
long max = 1000000;
if ( cr->m_siteListBuf.length() > max ) {
msgBuf.safePrintf( "<font color=red><b>"
"Site list is over %li bytes large, "
"too many to "
"display on this web page. Please use the "
"file upload feature only for now."
"</b></font>"
, max );
status = " disabled";
}
*/
/*
sb->safePrintf(
"On the command like you can issue a command like "
"<i>"
"gb addurls &lt; fileofurls.txt"
"</i> or "
"<i>"
"gb addfile &lt; *.html"
"</i> or "
"<i>"
"gb injecturls &lt; fileofurls.txt"
"</i> or "
"<i>"
"gb injectfile &lt; *.html"
"</i> or "
"to schedule downloads or inject content directly "
"into Gigablast."
"</td><td>"
"<input "
"size=20 "
"type=file "
"name=urls>"
"</td></tr>"
);
*/
// example table
sb->safePrintf ( "<a name=examples></a>"
"<table %s>"
"<tr class=hdrow><td colspan=2>"
"<center><b>Site List Examples</b></tr></tr>"
//"<tr bgcolor=#%s>"
//"<td>"
,TABLE_STYLE );//, DARK_BLUE);
sb->safePrintf(
//"*"
//"</td>"
//"<td>Spider all urls encountered. If you just submit "
//"this by itself, then Gigablast will initiate spidering "
//"automatically at dmoz.org, an internet "
//"directory of good sites.</td>"
//"</tr>"
"<tr>"
"<td>goodstuff.com</td>"
"<td>"
"Spider the url <i>goodstuff.com/</i> and spider "
"any links we harvest that have the domain "
"<i>goodstuff.com</i>"
"</td>"
"</tr>"
// protocol and subdomain match
"<tr>"
"<td>http://www.goodstuff.com/</td>"
"<td>"
"Spider the url "
"<i>http://www.goodstuff.com/</i> and spider "
"any links we harvest that start with "
"<i>http://www.goodstuff.com/</i>"
"</td>"
"</tr>"
"<tr>"
"<td>seed:www.goodstuff.com/myurl.html</td>"
"<td>"
"Spider the url <i>www.goodstuff.com/myurl.html</i>. "
"Add any outlinks we find into the "
"spider queue, but those outlinks will only be "
"spidered if they "
"match ANOTHER line in this site list."
"</td>"
"</tr>"
// protocol and subdomain match
"<tr>"
"<td>site:http://www.goodstuff.com/</td>"
"<td>"
"Allow any urls starting with "
"<i>http://www.goodstuff.com/</i> to be spidered "
"if encountered."
"</td>"
"</tr>"
// subdomain match
"<tr>"
"<td>site:www.goodstuff.com</td>"
"<td>"
"Allow any urls starting with "
"<i>www.goodstuff.com/</i> to be spidered "
"if encountered."
"</td>"
"</tr>"
"<tr>"
"<td>-site:bad.goodstuff.com</td>"
"<td>"
"Do not spider any urls starting with "
"<i>bad.goodstuff.com/</i> to be spidered "
"if encountered."
"</td>"
"</tr>"
// domain match
"<tr>"
"<td>site:goodstuff.com</td>"
"<td>"
"Allow any urls starting with "
"<i>goodstuff.com/</i> to be spidered "
"if encountered."
"</td>"
"</tr>"
// spider this subdir
"<tr>"
"<td><nobr>site:"
"http://www.goodstuff.com/goodir/anotherdir/</nobr></td>"
"<td>"
"Allow any urls starting with "
"<i>http://www.goodstuff.com/goodir/anotherdir/</i> "
"to be spidered "
"if encountered."
"</td>"
"</tr>"
// exact match
//"<tr>"
//"<td>exact:http://xyz.goodstuff.com/myurl.html</td>"
//"<td>"
//"Allow this specific url."
//"</td>"
//"</tr>"
/*
// local subdir match
"<tr>"
"<td>file://C/mydir/mysubdir/"
"<td>"
"Spider all files in the given subdirectory or lower. "
"</td>"
"</tr>"
"<tr>"
"<td>-file://C/mydir/mysubdir/baddir/"
"<td>"
"Do not spider files in this subdirectory."
"</td>"
"</tr>"
*/
// connect to a device and index it as a stream
//"<tr>"
//"<td>stream:/dev/eth0"
//"<td>"
//"Connect to a device and index it as a stream. "
//"It will be treated like a single huge document for "
//"searching purposes with chunks being indexed in "
//"realtime. Or chunk it up into individual document "
//"chunks, but proximity term searching will have to "
//"be adjusted to compute query term distances "
//"inter-document."
//"</td>"
//"</tr>"
// negative subdomain match
"<tr>"
"<td>contains:goodtuff</td>"
"<td>Spider any url containing <i>goodstuff</i>."
"</td>"
"</tr>"
"<tr>"
"<td>-contains:badstuff</td>"
"<td>Do not spider any url containing <i>badstuff</i>."
"</td>"
"</tr>"
/*
"<tr>"
"<td>regexp:-pid=[0-9A-Z]+/</td>"
"<td>Url must match this regular expression. "
"Try to avoid using these if possible; they can slow "
"things down and are confusing to use."
"</td>"
"</tr>"
// tag match
"<tr><td>"
//"<td>tag:boots contains:boots<br>"
"<nobr>tag:boots site:www.westernfootwear."
"</nobr>com<br>"
"tag:boots site:www.cowboyshop.com<br>"
"tag:boots site:www.moreboots.com<br>"
"<nobr>tag:boots site:www.lotsoffootwear.com"
"</nobr><br>"
//"<td>t:boots -contains:www.cowboyshop.com/shoes/</td>"
"</td><td>"
"Advance users only. "
"Tag any urls matching these 4 url patterns "
"so we can use "
"the expression <i>tag:boots</i> in the "
"<a href=/scheduler>spider scheduler</a> and perhaps "
"give such urls higher spider priority."
"For more "
"precise spidering control over url subsets. "
"Preceed any pattern with the tagname followed by "
"space to tag it."
"</td>"
"</tr>"
*/
"<tr>"
"<td># This line is a comment.</td>"
"<td>Empty lines and lines starting with # are "
"ignored."
"</td>"
"</tr>"
"</table>"
);
return true;
}
// from pagecrawlbot.cpp for printCrawlDetailsInJson()
#include "PageCrawlBot.h"
///////////
//
// main > Basic > Status
//
///////////
bool sendPageBasicStatus ( TcpSocket *socket , HttpRequest *hr ) {
char buf [ 128000 ];
SafeBuf sb(buf,128000);
sb.reset();
char *fs = hr->getString("format",NULL,NULL);
char fmt = FORMAT_HTML;
if ( fs && strcmp(fs,"html") == 0 ) fmt = FORMAT_HTML;
if ( fs && strcmp(fs,"json") == 0 ) fmt = FORMAT_JSON;
if ( fs && strcmp(fs,"xml") == 0 ) fmt = FORMAT_XML;
// true = usedefault coll?
CollectionRec *cr = g_collectiondb.getRec ( hr , true );
if ( ! cr ) {
g_httpServer.sendErrorReply(socket,500,"invalid collection");
return true;
}
if ( fmt == FMT_JSON ) {
printCrawlDetailsInJson ( &sb , cr );
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),
sb.length(),
0); // cachetime
}
// print standard header
if ( fmt == FORMAT_HTML )
// this prints the <form tag as well
g_pages.printAdminTop ( &sb , socket , hr );
// table to split between widget and stats in left and right panes
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("<TABLE id=pane>"
"<TR><TD valign=top>");
}
long savedLen1, savedLen2;
//
// widget
//
// put the widget in here, just sort results by spidered date
//
// the scripts do "infinite" scrolling both up and down.
// but if you are at the top then new results will load above
// you and we try to maintain your current visual state even though
// the scrollbar position will change.
//
if ( fmt == FORMAT_HTML ) {
// save position so we can output the widget code
// so user can embed it into their own web page
savedLen1 = sb.length();
sb.safePrintf("<script type=\"text/javascript\">\n\n");
// if user has the scrollbar at the top
// in the widget we do a search every 15 secs
// to try to load more recent results. we should
// return up to 10 results above your last
// top docid and 10 results below it. that way
// no matter which of the 10 results you were
// viewing your view should remaing unchanged.
sb.safePrintf(
// global var
"var forcing;"
"function widget123_handler_reload() {"
// return if reply is not fully ready
"if(this.readyState != 4 )return;"
// if error or empty reply then do nothing
"if(!this.responseText)return;"
// get the widget container
"var w=document.getElementById(\"widget123\");"
// GET DOCID of first div/searchresult
"var sd=document.getElementById("
"\"widget123_scrolldiv\");"
"var cd;"
"if ( sd ) cd=sd.firstChild;"
"var fd=0;"
"if(cd) fd=cd.getAttribute('docid');"
// if the searchbox has the focus then do not
// update the content just yet...
"var qb=document.getElementById(\"qbox\");"
"if(qb&&qb==document.activeElement)"
"return;"
// or if not forced and they scrolled down
// don't jerk them back up again
"if(!forcing&&sd&&sd.scrollTop!=0)return;"
// just set the widget content to the reply
"w.innerHTML=this.responseText;"
//
// find that SAME docid in response and see
// how many new results were added above it
//
"var added=0;"
// did we find the docid?
"var found=0;"
// get div again since we updated innerHTML
"sd=document.getElementById("
"\"widget123_scrolldiv\");"
// scan the kids
"var kid=sd.firstChild;"
// begin the while loop to scan the kids
"while (kid) {"
// if div had no docid it might have been a line
// break div, so ignore
"if (!kid.hasAttribute('docid') ) {"
"kid=kid.nextSibling;"
"continue;"
"}"
// set kd to docid of kid
"var kd=kid.getAttribute('docid');"
// stop if we hit our original top docid
"if(kd==fd) {found=1;break;}"
// otherwise count it as a NEW result we got
"added++;"
// advance kid
"kid=kid.nextSibling;"
// end while loop
"}"
//"alert(\"added=\"+added);"
// how many results did we ADD above the
// reported "topdocid" of the widget?
// it should be in the ajax reply from the
// search engine. how many result were above
// the given "topdocid".
//"var ta=document.getElementById(\"topadd\");"
//"var added=0;"
//"if(ta)added=ta.value;"
// if nothing added do nothing
"if (added==0)return;"
// if original top docid not found, i guess we
// added too many new guys to the top of the
// search results, so don't bother scrolling
// just reset to top
"if (!found) return;"
// show that
//"alert(this.responseText);"
// get the div that has the scrollbar
"var sd=document.getElementById("
"\"widget123_scrolldiv\");"
// save current scroll pos
"var oldpos=parseInt(sd.scrollTop);"
// note it
//"alert (sd.scrollTop);"
// preserve the relative scroll position so we
// do not jerk around since we might have added
// "added" new results to the top.
"sd.scrollTop += added*%li;"
// try to scroll out new results if we are
// still at the top of the scrollbar and
// there are new results to scroll.
"if(oldpos==0)widget123_scroll();}\n\n"
// for preserving scrollbar position
,(long)RESULT_HEIGHT +2*PADDING
);
// scroll the widget up until we hit the 0 position
sb.safePrintf(
"function widget123_scroll() {"
// only scroll if at the top of the widget
// and not scrolled down so we do not
// interrupt
"var sd=document.getElementById("
"\"widget123_scrolldiv\");"
// TODO: need parseInt here?
"var pos=parseInt(sd.scrollTop);"
// note it
//"alert (sd.scrollTop);"
// if already at the top of widget, return
"if(pos==0)return;"
// decrement by 3 pixels
"pos=pos-3;"
// do not go negative
"if(pos<0)pos=0;"
// assign to scroll up. TODO: need +\"px\"; ?
"sd.scrollTop=pos;"
// all done, then return
"if(pos==0) return;"
// otherwise, scroll more in 3ms
// TODO: make this 1000ms on result boundaries
// so it delays on each new result. perhaps make
// it less than 1000ms if we have a lot of
// results above us!
"setTimeout('widget123_scroll()',3);}\n\n"
);
// this function appends the search results to what is
// already in the widget.
sb.safePrintf(
"function widget123_handler_append() {"
// return if reply is not fully ready
"if(this.readyState != 4 )return;"
// i guess we are done... release the lock
"outstanding=0;"
// if error or empty reply then do nothing
"if(!this.responseText)return;"
// if too small
"if(this.responseText.length<=3)return;"
// get the widget container
"var w=document.getElementById("
"\"widget123_scrolldiv\");"
// just set the widget content to the reply
"w.innerHTML+=this.responseText;"
"}\n\n"
);
//sb.safePrintf ( "</script>\n\n" );
long widgetWidth = 300;
long widgetHeight = 500;
// make the ajax url that gets the search results
SafeBuf ub;
ub.safePrintf("/search"
//"format=ajax"
"?c=%s"
//"&prepend=gbsortbyint%%3Agbspiderdate"
"&q=-gbstatus:0+gbsortbyint%%3Agbspiderdate"
"&sc=0" // no site clustering
"&dr=0" // no deduping
// 10 results at a time
"&n=10"
"&widgetheight=%li"
"&widgetwidth=%li"
, cr->m_coll
, widgetHeight
, widgetWidth
);
//ub.safePrintf("&topdocid="
// );
// get the search results from neo as soon as this div is
// being rendered, and set its contents to them
sb.safePrintf(//"<script type=text/javascript>"
"function widget123_reload(force) {"
// when the user submits a new query in the
// query box we set force to false when
// we call this (see PageResults.cpp) so that
// we do not register multiple timeouts
"if ( ! force ) "
"setTimeout('widget123_reload(0)',%li);"
// get the query box
"var qb=document.getElementById(\"qbox\");"
// if forced then turn off focus for searchbox
// since it was either 1) the initial call
// or 2) someone submitted a query and
// we got called from PageResults.cpp
// onsubmit event.
"if (force&&qb) qb.blur();"
// if the searchbox has the focus then do not
// reload!! unless force is true..
"if(qb&&qb==document.activeElement&&!force)"
"return;"
//"var ee=document.getElementById(\"sbox\");"
//"if (ee)alert('reloading '+ee.style.display);"
// do not do timer reload if searchbox is
// visible because we do not want to interrupt
// a possible search
//"if(!force&&ee && ee.style.display=='')return;"
// do not bother timed reloading if scrollbar pos
// not at top or near bottom
"var sd=document.getElementById("
"\"widget123_scrolldiv\");"
"if ( sd && !force ) {"
"var pos=parseInt(sd.scrollTop);"
"if (pos!=0) return;"
"}"
"var client=new XMLHttpRequest();"
"client.onreadystatechange="
"widget123_handler_reload;"
// . this url gets the search results
// . get them in "ajax" format so we can embed
// them into the base html as a widget
"var u='%s&format=ajax';"
// append our query from query box if there
"var qv;"
"if (qb) qv=qb.value;"
"if (qv){"
//"u+='&q=';"
"u+='&prepend=';"
"u+=encodeURI(qv);"
"}"
// set global var so handler knows if we were
// forced or not
"forcing=force;"
// get the docid at the top of the widget
// so we can get SURROUNDING search results,
// like 10 before it and 10 after it for
// our infinite scrolling
//"var td=document.getElementById('topdocid');"
//"if ( td ) u=u+\"&topdocid=\"+td.value;"
//"alert('reloading');"
"client.open('GET',u);"
"client.send();"
"}\n\n"
// when page loads, populate the widget immed.
"widget123_reload(1);\n\n"
// initiate the timer loop since it was
// not initiated on that call since we had to
// set force=1 to load in case the query box
// was currently visible.
"setTimeout('widget123_reload(0)',%li);"
//, widgetHeight
, (long)DEFAULT_WIDGET_RELOAD
, ub.getBufStart()
, (long)DEFAULT_WIDGET_RELOAD
);
//
// . call this when scrollbar gets 5 up from bottom
// . but if < 10 new results are appended, then stop!
//
sb.safePrintf(
"var outstanding=0;\n\n"
"function widget123_append() {"
// bail if already outstanding
"if (outstanding) return;"
// if scrollbar not near bottom, then return
"var sd=document.getElementById("
"\"widget123_scrolldiv\");"
"if ( sd ) {"
"var pos=parseInt(sd.scrollTop);"
"if (pos < (sd.scrollHeight-%li)) "
"return;"
"}"
// . this url gets the search results
// . just get them so we can APPEND them to
// the widget, so it will be just the
// "results" divs
"var u='%s&format=append';"
// . get score of the last docid in our widget
// . it should be persistent.
// . it is like a bookmark for scrolling
// . append results AFTER it into the widget
// . this way we can deal with the fact that
// we may be adding 100s of results to this
// query per second, especially if spidering
// at a high rate. and this will keep the
// results we append persistent.
// . now we scan the children "search result"
// divs of the "widget123_scrolldiv" div
// container to get the last child and get
// its score/docid so we can re-do the search
// and just get the search results with
// a score/docid LESS THAN that. THEN our
// results should be contiguous.
// . get the container div, "cd"
"var cd=document.getElementById("
"'widget123_scrolldiv');"
// must be there
"if(!cd)return;"
// get the last child div in there
"var d=cd.lastChild.previousSibling;"
// must be there
"if(!d)return;"
// get docid/score
"u=u+\"&maxserpscore=\"+d.getAttribute('score');"
"u=u+\"&minserpdocid=\"+d.getAttribute('docid');"
// append our query from query box if there
"var qb=document.getElementById(\"qbox\");"
"var qv;"
"if (qb) qv=qb.value;"
"if (qv){"
//"u+='&q=';"
"u+='&prepend=';"
"u+=encodeURI(qv);"
"}"
// turn on the lock to prevent excessive calls
"outstanding=1;"
//"alert(\"scrolling2 u=\"+u);"
"var client=new XMLHttpRequest();"
"client.onreadystatechange="
"widget123_handler_append;"
//"alert('appending scrollTop='+sd.scrollTop+' scrollHeight='+sd.scrollHeight+' 5results=%li'+u);"
"client.open('GET',u);"
"client.send();"
"}\n\n"
"</script>\n\n"
// if (pos < (sd.scrollHeight-%li)) return...
// once user scrolls down to within last 5
// results then try to append to the results.
, widgetHeight +5*((long)RESULT_HEIGHT+2*PADDING)
, ub.getBufStart()
//,widgetHeight +5*((long)RESULT_HEIGHT+2*PADDING
);
// then the WIDGET MASTER div. set the "id" so that the
// style tag the user sets can control its appearance.
// when the browser loads this the ajax sets the contents
// to the reply from neo.
// on scroll call widget123_append() which will append
// more search results if we are near the bottom of the
// widget.
sb.safePrintf("<div id=widget123 "
"style=\"border:2px solid black;"
"position:relative;border-radius:10px;"
"width:%lipx;height:%lipx;\">"
, widgetWidth
, widgetHeight
);
//sb.safePrintf("<style>"
// "a{color:white;}"
// "</style>");
sb.safePrintf("Waiting for Server...");
// end the containing div
sb.safePrintf("</div>");
savedLen2 = sb.length();
}
// the right table pane is the crawl stats
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("</TD><TD valign=top>");
}
//
// show stats
//
if ( fmt == FORMAT_HTML ) {
char *seedStr = cr->m_diffbotSeeds.getBufStart();
if ( ! seedStr ) seedStr = "";
SafeBuf tmp;
long crawlStatus = -1;
getSpiderStatusMsg ( cr , &tmp , &crawlStatus );
CrawlInfo *ci = &cr->m_localCrawlInfo;
long sentAlert = (long)ci->m_sentCrawlDoneAlert;
if ( sentAlert ) sentAlert = 1;
//sb.safePrintf(
// "<form method=get action=/crawlbot>"
// "%s"
// , sb.getBufStart() // hidden input token/name/..
// );
char *hurts = "No";
if ( cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider )
hurts = "Yes";
sb.safePrintf(//"<TABLE border=0>"
//"<TR><TD valign=top>"
"<table id=stats border=0 cellpadding=5>"
"<tr>"
"<td><b>Crawl Status Code:</td>"
"<td>%li</td>"
"</tr>"
"<tr>"
"<td><b>Crawl Status Msg:</td>"
"<td>%s</td>"
"</tr>"
//"<tr>"
//"<td><b>Rounds Completed:</td>"
//"<td>%li</td>"
//"</tr>"
"<tr>"
"<td><b>Has Urls Ready to Spider:</td>"
"<td>%s</td>"
"</tr>"
// this will have to be in crawlinfo too!
//"<tr>"
//"<td><b>pages indexed</b>"
//"<td>%lli</td>"
//"</tr>"
"<tr>"
"<td><b><nobr>URLs Harvested</b> "
"(may include dups)</nobr></td>"
"<td>%lli</td>"
"</tr>"
//"<tr>"
//"<td><b>URLs Examined</b></td>"
//"<td>%lli</td>"
//"</tr>"
"<tr>"
"<td><b>Page Crawl Attempts</b></td>"
"<td>%lli</td>"
"</tr>"
"<tr>"
"<td><b>Page Crawl Successes</b></td>"
"<td>%lli</td>"
"</tr>"
, crawlStatus
, tmp.getBufStart()
//, cr->m_spiderRoundNum
//, cr->m_globalCrawlInfo.m_hasUrlsReadyToSpider
, hurts
, cr->m_globalCrawlInfo.m_urlsHarvested
//, cr->m_globalCrawlInfo.m_urlsConsidered
, cr->m_globalCrawlInfo.m_pageDownloadAttempts
, cr->m_globalCrawlInfo.m_pageDownloadSuccesses
);
char tmp3[64];
struct tm *timeStruct;
timeStruct = localtime((time_t *)&cr->m_diffbotCrawlStartTime);
// Jan 01 1970 at 10:30:00
strftime ( tmp3,64 , "%b %d %Y at %H:%M:%S",timeStruct);
sb.safePrintf("<tr><td><b>Collection Created</b></td>"
"<td>%s (local time)</td></tr>",tmp3);
// print link to embed the code in their own site
SafeBuf embed;
embed.htmlEncode(sb.getBufStart()+savedLen1,
savedLen2-savedLen1,
false); // encodePoundSign #?
// convert all ''s to "'s for php's echo ''; cmd
embed.replaceChar('\'','\"');
sb.safePrintf("<tr>"
"<td valign=top>"
"<a onclick=\""
"var dd=document.getElementById('hcode');"
"if ( dd.style.display=='none' ) "
"dd.style.display=''; "
"else "
"dd.style.display='none';"
"\" style=color:blue;>"
"<u>"
"show Widget HTML code"
"</u>"
"</a>"
"</td><td>"
"<div id=hcode style=display:none;"
"max-width:800px;>"
"%s"
"</div>"
"</td></tr>"
, embed.getBufStart() );
sb.safePrintf("<tr>"
"<td valign=top>"
"<a onclick=\""
"var dd=document.getElementById('pcode');"
"if ( dd.style.display=='none' ) "
"dd.style.display=''; "
"else "
"dd.style.display='none';"
"\" style=color:blue;>"
"<u>"
"show Widget PHP code"
"</u>"
"</a>"
"</td>"
"<td>"
"<div id=pcode style=display:none;"
"max-width:800px;>"
"<i>"
"echo '"
"%s"
"';"
"</i>"
"</div>"
"</td></tr>"
, embed.getBufStart() );
sb.safePrintf("</table>\n\n");
}
// end the right table pane
if ( fmt == FORMAT_HTML ) {
sb.safePrintf("</TD></TR></TABLE>");
}
//if ( fmt != FORMAT_JSON )
// // wrap up the form, print a submit button
// g_pages.printAdminBottom ( &sb );
return g_httpServer.sendDynamicPage (socket,
sb.getBufStart(),
sb.length(),
0); // cachetime
}