open-source-search-engine/PageAddUrl.cpp
Matt Wells 4e803210ee tons of changes from live github on neo.
lots of core fixes.
took out ppthtml powerpoint convert, it hangs.
dynamic rdbmap to save memory per coll.
fixed disk page cache logic and brought it
back.
2014-01-17 21:01:43 -08:00

605 lines
18 KiB
C++

#include "gb-include.h"
#include "Pages.h"
#include "Collectiondb.h"
#include "HashTable.h"
#include "Msg4.h"
#include "TuringTest.h"
#include "AutoBan.h"
//#include "CollectionRec.h"
//#include "Links.h"
#include "Users.h"
#include "HashTableT.h"
#include "Spider.h"
static bool sendReply ( void *state , bool addUrlEnabled );
static bool canSubmit (unsigned long h, long now, long maxUrlsPerIpDom);
static void addedStuff ( void *state );
void resetPageAddUrl ( ) ;
class State2 {
public:
Url m_url;
char *m_buf;
long m_bufLen;
long m_bufMaxLen;
};
class State1 {
public:
Msg4 m_msg4;
TcpSocket *m_socket;
bool m_isAdmin;
char m_coll[MAX_COLL_LEN+1];
bool m_goodAnswer;
bool m_doTuringTest;
long m_ufuLen;
char m_ufu[MAX_URL_LEN];
long m_urlLen;
char m_url[MAX_URL_LEN];
char m_username[MAX_USER_SIZE];
bool m_strip;
bool m_spiderLinks;
bool m_forceRespider;
// buf filled by the links coming from google, msn, yahoo, etc
State2 m_state2[5]; // gb, goog, yahoo, msn, ask
long m_numSent;
long m_numReceived;
//long m_raw;
SpiderRequest m_sreq;
};
// only allow up to 1 Msg10's to be in progress at a time
static bool s_inprogress = false;
// . returns false if blocked, true otherwise
// . sets g_errno on error
bool sendPageAddUrl ( TcpSocket *s , HttpRequest *r ) {
// . get fields from cgi field of the requested url
// . get the search query
long urlLen = 0;
char *url = r->getString ( "u" , &urlLen , NULL /*default*/);
// see if they provided a url of a file of urls if they did not
// provide a url to add directly
//bool isAdmin = g_collectiondb.isAdmin ( r , s );
bool isAdmin = r->getIsLocal();
long ufuLen = 0;
char *ufu = NULL;
if ( isAdmin )
// get the url of a file of urls (ufu)
ufu = r->getString ( "ufu" , &ufuLen , NULL );
// can't be too long, that's obnoxious
if ( urlLen > MAX_URL_LEN || ufuLen > MAX_URL_LEN ) {
g_errno = EBUFTOOSMALL;
g_msg = " (error: url too long)";
return g_httpServer.sendErrorReply(s,500,"url too long");
}
// get the collection
long collLen = 0;
char *coll = r->getString("c",&collLen);
if ( ! coll || ! coll[0] ) {
//coll = g_conf.m_defaultColl;
coll = g_conf.getDefaultColl( r->getHost(), r->getHostLen() );
collLen = gbstrlen(coll);
}
// get collection rec
CollectionRec *cr = g_collectiondb.getRec ( coll );
// bitch if no collection rec found
if ( ! cr ) {
g_errno = ENOCOLLREC;
g_msg = " (error: no collection)";
return g_httpServer.sendErrorReply(s,500,"no coll rec");
}
// . make sure the ip is not banned
// . we may also have an exclusive list of IPs for private collections
if ( ! cr->hasSearchPermission ( s ) ) {
g_errno = ENOPERM;
g_msg = " (error: permission denied)";
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno));
}
// make a new state
State1 *st1 ;
try { st1 = new (State1); }
catch ( ... ) {
g_errno = ENOMEM;
log("PageAddUrl: new(%i): %s",
sizeof(State1),mstrerror(g_errno));
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); }
mnew ( st1 , sizeof(State1) , "PageAddUrl" );
// save socket and isAdmin
st1->m_socket = s;
st1->m_isAdmin = isAdmin;
// assume no url buf yet, set below
//st1->m_ubuf = NULL;
//st1->m_ubufAlloc = NULL;
//st1->m_metaList = NULL;
// save the url
st1->m_url[0] = '\0';
if ( url ) {
// normalize and add www. if it needs it
Url uu;
uu.set ( url , gbstrlen(url) , true );
// remove >'s i guess and store in st1->m_url[] buffer
st1->m_urlLen=cleanInput ( st1->m_url,
MAX_URL_LEN,
uu.getUrl(),
uu.getUrlLen() );
// point to that as the url "buf" to add
//st1->m_ubuf = st1->m_url;
//st1->m_ubufSize = urlLen;
//st1->m_ubufAlloc = NULL; // do not free it!
}
// save the "ufu" (url of file of urls)
st1->m_ufu[0] = '\0';
st1->m_ufuLen = ufuLen;
memcpy ( st1->m_ufu , ufu , ufuLen );
st1->m_ufu[ufuLen] = '\0';
st1->m_doTuringTest = cr->m_doTuringTest;
char *username = g_users.getUsername(r);
if(username) strcpy(st1->m_username,username);
//st1->m_user = g_pages.getUserType ( s , r );
st1->m_spiderLinks = true;
st1->m_strip = true;
//st1->m_raw = r->getLong("raw",0);
// init state2
for ( long i = 0; i < 5; i++ ){
st1->m_state2[i].m_buf = NULL;
st1->m_state2[i].m_bufLen = 0;
st1->m_state2[i].m_bufMaxLen = 0;
}
// save the collection name in the State1 class
if ( collLen > MAX_COLL_LEN ) collLen = MAX_COLL_LEN;
strncpy ( st1->m_coll , coll , collLen );
st1->m_coll [ collLen ] = '\0';
// assume they answered turing test correctly
st1->m_goodAnswer = true;
// if addurl is turned off, just print "disabled" msg
if ( ! g_conf.m_addUrlEnabled ) return sendReply ( st1 , false );
// can also be turned off in the collection rec
if ( ! cr->m_addUrlEnabled ) return sendReply ( st1 , false );
// or if in read-only mode
if ( g_conf.m_readOnlyMode ) return sendReply ( st1 , false );
// cannot add if another Msg10 from here is still in progress
if ( s_inprogress ) return sendReply ( st1 , true );
// use now as the spiderTime
// get ip of submitter
//unsigned long h = ipdom ( s->m_ip );
// . use top 2 bytes now, some isps have large blocks
// . if this causes problems, then they can do pay for inclusion
unsigned long h = iptop ( s->m_ip );
long codeLen;
char* code = r->getString("code", &codeLen);
if(g_autoBan.hasCode(code, codeLen, s->m_ip)) {
long uipLen = 0;
char* uip = r->getString("uip",&uipLen);
long hip = 0;
//use the uip when we have a raw query to test if
//we can submit
if(uip) {
hip = atoip(uip, uipLen);
h = iptop( hip );
}
}
st1->m_strip = r->getLong("strip",0);
// Remember, for cgi, if the box is not checked, then it is not
// reported in the request, so set default return value to 0
long spiderLinks = r->getLong("spiderLinks",-1);
// also support all lowercase like PageInject.cpp uses
if ( spiderLinks == -1 )
spiderLinks = r->getLong("spiderlinks",0);
// . should we force it into spiderdb even if already in there
// . use to manually update spider times for a url
// . however, will not remove old scheduled spider times
// . mdw: made force on the default
st1->m_forceRespider = r->getLong("force",1); // 0);
long now = getTimeGlobal();
// . allow 1 submit every 1 hour
// . restrict by submitter domain ip
if ( ! st1->m_isAdmin &&
! canSubmit ( h , now , cr->m_maxAddUrlsPerIpDomPerDay ) ) {
// return error page
g_errno = ETOOEARLY;
return sendReply ( st1 , true );
}
//st1->m_query = r->getString( "qts", &st1->m_queryLen );
// check it, if turing test is enabled for this collection
if ( ! st1->m_isAdmin && cr->m_doTuringTest &&
! g_turingTest.isHuman(r) ) {
// log note so we know it didn't make it
g_msg = " (error: bad answer)";
//log("PageAddUrl:: addurl failed for %s : bad answer",
// iptoa(s->m_ip));
st1->m_goodAnswer = false;
return sendReply ( st1 , true /*addUrl enabled?*/ );
}
//if ( st1->m_queryLen > 0 )
// return getPages( st1 );
// if no url given, just print a blank page
if ( ! url ) return sendReply ( st1 , true );
//
// make a SpiderRequest
//
SpiderRequest *sreq = &st1->m_sreq;
// reset it
sreq->reset();
// make the probable docid
long long probDocId = g_titledb.getProbableDocId ( st1->m_url );
// make one up, like we do in PageReindex.cpp
long firstIp = (probDocId & 0xffffffff);
// avoid ips of 0 or -1
if ( firstIp == 0 || firstIp == -1 ) firstIp = 1;
// . now fill it up
// . TODO: calculate the other values... lazy!!! (m_isRSSExt,
// m_siteNumInlinks,...)
sreq->m_isNewOutlink = 1;
sreq->m_isAddUrl = 1;
sreq->m_addedTime = now;
sreq->m_fakeFirstIp = 1;
sreq->m_probDocId = probDocId;
sreq->m_firstIp = firstIp;
sreq->m_hopCount = 0;
// its valid if root
Url uu; uu.set ( st1->m_url );
if ( uu.isRoot() ) sreq->m_hopCountValid = true;
// too big?
//long len = st1->m_urlLen;
// the url! includes \0
strcpy ( sreq->m_url , st1->m_url );
// call this to set sreq->m_dataSize now
sreq->setDataSize();
// make the key dude -- after setting url
sreq->setKey ( firstIp , 0LL, false );
// need a fake first ip lest we core!
//sreq->m_firstIp = (pdocId & 0xffffffff);
// how to set m_firstIp? i guess addurl can be throttled independently
// of the other urls??? use the hash of the domain for it!
long dlen;
char *dom = getDomFast ( st1->m_url , &dlen );
// fake it for this...
//sreq->m_firstIp = hash32 ( dom , dlen );
// sanity
if ( ! dom ) {
g_errno = EBADURL;
return sendReply ( st1 , true );
}
// shortcut
Msg4 *m = &st1->m_msg4;
// now add that to spiderdb using msg4
if ( ! m->addMetaList ( (char *)sreq ,
sreq->getRecSize() ,
coll ,
st1 , // state
addedStuff ,
MAX_NICENESS ,
RDB_SPIDERDB ) )
// we blocked
return false;
// send back the reply
return sendReply ( st1 , true );
}
void addedStuff ( void *state ) {
State1 *st1 = (State1 *)state;
// otherwise call gotResults which returns false if blocked, true else
// and sets g_errno on error
sendReply ( st1 , true );
}
bool sendReply ( void *state , bool addUrlEnabled ) {
// allow others to add now
s_inprogress = false;
// get the state properly
State1 *st1 = (State1 *) state;
// in order to see what sites are being added log it, then we can
// more easily remove sites from sitesearch.gigablast.com that are
// being added but not being searched
log(LOG_INFO,"http: add url %s (%s)",st1->m_url ,mstrerror(g_errno));
// extract info from state
TcpSocket *s = st1->m_socket;
bool isAdmin = st1->m_isAdmin;
char *url = NULL;
if ( st1->m_urlLen ) url = st1->m_url;
// re-null it out if just http://
bool printUrl = true;
if ( st1->m_urlLen == 0 ) printUrl = false;
if ( ! st1->m_url ) printUrl = false;
if (st1->m_urlLen==7&&st1->m_url&&!strncasecmp(st1->m_url,"http://",7))
printUrl = false;
// page is not more than 32k
char buf[1024*32+MAX_URL_LEN*2];
SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);
//char rawbuf[1024*8];
//SafeBuf rb(rawbuf, 1024*8);
//rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
//rb.safePrintf("<status>\n");
//CollectionRec *cr = g_collectiondb.getRec ( st1->m_coll );
// collection name
char tt [ 128 ];
tt[0] = '\0';
if ( st1->m_coll[0] != '\0' && ! isAdmin )
sprintf ( tt , " for %s", st1->m_coll );
// the bg colors and style
g_pages.printColors (&sb);
sb.safePrintf ( "<title>Gigablast Add a Url</title>"
"<table><tr><td valign=bottom><a href=/>"
//"<img width=200 length=25 border=0 src=/logo2.gif></a>"
"<img width=210 height=25 border=0 src=/logo2.gif></a>"
"&nbsp;&nbsp;</font></td><td><font size=+1>"
"<b>Add Url%s</td></tr></table>" , tt );
// watch out for NULLs
if ( ! url ) url = "http://";
// blank out url if adding a url of a file of urls
// if ( st1->m_ufu ) url = "http://";
// if there was an error let them know
char msg[MAX_URL_LEN + 1024];
char *pm = "";
if ( g_errno ) {
if ( g_errno == ETOOEARLY ) {
pm = "Error. 100 urls have "
"already been submitted by "
"this IP address for the last 24 hours. "
"<a href=/addurlerror.html>Explanation</a>.";
log("addurls: Failed for user at %s: "
"quota breeched.", iptoa(s->m_ip));
//rb.safePrintf("Error. %li urls have "
// "already been submitted by "
// "this IP address for the "
// "last 24 hours. ",
// cr->m_maxAddUrlsPerIpDomPerDay);
}
else {
sprintf ( msg ,"Error adding url(s): <b>%s[%i]</b>",
mstrerror(g_errno) , g_errno);
pm = msg;
//rb.safePrintf("Error adding url(s): %s[%i]",
// mstrerror(g_errno) , g_errno);
}
}
else {
if ( ! addUrlEnabled ) {//g_conf.m_addUrlEnabled )
pm = "<font color=#ff0000>"
"Sorry, this feature is temporarily disabled. "
"Please try again later.</font>";
if ( st1->m_urlLen )
log("addurls: failed for user at %s: "
"add url is disabled. "
"Enable add url on the "
"Master Controls page and "
"on the Spider Controls page for "
"this collection.",
iptoa(s->m_ip));
//rb.safePrintf("Sorry, this feature is temporarily "
// "disabled. Please try again later.");
}
else if ( s_inprogress ) {
pm = "Add url busy. Try again later.";
log("addurls: Failed for user at %s: "
"busy adding another.", iptoa(s->m_ip));
//rb.safePrintf("Add url busy. Try again later.");
}
// did they fail the turing test?
else if ( ! st1->m_goodAnswer ) {
pm = "<font color=#ff0000>"
"Oops, you did not enter the 4 large letters "
"you see below. Please try again.</font>";
//rb.safePrintf("could not add the url"
// " because the turing test"
// " is enabled.");
}
if ( url && ! st1->m_ufu[0] && url[0] && printUrl ) {
sprintf ( msg ,"<u>%s</u> added to spider "
"queue "
"successfully", url );
//rb.safePrintf("%s added to spider "
// "queue successfully", url );
}
else if ( st1->m_ufu[0] ) {
sprintf ( msg ,"urls in <u>%s</u> "
"added to spider queue "
"successfully", st1->m_ufu );
//rb.safePrintf("urls in %s added to spider "
// "queue successfully", url );
}
else {
sprintf(msg,"Add the url you want:");
//rb.safePrintf("Add the url you want:");
}
pm = msg;
url = "http://";
//else
// pm = "Don't forget to <a href=/gigaboost.html>"
// "Gigaboost</a> your URL.";
}
// TODO: show them a list of the urls they added
// print the addUrl page in here with a status msg
sb.safePrintf (
"<br><br><br><center>"
"<b>%s</b>" // the url msg
"<br><br>"
"<FORM method=get action=/addurl>"
"<input type=text name=u value=\"%s\" size=50> "
"<input type=submit value=\"add url\" border=0><br>",pm,url);
// if we're coming from local ip print the collection box
if ( isAdmin )
sb.safePrintf (
"\n"
"<br><b>or specify the url of a "
"file of urls to add:</b>"
"<br>\n"
"<input type=text name=ufu size=50> "
"<input type=submit value=\"add file\" border=0><br>"
"<br>"
//"<br><b>or a query to scrape from major engines:</b>"
//"<br>\n"
// qts = query to scrape
//"<input type=text name=qts size=49> "
//"<input type=submit value=\"add query\" border=0><br>"
//"<br>"
"<br><b>collection to add to:</b> "
"<input type=text name=c size=20 value=\"%s\">"
"<br><br>\n",
st1->m_coll );
// otherwise hide it
else
sb.safePrintf ( "<input type=hidden name=c value=\"%s\">" ,
st1->m_coll );
char *ss = "";
if ( st1->m_strip ) ss =" checked";
sb.safePrintf ("<br>"
"<input type=checkbox name=strip value=1%s> "
"strip sessionids<br>", ss );
sb.safePrintf("<br>\n");
//Adding spider links box
char *sl = "";
if ( st1->m_spiderLinks ) sl =" checked";
sb.safePrintf ("<input type=checkbox name=spiderLinks value=1%s> "
"spider (harvest) links from page<br><br>\n", sl );
if ( ! s_inprogress && addUrlEnabled && st1->m_doTuringTest ) {
g_turingTest.printTest(&sb);
}
// . print the url box, etc...
// . assume user is always forcing their url
// sprintf ( p ,
// "<br><br>"
// "<input type=checkbox name=force value=1 checked> "
// "force respider<br>" );
//p += gbstrlen ( p );
/*
sprintf ( p ,
"<br>"
"<a href=/?redir="
"http://www.searchengineguide.com/submit/gigablast.html>"
"<b>Search Engine Marketing News</b></a><br>"
"If you would like to stay up to date with the "
"latest articles on using search engines to market "
"your web site, we recommend subscribing to the "
"Search Engine Marketing weekly newsletter. Once a "
"week, a digest of articles from the top search "
"engine marketing experts is delivered straight to "
"your inbox for free.<br><br>");
p += gbstrlen(p);
*/
// print the final tail
g_pages.printTail ( &sb, st1->m_isAdmin ); // local?
// clear g_errno, if any, so our reply send goes through
g_errno = 0;
//bool raw = st1->m_raw;
// free the buffer
//if ( st1->m_ubufAlloc )
// mfree ( st1->m_ubufAlloc , st1->m_ubufAllocSize,"pau");
//if ( st1->m_metaList )
// mfree ( st1->m_metaList , st1->m_metaListAllocSize,"pau");
// nuke state
mdelete ( st1 , sizeof(State1) , "PageAddUrl" );
delete (st1);
// . send this page
// . encapsulates in html header and tail
// . make a Mime
// . i thought we need -2 for cacheTime, but i guess not
//rb.safePrintf("</status>\n");
//if(raw) return g_httpServer.sendDynamicPage (s,
// rb.getBufStart(),
// rb.length(),
// -1/*cachetime*/,
// false, // POSTREply?
// "text/xml"// content type
// );
return g_httpServer.sendDynamicPage (s, sb.getBufStart(),
sb.length(),
-1/*cachetime*/);
}
// we get like 100k submissions a day!!!
static HashTable s_htable;
static bool s_init = false;
static long s_lastTime = 0;
bool canSubmit ( unsigned long h , long now , long maxAddUrlsPerIpDomPerDay ) {
// . sometimes no limit
// . 0 means no limit because if they don't want any submission they
// can just turn off add url and we want to avoid excess
// troubleshooting for why a url can't be added
if ( maxAddUrlsPerIpDomPerDay <= 0 ) return true;
// init the table
if ( ! s_init ) {
s_htable.set ( 50000 );
s_init = true;
}
// clean out table every 24 hours
if ( now - s_lastTime > 24*60*60 ) {
s_lastTime = now;
s_htable.clear();
}
// . if table almost full clean out ALL slots
// . TODO: just clean out oldest slots
if ( s_htable.getNumSlotsUsed() > 47000 ) s_htable.clear ();
// . how many times has this IP domain submitted?
// . allow 10 times per day
long n = s_htable.getValue ( h );
// if over 24hr limit then bail
if ( n >= maxAddUrlsPerIpDomPerDay ) return false;
// otherwise, inc it
n++;
// add to table, will replace old values
s_htable.addKey ( h , n );
return true;
}
void resetPageAddUrl ( ) {
s_htable.reset();
}