mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 20:27:43 +03:00
367 lines
9.9 KiB
C++
367 lines
9.9 KiB
C++
#include "gb-include.h"
|
|
|
|
#include "Pages.h"
|
|
#include "Collectiondb.h"
|
|
#include "Msg4.h"
|
|
#include "Spider.h"
|
|
|
|
static bool sendReply ( void *state , bool addUrlEnabled );
|
|
|
|
static void addedStuff ( void *state );
|
|
|
|
class State1 {
|
|
public:
|
|
Msg4 m_msg4;
|
|
TcpSocket *m_socket;
|
|
|
|
long m_urlLen;
|
|
char m_url[MAX_URL_LEN];
|
|
|
|
bool m_strip;
|
|
bool m_spiderLinks;
|
|
|
|
long m_numSent;
|
|
long m_numReceived;
|
|
SpiderRequest m_sreq;
|
|
};
|
|
|
|
// . returns false if blocked, true otherwise
|
|
// . sets g_errno on error
|
|
// . add url page for admin, users use sendPageAddUrl() in PageRoot.cpp
|
|
bool sendPageAddUrl2 ( TcpSocket *s , HttpRequest *r ) {
|
|
// . get fields from cgi field of the requested url
|
|
// . get the search query
|
|
long urlLen = 0;
|
|
char *url = r->getString ( "u" , &urlLen , NULL /*default*/);
|
|
// also try "url" and "urls"
|
|
if ( ! url ) url = r->getString ( "url" , &urlLen , NULL );
|
|
if ( ! url ) url = r->getString ( "urls" , &urlLen , NULL );
|
|
|
|
// see if they provided a url of a file of urls if they did not
|
|
// provide a url to add directly
|
|
|
|
// can't be too long, that's obnoxious
|
|
if ( urlLen > MAX_URL_LEN ) {
|
|
g_errno = EBUFTOOSMALL;
|
|
g_msg = " (error: url too long)";
|
|
return g_httpServer.sendErrorReply(s,500,"url too long");
|
|
}
|
|
|
|
// get collection rec
|
|
CollectionRec *cr = g_collectiondb.getRec ( r );
|
|
// bitch if no collection rec found
|
|
if ( ! cr ) {
|
|
g_errno = ENOCOLLREC;
|
|
g_msg = " (error: no collection)";
|
|
return g_httpServer.sendErrorReply(s,500,"no coll rec");
|
|
}
|
|
|
|
|
|
// make a new state
|
|
State1 *st1 ;
|
|
try { st1 = new (State1); }
|
|
catch ( ... ) {
|
|
g_errno = ENOMEM;
|
|
log("PageAddUrl: new(%i): %s",
|
|
sizeof(State1),mstrerror(g_errno));
|
|
return g_httpServer.sendErrorReply(s,500,mstrerror(g_errno)); }
|
|
mnew ( st1 , sizeof(State1) , "PageAddUrl" );
|
|
|
|
|
|
st1->m_socket = s;
|
|
|
|
// assume no url buf yet, set below
|
|
//st1->m_ubuf = NULL;
|
|
//st1->m_ubufAlloc = NULL;
|
|
//st1->m_metaList = NULL;
|
|
|
|
// save the url
|
|
st1->m_url[0] = '\0';
|
|
if ( url ) {
|
|
// normalize and add www. if it needs it
|
|
Url uu;
|
|
uu.set ( url , gbstrlen(url) , true );
|
|
// remove >'s i guess and store in st1->m_url[] buffer
|
|
st1->m_urlLen=cleanInput ( st1->m_url,
|
|
MAX_URL_LEN,
|
|
uu.getUrl(),
|
|
uu.getUrlLen() );
|
|
// point to that as the url "buf" to add
|
|
//st1->m_ubuf = st1->m_url;
|
|
//st1->m_ubufSize = urlLen;
|
|
//st1->m_ubufAlloc = NULL; // do not free it!
|
|
}
|
|
|
|
st1->m_spiderLinks = true;
|
|
st1->m_strip = true;
|
|
|
|
// or if in read-only mode
|
|
if ( g_conf.m_readOnlyMode ) return sendReply ( st1 , false );
|
|
|
|
st1->m_strip = r->getLong("strip",0);
|
|
// Remember, for cgi, if the box is not checked, then it is not
|
|
// reported in the request, so set default return value to 0
|
|
long spiderLinks = r->getLong("spiderLinks",-1);
|
|
// also support all lowercase like PageInject.cpp uses
|
|
if ( spiderLinks == -1 )
|
|
spiderLinks = r->getLong("spiderlinks",0);
|
|
|
|
// . should we force it into spiderdb even if already in there
|
|
// . use to manually update spider times for a url
|
|
// . however, will not remove old scheduled spider times
|
|
// . mdw: made force on the default
|
|
//st1->m_forceRespider = r->getLong("force",1); // 0);
|
|
|
|
// if no url given, just print a blank page
|
|
if ( ! url ) return sendReply ( st1 , true );
|
|
|
|
|
|
//
|
|
// make a SpiderRequest
|
|
//
|
|
|
|
SpiderRequest *sreq = &st1->m_sreq;
|
|
// set the SpiderRequest from this add url
|
|
if ( ! sreq->setFromAddUrl ( st1->m_url ) ) {
|
|
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
|
// send reply back with g_errno set if this returned false
|
|
return sendReply ( st1 , true );
|
|
}
|
|
|
|
|
|
|
|
// shortcut
|
|
Msg4 *m = &st1->m_msg4;
|
|
// now add that to spiderdb using msg4
|
|
if ( ! m->addMetaList ( (char *)sreq ,
|
|
sreq->getRecSize() ,
|
|
cr->m_coll ,
|
|
st1 , // state
|
|
addedStuff ,
|
|
MAX_NICENESS ,
|
|
RDB_SPIDERDB ) )
|
|
// we blocked
|
|
return false;
|
|
|
|
// send back the reply
|
|
return sendReply ( st1 , true );
|
|
}
|
|
|
|
void addedStuff ( void *state ) {
|
|
State1 *st1 = (State1 *)state;
|
|
// otherwise call gotResults which returns false if blocked, true else
|
|
// and sets g_errno on error
|
|
sendReply ( st1 , true );
|
|
}
|
|
|
|
bool sendReply ( void *state , bool addUrlEnabled ) {
|
|
// allow others to add now
|
|
//s_inprogress = false;
|
|
// get the state properly
|
|
State1 *st1 = (State1 *) state;
|
|
// in order to see what sites are being added log it, then we can
|
|
// more easily remove sites from sitesearch.gigablast.com that are
|
|
// being added but not being searched
|
|
log(LOG_INFO,"http: add url %s (%s)",st1->m_url ,mstrerror(g_errno));
|
|
// extract info from state
|
|
TcpSocket *s = st1->m_socket;
|
|
char *url = NULL;
|
|
if ( st1->m_urlLen ) url = st1->m_url;
|
|
// re-null it out if just http://
|
|
bool printUrl = true;
|
|
if ( st1->m_urlLen == 0 ) printUrl = false;
|
|
if ( ! st1->m_url ) printUrl = false;
|
|
if (st1->m_urlLen==7&&st1->m_url&&!strncasecmp(st1->m_url,"http://",7))
|
|
printUrl = false;
|
|
// page is not more than 32k
|
|
char buf[1024*32+MAX_URL_LEN*2];
|
|
SafeBuf sb(buf, 1024*32+MAX_URL_LEN*2);
|
|
|
|
//char rawbuf[1024*8];
|
|
//SafeBuf rb(rawbuf, 1024*8);
|
|
//rb.safePrintf("<?xml version=\"1.0\" encoding=\"utf-8\"?>\n");
|
|
//rb.safePrintf("<status>\n");
|
|
//CollectionRec *cr = g_collectiondb.getRec ( st1->m_coll );
|
|
|
|
// collection name
|
|
|
|
char tt [ 128 ];
|
|
tt[0] = '\0';
|
|
|
|
// the bg colors and style
|
|
g_pages.printColors (&sb);
|
|
sb.safePrintf ( "<title>Gigablast Add a Url</title>"
|
|
"<table><tr><td valign=bottom><a href=/>"
|
|
//"<img width=200 length=25 border=0 src=/logo2.gif></a>"
|
|
"<img width=210 height=25 border=0 src=/logo2.gif></a>"
|
|
" </font></td><td><font size=+1>"
|
|
"<b>Add Url%s</td></tr></table>" , tt );
|
|
// watch out for NULLs
|
|
if ( ! url ) url = "http://";
|
|
|
|
// if there was an error let them know
|
|
char msg[MAX_URL_LEN + 1024];
|
|
char *pm = "";
|
|
if ( g_errno ) {
|
|
sprintf ( msg ,"Error adding url(s): <b>%s[%i]</b>",
|
|
mstrerror(g_errno) , g_errno);
|
|
pm = msg;
|
|
//rb.safePrintf("Error adding url(s): %s[%i]",
|
|
// mstrerror(g_errno) , g_errno);
|
|
}
|
|
else {
|
|
if ( url && printUrl && url[0] ) {
|
|
sprintf ( msg ,"<u>%s</u> added to spider "
|
|
"queue "
|
|
"successfully", url );
|
|
//rb.safePrintf("%s added to spider "
|
|
// "queue successfully", url );
|
|
}
|
|
else {
|
|
sprintf(msg,"Add the url you want:");
|
|
//rb.safePrintf("Add the url you want:");
|
|
}
|
|
|
|
pm = msg;
|
|
url = "http://";
|
|
//else
|
|
// pm = "Don't forget to <a href=/gigaboost.html>"
|
|
// "Gigaboost</a> your URL.";
|
|
}
|
|
|
|
|
|
|
|
// print the add url table
|
|
sb.safePrintf (
|
|
"<br><br><br><center>"
|
|
"<b>%s</b>" // the url msg
|
|
"<br><br>"
|
|
"<FORM method=post action=/addurl>"
|
|
|
|
"<table %s>"
|
|
"<tr class=hdrow><td colspan=2>"
|
|
"<center>"
|
|
//"<font size=+1>"
|
|
"<b>"
|
|
"Inject URL</b>"
|
|
//"</font>"
|
|
"</td></tr>\n\n"
|
|
|
|
"<tr class=poo><td><b>url</b>"
|
|
"<br>"
|
|
"<font size=-2>"
|
|
|
|
"Submit requests for Gigablast to index certain urls. "
|
|
"They must match the patterns you have specified in "
|
|
"the <a href=/admin/sites>spider sites</a> list. "
|
|
"You can override that behavior on the "
|
|
"<a href=/admin/scheduler>spider scheduler</a> by "
|
|
"telling Gigablast to always index manually added "
|
|
"urls. If your url does not index as you expect you "
|
|
"can check it's history. " // (spiderdb lookup)
|
|
"Added urls will have a "
|
|
"<a href=/admin/scheduler#hopcount>hopcount</a> of 0. "
|
|
"The add url api is described on the "
|
|
"<a href=/admin/api>api</a> page. "
|
|
|
|
"</font>"
|
|
"</td></tr>"
|
|
"<tr><td colspan=2>"
|
|
|
|
//"<input type=text name=url value=\"%s\" size=50> "
|
|
"<textarea cols=80 rows=20 name=urls>"
|
|
"%s"
|
|
"</textarea>"
|
|
|
|
, pm // msg
|
|
, TABLE_STYLE
|
|
, url // submitted urls
|
|
);
|
|
|
|
|
|
/*
|
|
sb.safePrintf (
|
|
"\n"
|
|
|
|
"<br><b>or specify the url of a "
|
|
"file of urls to add:</b>"
|
|
"<br>\n"
|
|
"<input type=text name=ufu size=50> "
|
|
"<input type=submit value=\"add file\" border=0><br>"
|
|
"<br>"
|
|
|
|
//"<br><b>or a query to scrape from major engines:</b>"
|
|
//"<br>\n"
|
|
// qts = query to scrape
|
|
//"<input type=text name=qts size=49> "
|
|
//"<input type=submit value=\"add query\" border=0><br>"
|
|
//"<br>"
|
|
|
|
"<br><b>collection to add to:</b> "
|
|
"<input type=text name=c size=20 value=\"%s\">"
|
|
"<br><br>\n",
|
|
st1->m_coll );
|
|
*/
|
|
|
|
// upload a file of urls to add
|
|
sb.safePrintf ( "<br>"
|
|
"<input "
|
|
"size=20 "
|
|
"type=file "
|
|
"name=\"Upload file of urls\">"
|
|
"</td></tr>"
|
|
);
|
|
|
|
|
|
char *ss = "";
|
|
if ( st1->m_strip ) ss =" checked";
|
|
sb.safePrintf ("<tr><td>"
|
|
"<input type=checkbox name=strip value=1%s>"
|
|
"</td><td>"
|
|
"strip sessionids"
|
|
"</td></tr>"
|
|
, ss );
|
|
|
|
|
|
// adding spider links box
|
|
char *sl = "";
|
|
if ( st1->m_spiderLinks ) sl =" checked";
|
|
sb.safePrintf ("<tr><td>"
|
|
"<input type=checkbox name=spiderLinks value=1%s>"
|
|
"</td><td>"
|
|
"spider (harvest) links from page"
|
|
"</td></tr>"
|
|
, sl );
|
|
|
|
|
|
sb.safePrintf(
|
|
"<tr><td colspan=2>"
|
|
"<input type=submit value=\"add urls\" border=0>"
|
|
"</td></tr>"
|
|
"</table>"
|
|
);
|
|
|
|
|
|
// . print the url box, etc...
|
|
// . assume user is always forcing their url
|
|
// sprintf ( p ,
|
|
// "<br><br>"
|
|
// "<input type=checkbox name=force value=1 checked> "
|
|
// "force respider<br>" );
|
|
//p += gbstrlen ( p );
|
|
|
|
// print the final tail
|
|
g_pages.printTail ( &sb, true ); // admin?
|
|
// clear g_errno, if any, so our reply send goes through
|
|
g_errno = 0;
|
|
|
|
// nuke state
|
|
mdelete ( st1 , sizeof(State1) , "PageAddUrl" );
|
|
delete (st1);
|
|
|
|
return g_httpServer.sendDynamicPage (s,
|
|
sb.getBufStart(),
|
|
sb.length(),
|
|
-1 ); // cachetime
|
|
}
|