2013-08-03 00:12:24 +04:00
# include "gb-include.h"
2014-06-15 19:26:27 +04:00
# include "PageInject.h"
2013-08-03 00:12:24 +04:00
# include "HttpServer.h"
# include "Pages.h"
# include "Users.h"
# include "XmlDoc.h"
# include "PageParser.h"
# include "Repair.h"
2013-09-27 08:41:05 +04:00
# include "PageCrawlBot.h"
2014-06-12 04:24:33 +04:00
# include "HttpRequest.h"
//
// HTML INJECITON PAGE CODE
//
2013-08-03 00:12:24 +04:00
static bool sendReply ( void * state ) ;
static void sendReplyWrapper ( void * state ) {
sendReply ( state ) ;
} ;
// . returns false if blocked, true otherwise
// . sets g_errno on error
2014-06-12 04:24:33 +04:00
// . we are called by Parms::sendPageGeneric() to handle this request
// which was called by Pages.cpp's sendDynamicReply() when it calls
// pg->function() which is called by HttpServer::sendReply(s,r) when it
// gets an http request
// . so "hr" is on the stack in HttpServer::requestHandler() which calls
// HttpServer::sendReply() so we gotta copy it here
bool sendPageInject ( TcpSocket * sock , HttpRequest * hr ) {
2013-08-03 00:12:24 +04:00
2014-09-21 07:12:28 +04:00
if ( ! g_conf . m_injectionEnabled ) {
g_errno = EBADENGINEER ;
log ( " inject: injection disabled " ) ;
return g_httpServer . sendErrorReply ( sock , 500 , " injection is "
" disabled by "
" the administrator in "
" the master "
" controls " ) ;
}
2014-08-15 21:27:50 +04:00
2014-06-12 04:24:33 +04:00
// get the collection
2013-08-03 00:12:24 +04:00
// make a new state
Msg7 * msg7 ;
try { msg7 = new ( Msg7 ) ; }
catch ( . . . ) {
g_errno = ENOMEM ;
log ( " PageInject: new(%i): %s " ,
2014-08-28 23:55:02 +04:00
( int ) sizeof ( Msg7 ) , mstrerror ( g_errno ) ) ;
2014-06-12 04:24:33 +04:00
return g_httpServer . sendErrorReply ( sock , 500 , mstrerror ( g_errno ) ) ;
}
2013-08-03 00:12:24 +04:00
mnew ( msg7 , sizeof ( Msg7 ) , " PageInject " ) ;
2014-07-30 06:51:41 +04:00
msg7 - > m_socket = sock ;
2013-08-03 00:12:24 +04:00
2014-07-07 01:13:00 +04:00
char format = hr - > getReplyFormat ( ) ;
// no url parm?
if ( format ! = FORMAT_HTML & & ! hr - > getString ( " c " , NULL ) ) {
g_errno = ENOCOLLREC ;
char * msg = mstrerror ( g_errno ) ;
return g_httpServer . sendErrorReply ( sock , g_errno , msg , NULL ) ;
}
2014-06-12 04:24:33 +04:00
// set this. also sets gr->m_hr
GigablastRequest * gr = & msg7 - > m_gr ;
// this will fill in GigablastRequest so all the parms we need are set
g_parms . setGigablastRequest ( sock , hr , gr ) ;
2013-08-03 00:12:24 +04:00
2014-06-15 19:26:27 +04:00
// if content is "" make it NULL so XmlDoc will download it
// if user really wants empty content they can put a space in there
// TODO: update help then...
2014-06-16 01:57:38 +04:00
if ( gr - > m_content & & ! gr - > m_content [ 0 ] )
2014-06-15 19:26:27 +04:00
gr - > m_content = NULL ;
2014-06-16 01:57:38 +04:00
if ( gr - > m_contentFile & & ! gr - > m_contentFile [ 0 ] )
gr - > m_contentFile = NULL ;
if ( gr - > m_contentDelim & & ! gr - > m_contentDelim [ 0 ] )
gr - > m_contentDelim = NULL ;
2014-07-23 00:11:21 +04:00
// set this to false
gr - > m_gotSections = false ;
2014-06-16 01:57:38 +04:00
// if we had a delimeter but not content, zero it out...
char * content = gr - > m_content ;
if ( ! content ) content = gr - > m_contentFile ;
if ( ! content ) gr - > m_contentDelim = NULL ;
2014-06-15 19:26:27 +04:00
2014-06-12 04:24:33 +04:00
// get collection rec
CollectionRec * cr = g_collectiondb . getRec ( gr - > m_coll ) ;
// bitch if no collection rec found
if ( ! cr ) {
g_errno = ENOCOLLREC ;
//log("build: Injection from %s failed. "
// "Collection \"%s\" does not exist.",
// iptoa(s->m_ip),coll);
// g_errno should be set so it will return an error response
return sendReply ( msg7 ) ;
}
2013-09-16 21:49:37 +04:00
2014-07-07 01:13:00 +04:00
2013-08-03 00:12:24 +04:00
// a scrape request?
2014-06-12 04:24:33 +04:00
if ( gr - > m_queryToScrape & & gr - > m_queryToScrape [ 0 ] ) {
2013-08-03 00:12:24 +04:00
//char *uf="http://www.google.com/search?num=50&"
// "q=%s&scoring=d&filter=0";
msg7 - > m_linkDedupTable . set ( 4 , 0 , 512 , NULL , 0 , false , 0 , " ldtab " ) ;
if ( ! msg7 - > scrapeQuery ( ) ) return false ;
return sendReply ( msg7 ) ;
}
2014-06-15 19:26:27 +04:00
// if no url do not inject
if ( ! gr - > m_url | | gr - > m_url [ 0 ] = = ' \0 ' )
return sendReply ( msg7 ) ;
2014-12-08 20:49:17 +03:00
// no permmission?
bool isMasterAdmin = g_conf . isMasterAdmin ( sock , hr ) ;
bool isCollAdmin = g_conf . isCollAdmin ( sock , hr ) ;
if ( ! isMasterAdmin & &
! isCollAdmin ) {
g_errno = ENOPERM ;
return sendReply ( msg7 ) ;
}
2014-06-12 04:24:33 +04:00
// call sendReply() when inject completes
if ( ! msg7 - > inject ( msg7 , sendReplyWrapper ) )
2013-08-03 00:12:24 +04:00
return false ;
// it did not block, i gues we are done
return sendReply ( msg7 ) ;
}
bool sendReply ( void * state ) {
// get the state properly
Msg7 * msg7 = ( Msg7 * ) state ;
2014-06-12 04:24:33 +04:00
GigablastRequest * gr = & msg7 - > m_gr ;
2013-08-03 00:12:24 +04:00
// extract info from state
2014-06-12 04:24:33 +04:00
TcpSocket * sock = gr - > m_socket ;
2013-08-03 00:12:24 +04:00
XmlDoc * xd = & msg7 - > m_xd ;
// log it
//if ( msg7->m_url[0] ) xd->logIt();
// msg7 has the docid for what we injected, iff g_errno is not set
2014-10-30 22:36:39 +03:00
//int64_t docId = msg7->m_msg7.m_docId;
2014-11-11 01:45:11 +03:00
//int32_t hostId = msg7->m_msg7.m_hostId;
2014-10-30 22:36:39 +03:00
int64_t docId = xd - > m_docId ;
2014-11-11 01:45:11 +03:00
int32_t hostId = 0 ; //msg7->m_msg7.m_hostId;
2014-07-07 01:13:00 +04:00
// set g_errno to index code
if ( xd - > m_indexCodeValid & & xd - > m_indexCode & & ! g_errno )
g_errno = xd - > m_indexCode ;
char format = gr - > m_hr . getReplyFormat ( ) ;
// no url parm?
if ( ! g_errno & & ! gr - > m_url & & format ! = FORMAT_HTML )
g_errno = EMISSINGINPUT ;
2014-07-23 00:11:21 +04:00
if ( g_errno & & g_errno ! = EDOCUNCHANGED ) {
2014-11-11 01:45:11 +03:00
int32_t save = g_errno ;
2014-07-07 01:13:00 +04:00
mdelete ( msg7 , sizeof ( Msg7 ) , " PageInject " ) ;
delete ( msg7 ) ;
g_errno = save ;
char * msg = mstrerror ( g_errno ) ;
return g_httpServer . sendErrorReply ( sock , save , msg , NULL ) ;
}
2014-07-23 00:11:21 +04:00
char abuf [ 320 ] ;
SafeBuf am ( abuf , 320 , 0 , false ) ;
2014-07-23 01:23:41 +04:00
am . setLabel ( " injbuf " ) ;
2014-07-23 00:11:21 +04:00
char * ct = NULL ;
2014-07-07 01:13:00 +04:00
// a success reply, include docid and url i guess
if ( format = = FORMAT_XML ) {
2014-07-23 00:11:21 +04:00
am . safePrintf ( " <response> \n " ) ;
2014-11-11 01:45:11 +03:00
am . safePrintf ( " \t <statusCode>% " INT32 " </statusCode> \n " ,
( int32_t ) g_errno ) ;
2014-07-23 00:11:21 +04:00
am . safePrintf ( " \t <statusMsg><![CDATA[ " ) ;
am . cdataEncode ( mstrerror ( g_errno ) ) ;
am . safePrintf ( " ]]></statusMsg> \n " ) ;
2014-11-11 01:45:11 +03:00
am . safePrintf ( " \t <docId>% " INT64 " </docId> \n " , xd - > m_docId ) ;
2014-07-23 00:11:21 +04:00
if ( gr - > m_getSections ) {
SafeBuf * secBuf = xd - > getInlineSectionVotingBuf ( ) ;
am . safePrintf ( " \t <htmlSrc><![CDATA[ " ) ;
if ( secBuf - > length ( ) )
am . cdataEncode ( secBuf - > getBufStart ( ) ) ;
am . safePrintf ( " ]]></htmlSrc> \n " ) ;
}
am . safePrintf ( " </response> \n " ) ;
ct = " text/xml " ;
2014-07-07 01:13:00 +04:00
}
if ( format = = FORMAT_JSON ) {
2014-07-23 00:11:21 +04:00
am . safePrintf ( " { \" response \" :{ \n " ) ;
2014-11-11 01:45:11 +03:00
am . safePrintf ( " \t \" statusCode \" :% " INT32 " , \n " , ( int32_t ) g_errno ) ;
2014-07-23 00:11:21 +04:00
am . safePrintf ( " \t \" statusMsg \" : \" " ) ;
am . jsonEncode ( mstrerror ( g_errno ) ) ;
am . safePrintf ( " \" , \n " ) ;
2014-11-11 01:45:11 +03:00
am . safePrintf ( " \t \" docId \" :% " INT64 " , \n " , xd - > m_docId ) ;
2014-07-23 00:11:21 +04:00
if ( gr - > m_getSections ) {
SafeBuf * secBuf = xd - > getInlineSectionVotingBuf ( ) ;
am . safePrintf ( " \t \" htmlSrc \" : \" " ) ;
if ( secBuf - > length ( ) )
am . jsonEncode ( secBuf - > getBufStart ( ) ) ;
am . safePrintf ( " \" , \n " ) ;
}
// subtract ",\n"
am . m_length - = 2 ;
am . safePrintf ( " \n } \n } \n " ) ;
ct = " application/json " ;
}
if ( format = = FORMAT_XML | | format = = FORMAT_JSON ) {
2014-07-07 05:53:05 +04:00
mdelete ( msg7 , sizeof ( Msg7 ) , " PageInject " ) ;
delete ( msg7 ) ;
2014-07-23 00:11:21 +04:00
return g_httpServer . sendDynamicPage ( sock ,
am . getBufStart ( ) ,
am . length ( ) ,
0 ,
false ,
ct ) ;
2014-07-07 01:13:00 +04:00
}
2013-08-03 00:12:24 +04:00
//
// debug
//
/*
// now get the meta list, in the process it will print out a
// bunch of junk into msg7->m_pbuf
if ( xd - > m_docId ) {
char * metalist = xd - > getMetaList ( 1 , 1 , 1 , 1 , 1 , 1 ) ;
if ( ! metalist | | metalist = = ( void * ) - 1 ) { char * xx = NULL ; * xx = 0 ; }
// print it out
SafeBuf * pbuf = & msg7 - > m_sbuf ;
xd - > printDoc ( pbuf ) ;
bool status = g_httpServer . sendDynamicPage ( msg7 - > m_socket ,
pbuf - > getBufStart ( ) ,
pbuf - > length ( ) ,
- 1 , //cachtime
false , //postreply?
NULL , //ctype
- 1 , //httpstatus
NULL , //cookie
" utf-8 " ) ;
// delete the state now
mdelete ( st , sizeof ( Msg7 ) , " PageInject " ) ;
delete ( st ) ;
// return the status
return status ;
}
*/
//
// end debug
//
2014-06-15 19:26:27 +04:00
char * url = gr - > m_url ;
2013-08-03 00:12:24 +04:00
// . if we're talking w/ a robot he doesn't care about this crap
// . send him back the error code (0 means success)
2014-12-04 19:29:17 +03:00
if ( url & & gr - > m_shortReply ) {
2014-01-10 09:13:41 +04:00
char buf [ 1024 * 32 ] ;
2013-08-03 00:12:24 +04:00
char * p = buf ;
// return docid and hostid
if ( ! g_errno ) p + = sprintf ( p ,
2014-11-11 01:45:11 +03:00
" 0,docId=% " INT64 " ,hostId=% " INT32 " , " ,
2013-08-03 00:12:24 +04:00
docId , hostId ) ;
// print error number here
2014-11-11 01:45:11 +03:00
else p + = sprintf ( p , " % " INT32 " ,0,0, " , ( int32_t ) g_errno ) ;
2013-08-03 00:12:24 +04:00
// print error msg out, too or "Success"
p + = sprintf ( p , " %s " , mstrerror ( g_errno ) ) ;
mdelete ( msg7 , sizeof ( Msg7 ) , " PageInject " ) ;
delete ( msg7 ) ;
2014-06-12 04:24:33 +04:00
return g_httpServer . sendDynamicPage ( sock , buf , gbstrlen ( buf ) ,
2013-08-03 00:12:24 +04:00
- 1 /*cachetime*/ ) ;
}
2014-01-10 09:13:41 +04:00
SafeBuf sb ;
2013-08-03 00:12:24 +04:00
// print admin bar
2014-06-12 04:24:33 +04:00
g_pages . printAdminTop ( & sb , sock , & gr - > m_hr ) ;
// print a response msg if rendering the page after a submission
if ( g_errno )
sb . safePrintf ( " <center>Error injecting url: <b>%s[%i]</b> "
" </center> " ,
mstrerror ( g_errno ) , g_errno ) ;
else if ( ( gr - > m_url & & gr - > m_url [ 0 ] ) | |
( gr - > m_queryToScrape & & gr - > m_queryToScrape [ 0 ] ) )
2014-06-15 19:26:27 +04:00
sb . safePrintf ( " <center><b>Sucessfully injected %s "
" </center><br> "
, xd - > m_firstUrl . m_url
) ;
2014-06-12 04:24:33 +04:00
// print the table of injection parms
g_parms . printParmTable ( & sb , sock , & gr - > m_hr ) ;
2013-08-03 00:12:24 +04:00
// clear g_errno, if any, so our reply send goes through
g_errno = 0 ;
// calculate buffer length
2014-11-11 01:45:11 +03:00
//int32_t bufLen = p - buf;
2013-08-03 00:12:24 +04:00
// nuke state
mdelete ( msg7 , sizeof ( Msg7 ) , " PageInject " ) ;
delete ( msg7 ) ;
// . send this page
// . encapsulates in html header and tail
// . make a Mime
// . i thought we need -2 for cacheTime, but i guess not
2014-06-12 04:24:33 +04:00
return g_httpServer . sendDynamicPage ( sock ,
2014-01-10 09:13:41 +04:00
sb . getBufStart ( ) ,
sb . length ( ) ,
- 1 /*cachetime*/ ) ;
2013-08-03 00:12:24 +04:00
}
2014-06-12 04:24:33 +04:00
//
// END HTML INJECTION PAGE CODE
//
2013-08-03 00:12:24 +04:00
Msg7 : : Msg7 ( ) {
2014-09-21 07:12:28 +04:00
reset ( ) ;
}
Msg7 : : ~ Msg7 ( ) {
}
2014-09-24 04:48:40 +04:00
//void Msg7::constructor () {
// reset();
//}
2014-09-21 07:12:28 +04:00
void Msg7 : : reset ( ) {
2013-08-03 00:12:24 +04:00
m_round = 0 ;
2014-06-15 20:10:00 +04:00
m_firstTime = true ;
m_fixMe = false ;
2014-06-16 01:57:38 +04:00
m_injectCount = 0 ;
2014-06-24 17:30:33 +04:00
m_start = NULL ;
2014-09-24 04:48:40 +04:00
m_sbuf . reset ( ) ;
2013-08-03 00:12:24 +04:00
}
2014-06-12 04:24:33 +04:00
// when XmlDoc::inject() complets it calls this
void doneInjectingWrapper9 ( void * state ) {
2014-06-15 20:10:00 +04:00
2014-06-12 04:24:33 +04:00
Msg7 * msg7 = ( Msg7 * ) state ;
2014-06-15 20:10:00 +04:00
2014-09-21 07:12:28 +04:00
msg7 - > m_inUse = false ;
2014-11-11 01:45:11 +03:00
// int16_tcut
2014-07-23 00:11:21 +04:00
XmlDoc * xd = & msg7 - > m_xd ;
GigablastRequest * gr = & msg7 - > m_gr ;
if ( gr - > m_getSections & & ! gr - > m_gotSections ) {
// do not re-call
gr - > m_gotSections = true ;
// new callback now, same state
xd - > m_callback1 = doneInjectingWrapper9 ;
// and if it blocks internally, it will call
// getInlineSectionVotingBuf until it completes then it will
// call xd->m_callback
xd - > m_masterLoop = NULL ;
// get sections
SafeBuf * buf = xd - > getInlineSectionVotingBuf ( ) ;
// if it returns -1 wait for it to call wrapper10 when done
if ( buf = = ( void * ) - 1 ) return ;
// error?
if ( ! buf ) log ( " inject: error getting sections: %s " ,
mstrerror ( g_errno ) ) ;
}
2014-06-15 20:10:00 +04:00
loop :
// if we were injecting delimterized documents...
char * delim = gr - > m_contentDelim ;
if ( delim & & ! delim [ 0 ] ) delim = NULL ;
if ( delim & & msg7 - > m_start ) {
// do another injection. returns false if it blocks
if ( ! msg7 - > inject ( msg7 - > m_state , msg7 - > m_callback ) )
return ;
}
2014-06-26 16:43:03 +04:00
if ( msg7 - > m_start & & delim )
2014-06-15 20:54:08 +04:00
goto loop ;
2014-06-15 20:10:00 +04:00
// and we call the original caller
2014-06-12 04:24:33 +04:00
msg7 - > m_callback ( msg7 - > m_state ) ;
}
2013-08-03 00:12:24 +04:00
2014-07-09 23:25:23 +04:00
bool Msg7 : : inject ( char * coll ,
char * proxiedUrl ,
2014-11-11 01:45:11 +03:00
int32_t proxiedUrlLen ,
2014-07-09 23:25:23 +04:00
char * content ,
void * state ,
void ( * callback ) ( void * state ) ) {
GigablastRequest * gr = & m_gr ;
// reset THIS to defaults. use NULL for cr since mostly for SearchInput
g_parms . setToDefault ( ( char * ) gr , OBJ_GBREQUEST , NULL ) ;
// copy into safebufs in case the underlying data gets deleted.
gr - > m_tmpBuf1 . safeStrcpy ( coll ) ;
gr - > m_coll = gr - > m_tmpBuf1 . getBufStart ( ) ;
// copy into safebufs in case the underlying data gets deleted.
gr - > m_tmpBuf2 . safeMemcpy ( proxiedUrl , proxiedUrlLen ) ;
gr - > m_tmpBuf2 . nullTerm ( ) ;
gr - > m_url = gr - > m_tmpBuf2 . getBufStart ( ) ;
// copy into safebufs in case the underlying data gets deleted.
gr - > m_tmpBuf3 . safeStrcpy ( content ) ;
gr - > m_content = gr - > m_tmpBuf3 . getBufStart ( ) ;
gr - > m_hasMime = true ;
return inject ( state , callback ) ;
}
2013-09-27 21:04:46 +04:00
2014-09-24 04:48:40 +04:00
// returns false if would block
2014-09-24 23:40:39 +04:00
// bool Msg7::injectTitleRec ( void *state ,
// void (*callback)(void *state) ,
// CollectionRec *cr ) {
static void sendReply ( UdpSlot * slot ) {
if ( g_errno )
g_udpServer . sendErrorReply ( slot , g_errno ) ;
else
g_udpServer . sendReply_ass ( NULL , 0 , NULL , 0 , slot ) ;
}
// when XmlDoc::inject() complets it calls this
void doneInjectingWrapper10 ( void * state ) {
XmlDoc * xd = ( XmlDoc * ) state ;
UdpSlot * slot = ( UdpSlot * ) xd - > m_slot ;
2014-11-11 01:45:11 +03:00
int32_t err = g_errno ;
2014-09-24 23:40:39 +04:00
mdelete ( xd , sizeof ( XmlDoc ) , " PageInject " ) ;
delete ( xd ) ;
g_errno = err ;
sendReply ( slot ) ;
}
2014-11-11 01:45:11 +03:00
void handleRequest7 ( UdpSlot * slot , int32_t netnice ) {
2014-09-24 23:40:39 +04:00
//m_state = state;
//m_callback = callback;
2014-09-24 04:48:40 +04:00
2014-11-11 01:45:11 +03:00
// int16_tcut
2014-09-24 23:40:39 +04:00
XmlDoc * xd ;
try { xd = new ( XmlDoc ) ; }
catch ( . . . ) {
g_errno = ENOMEM ;
log ( " PageInject: import failed: new(%i): %s " ,
( int ) sizeof ( XmlDoc ) , mstrerror ( g_errno ) ) ;
sendReply ( slot ) ;
return ;
}
mnew ( xd , sizeof ( XmlDoc ) , " PageInject " ) ;
//xd->reset();
char * titleRec = slot - > m_readBuf ;
2014-11-11 01:45:11 +03:00
int32_t titleRecSize = slot - > m_readBufSize ;
2014-09-24 23:40:39 +04:00
2014-11-11 01:45:11 +03:00
int32_t collnum = * ( int32_t * ) titleRec ;
2014-09-24 04:48:40 +04:00
2014-09-24 23:40:39 +04:00
titleRec + = 4 ;
titleRecSize - = 4 ;
CollectionRec * cr = g_collectiondb . m_recs [ collnum ] ;
if ( ! cr ) {
sendReply ( slot ) ;
return ;
}
2014-09-24 04:48:40 +04:00
// if injecting a titlerec from an import operation use set2()
//if ( m_sbuf.length() > 0 ) {
2014-09-24 23:40:39 +04:00
xd - > set2 ( titleRec , //m_sbuf.getBufStart() ,
titleRecSize , //m_sbuf.length() ,
2014-09-24 04:48:40 +04:00
cr - > m_coll ,
NULL , // pbuf
MAX_NICENESS ,
NULL ) ; // sreq
// log it i guess
log ( " inject: importing %s " , xd - > m_firstUrl . getUrl ( ) ) ;
// call this when done indexing
//xd->m_masterState = this;
//xd->m_masterLoop = doneInjectingWrapper9;
2014-09-24 23:40:39 +04:00
xd - > m_state = xd ; //this;
xd - > m_callback1 = doneInjectingWrapper10 ;
2014-09-24 04:48:40 +04:00
xd - > m_isImporting = true ;
xd - > m_isImportingValid = true ;
2014-09-24 23:40:39 +04:00
// hack this
xd - > m_slot = slot ;
2014-09-24 04:48:40 +04:00
// then index it
if ( ! xd - > indexDoc ( ) )
2014-09-24 23:40:39 +04:00
// return if would block
return ;
// all done?
//return true;
sendReply ( slot ) ;
2014-09-24 04:48:40 +04:00
}
// . returns false if blocked and callback will be called, true otherwise
// . sets g_errno on error
2014-06-12 04:24:33 +04:00
bool Msg7 : : inject ( void * state ,
void ( * callback ) ( void * state )
2014-11-11 01:45:11 +03:00
//int32_t spiderLinksDefault ,
2014-06-12 04:24:33 +04:00
//char *collOveride ) {
) {
2014-01-19 04:23:13 +04:00
2014-06-12 04:24:33 +04:00
GigablastRequest * gr = & m_gr ;
2014-01-19 04:23:13 +04:00
2014-06-12 04:24:33 +04:00
char * coll2 = gr - > m_coll ;
CollectionRec * cr = g_collectiondb . getRec ( coll2 ) ;
2014-01-19 04:23:13 +04:00
if ( ! cr ) {
g_errno = ENOCOLLREC ;
return true ;
}
2014-09-24 04:48:40 +04:00
m_state = state ;
m_callback = callback ;
2014-11-11 01:45:11 +03:00
// int16_tcut
2014-09-24 04:48:40 +04:00
XmlDoc * xd = & m_xd ;
2014-07-04 21:43:04 +04:00
if ( ! gr - > m_url ) {
log ( " inject: no url provied to inject " ) ;
g_errno = EBADURL ;
return true ;
}
2014-06-12 04:24:33 +04:00
//char *coll = cr->m_coll;
2013-08-03 00:12:24 +04:00
2014-05-16 02:10:57 +04:00
// test
//diffbotReply = "{\"request\":{\"pageUrl\":\"http://www.washingtonpost.com/2011/03/10/ABe7RaQ_moreresults.html\",\"api\":\"article\",\"version\":3},\"objects\":[{\"icon\":\"http://www.washingtonpost.com/favicon.ico\",\"text\":\"In Case You Missed It\nWeb Hostess Live: The latest from the Web (vForum, May 15, 2014; 3:05 PM)\nGot Plans: Advice from the Going Out Guide (vForum, May 15, 2014; 2:05 PM)\nWhat to Watch: TV chat with Hank Stuever (vForum, May 15, 2014; 1:10 PM)\nColor of Money Live (vForum, May 15, 2014; 1:05 PM)\nWeb Hostess Live: The latest from the Web (vForum, May 15, 2014; 12:25 PM)\nMichael Devine outdoor entertaining and design | Home Front (vForum, May 15, 2014; 12:20 PM)\nThe Answer Sheet: Education chat with Valerie Strauss (vForum, May 14, 2014; 2:00 PM)\nThe Reliable Source Live (vForum, May 14, 2014; 1:05 PM)\nAsk Tom: Rants, raves and questions on the DC dining scene (vForum, May 14, 2014; 12:15 PM)\nOn Parenting with Meghan Leahy (vForum, May 14, 2014; 12:10 PM)\nAsk Aaron: The week in politics (vForum, May 13, 2014; 3:05 PM)\nEugene Robinson Live (vForum, May 13, 2014; 2:05 PM)\nTuesdays with Moron: Chatological Humor Update (vForum, May 13, 2014; 12:00 PM)\nComPost Live with Alexandra Petri (vForum, May 13, 2014; 11:05 AM)\nAsk Boswell: Redskins, Nationals and Washington sports (vForum, May 12, 2014; 1:50 PM)\nAdvice from Slate's 'Dear Prudence' (vForum, May 12, 2014; 1:40 PM)\nDr. Gridlock (vForum, May 12, 2014; 1:35 PM)\nSwitchback: Talking Tech (vForum, May 9, 2014; 12:05 PM)\nThe Fix Live (vForum, May 9, 2014; 12:00 PM)\nWhat to Watch: TV chat with Hank Stuever (vForum, May 8, 2014; 1:10 PM)\nMore News\",\"title\":\"The Washington Post\",\"diffbotUri\":\"article|3|828850106\",\"pageUrl\":\"http://www.washingtonpost.com/2011/03/10/ABe7RaQ_moreresults.html\",\"humanLanguage\":\"en\",\"html\":\"<p>In Case You Missed It<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/web-hostess-140515-new.html\\\">Web Hostess Live: The latest from the Web<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 3:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/got-plans-05-15-2014.html\\\">Got Plans: Advice from the Going Out Guide<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 2:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/tv-chat-140515.html\\\">What to Watch: TV chat with Hank Stuever<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 1:10 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/color-of-money-live-20140515.html\\\">Color of Money Live<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 1:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/web-hostess-140515-new.html\\\">Web Hostess Live: The latest from the Web<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 12:25 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/home-front-0515.html\\\">Michael Devine outdoor entertaining and design | Home Front<\\/a> <\\/p>\n<p>(vForum, May 15, 2014; 12:20 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/the-answer-sheet-20140514.html\\\">The Answer Sheet: Education chat with Valerie Strauss<\\/a> <\\/p>\n<p>(vForum, May 14, 2014; 2:00 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/the-reliable-source-140514-new.html\\\">The Reliable Source Live<\\/a> <\\/p>\n<p>(vForum, May 14, 2014; 1:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/ask-tom-5-14-14.html\\\">Ask Tom: Rants, raves and questions on the DC dining scene <\\/a> <\\/p>\n<p>(vForum, May 14, 2014; 12:15 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/parenting-0514.html\\\">On Parenting with Meghan Leahy<\\/a> <\\/p>\n<p>(vForum, May 14, 2014; 12:10 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/post-politics-ask-aaron-051313.html\\\">Ask Aaron: The week in politics<\\/a> <\\/p>\n<p>(vForum, May 13, 2014; 3:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/opinion-focus-with-eugene-robinson-20140513.html\\\">Eugene Robinson Live<\\/a> <\\/p>\n<p>(vForum, May 13, 2014; 2:05 PM)<\\/p>\n<p> <a href=\\\"http://live.washingtonpost.com/gene-weingarten-140513.html
2013-08-03 00:12:24 +04:00
if ( g_repairMode ) { g_errno = EREPAIRING ; return true ; }
2014-06-16 01:57:38 +04:00
// this will be NULL if the "content" was empty or not given
2014-06-15 19:26:27 +04:00
char * content = gr - > m_content ;
2014-06-16 01:57:38 +04:00
// . try the uploaded file if nothing in the text area
// . this will be NULL if the "content" was empty or not given
2014-06-15 19:26:27 +04:00
if ( ! content ) content = gr - > m_contentFile ;
2014-06-15 20:10:00 +04:00
if ( m_firstTime ) {
m_firstTime = false ;
m_start = content ;
}
// save current start since we update it next
char * start = m_start ;
2014-06-15 20:54:08 +04:00
// if this is empty we are done
//if ( ! start )
// return true;
2014-06-15 20:10:00 +04:00
char * delim = gr - > m_contentDelim ;
if ( delim & & ! delim [ 0 ] ) delim = NULL ;
if ( m_fixMe ) {
// we had made the first delim char a \0 to index the
// previous document, now put it back to what it was
* m_start = * delim ;
// i guess unset this
m_fixMe = false ;
}
// if we had a delimeter...
if ( delim ) {
// we've saved m_start as "start" above,
// so find the next delimeter after it and set that to m_start
2014-06-15 20:54:08 +04:00
// add +1 to avoid infinite loop
2014-06-16 01:57:38 +04:00
m_start = strstr ( start + 1 , delim ) ;
2014-06-15 20:10:00 +04:00
// for injecting "start" set this to \0
if ( m_start ) {
// null term it
* m_start = ' \0 ' ;
// put back the original char on next round...?
m_fixMe = true ;
}
}
// this is the url of the injected content
m_injectUrlBuf . safeStrcpy ( gr - > m_url ) ;
bool modifiedUrl = false ;
// if we had a delimeter we must make a fake url
// if ( delim ) {
// // if user had a <url> or <doc> or <docid> field use that
// char *hint = strcasestr ( start , "<url>" );
// if ( hint ) {
// modifiedUrl = true;
// ...
// }
// }
// if we had a delimeter thus denoting multiple items/documents to
// be injected, we must create unique urls for each item.
if ( delim & & ! modifiedUrl ) {
// use hash of the content
2014-10-30 22:36:39 +03:00
int64_t ch64 = hash64n ( start , 0LL ) ;
2014-06-15 20:54:08 +04:00
// normalize it
Url u ; u . set ( gr - > m_url ) ;
// reset it
m_injectUrlBuf . reset ( ) ;
2014-06-16 01:57:38 +04:00
// by default append a -<ch64> to the provided url
2014-11-11 01:45:11 +03:00
m_injectUrlBuf . safePrintf ( " %s-% " UINT64 " " , u . getUrl ( ) , ch64 ) ;
2014-06-15 20:10:00 +04:00
}
2014-06-16 01:57:38 +04:00
// count them
m_injectCount + + ;
2014-09-21 07:12:28 +04:00
m_inUse = true ;
2014-06-15 20:10:00 +04:00
if ( ! xd - > injectDoc ( m_injectUrlBuf . getBufStart ( ) ,
2014-01-19 09:19:26 +04:00
cr ,
2014-06-15 20:10:00 +04:00
start , // content ,
2014-06-12 04:24:33 +04:00
gr - > m_diffbotReply ,
gr - > m_hasMime , // content starts with http mime?
gr - > m_hopCount ,
gr - > m_charset ,
gr - > m_deleteUrl ,
gr - > m_contentTypeStr , // text/html text/xml
gr - > m_spiderLinks ,
gr - > m_newOnly , // index iff new
this ,
2014-12-04 19:29:17 +03:00
doneInjectingWrapper9 ,
// extra shit
gr - > m_firstIndexed ,
gr - > m_lastSpidered ) )
2014-01-19 09:19:26 +04:00
// we blocked...
return false ;
2013-08-03 00:12:24 +04:00
2014-09-21 07:12:28 +04:00
m_inUse = false ;
2014-01-19 09:19:26 +04:00
return true ;
2013-08-03 00:12:24 +04:00
}
2014-01-19 09:19:26 +04:00
2013-08-03 00:12:24 +04:00
///////////////
//
// SCRAPE GOOGLE
//
// and inject the serps
//
///////////////
void doneInjectingLinksWrapper ( void * state ) {
Msg7 * msg7 = ( Msg7 * ) state ;
SafeBuf * sb = & msg7 - > m_sb ;
// copy the serps into ou rbuf
if ( ! g_errno ) {
// print header
if ( sb - > length ( ) = = 0 ) {
// print header of page
sb - > safePrintf ( " <?xml version= \" 1.0 \" "
" encoding= \" UTF-8 \" ?> \n "
" <response> \n " ) ;
}
// serp header
if ( msg7 - > m_round = = 1 )
sb - > safePrintf ( " \t <googleResults> \n " ) ;
else
sb - > safePrintf ( " \t <bingResults> \n " ) ;
// print results
sb - > safeMemcpy ( & msg7 - > m_xd . m_serpBuf ) ;
// end that
if ( msg7 - > m_round = = 1 )
sb - > safePrintf ( " \t </googleResults> \n " ) ;
else
sb - > safePrintf ( " \t </bingResults> \n " ) ;
}
// do bing now
if ( msg7 - > m_round = = 1 ) {
// return if it blocks
if ( ! msg7 - > scrapeQuery ( ) ) return ;
}
2014-07-30 06:51:41 +04:00
2013-08-03 00:12:24 +04:00
// otherwise, parse out the search results so steve can display them
if ( g_errno )
sb - > safePrintf ( " <error><![CDATA[%s]]></error> \n " ,
mstrerror ( g_errno ) ) ;
// print header of page
sb - > safePrintf ( " </response> \n " ) ;
// page is not more than 32k
//char buf[1024*32];
//char *p = buf;
// return docid and hostid
//p += sprintf ( p , "scraping status ");
// print error msg out, too or "Success"
//p += sprintf ( p , "%s", mstrerror(g_errno));
2014-07-30 06:51:41 +04:00
TcpSocket * sock = msg7 - > m_socket ;
g_httpServer . sendDynamicPage ( sock ,
2013-08-03 00:12:24 +04:00
sb - > getBufStart ( ) ,
sb - > length ( ) ,
- 1 /*cachetime*/ ) ;
// hopefully sb buffer is copied becaues this will free it:
mdelete ( msg7 , sizeof ( Msg7 ) , " PageInject " ) ;
delete ( msg7 ) ;
}
// . "uf" is printf url format to scrape with a %s for the query
// . example: uf="http://www.google.com/search?num=50&q=%s&scoring=d&filter=0";
bool Msg7 : : scrapeQuery ( ) {
// advance round now in case we return early
m_round + + ;
2014-06-12 04:24:33 +04:00
GigablastRequest * gr = & m_gr ;
2013-08-03 00:12:24 +04:00
// error?
2014-06-12 04:24:33 +04:00
char * qts = gr - > m_queryToScrape ;
if ( ! qts ) { char * xx = NULL ; * xx = 0 ; }
if ( gbstrlen ( qts ) > 500 ) {
2013-08-03 00:12:24 +04:00
g_errno = EQUERYTOOBIG ;
return true ;
}
// first encode the query
SafeBuf ebuf ;
2014-06-12 04:24:33 +04:00
ebuf . urlEncode ( qts ) ; // queryUNEncoded );
2014-07-30 06:51:41 +04:00
ebuf . nullTerm ( ) ;
2013-08-03 00:12:24 +04:00
char * uf ;
if ( m_round = = 1 )
// set to 1 for debugging
uf = " http://www.google.com/search?num=20& "
" q=%s&scoring=d&filter=0 " ;
//uf = "https://startpage.com/do/search?q=%s";
//uf = "http://www.google.com/"
// "/cse?cx=013269018370076798483%3A8eec3papwpi&"
// "ie=UTF-8&q=%s&"
// "num=20";
else
uf = " http://www.bing.com/search?q=%s " ;
// skip bing for now
//if ( m_round == 2 )
// return true;
//if ( m_round == 1 )
// return true;
// make the url we will download
char ubuf [ 2048 ] ;
sprintf ( ubuf , uf , ebuf . getBufStart ( ) ) ;
// log it
log ( " inject: SCRAPING %s " , ubuf ) ;
SpiderRequest sreq ;
sreq . reset ( ) ;
// set the SpiderRequest
strcpy ( sreq . m_url , ubuf ) ;
// . tell it to only add the hosts of each outlink for now!
// . that will be passed on to when XmlDoc calls Links::set() i guess
// . xd will not reschedule the scraped url into spiderdb either
sreq . m_isScraping = 1 ;
sreq . m_fakeFirstIp = 1 ;
2014-11-11 01:45:11 +03:00
int32_t firstIp = hash32n ( ubuf ) ;
2013-08-03 00:12:24 +04:00
if ( firstIp = = 0 | | firstIp = = - 1 ) firstIp = 1 ;
sreq . m_firstIp = firstIp ;
// parent docid is 0
sreq . setKey ( firstIp , 0LL , false ) ;
2014-06-12 04:24:33 +04:00
char * coll2 = gr - > m_coll ;
CollectionRec * cr = g_collectiondb . getRec ( coll2 ) ;
2013-08-03 00:12:24 +04:00
// forceDEl = false, niceness = 0
2014-06-12 04:24:33 +04:00
m_xd . set4 ( & sreq , NULL , cr - > m_coll , NULL , 0 ) ;
2013-08-03 00:12:24 +04:00
//m_xd.m_isScraping = true;
// download without throttling
//m_xd.m_throttleDownload = false;
// disregard this
m_xd . m_useRobotsTxt = false ;
// this will tell it to index ahrefs first before indexing
// the doc. but do NOT do this if we are from ahrefs.com
// ourselves to avoid recursive explosion!!
if ( m_useAhrefs )
m_xd . m_useAhrefs = true ;
2014-07-30 06:51:41 +04:00
m_xd . m_reallyInjectLinks = true ; //gr->m_injectLinks;
2013-08-03 00:12:24 +04:00
//
// rather than just add the links of the page to spiderdb,
// let's inject them!
//
m_xd . setCallback ( this , doneInjectingLinksWrapper ) ;
// niceness is 0
m_linkDedupTable . set ( 4 , 0 , 512 , NULL , 0 , false , 0 , " ldtab2 " ) ;
// do we actually inject the links, or just scrape?
if ( ! m_xd . injectLinks ( & m_linkDedupTable ,
NULL ,
this ,
doneInjectingLinksWrapper ) )
return false ;
// otherwise, just download the google/bing search results so we
// can display them in xml
//else if ( m_xd.getUtf8Content() == (char **)-1 )
// return false;
// print reply..
//printReply();
return true ;
}
2014-09-21 07:12:28 +04:00
///////////////////////////////////////
///////////////////////////////////////
// IMPORT CODE
///////////////////////////////////////
///////////////////////////////////////
//////
//
// BEGIN IMPORT TITLEDB FUNCTIONS
//
//////
// . injecting titledb files from other gb clusters into your collection
// . select the 'import' tab in the admin gui and enter the directory of
// the titledb files you want to import/inject.
// . it will scan that directory for all titledb files.
// . you can also set max simultaneous injections. set to auto so it
// will do 10 per host, up to like 100 max.
# define MAXINJECTSOUT 100
class ImportState {
public :
// available msg7s to use
2014-09-24 23:40:39 +04:00
class Multicast * m_ptrs ;
2014-11-11 01:45:11 +03:00
int32_t m_numPtrs ;
2014-09-21 07:12:28 +04:00
// collection we are importing INTO
collnum_t m_collnum ;
2014-10-30 22:36:39 +03:00
int64_t m_numIn ;
int64_t m_numOut ;
2014-09-21 07:12:28 +04:00
// bookmarking helpers
2014-10-30 22:36:39 +03:00
int64_t m_fileOffset ;
2014-11-11 01:45:11 +03:00
int32_t m_bfFileId ;
2014-09-21 07:12:28 +04:00
BigFile m_bf ;
bool m_loadedPlaceHolder ;
2014-10-30 22:36:39 +03:00
int64_t m_bfFileSize ;
2014-09-21 07:12:28 +04:00
2014-09-24 23:40:39 +04:00
class Multicast * getAvailMulticast ( ) ; // Msg7();
2014-09-21 07:12:28 +04:00
2014-09-24 04:48:40 +04:00
void saveFileBookMark ( ) ; //class Msg7 *msg7 );
2014-09-21 07:12:28 +04:00
2014-09-24 04:48:40 +04:00
bool setCurrentTitleFileAndOffset ( ) ;
2014-09-21 07:12:28 +04:00
ImportState ( ) ;
~ ImportState ( ) { reset ( ) ; }
bool importLoop ( ) ;
void reset ( ) ;
} ;
ImportState : : ImportState ( ) {
m_numIn = 0 ;
m_numOut = 0 ;
m_ptrs = NULL ;
m_numPtrs = 0 ;
2014-09-24 04:48:40 +04:00
m_bfFileId = - 1 ;
m_bfFileSize = - 1 ;
m_fileOffset = 0 ;
2014-09-21 07:12:28 +04:00
}
void ImportState : : reset ( ) {
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < m_numPtrs ; i + + ) {
2014-09-24 23:40:39 +04:00
Multicast * mcast = & m_ptrs [ i ] ;
mcast - > destructor ( ) ;
2014-09-21 07:12:28 +04:00
//m_ptrs[i] = NULL;
}
2014-09-24 23:40:39 +04:00
mfree ( m_ptrs , MAXINJECTSOUT * sizeof ( Multicast ) , " ism7f " ) ;
2014-09-21 07:12:28 +04:00
m_ptrs = NULL ;
m_numPtrs = 0 ;
m_fileOffset = 0LL ;
m_bfFileId = - 2 ;
m_loadedPlaceHolder = false ;
}
2014-09-24 04:48:40 +04:00
static bool s_tried = false ;
// if user clicks on "enable import loop" for a collection we call this
// from Parms.cpp
void resetImportLoopFlag ( ) {
s_tried = false ;
}
2014-09-21 07:12:28 +04:00
// . call this when gb startsup
// . scan collections to see if any imports were active
2014-09-21 20:26:13 +04:00
// . returns false and sets g_errno on failure
2014-09-21 07:12:28 +04:00
bool resumeImports ( ) {
2014-09-24 04:48:40 +04:00
if ( s_tried ) return true ;
s_tried = true ;
2014-09-25 18:55:30 +04:00
if ( g_hostdb . m_hostId ! = 0 ) return true ;
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < g_collectiondb . m_numRecs ; i + + ) {
2014-09-21 07:12:28 +04:00
CollectionRec * cr = g_collectiondb . m_recs [ i ] ;
if ( ! cr ) continue ;
if ( ! cr - > m_importEnabled ) continue ;
// each import has its own state
// it contains a sequence of msg7s to do simulataneous
// injections
ImportState * is ;
try { is = new ( ImportState ) ; }
catch ( . . . ) {
g_errno = ENOMEM ;
2014-11-11 01:45:11 +03:00
log ( " PageInject: new(% " INT32 " ): %s " ,
( int32_t ) sizeof ( ImportState ) , mstrerror ( g_errno ) ) ;
2014-09-21 20:26:13 +04:00
return false ;
2014-09-21 07:12:28 +04:00
}
mnew ( is , sizeof ( ImportState ) , " isstate " ) ;
// assign to cr as well
2014-09-24 04:48:40 +04:00
cr - > m_importState = is ;
2014-09-21 07:12:28 +04:00
// and collnum
is - > m_collnum = cr - > m_collnum ;
// resume the import
is - > importLoop ( ) ;
}
return true ;
}
2014-09-24 04:48:40 +04:00
// . sets m_fileOffset and m_bf
// . returns false and sets g_errno on error
// . returns false if nothing to read too... but does not set g_errno
bool ImportState : : setCurrentTitleFileAndOffset ( ) {
// leave m_bf and m_fileOffset alone if there is more to read
if ( m_fileOffset < m_bfFileSize )
return true ;
2014-09-21 07:12:28 +04:00
2014-09-24 04:48:40 +04:00
CollectionRec * cr = g_collectiondb . getRec ( m_collnum ) ;
if ( ! cr ) return false ;
log ( " import: import finding next file " ) ;
// if ( m_offIsValid ) {
// //*off = m_fileOffset;
// return &m_bf;
// }
//m_offIsValid = true;
2014-09-21 07:12:28 +04:00
// look for titledb0001.dat etc. files in the
// workingDir/inject/ subdir
SafeBuf ddd ;
ddd . safePrintf ( " %sinject " , cr - > m_importDir . getBufStart ( ) ) ;
// now use the one provided. we should also provide the # of threads
if ( cr - > m_importDir . getBufStart ( ) & &
cr - > m_importDir . getBufStart ( ) [ 0 ] ) {
ddd . reset ( ) ;
ddd . safeStrcpy ( cr - > m_importDir . getBufStart ( ) ) ;
}
//
// assume we are the first filename
// set s_fileId to the minimum
//
Dir dir ;
dir . set ( ddd . getBufStart ( ) ) ;
2014-09-24 04:48:40 +04:00
if ( ! dir . open ( ) ) return false ;
// assume none
2014-11-11 01:45:11 +03:00
int32_t minFileId = - 1 ;
2014-09-24 04:48:40 +04:00
2014-09-21 07:12:28 +04:00
// getNextFilename() writes into this
2014-09-24 04:48:40 +04:00
char pattern [ 64 ] ; strcpy ( pattern , " titledb* " ) ;
2014-09-21 07:12:28 +04:00
char * filename ;
while ( ( filename = dir . getNextFilename ( pattern ) ) ) {
// filename must be a certain length
2014-11-11 01:45:11 +03:00
int32_t filenameLen = gbstrlen ( filename ) ;
2014-09-21 07:12:28 +04:00
// we need at least "titledb0001.dat"
if ( filenameLen < 15 ) continue ;
// ensure filename starts w/ our m_dbname
if ( strncmp ( filename , " titledb " , 7 ) ! = 0 )
continue ;
2014-09-24 04:48:40 +04:00
// skip if not .dat file
if ( ! strstr ( filename , " .dat " ) )
continue ;
2014-09-21 07:12:28 +04:00
// then a 4 digit number should follow
char * s = filename + 7 ;
if ( ! isdigit ( * ( s + 0 ) ) ) continue ;
if ( ! isdigit ( * ( s + 1 ) ) ) continue ;
if ( ! isdigit ( * ( s + 2 ) ) ) continue ;
if ( ! isdigit ( * ( s + 3 ) ) ) continue ;
// convert digit to id
2014-11-11 01:45:11 +03:00
int32_t id = atol ( s ) ;
2014-09-24 04:48:40 +04:00
// . do not accept files we've already processed
// . -1 means we haven't processed any yet
if ( m_bfFileId > = 0 & & id < = m_bfFileId ) continue ;
2014-09-21 07:12:28 +04:00
// the min of those we haven't yet processed/injected
2014-09-24 04:48:40 +04:00
if ( id < minFileId | | minFileId < 0 ) minFileId = id ;
2014-09-21 07:12:28 +04:00
}
// get where we left off
if ( ! m_loadedPlaceHolder ) {
// read where we left off from file if possible
char fname [ 256 ] ;
sprintf ( fname , " %slasttitledbinjectinfo.dat " , g_hostdb . m_dir ) ;
SafeBuf ff ;
ff . fillFromFile ( fname ) ;
if ( ff . length ( ) > 1 ) {
m_loadedPlaceHolder = true ;
// get the placeholder
sscanf ( ff . getBufStart ( )
2014-11-11 01:45:11 +03:00
, " % " UINT64 " ,% " INT32 " "
2014-09-21 07:12:28 +04:00
, & m_fileOffset
2014-09-24 04:48:40 +04:00
, & minFileId
2014-09-21 07:12:28 +04:00
) ;
}
}
2014-09-24 04:48:40 +04:00
// if no files! return false to indicate we are done
if ( minFileId = = - 1 ) return false ;
2014-09-21 07:12:28 +04:00
// set up s_bf then
2014-09-24 04:48:40 +04:00
//if ( m_bfFileId != minFileId ) {
SafeBuf tmp ;
2014-11-11 01:45:11 +03:00
tmp . safePrintf ( " titledb%04 " INT32 " -000.dat "
2014-09-24 04:48:40 +04:00
//,dir.getDirname()
, minFileId ) ;
m_bf . set ( dir . getDirname ( ) , tmp . getBufStart ( ) ) ;
if ( ! m_bf . open ( O_RDONLY ) ) {
log ( " inject: import: could not open %s%s for reading " ,
dir . getDirname ( ) , tmp . getBufStart ( ) ) ;
return false ;
2014-09-21 07:12:28 +04:00
}
2014-09-24 04:48:40 +04:00
m_bfFileId = minFileId ;
// reset ptr into file
//*off = 0;
// and set this
m_bfFileSize = m_bf . getFileSize ( ) ;
m_fileOffset = 0 ;
//}
2014-09-21 07:12:28 +04:00
2014-09-24 04:48:40 +04:00
log ( " import: importing from file %s " , m_bf . getFilename ( ) ) ;
2014-09-21 07:12:28 +04:00
2014-09-24 04:48:40 +04:00
return true ; //&m_bf;
2014-09-21 07:12:28 +04:00
}
2014-09-24 23:40:39 +04:00
void gotMulticastReplyWrapper ( void * state , void * state2 ) ;
2014-09-21 07:12:28 +04:00
//
// . ENTRY POINT FOR IMPORTING TITLEDB RECS FROM ANOTHER CLUSTER
// . when user clicks 'begin' in import page we come here..
// . so when that parm changes in Parms.cpp we sense that and call
// beginImport(CollectionRec *cr)
// . or on startup we call resumeImports to check each coll for
// an import in progress.
// . search for files named titledb*.dat
// . if none found just return
// . when msg7 inject competes it calls this
// . call this from sleep wrapper in Process.cpp
2014-09-24 04:48:40 +04:00
// . returns false if would block (outstanding injects), true otherwise
// . sets g_errno on error
2014-09-21 07:12:28 +04:00
bool ImportState : : importLoop ( ) {
CollectionRec * cr = g_collectiondb . getRec ( m_collnum ) ;
2014-09-25 18:55:30 +04:00
if ( ! cr | | g_hostdb . m_hostId ! = 0 ) {
2014-09-21 07:12:28 +04:00
// if coll was deleted!
2014-11-11 01:45:11 +03:00
log ( " import: collnum % " INT32 " deleted while importing into " ,
( int32_t ) m_collnum ) ;
2014-09-21 07:12:28 +04:00
//if ( m_numOut > m_numIn ) return true;
// delete the entire import state i guess
// what happens if we have a msg7 reply come back in?
// it should see the collrec is NULL and just fail.
mdelete ( this , sizeof ( ImportState ) , " impstate " ) ;
delete ( this ) ;
return true ;
}
INJECTLOOP :
2014-09-24 04:48:40 +04:00
// stop if waiting on outstanding injects
2014-10-30 22:36:39 +03:00
int64_t out = m_numOut - m_numIn ;
2014-09-24 04:48:40 +04:00
if ( out > = cr - > m_numImportInjects ) {
g_errno = 0 ;
return false ;
}
2014-09-26 07:33:42 +04:00
if ( ! cr - > m_importEnabled ) {
// wait for all to return
if ( out > 0 ) return false ;
// then delete it
2014-11-11 01:45:11 +03:00
log ( " import: collnum % " INT32 " import loop disabled " ,
( int32_t ) m_collnum ) ;
2014-09-26 07:33:42 +04:00
mdelete ( this , sizeof ( ImportState ) , " impstate " ) ;
delete ( this ) ;
return true ;
}
2014-09-21 07:12:28 +04:00
// scan each titledb file scanning titledb0001.dat first,
// titledb0003.dat second etc.
2014-10-30 22:36:39 +03:00
//int64_t offset = -1;
2014-09-24 04:48:40 +04:00
// . when offset is too big for current m_bigFile file then
// we go to the next and set offset to 0.
// . sets m_bf and m_fileOffset
if ( ! setCurrentTitleFileAndOffset ( ) ) { //cr , -1 );
log ( " import: import: no files to read " ) ;
//goto INJECTLOOP;
return true ;
}
2014-09-21 07:12:28 +04:00
// this is -1 if none remain!
2014-09-24 04:48:40 +04:00
if ( m_fileOffset = = - 1 ) {
log ( " import: import fileoffset is -1. done. " ) ;
return true ;
}
2014-09-21 07:12:28 +04:00
2014-10-30 22:36:39 +03:00
int64_t saved = m_fileOffset ;
2014-09-21 07:12:28 +04:00
2014-09-24 23:40:39 +04:00
//Msg7 *msg7;
2014-09-24 04:48:40 +04:00
//GigablastRequest *gr;
2014-09-24 23:40:39 +04:00
//SafeBuf *sbuf = NULL;
2014-09-21 07:12:28 +04:00
2014-11-11 01:45:11 +03:00
int32_t need = 12 ;
int32_t dataSize = - 1 ;
2014-09-24 23:40:39 +04:00
//XmlDoc xd;
key_t tkey ;
2014-09-24 04:48:40 +04:00
bool status ;
2014-09-24 23:40:39 +04:00
SafeBuf tmp ;
SafeBuf * sbuf = & tmp ;
2014-10-30 22:36:39 +03:00
int64_t docId ;
2014-11-11 01:45:11 +03:00
int32_t shardNum ;
int32_t key ;
2014-09-24 23:40:39 +04:00
Multicast * mcast ;
char * req ;
2014-11-11 01:45:11 +03:00
int32_t reqSize ;
2014-09-24 04:48:40 +04:00
if ( m_fileOffset > = m_bfFileSize ) {
2014-11-11 01:45:11 +03:00
log ( " inject: import: done processing file % " INT32 " %s " ,
2014-09-24 04:48:40 +04:00
m_bfFileId , m_bf . getFilename ( ) ) ;
goto nextFile ;
}
2014-09-21 07:12:28 +04:00
// read in title rec key and data size
2014-09-24 23:40:39 +04:00
status = m_bf . read ( & tkey , sizeof ( key_t ) , m_fileOffset ) ;
2014-09-21 07:12:28 +04:00
2014-09-24 04:48:40 +04:00
//if ( n != 12 ) goto nextFile;
if ( g_errno ) {
log ( " inject: import: reading file error: %s. advancing "
" to next file " , mstrerror ( g_errno ) ) ;
goto nextFile ;
}
m_fileOffset + = 12 ;
// if negative key, skip
if ( ( tkey . n0 & 0x01 ) = = 0 ) {
goto INJECTLOOP ;
}
2014-09-21 07:12:28 +04:00
// if non-negative then read in size
2014-09-24 04:48:40 +04:00
status = m_bf . read ( & dataSize , 4 , m_fileOffset ) ;
if ( g_errno ) {
log ( " main: failed to read in title rec "
" file. %s. Skipping file %s " ,
mstrerror ( g_errno ) , m_bf . getFilename ( ) ) ;
goto nextFile ;
}
m_fileOffset + = 4 ;
need + = 4 ;
need + = dataSize ;
2014-09-24 23:40:39 +04:00
need + = 4 ; // collnum, first 4 bytes
2014-09-24 04:48:40 +04:00
if ( dataSize < 0 | | dataSize > 500000000 ) {
log ( " main: could not scan in titledb rec of "
2014-11-11 01:45:11 +03:00
" corrupt dataSize of % " INT32 " . BAILING ENTIRE "
2014-09-24 04:48:40 +04:00
" SCAN of file %s " , dataSize , m_bf . getFilename ( ) ) ;
goto nextFile ;
2014-09-21 07:12:28 +04:00
}
2014-09-24 04:48:40 +04:00
//gr = &msg7->m_gr;
//XmlDoc *xd = getAvailXmlDoc();
2014-09-24 23:40:39 +04:00
//msg7 = getAvailMsg7();
mcast = getAvailMulticast ( ) ;
2014-09-24 04:48:40 +04:00
// if none, must have to wait for some to come back to us
2014-09-24 23:40:39 +04:00
if ( ! mcast ) {
2014-09-24 04:48:40 +04:00
// restore file offset
//m_fileOffset = saved;
// no, must have been a oom or something
2014-09-24 23:40:39 +04:00
log ( " import: import no mcast available " ) ;
2014-09-24 04:48:40 +04:00
return true ; //false;
}
// this is for holding a compressed titlerec
2014-09-24 23:40:39 +04:00
//sbuf = &mcast->m_sbuf;//&gr->m_sbuf;
2014-09-21 07:12:28 +04:00
// point to start of buf
2014-09-24 04:48:40 +04:00
sbuf - > reset ( ) ;
2014-09-21 07:12:28 +04:00
// ensure we have enough room
2014-09-24 04:48:40 +04:00
sbuf - > reserve ( need ) ;
2014-09-21 07:12:28 +04:00
2014-09-24 23:40:39 +04:00
// collnum first 4 bytes
2014-11-11 01:45:11 +03:00
sbuf - > pushLong ( ( int32_t ) m_collnum ) ;
2014-09-24 23:40:39 +04:00
2014-09-21 07:12:28 +04:00
// store title key
2014-09-24 04:48:40 +04:00
sbuf - > safeMemcpy ( & tkey , sizeof ( key_t ) ) ;
2014-09-21 07:12:28 +04:00
// then datasize if any. neg rec will have -1 datasize
if ( dataSize > = 0 )
2014-09-24 04:48:40 +04:00
sbuf - > pushLong ( dataSize ) ;
2014-09-21 07:12:28 +04:00
// then read data rec itself into it, compressed titlerec part
if ( dataSize > 0 ) {
// read in the titlerec after the key/datasize
2014-09-24 04:48:40 +04:00
status = m_bf . read ( sbuf - > getBuf ( ) ,
dataSize ,
m_fileOffset ) ;
if ( g_errno ) { // n != dataSize ) {
2014-09-21 07:12:28 +04:00
log ( " main: failed to read in title rec "
2014-09-24 04:48:40 +04:00
" file. %s. Skipping file %s " ,
mstrerror ( g_errno ) , m_bf . getFilename ( ) ) ;
// essentially free up this msg7 now
2014-09-24 23:40:39 +04:00
//msg7->m_inUse = false;
//msg7->reset();
2014-09-21 07:12:28 +04:00
goto nextFile ;
}
2014-09-24 04:48:40 +04:00
// advance
m_fileOffset + = dataSize ;
2014-09-21 07:12:28 +04:00
// it's good, count it
2014-09-24 04:48:40 +04:00
sbuf - > m_length + = dataSize ;
2014-09-21 07:12:28 +04:00
}
// set xmldoc from the title rec
//xd->set ( sbuf.getBufStart() );
//xd->m_masterState = NULL;
//xd->m_masterCallback ( titledbInjectLoop );
// we use this so we know where the doc we are injecting
// was in the foregien titledb file. so we can update our bookmark
// code.
2014-09-24 23:40:39 +04:00
mcast - > m_hackFileOff = saved ; //m_fileOffset;
mcast - > m_hackFileId = m_bfFileId ;
2014-09-21 07:12:28 +04:00
//
// inject a title rec buf this time, we are doing an import
// FROM A TITLEDB FILE!!!
//
//gr->m_titleRecBuf = &sbuf;
// break it down into gw
2014-09-24 04:48:40 +04:00
// xd.set2 ( sbuf.getBufStart() ,
// sbuf.length() , // max size
// cr->m_coll, // use our coll
// NULL , // pbuf for page parser
// 1 , // niceness
// NULL ); //sreq );
// // note it
// log("import: importing %s",xd.m_firstUrl.getUrl());
2014-09-21 07:12:28 +04:00
// now we can set gr for the injection
2014-09-24 04:48:40 +04:00
// TODO: inject the whole "sbuf" so we get sitenuminlinks etc
// all exactly the same...
// gr->m_url = xd.getFirstUrl()->getUrl();
// gr->m_queryToScrape = NULL;
// gr->m_contentDelim = 0;
// gr->m_contentTypeStr = g_contentTypeStrings [xd.m_contentType];
// gr->m_contentFile = NULL;
// gr->m_content = xd.ptr_utf8Content;
// gr->m_diffbotReply = NULL;
// gr->m_injectLinks = false;
// gr->m_spiderLinks = true;
2014-12-04 19:29:17 +03:00
// gr->m_shortReply = false;
2014-09-24 04:48:40 +04:00
// gr->m_newOnly = false;
// gr->m_deleteUrl = false;
// gr->m_recycle = true; // recycle content? or sitelinks?
// gr->m_dedup = false;
// gr->m_hasMime = false;
// gr->m_doConsistencyTesting = false;
// gr->m_getSections = false;
// gr->m_gotSections = false;
// gr->m_charset = xd.m_charset;
// gr->m_hopCount = xd.m_hopCount;
2014-09-21 07:12:28 +04:00
//
// point to next doc in the titledb file
//
2014-09-24 04:48:40 +04:00
//m_fileOffset += need;
2014-09-21 07:12:28 +04:00
2014-09-24 23:40:39 +04:00
// get docid from key
docId = g_titledb . getDocIdFromKey ( & tkey ) ;
// get shard that holds the titlerec for it
shardNum = g_hostdb . getShardNumFromDocId ( docId ) ;
// for selecting which host in the shard receives it
2014-11-11 01:45:11 +03:00
key = ( int32_t ) docId ;
2014-09-21 07:12:28 +04:00
m_numOut + + ;
// then index it. master callback will be called
//if ( ! xd->index() ) return false;
2014-09-24 23:40:39 +04:00
2014-09-21 07:12:28 +04:00
// TODO: make this forward the request to an appropriate host!!
2014-09-24 04:48:40 +04:00
// . gr->m_sbuf is set to the titlerec so this should handle that
// and use XmlDoc::set4() or whatever
2014-09-24 23:40:39 +04:00
// if ( msg7->injectTitleRec ( msg7 , // state
// gotMsg7ReplyWrapper , // callback
// cr )) {
// // it didn't block somehow...
// msg7->m_inUse = false;
// msg7->gotMsg7Reply();
// }
req = sbuf - > getBufStart ( ) ;
reqSize = sbuf - > length ( ) ;
if ( reqSize ! = need ) { char * xx = NULL ; * xx = 0 ; }
// do not free it, let multicast free it after sending it
sbuf - > detachBuf ( ) ;
if ( ! mcast - > send ( req ,
reqSize ,
0x07 ,
true , // ownmsg?
shardNum ,
false , // send to whole shard?
key , // for selecting host in shard
mcast , // state
NULL , // state2
gotMulticastReplyWrapper ,
999999 ) ) { // total timeout in seconds
log ( " import: import mcast had error: %s " , mstrerror ( g_errno ) ) ;
m_numIn + + ;
2014-09-24 04:48:40 +04:00
}
2014-09-21 07:12:28 +04:00
goto INJECTLOOP ;
nextFile :
// invalidate this flag
2014-09-24 04:48:40 +04:00
//m_offIsValid = false;
// . and call this function. we add one to m_bfFileId so we
// do not re-get the file we just injected.
// . sets m_bf and m_fileOffset
// . returns false if nothing to read
if ( ! setCurrentTitleFileAndOffset ( ) ) { //cr , m_bfFileId+1 );
log ( " import: import: no files left to read " ) ;
//goto INJECTLOOP;
return true ;
}
2014-09-21 07:12:28 +04:00
// if it returns NULL we are done!
log ( " main: titledb injection loop completed. waiting for "
" outstanding injects to return. " ) ;
if ( m_numOut > m_numIn )
return false ;
log ( " main: all injects have returned. DONE. " ) ;
// dummy return
return true ;
}
2014-09-24 23:40:39 +04:00
void gotMulticastReplyWrapper ( void * state , void * state2 ) {
2014-09-21 07:12:28 +04:00
2014-09-24 23:40:39 +04:00
Multicast * mcast = ( Multicast * ) state ;
//msg7->gotMsg7Reply();
2014-09-21 07:12:28 +04:00
2014-09-24 23:40:39 +04:00
ImportState * is = mcast - > m_importState ;
2014-09-24 04:48:40 +04:00
2014-09-24 23:40:39 +04:00
is - > m_numIn + + ;
2014-09-24 04:48:40 +04:00
2014-11-11 01:45:11 +03:00
log ( " import: imported % " INT64 " docs (off=% " INT64 " ) " ,
2014-09-24 23:40:39 +04:00
is - > m_numIn , is - > m_fileOffset ) ;
2014-09-24 04:48:40 +04:00
2014-09-24 23:40:39 +04:00
if ( ! is - > importLoop ( ) ) return ;
2014-09-24 04:48:40 +04:00
2014-09-24 23:40:39 +04:00
// we will be called again when this multicast reply comes in...
if ( is - > m_numIn < is - > m_numOut ) return ;
2014-09-24 04:48:40 +04:00
2014-09-24 23:40:39 +04:00
log ( " inject: import is done " ) ;
2014-09-21 07:12:28 +04:00
2014-09-24 23:40:39 +04:00
CollectionRec * cr = g_collectiondb . getRec ( is - > m_collnum ) ;
// signify to qa.cpp that we are done
if ( cr ) cr - > m_importState = NULL ;
2014-09-21 07:12:28 +04:00
2014-09-24 23:40:39 +04:00
mdelete ( is , sizeof ( ImportState ) , " impstate " ) ;
delete ( is ) ;
2014-09-21 07:12:28 +04:00
}
2014-09-24 04:48:40 +04:00
2014-09-21 07:12:28 +04:00
// . return NULL with g_errno set on error
// . importLoop() calls this to get a msg7 to inject a doc from the foreign
// titledb file into our local collection
2014-09-24 23:40:39 +04:00
Multicast * ImportState : : getAvailMulticast ( ) { // Msg7 ( ) {
2014-09-21 07:12:28 +04:00
//static XmlDoc **s_ptrs = NULL;
// this is legit because parent checks for it
CollectionRec * cr = g_collectiondb . getRec ( m_collnum ) ;
// each msg7 has an xmldoc doc in it
if ( ! m_ptrs ) {
2014-11-11 01:45:11 +03:00
int32_t max = ( int32_t ) MAXINJECTSOUT ;
2014-09-24 23:40:39 +04:00
m_ptrs = ( Multicast * ) mcalloc ( sizeof ( Multicast ) * max , " sxdp " ) ;
2014-09-21 07:12:28 +04:00
if ( ! m_ptrs ) return NULL ;
2014-11-11 01:45:11 +03:00
m_numPtrs = max ; //(int32_t)MAXINJECTSOUT;
for ( int32_t i = 0 ; i < m_numPtrs ; i + + )
2014-09-24 23:40:39 +04:00
m_ptrs [ i ] . constructor ( ) ;
2014-09-21 07:12:28 +04:00
}
// respect the user limit for this coll
2014-10-30 22:36:39 +03:00
int64_t out = m_numOut - m_numIn ;
2014-09-21 07:12:28 +04:00
if ( out > = cr - > m_numImportInjects ) {
g_errno = 0 ;
return NULL ;
}
// find one not in use and return it
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < m_numPtrs ; i + + ) {
2014-09-21 07:12:28 +04:00
// point to it
2014-09-24 23:40:39 +04:00
Multicast * mcast = & m_ptrs [ i ] ;
if ( mcast - > m_inUse ) continue ;
//m7->m_inUse = true;
mcast - > m_importState = this ;
return mcast ;
2014-09-21 07:12:28 +04:00
}
// none avail
g_errno = 0 ;
return NULL ;
}
2014-09-24 04:48:40 +04:00
void saveImportStates ( ) {
2014-09-26 07:48:34 +04:00
if ( g_hostdb . m_myHost - > m_hostId ! = 0 ) return ;
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < g_collectiondb . m_numRecs ; i + + ) {
2014-09-24 04:48:40 +04:00
CollectionRec * cr = g_collectiondb . m_recs [ i ] ;
if ( ! cr ) continue ;
if ( ! cr - > m_importEnabled ) continue ;
cr - > m_importState - > saveFileBookMark ( ) ;
}
}
2014-09-21 07:12:28 +04:00
// "xd" is the XmlDoc that just completed injecting
2014-09-24 04:48:40 +04:00
void ImportState : : saveFileBookMark ( ) { //Msg7 *msg7 ) {
2014-10-30 22:36:39 +03:00
int64_t minOff = - 1LL ;
2014-11-11 01:45:11 +03:00
int32_t minFileId = - 1 ;
2014-09-21 07:12:28 +04:00
2014-11-11 01:45:11 +03:00
//int32_t fileId = msg7->m_hackFileId;
2014-10-30 22:36:39 +03:00
//int64_t fileOff = msg7->m_hackFileOff;
2014-09-21 07:12:28 +04:00
// if there is one outstanding the preceeded us, we can't update
// the bookmark just yet.
2014-11-11 01:45:11 +03:00
for ( int32_t i = 0 ; i < m_numPtrs ; i + + ) {
2014-09-24 23:40:39 +04:00
Multicast * mcast = & m_ptrs [ i ] ;
if ( ! mcast - > m_inUse ) continue ;
2014-09-24 04:48:40 +04:00
if ( minOff = = - 1 ) {
2014-09-24 23:40:39 +04:00
minOff = mcast - > m_hackFileOff ;
minFileId = mcast - > m_hackFileId ;
2014-09-24 04:48:40 +04:00
continue ;
}
2014-09-24 23:40:39 +04:00
if ( mcast - > m_hackFileId > minFileId )
2014-09-24 04:48:40 +04:00
continue ;
2014-09-24 23:40:39 +04:00
if ( mcast - > m_hackFileId = = minFileId & &
mcast - > m_hackFileOff > minOff )
2014-09-24 04:48:40 +04:00
continue ;
2014-09-24 23:40:39 +04:00
minOff = mcast - > m_hackFileOff ;
minFileId = mcast - > m_hackFileId ;
2014-09-21 07:12:28 +04:00
}
char fname [ 256 ] ;
sprintf ( fname , " %slasttitledbinjectinfo.dat " , g_hostdb . m_dir ) ;
SafeBuf ff ;
2014-11-11 01:45:11 +03:00
ff . safePrintf ( " % " INT64 " ,% " INT32 " " , minOff , minFileId ) ; //_fileOffset,m_bfFileId);
2014-09-21 07:12:28 +04:00
ff . save ( fname ) ;
}