mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
now we pass both injection tests in qa.cpp
This commit is contained in:
parent
b55359a95d
commit
16b73a9bdd
@ -550,7 +550,7 @@ bool Msg7::inject2 ( void *state ,
|
||||
// shortcut
|
||||
XmlDoc *xd = &m_xd;
|
||||
|
||||
if ( ! gr->m_url ) {
|
||||
if ( ! gr->m_url || ! gr->m_url[0] ) {
|
||||
// if there is a record delimeter, we form a new fake url
|
||||
// for each record based on content hash
|
||||
//! gr->m_contentDelim ) {
|
||||
@ -692,6 +692,8 @@ bool Msg7::inject2 ( void *state ,
|
||||
cr ,
|
||||
content , // start , // content ,
|
||||
gr->m_diffbotReply,
|
||||
// if this doc is a 'container doc' then
|
||||
// hasMime applies to the SUBDOCS only!!
|
||||
gr->m_hasMime, // content starts with http mime?
|
||||
gr->m_hopCount,
|
||||
gr->m_charset,
|
||||
@ -714,7 +716,8 @@ bool Msg7::inject2 ( void *state ,
|
||||
gr->m_lastSpidered ,
|
||||
// the ip of the url being injected.
|
||||
// use 0 if unknown and it won't be valid.
|
||||
gr->m_injectDocIp
|
||||
gr->m_injectDocIp ,
|
||||
gr->m_contentDelim
|
||||
) )
|
||||
// we blocked...
|
||||
return false;
|
||||
|
113
XmlDoc.cpp
113
XmlDoc.cpp
@ -1154,7 +1154,8 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
int32_t forcedIp ,
|
||||
uint8_t contentType ,
|
||||
uint32_t spideredTime ,
|
||||
bool contentHasMime ) {
|
||||
bool contentHasMimeArg ,
|
||||
char *contentDelim ) {
|
||||
|
||||
// sanity check
|
||||
if ( sreq->m_dataSize == 0 ) { char *xx=NULL;*xx=0; }
|
||||
@ -1179,6 +1180,21 @@ bool XmlDoc::set4 ( SpiderRequest *sreq ,
|
||||
if ( sreq->m_forceDelete )
|
||||
m_deleteFromIndex = true;
|
||||
|
||||
// if we are a container doc then we need the content delimeter,
|
||||
// unless if we are a warc or arc, then we know how those delimit
|
||||
// already.
|
||||
m_contentDelim = contentDelim;
|
||||
m_contentDelimValid = true;
|
||||
|
||||
bool contentHasMime = contentHasMimeArg;
|
||||
// but if we are a container doc then this parm applies to each subdoc
|
||||
// not to us, so turn it off for this part.
|
||||
if ( isContainerDoc() ) {
|
||||
contentHasMime = false;
|
||||
m_subDocsHaveMime = contentHasMimeArg;
|
||||
}
|
||||
|
||||
|
||||
char *utf8Content = utf8ContentArg;
|
||||
|
||||
if ( contentHasMime && utf8Content ) {
|
||||
@ -2016,7 +2032,7 @@ bool XmlDoc::injectDoc ( char *url ,
|
||||
CollectionRec *cr ,
|
||||
char *content ,
|
||||
char *diffbotReply, // usually null
|
||||
bool contentHasMime ,
|
||||
bool contentHasMimeArg ,
|
||||
int32_t hopCount,
|
||||
int32_t charset,
|
||||
|
||||
@ -2030,7 +2046,8 @@ bool XmlDoc::injectDoc ( char *url ,
|
||||
|
||||
uint32_t firstIndexed,
|
||||
uint32_t lastSpidered ,
|
||||
int32_t injectDocIp ) {
|
||||
int32_t injectDocIp ,
|
||||
char *contentDelim ) {
|
||||
|
||||
// wait until we are synced with host #0
|
||||
if ( ! isClockInSync() ) {
|
||||
@ -2098,7 +2115,8 @@ bool XmlDoc::injectDoc ( char *url ,
|
||||
injectDocIp, // 0,//forcedIp ,
|
||||
contentType ,
|
||||
lastSpidered,//lastSpidered overide
|
||||
contentHasMime )) {
|
||||
contentHasMimeArg ,
|
||||
contentDelim )) {
|
||||
// g_errno should be set if that returned false
|
||||
if ( ! g_errno ) { char *xx=NULL;*xx=0; }
|
||||
return true;
|
||||
@ -2726,7 +2744,8 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
// handle docs that consist of subdocs that need to be injected
|
||||
// or indexed individually.
|
||||
if ( m_firstUrlValid && m_firstUrl.isWarc() ) {
|
||||
// this returns false if it would block and callback will be called
|
||||
// this returns false if it would block and callback will be
|
||||
// called
|
||||
if ( ! indexWarc () )
|
||||
return false;
|
||||
// all done! no need to add the parent doc.
|
||||
@ -2734,7 +2753,8 @@ bool XmlDoc::indexDoc2 ( ) {
|
||||
}
|
||||
|
||||
if ( m_firstUrlValid && m_firstUrl.isArc() ) {
|
||||
// this returns false if it would block and callback will be called
|
||||
// this returns false if it would block and callback will be
|
||||
// called
|
||||
if ( ! indexArc () )
|
||||
return false;
|
||||
// all done! no need to add the parent doc.
|
||||
@ -3009,6 +3029,7 @@ bool isRobotsTxtFile ( char *u , int32_t ulen ) {
|
||||
bool XmlDoc::isContainerDoc ( ) {
|
||||
if ( m_firstUrlValid && m_firstUrl.isWarc() ) return true;
|
||||
if ( m_firstUrlValid && m_firstUrl.isArc () ) return true;
|
||||
if ( ! m_contentDelimValid ) { char *xx=NULL;*xx=0; }
|
||||
if ( m_contentDelim ) return true;
|
||||
return false;
|
||||
}
|
||||
@ -3052,6 +3073,11 @@ bool XmlDoc::indexContainerDoc ( ) {
|
||||
if ( ! m_anyContentPtr ) {
|
||||
// init the content cursor to point to the first subdoc
|
||||
m_anyContentPtr = *cpp;
|
||||
// but skip over initial separator if there. that is a
|
||||
// faux pau
|
||||
int32_t dlen = gbstrlen(m_contentDelim);
|
||||
if ( strncmp(m_anyContentPtr,m_contentDelim,dlen) == 0 )
|
||||
m_anyContentPtr += dlen;
|
||||
// init the input parms
|
||||
memset ( gr , 0 , sizeof(GigablastRequest) );
|
||||
// reset it
|
||||
@ -3063,32 +3089,30 @@ bool XmlDoc::indexContainerDoc ( ) {
|
||||
// will this work on a content delimeterized doc?
|
||||
gr->m_deleteUrl = m_deleteFromIndex;
|
||||
// each subdoc will have a mime since it is an arc
|
||||
gr->m_hasMime = true;
|
||||
gr->m_hasMime = m_subDocsHaveMime;//true;
|
||||
}
|
||||
|
||||
subdocLoop:
|
||||
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
// EOF?
|
||||
if ( m_anyContentPtr == (char *)-1 )
|
||||
return true;
|
||||
|
||||
// we had \0 terminated the end of the previous record, so put back
|
||||
if ( m_savedChar && ! *m_anyContentPtr ) {
|
||||
*m_anyContentPtr = m_savedChar;
|
||||
m_anyContentPtr += gbstrlen(m_contentDelim);
|
||||
}
|
||||
|
||||
// EOF?
|
||||
if ( ! *m_anyContentPtr ) return true;
|
||||
|
||||
// . should have the url as well.
|
||||
// . the url, ip etc. are on a single \n terminated line for an arc!
|
||||
char *separator = strstr(m_anyContentPtr,m_contentDelim);
|
||||
|
||||
// index this subdoc
|
||||
gr->m_content = m_anyContentPtr;
|
||||
|
||||
// these are not defined. will be autoset in set4() i guess.
|
||||
gr->m_firstIndexed = 0;
|
||||
gr->m_lastSpidered = 0;
|
||||
// . should have the url as well.
|
||||
// . the url, ip etc. are on a single \n terminated line for an arc!
|
||||
char *separator = strstr(m_anyContentPtr,m_contentDelim);
|
||||
|
||||
if ( separator ) {
|
||||
m_savedChar = *separator;
|
||||
@ -3096,6 +3120,63 @@ bool XmlDoc::indexContainerDoc ( ) {
|
||||
*m_anyContentPtr = '\0';
|
||||
}
|
||||
|
||||
// if no separator found, this is our last injection
|
||||
if ( ! separator )
|
||||
m_anyContentPtr = (char *)-1;
|
||||
|
||||
// these are not defined. will be autoset in set4() i guess.
|
||||
gr->m_firstIndexed = 0;
|
||||
gr->m_lastSpidered = 0;
|
||||
|
||||
bool setUrl = false;
|
||||
|
||||
// HOWEVER, if an hasmime is true and an http:// follows
|
||||
// the delimeter then use that as the url...
|
||||
// this way we can specify our own urls.
|
||||
if ( gr->m_hasMime ) {
|
||||
char *du = gr->m_content;
|
||||
//du += gbstrlen(delim);
|
||||
if ( du && is_wspace_a ( *du ) ) du++;
|
||||
if ( du && is_wspace_a ( *du ) ) du++;
|
||||
if ( du && is_wspace_a ( *du ) ) du++;
|
||||
if ( gr->m_hasMime &&
|
||||
(strncasecmp( du,"http://",7) == 0 ||
|
||||
strncasecmp( du,"https://",8) == 0 ) ) {
|
||||
// flag it
|
||||
setUrl = true;
|
||||
// find end of it
|
||||
char *uend = du + 7;
|
||||
for ( ; *uend && ! is_wspace_a(*uend) ; uend++ );
|
||||
// inject that then
|
||||
m_injectUrlBuf.reset();
|
||||
m_injectUrlBuf.safeMemcpy ( du , uend - du );
|
||||
m_injectUrlBuf.nullTerm();
|
||||
// and point to the actual http mime then
|
||||
// well, skip that space, right
|
||||
gr->m_content = uend + 1;
|
||||
gr->m_url = m_injectUrlBuf.getBufStart();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
// make the url from parent url
|
||||
// use hash of the content
|
||||
int64_t ch64 = hash64n ( gr->m_content , 0LL );
|
||||
|
||||
QUICKPOLL ( m_niceness );
|
||||
|
||||
if ( ! setUrl ) {
|
||||
// reset it
|
||||
m_injectUrlBuf.reset();
|
||||
// by default append a -<ch64> to the provided url
|
||||
m_injectUrlBuf.safePrintf("%s-%"UINT64"",
|
||||
m_firstUrl.getUrl(),ch64);
|
||||
gr->m_url = m_injectUrlBuf.getBufStart();
|
||||
}
|
||||
|
||||
|
||||
if ( ! m_msg7->inject2 ( m_masterState , m_masterLoop ) )
|
||||
// it would block, callback will be called later
|
||||
return false;
|
||||
|
11
XmlDoc.h
11
XmlDoc.h
@ -473,7 +473,9 @@ class XmlDoc {
|
||||
int32_t forcedIp = 0 ,
|
||||
uint8_t contentType = CT_HTML ,
|
||||
uint32_t spideredTime = 0 , // time_t
|
||||
bool contentHasMime = false ) ;
|
||||
bool contentHasMime = false ,
|
||||
// for container docs, what is the separator of subdocs?
|
||||
char *contentDelim = NULL ) ;
|
||||
|
||||
// we now call this right away rather than at download time!
|
||||
int32_t getSpideredTime();
|
||||
@ -1057,6 +1059,8 @@ class XmlDoc {
|
||||
char *m_arcContentPtr;
|
||||
char *m_anyContentPtr;
|
||||
char *m_contentDelim;
|
||||
SafeBuf m_injectUrlBuf;
|
||||
bool m_subDocsHaveMime;
|
||||
|
||||
// . same thing, a little more complicated
|
||||
// . these classes are only set on demand
|
||||
@ -1181,6 +1185,7 @@ class XmlDoc {
|
||||
char m_addedSpiderReplySizeValid;
|
||||
char m_addedStatusDocSizeValid;
|
||||
char m_downloadStartTimeValid;
|
||||
char m_contentDelimValid;
|
||||
//char m_docQualityValid;
|
||||
char m_siteValid;
|
||||
char m_startTimeValid;
|
||||
@ -2418,7 +2423,9 @@ class XmlDoc {
|
||||
|
||||
uint32_t firstIndexedTime = 0,
|
||||
uint32_t lastSpideredDate = 0 ,
|
||||
int32_t injectDocIp = 0 );
|
||||
int32_t injectDocIp = 0 ,
|
||||
// for container docs consisting of subdocs to inject
|
||||
char *contentDelim = NULL );
|
||||
|
||||
|
||||
bool injectLinks ( HashTableX *linkDedupTable ,
|
||||
|
Loading…
Reference in New Issue
Block a user