mirror of
https://github.com/gigablast/open-source-search-engine.git
synced 2024-10-04 12:17:35 +03:00
fix geth1tag some more.
fixed bad comment tag detection. was losing a good deal of some pages because of that.
This commit is contained in:
parent
fed7b73b9f
commit
e22641997a
@ -3472,6 +3472,11 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
sb->cdataEncode(mr->ptr_htag);
|
||||
sb->safePrintf("]]></h1Tag>\n");
|
||||
}
|
||||
if ( si->m_format == FORMAT_JSON ) {
|
||||
sb->safePrintf("\t\t\"h1Tag\":\"");
|
||||
sb->jsonEncode(mr->ptr_htag);
|
||||
sb->safePrintf("\",\n");
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -3490,6 +3495,8 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
|
||||
"]]>"
|
||||
"</contentType>\n",
|
||||
cs);
|
||||
else if ( si->m_format == FORMAT_JSON )
|
||||
sb->safePrintf("\t\t\"contentType\":\"%s\",\n",cs);
|
||||
else if ( si->m_format == FORMAT_HTML && ctype != CT_HTML ) {
|
||||
sb->safePrintf(" <b><font style=color:white;"
|
||||
"background-color:maroon;>");
|
||||
|
28
XmlDoc.cpp
28
XmlDoc.cpp
@ -29591,10 +29591,30 @@ char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) {
|
||||
|
||||
SafeBuf *XmlDoc::getHeaderTagBuf() {
|
||||
if ( m_htbValid ) return &m_htb;
|
||||
// get it. true = skip leading spaces
|
||||
long h1len = 0;
|
||||
char *h1 = m_xml.getTextForXmlTag ( 0, 999999,"h1",&h1len,true);
|
||||
if ( h1 && h1len ) m_htb.safeMemcpy(h1,h1len);
|
||||
|
||||
Sections *ss = getSections();
|
||||
if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss;
|
||||
|
||||
// scan sections
|
||||
Section *si = ss->m_rootSection;
|
||||
for ( ; si ; si = si->m_next ) {
|
||||
// breathe
|
||||
QUICKPOLL(m_niceness);
|
||||
if ( si->m_tagId == TAG_H1 ) break;
|
||||
}
|
||||
// if no h1 tag then make buf empty
|
||||
if ( ! si ) {
|
||||
m_htb.nullTerm();
|
||||
m_htbValid = true;
|
||||
return &m_htb;
|
||||
}
|
||||
// otherwise, set it
|
||||
char *a = m_words.m_words[si->m_firstWordPos];
|
||||
char *b = m_words.m_words[si->m_lastWordPos] ;
|
||||
b += m_words.m_wordLens[si->m_lastWordPos];
|
||||
|
||||
// copy it
|
||||
m_htb.safeMemcpy ( a , b - a );
|
||||
m_htb.nullTerm();
|
||||
m_htbValid = true;
|
||||
return &m_htb;
|
||||
|
@ -257,6 +257,8 @@ long XmlNode::setCommentNode2 ( char *node ) {
|
||||
// look for ending of ]> like for <![if gt IE 6]>
|
||||
if ( node[i] !='>' ) continue;
|
||||
if ( node[i-1] ==']' ) break;
|
||||
// look for ending of --> like for <![endif]-->
|
||||
if ( node[i-1] == '-' && node[i-2] == '-' ) break;
|
||||
}
|
||||
|
||||
// skip i over the >, if any (could be end of doc)
|
||||
|
41
qa.cpp
41
qa.cpp
@ -364,27 +364,19 @@ void checkCRC ( long needCRC ) {
|
||||
sprintf(cmd,"diff %s %s",fn1,fn2);
|
||||
fprintf(stderr,"%s\n",cmd);
|
||||
system(cmd);
|
||||
// if this is zero allow it to slide by. it is learning mode i guess.
|
||||
// so we can learn what crc we need to use.
|
||||
if ( needCRC == 0 ) return;
|
||||
// otherwise, stop right there for debugging
|
||||
exit(1);
|
||||
}
|
||||
|
||||
|
||||
|
||||
//static long s_rdbId1 = 0;
|
||||
//static long s_rdbId2 = 0;
|
||||
//static long s_rdbId3 = 0;
|
||||
|
||||
#undef usleep
|
||||
|
||||
// . run a series of tests to ensure that gb is functioning properly
|
||||
// . use s_urls[] array of urls for injecting and spider seeding
|
||||
// . contain an archive copy of all webpages in the injectme3 file and
|
||||
// in pagearchive1.txt file
|
||||
// . while initially spidering store pages in pagearchive1.txt so we can
|
||||
// replay later. store up to 100,000 pages in there.
|
||||
bool qatest ( ) {
|
||||
|
||||
// hack
|
||||
//goto checkdelim;
|
||||
//
|
||||
// the injection qa test suite
|
||||
//
|
||||
bool qainject () {
|
||||
|
||||
static bool s_x1 = false;
|
||||
if ( ! s_x1 ) {
|
||||
@ -413,9 +405,6 @@ bool qatest ( ) {
|
||||
checkCRC ( 238170006 );
|
||||
}
|
||||
|
||||
// hack
|
||||
//goto deliminject;
|
||||
|
||||
//
|
||||
// inject urls, return false if not done yet
|
||||
//
|
||||
@ -502,8 +491,6 @@ bool qatest ( ) {
|
||||
static bool s_y1 = false;
|
||||
if ( ! s_y1 ) { s_y1 = true; checkCRC ( -1672870556 ); }
|
||||
|
||||
// deliminject:
|
||||
|
||||
//
|
||||
// try delimeter based injecting
|
||||
//
|
||||
@ -527,8 +514,6 @@ bool qatest ( ) {
|
||||
static bool s_y3 = false;
|
||||
if ( ! s_y3 ) { s_y3 = true; checkCRC ( -1970198487 ); }
|
||||
|
||||
// checkdelim:
|
||||
|
||||
// now query check
|
||||
static bool s_y4 = false;
|
||||
if ( ! s_y4 ) {
|
||||
@ -597,3 +582,13 @@ bool qatest ( ) {
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// . run a series of tests to ensure that gb is functioning properly
|
||||
// . uses the ./qa subdirectory to hold archive pages, ips, spider dates to
|
||||
// ensure consistency between tests for exact replays
|
||||
bool qatest ( ) {
|
||||
|
||||
return qainject();
|
||||
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user