fix geth1tag some more.

fixed bad comment tag detection. was losing
a good deal of some pages because of that.
This commit is contained in:
mwells 2014-07-07 08:20:21 -07:00
parent fed7b73b9f
commit e22641997a
4 changed files with 51 additions and 27 deletions

View File

@ -3472,6 +3472,11 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
sb->cdataEncode(mr->ptr_htag);
sb->safePrintf("]]></h1Tag>\n");
}
if ( si->m_format == FORMAT_JSON ) {
sb->safePrintf("\t\t\"h1Tag\":\"");
sb->jsonEncode(mr->ptr_htag);
sb->safePrintf("\",\n");
}
}
@ -3490,6 +3495,8 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
"]]>"
"</contentType>\n",
cs);
else if ( si->m_format == FORMAT_JSON )
sb->safePrintf("\t\t\"contentType\":\"%s\",\n",cs);
else if ( si->m_format == FORMAT_HTML && ctype != CT_HTML ) {
sb->safePrintf(" <b><font style=color:white;"
"background-color:maroon;>");

View File

@ -29591,10 +29591,30 @@ char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) {
SafeBuf *XmlDoc::getHeaderTagBuf() {
if ( m_htbValid ) return &m_htb;
// get it. true = skip leading spaces
long h1len = 0;
char *h1 = m_xml.getTextForXmlTag ( 0, 999999,"h1",&h1len,true);
if ( h1 && h1len ) m_htb.safeMemcpy(h1,h1len);
Sections *ss = getSections();
if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss;
// scan sections
Section *si = ss->m_rootSection;
for ( ; si ; si = si->m_next ) {
// breathe
QUICKPOLL(m_niceness);
if ( si->m_tagId == TAG_H1 ) break;
}
// if no h1 tag then make buf empty
if ( ! si ) {
m_htb.nullTerm();
m_htbValid = true;
return &m_htb;
}
// otherwise, set it
char *a = m_words.m_words[si->m_firstWordPos];
char *b = m_words.m_words[si->m_lastWordPos] ;
b += m_words.m_wordLens[si->m_lastWordPos];
// copy it
m_htb.safeMemcpy ( a , b - a );
m_htb.nullTerm();
m_htbValid = true;
return &m_htb;

View File

@ -257,6 +257,8 @@ long XmlNode::setCommentNode2 ( char *node ) {
// look for ending of ]> like for <![if gt IE 6]>
if ( node[i] !='>' ) continue;
if ( node[i-1] ==']' ) break;
// look for ending of --> like for <![endif]-->
if ( node[i-1] == '-' && node[i-2] == '-' ) break;
}
// skip i over the >, if any (could be end of doc)

41
qa.cpp
View File

@ -364,27 +364,19 @@ void checkCRC ( long needCRC ) {
sprintf(cmd,"diff %s %s",fn1,fn2);
fprintf(stderr,"%s\n",cmd);
system(cmd);
// if this is zero allow it to slide by. it is learning mode i guess.
// so we can learn what crc we need to use.
if ( needCRC == 0 ) return;
// otherwise, stop right there for debugging
exit(1);
}
//static long s_rdbId1 = 0;
//static long s_rdbId2 = 0;
//static long s_rdbId3 = 0;
#undef usleep
// . run a series of tests to ensure that gb is functioning properly
// . use s_urls[] array of urls for injecting and spider seeding
// . contain an archive copy of all webpages in the injectme3 file and
// in pagearchive1.txt file
// . while initially spidering store pages in pagearchive1.txt so we can
// replay later. store up to 100,000 pages in there.
bool qatest ( ) {
// hack
//goto checkdelim;
//
// the injection qa test suite
//
bool qainject () {
static bool s_x1 = false;
if ( ! s_x1 ) {
@ -413,9 +405,6 @@ bool qatest ( ) {
checkCRC ( 238170006 );
}
// hack
//goto deliminject;
//
// inject urls, return false if not done yet
//
@ -502,8 +491,6 @@ bool qatest ( ) {
static bool s_y1 = false;
if ( ! s_y1 ) { s_y1 = true; checkCRC ( -1672870556 ); }
// deliminject:
//
// try delimeter based injecting
//
@ -527,8 +514,6 @@ bool qatest ( ) {
static bool s_y3 = false;
if ( ! s_y3 ) { s_y3 = true; checkCRC ( -1970198487 ); }
// checkdelim:
// now query check
static bool s_y4 = false;
if ( ! s_y4 ) {
@ -597,3 +582,13 @@ bool qatest ( ) {
return true;
}
// . run a series of tests to ensure that gb is functioning properly
// . uses the ./qa subdirectory to hold archive pages, ips, spider dates to
// ensure consistency between tests for exact replays
bool qatest ( ) {
return qainject();
}