fix geth1tag some more.

fixed bad comment tag detection. was losing a good deal of some pages because of that.
2024-10-04 12:17:35 +03:00 · 2014-07-07 08:20:21 -07:00 · 2014-07-07 08:20:21 -07:00 · e22641997a
commit e22641997a
parent fed7b73b9f
4 changed files with 51 additions and 27 deletions
--- a/PageResults.cpp
+++ b/PageResults.cpp
@ -3472,6 +3472,11 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 			sb->cdataEncode(mr->ptr_htag);
 			sb->safePrintf("]]></h1Tag>\n");
 		}
+		if ( si->m_format == FORMAT_JSON ) {
+			sb->safePrintf("\t\t\"h1Tag\":\"");
+			sb->jsonEncode(mr->ptr_htag);
+			sb->safePrintf("\",\n");
+		}
 	}


@ -3490,6 +3495,8 @@ bool printResult ( State0 *st, long ix , long *numPrintedSoFar ) {
 				      "]]>"
 				      "</contentType>\n",
 				      cs);
+		else if ( si->m_format == FORMAT_JSON )
+			sb->safePrintf("\t\t\"contentType\":\"%s\",\n",cs);
 		else if ( si->m_format == FORMAT_HTML && ctype != CT_HTML ) {
 			sb->safePrintf(" <b><font style=color:white;"
 				      "background-color:maroon;>");
--- a/XmlDoc.cpp
+++ b/XmlDoc.cpp
@ -29591,10 +29591,30 @@ char *XmlDoc::getDescriptionBuf ( char *displayMetas , long *dlen ) {

 SafeBuf *XmlDoc::getHeaderTagBuf() {
 	if ( m_htbValid ) return &m_htb;
-	// get it. true = skip leading spaces
-	long h1len = 0;
-	char *h1 = m_xml.getTextForXmlTag ( 0, 999999,"h1",&h1len,true);
-	if ( h1 && h1len ) m_htb.safeMemcpy(h1,h1len);
+
+	Sections *ss = getSections();
+	if ( ! ss || ss == (void *)-1) return (SafeBuf *)ss;
+
+	// scan sections
+	Section *si = ss->m_rootSection;
+	for ( ; si ; si = si->m_next ) {
+		// breathe
+		QUICKPOLL(m_niceness);
+		if ( si->m_tagId == TAG_H1 ) break;
+	}
+	// if no h1 tag then make buf empty
+	if ( ! si ) {
+		m_htb.nullTerm();
+		m_htbValid = true;
+		return &m_htb;
+	}
+	// otherwise, set it
+	char *a = m_words.m_words[si->m_firstWordPos];
+	char *b = m_words.m_words[si->m_lastWordPos] ;
+	b += m_words.m_wordLens[si->m_lastWordPos];
+
+	// copy it
+	m_htb.safeMemcpy ( a , b - a );
 	m_htb.nullTerm();
 	m_htbValid = true;
 	return &m_htb;
--- a/XmlNode.cpp
+++ b/XmlNode.cpp
@ -257,6 +257,8 @@ long XmlNode::setCommentNode2 ( char *node ) {
 		// look for ending of ]> like for <![if gt IE 6]>
 		if ( node[i]   !='>' ) continue;
 		if ( node[i-1] ==']' ) break;
+		// look for ending of --> like for <![endif]-->
+		if ( node[i-1] == '-' && node[i-2] == '-' ) break;
 	}

 	// skip i over the >, if any (could be end of doc)
--- a/qa.cpp
+++ b/qa.cpp
@ -364,27 +364,19 @@ void checkCRC ( long needCRC ) {
 	sprintf(cmd,"diff %s %s",fn1,fn2);
 	fprintf(stderr,"%s\n",cmd);
 	system(cmd);
+	// if this is zero allow it to slide by. it is learning mode i guess.
+	// so we can learn what crc we need to use.
+	if ( needCRC == 0 ) return;
+	// otherwise, stop right there for debugging
 	exit(1);
 }

-
-
-//static long s_rdbId1 = 0;
-//static long s_rdbId2 = 0;
-//static long s_rdbId3 = 0;
-
 #undef usleep

-// . run a series of tests to ensure that gb is functioning properly
-// . use s_urls[] array of urls for injecting and spider seeding
-// . contain an archive copy of all webpages in the injectme3 file and
-//   in pagearchive1.txt file
-// . while initially spidering store pages in pagearchive1.txt so we can
-//   replay later. store up to 100,000 pages in there.
-bool qatest ( ) {
-
-	// hack
-	//goto checkdelim;
+//
+// the injection qa test suite
+//
+bool qainject () {

 	static bool s_x1 = false;
 	if ( ! s_x1 ) {
@ -413,9 +405,6 @@ bool qatest ( ) {
 		checkCRC ( 238170006 );
 	}

-	// hack
-	//goto deliminject;
-	
 	//
 	// inject urls, return false if not done yet
 	//
@ -502,8 +491,6 @@ bool qatest ( ) {
 	static bool s_y1 = false;
 	if ( ! s_y1 ) { s_y1 = true; checkCRC ( -1672870556 ); }

-	// deliminject:
-
 	//
 	// try delimeter based injecting
 	//
@ -527,8 +514,6 @@ bool qatest ( ) {
 	static bool s_y3 = false;
 	if ( ! s_y3 ) { s_y3 = true; checkCRC ( -1970198487 ); }

-	// checkdelim:
-
 	// now query check
 	static bool s_y4 = false;
 	if ( ! s_y4 ) {
@ -597,3 +582,13 @@ bool qatest ( ) {

 	return true;
 }
+
+// . run a series of tests to ensure that gb is functioning properly
+// . uses the ./qa subdirectory to hold archive pages, ips, spider dates to
+//   ensure consistency between tests for exact replays
+bool qatest ( ) {
+
+	return qainject();
+
+}
+