comments in <script> tags are a

convultion. deal with all four types and
their precedence issues. all of this is
to find the proper end of the </script> and
not a </script> or <script> that is being
printed out in the javascript in the <script> tag.
This commit is contained in:
Matt 2015-08-28 16:31:22 -06:00
parent a7222dcf3f
commit 8299197cca
2 changed files with 47 additions and 6 deletions

View File

@ -2300,7 +2300,8 @@ bool getTestDoc ( char *u , TcpSocket *ts , Msg13Request *r ) {
// log it for now
//if ( g_conf.m_logDebugSpider )
log("test: GOT doc in test cache: %s (%"UINT64")",u,h);
log("test: GOT doc in test cache: %s (qa/doc.%"UINT64".html)",
u,h);
//fprintf(stderr,"scp gk252:/e/test-spider/doc.%"UINT64".* /home/mwells/gigablast/test-parser/\n",h);

50
Xml.cpp
View File

@ -443,6 +443,7 @@ bool Xml::set ( char *s ,
bool inComment1 = false;
bool inComment2 = false;
bool inComment3 = false;
bool inComment4 = false;
bool escaped = false;
// bool foo = false;
// if ( m_xmlLen == 13257 ) { //pstart - m_xml == 88881 ) {
@ -456,7 +457,9 @@ bool Xml::set ( char *s ,
// adding these new quote checks may cause a few
// parsing inconsistencies for pages a hanful of pages
//
if ( p[0] =='\n' ) {
// windows-based html pages use 13 sometimes and no
// \n at all...
if ( p[0] =='\n' || p[0] == 13 ) { // ^m = 13 = CR
//newLine = true;
inComment1 = false;
}
@ -470,7 +473,8 @@ bool Xml::set ( char *s ,
p[2] == '-' && p[2] == '-' &&
! inSingles && ! inDoubles &&
! inComment1 &&
! inComment2 )
! inComment2 &&
! inComment4 )
inComment3 = true;
if ( p[0] == '-' && p[1] == '-' &&
p[2] == '>' &&
@ -478,14 +482,46 @@ bool Xml::set ( char *s ,
inComment3 = false;
if ( p[0] == '/' && p[1]=='/'&&
! inSingles && ! inDoubles &&
! inComment2 && ! inComment3 )
! inComment2 &&
! inComment3 &&
// allow for "//<![CDATA[..." to end in
// "//]]>" so ignore if inComment4 is true.
// i'd say these are the weaker of all 4
// comment types in that regard.
! inComment4 )
inComment1 = true;
// handle /* */ comments
if ( p[0] == '/' && p[1]=='*' &&
! inSingles && ! inDoubles &&
! inComment1 && ! inComment3 )
! inComment1 &&
! inComment3 &&
! inComment4 )
inComment2 = true;
if ( p[0] == '*' && p[1]=='/' )
// <![CDATA[...]]> "comments" in <script> tags
// are common. CDATA tags seem to prevail even if
// within another comment tag, like i am seeing
// "//<![CDATA[..." a lot.
if ( p[0] == '<' &&
p[1] == '!' &&
p[2] == '[' &&
p[3] == 'C' &&
p[4] == 'D' &&
p[5] == 'A' &&
p[6] == 'T' &&
p[7] == 'A' &&
p[8] == '['
//! inComment1 &&
//! inComment2 &&
//! inComment3 )
)
inComment4 = true;
if ( p[0] == ']' &&
p[1] == ']' &&
p[2] == '>' )
inComment4 = false;
if ( p[0] == '*' &&
p[1]=='/' &&
! inComment4 )
inComment2 = false;
// no longer the start of a newLine
//newLine = false;
@ -502,6 +538,10 @@ bool Xml::set ( char *s ,
escaped = false;
continue;
}
if ( inComment4 && newVersion ) {
escaped = false;
continue;
}
// if an unescaped double quote
if ( p[0] == '\"' && ! escaped && ! inSingles )
inDoubles = ! inDoubles;