fix(html/codegen): Handle HTML entities correctly (#4782)

This commit is contained in:
Alexander Akait 2022-05-27 11:39:30 +03:00 committed by GitHub
parent 58ed4a2723
commit d833057d79
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 357 additions and 52 deletions

View File

@ -655,11 +655,9 @@ where
attribute.push_str(&minifier);
} else {
let quote = if value.contains('"') { '\'' } else { '"' };
let normalized = normalize_attribute_value(value);
attribute.push(quote);
attribute.push_str(value);
attribute.push(quote);
attribute.push_str(&normalized);
}
}
@ -671,26 +669,16 @@ where
if self.ctx.skip_escape_text {
write_str!(self, n.span, &n.value);
} else {
let mut data = String::new();
let mut data = String::with_capacity(n.value.len());
if self.ctx.need_extra_newline_in_text && n.value.contains('\n') {
data.push('\n');
}
for c in n.value.chars() {
match c {
'&' => {
data.push_str(&String::from("&"));
}
'<' => {
data.push_str(&String::from("&lt;"));
}
'>' => {
data.push_str(&String::from("&gt;"));
}
'\u{00A0}' => data.push_str(&String::from("&nbsp;")),
_ => data.push(c),
}
if self.config.minify {
data.push_str(&minify_text(&n.value));
} else {
data.push_str(&escape_string(&n.value, false));
}
write_str!(self, n.span, &data);
@ -699,7 +687,7 @@ where
#[emitter]
fn emit_comment(&mut self, n: &Comment) -> Result {
let mut comment = String::new();
let mut comment = String::with_capacity(n.data.len() + 7);
comment.push_str("<!--");
comment.push_str(&n.data);
@ -1023,7 +1011,7 @@ fn minify_attribute_value(value: &str) -> String {
return "\"\"".to_string();
}
let mut minified = String::new();
let mut minified = String::with_capacity(value.len());
let mut unquoted = true;
let mut dq = 0;
@ -1031,6 +1019,11 @@ fn minify_attribute_value(value: &str) -> String {
for c in value.chars() {
match c {
'&' => {
minified.push_str("&amp;");
continue;
}
c if c.is_ascii_whitespace() => {
unquoted = false;
}
@ -1063,6 +1056,75 @@ fn minify_attribute_value(value: &str) -> String {
}
}
fn normalize_attribute_value(value: &str) -> String {
if value.is_empty() {
return "\"\"".to_string();
}
let mut normalized = String::with_capacity(value.len() + 2);
normalized.push('"');
normalized.push_str(&escape_string(value, true));
normalized.push('"');
normalized
}
fn minify_text(value: &str) -> String {
let mut result = String::with_capacity(value.len());
for c in value.chars() {
match c {
'&' => {
result.push_str("&amp;");
}
'<' => {
result.push_str("&lt;");
}
_ => result.push(c),
}
}
result
}
// Escaping a string (for the purposes of the algorithm above) consists of
// running the following steps:
//
// 1. Replace any occurrence of the "&" character by the string "&amp;".
//
// 2. Replace any occurrences of the U+00A0 NO-BREAK SPACE character by the
// string "&nbsp;".
//
// 3. If the algorithm was invoked in the attribute mode, replace any
// occurrences of the """ character by the string "&quot;".
//
// 4. If the algorithm was not invoked in the attribute mode, replace any
// occurrences of the "<" character by the string "&lt;", and any occurrences of
// the ">" character by the string "&gt;".
fn escape_string(value: &str, is_attribute_mode: bool) -> String {
let mut result = String::with_capacity(value.len());
for c in value.chars() {
match c {
'&' => {
result.push_str("&amp;");
}
'\u{00A0}' => result.push_str("&nbsp;"),
'"' if is_attribute_mode => result.push_str("&quot;"),
'<' if !is_attribute_mode => {
result.push_str("&lt;");
}
'>' if !is_attribute_mode => {
result.push_str("&gt;");
}
_ => result.push(c),
}
}
result
}
fn is_html_tag_name(namespace: Namespace, tag_name: &str) -> bool {
if namespace != Namespace::HTML {
return false;

View File

@ -395,10 +395,19 @@ fn test_indent_type_option(input: PathBuf) {
);
}
// TODO minified verification
#[testing::fixture("../swc_html_parser/tests/fixture/**/*.html")]
fn parser_verify(input: PathBuf) {
verify_document(&input, None, None, None, false);
verify_document(
&input,
None,
None,
Some(CodegenConfig {
scripting_enabled: false,
minify: true,
}),
false,
);
}
#[testing::fixture(
@ -408,16 +417,48 @@ fn parser_verify(input: PathBuf) {
"document_type/wrong-name/input.html",
"text/cr-charref-novalid/input.html",
"element/foreign-context/input.html",
"element/a-4/input.html",
"element/b-3/input.html",
"element/template-1/input.html",
)
)]
fn parser_recovery_verify(input: PathBuf) {
verify_document(&input, None, None, None, true);
verify_document(
&input,
None,
None,
Some(CodegenConfig {
scripting_enabled: false,
minify: true,
}),
true,
);
}
// TODO - remove exclude when we implement `raw`, `context_element` and etc
// TODO - investigate, exclude some when we implement `raw`, `context_element`
#[testing::fixture(
"../swc_html_parser/tests/html5lib-tests-fixture/**/*.html",
exclude(
"adoption01_dat/5.html",
"adoption01_dat/6.html",
"adoption01_dat/7.html",
"adoption01_dat/8.html",
"adoption02_dat/0.html",
"comments01_dat/15.html",
"template_dat/68.html",
"tests1_dat/68.html",
"tests1_dat/69.html",
"tests1_dat/70.html",
"tests1_dat/71.html",
"tests1_dat/87.html",
"tests15_dat/0.html",
"tests15_dat/1.html",
"tests15_dat/4.html",
"tests15_dat/5.html",
"tests18_dat/33.html",
"tests19_dat/1.html",
"tests19_dat/39.html",
"tests19_dat/97.html",
"tests16_dat/131.html",
"plain-text-unsafe_dat/0.html",
"template_dat/107.html",
@ -452,16 +493,31 @@ fn parser_recovery_verify(input: PathBuf) {
"tests18_dat/7.html",
"tests18_dat/8.html",
"tests18_dat/9.html",
"tests19_dat/98.html",
"tests19_dat/103.html",
"tests25_dat/2.html",
"tests25_dat/3.html",
"tests25_dat/4.html",
"tests1_dat/103.html",
"tests1_dat/30.html",
"tests1_dat/77.html",
"tests1_dat/90.html",
"tests2_dat/45.html",
"tests2_dat/46.html",
"tests7_dat/0.html",
"tests7_dat/1.html",
"tricky01_dat/6.html",
"webkit01_dat/21.html",
"webkit01_dat/22.html",
"webkit01_dat/24.html",
"webkit01_dat/25.html",
"webkit01_dat/28.html",
"tests20_dat/41.html",
"tests26_dat/2.html",
"tests2_dat/12.html",
"tests4_dat/3.fragment.style.html",
"tests4_dat/4.fragment.plaintext.html",
"tests_innerHTML_1_dat/82.fragment.html.html",
)
)]
fn html5lib_tests_verify(input: PathBuf) {
@ -475,6 +531,10 @@ fn html5lib_tests_verify(input: PathBuf) {
minify: false,
scripting_enabled,
};
let minified_codegen_config = CodegenConfig {
minify: true,
scripting_enabled,
};
if file_stem.contains("fragment") {
let mut context_element_namespace = Namespace::HTML;
@ -515,12 +575,20 @@ fn html5lib_tests_verify(input: PathBuf) {
verify_document_fragment(
&input,
context_element,
context_element.clone(),
Some(parser_config),
None,
Some(codegen_config),
true,
);
verify_document_fragment(
&input,
context_element,
Some(parser_config),
None,
Some(minified_codegen_config),
true,
);
} else {
verify_document(
&input,
@ -529,5 +597,12 @@ fn html5lib_tests_verify(input: PathBuf) {
Some(codegen_config),
true,
);
verify_document(
&input,
Some(parser_config),
None,
Some(minified_codegen_config),
true,
);
}
}

View File

@ -95,5 +95,61 @@ foo
<div data-test='\foo' class='bar'>test</div>
<div data-test='\\foo' class='bar'>test</div>
<span title='test "with" &amp;quot;'>test</span>
<span title='test "with" &amp; quot'>test</span>
<span title='test "with" &amp;test'>test</span>
<span title='test "with" &amptest'>test</span>
<span title='test "with" &lt;'>test</span>
<span title='test "with" &gt;'>test</span>
<span title=foo>Test</span>
<span title=foo<bar>Test</span>
<span title="foo=bar">Test</span>
<span title="foo>bar">Test</span>
<span title='foo"bar'>Test</span>
<span title="foo'bar">Test</span>
<div>
&amp;quot;
</div>
<script>
let foo = "&amp;";
</script>
<style>
.class::before {
content: "&amp;";
}
</style>
<div>
foo & bar
foo&<i>bar</i>
foo&&& bar
</div>
<pre><code>Label current;
// Load effective address of current instruction into rcx.
__ leaq(rcx, Operand(&amp;current));
__ bind(&amp;current);
</code></pre>
<div>
&amp;xxx; &amp;xxx &amp;thorn; &amp;thorn &amp;curren;t &amp;current &amp;current; &amp;&amp;
&amp;gt
&amp;unknown;
&#x00026;current
&#x00026;current;
&#38;current
&#38;current;
&oslash &osLash &Oslash
&amp;&oslash; &amp;&osLash; &amp;&Oslash;
&amp&oslash; &amp&osLash; &amp&Oslash;
&amp;oslash; &amp;osLash; &amp;Oslash;
&amposlash; &amposLash; &ampOslash;
</div>
</body>
</html>

View File

@ -49,7 +49,7 @@
<a title="x" href=" ">foo</a>
<p id="" class="" title="">x</p>
<p x="x'">x</p>
<p x='x"'>x</p>
<p x="x&quot;">x</p>
<a href="#"><p>Click me</p></a>
<span><button>Hit me</button></span>
<!--TODO FIX ME -->
@ -68,20 +68,20 @@
<div data-test="foo" class="bar">test</div>
<div data-test="foo bar" class="bar">test</div>
<div data-test='foo " bar' class="bar">test</div>
<div data-test="foo &quot; bar" class="bar">test</div>
<div data-test="foo ' bar" class="bar">test</div>
<div data-test='foo " bar' class="bar">test</div>
<div data-test="foo &quot; bar" class="bar">test</div>
<div data-test="foo ' bar" class="bar">test</div>
<div data-test='foo """ bar' class="bar">test</div>
<div data-test="foo &quot;&quot;&quot; bar" class="bar">test</div>
<div data-test="foo ''' bar" class="bar">test</div>
<div data-test='"foo"' class="bar">test</div>
<div data-test='"foo"' class="bar">test</div>
<div data-test='" foo "' class="bar">test</div>
<div data-test='"foo"' class="bar">test</div>
<div data-test='"foo"' class="bar">test</div>
<div data-test='" foo "' class="bar">test</div>
<div data-test="&quot;foo&quot;" class="bar">test</div>
<div data-test="&quot;foo&quot;" class="bar">test</div>
<div data-test="&quot; foo &quot;" class="bar">test</div>
<div data-test="&quot;foo&quot;" class="bar">test</div>
<div data-test="&quot;foo&quot;" class="bar">test</div>
<div data-test="&quot; foo &quot;" class="bar">test</div>
<div data-test="'foo'" class="bar">test</div>
<div data-test="'foo'" class="bar">test</div>
<div data-test="
@ -91,6 +91,62 @@ foo
<div data-test="\foo" class="bar">test</div>
<div data-test="\\foo" class="bar">test</div>
<span title="test &quot;with&quot; &amp;quot;">test</span>
<span title="test &quot;with&quot; &amp; quot">test</span>
<span title="test &quot;with&quot; &amp;test">test</span>
<span title="test &quot;with&quot; &amp;amptest">test</span>
<span title="test &quot;with&quot; <">test</span>
<span title="test &quot;with&quot; >">test</span>
<span title="foo">Test</span>
<span title="foo<bar">Test</span>
<span title="foo=bar">Test</span>
<span title="foo>bar">Test</span>
<span title="foo&quot;bar">Test</span>
<span title="foo'bar">Test</span>
<div>
&amp;quot;
</div>
<script>
let foo = "&amp;";
</script>
<style>
.class::before {
content: "&amp;";
}
</style>
<div>
foo &amp; bar
foo&amp;<i>bar</i>
foo&amp;&amp;&amp; bar
</div>
<pre><code>Label current;
// Load effective address of current instruction into rcx.
__ leaq(rcx, Operand(&amp;current));
__ bind(&amp;current);
</code></pre>
<div>
&amp;xxx; &amp;xxx &amp;thorn; &amp;thorn &amp;curren;t &amp;current &amp;current; &amp;&amp;
&amp;gt
&amp;unknown;
&amp;current
&amp;current;
&amp;current
&amp;current;
ø &amp;osLash Ø
&amp;ø &amp;&amp;osLash; &amp;Ø
&amp;ø &amp;&amp;osLash; &amp;Ø
&amp;oslash; &amp;osLash; &amp;Oslash;
&amp;oslash; &amp;osLash; &amp;Oslash;
</div>
</body></html>

View File

@ -89,5 +89,61 @@ foo
<div data-test=\foo class=bar>test</div>
<div data-test=\\foo class=bar>test</div>
<span title='test "with" &amp;quot;'>test</span>
<span title='test "with" &amp; quot'>test</span>
<span title='test "with" &amp;test'>test</span>
<span title='test "with" &amp;amptest'>test</span>
<span title='test "with" <'>test</span>
<span title='test "with" >'>test</span>
<span title=foo>Test</span>
<span title="foo<bar">Test</span>
<span title="foo=bar">Test</span>
<span title="foo>bar">Test</span>
<span title='foo"bar'>Test</span>
<span title="foo'bar">Test</span>
<div>
&amp;quot;
</div>
<script>
let foo = "&amp;";
</script>
<style>
.class::before {
content: "&amp;";
}
</style>
<div>
foo &amp; bar
foo&amp;<i>bar</i>
foo&amp;&amp;&amp; bar
</div>
<pre><code>Label current;
// Load effective address of current instruction into rcx.
__ leaq(rcx, Operand(&amp;current));
__ bind(&amp;current);
</code></pre>
<div>
&amp;xxx; &amp;xxx &amp;thorn; &amp;thorn &amp;curren;t &amp;current &amp;current; &amp;&amp;
&amp;gt
&amp;unknown;
&amp;current
&amp;current;
&amp;current
&amp;current;
ø &amp;osLash Ø
&amp;ø &amp;&amp;osLash; &amp;Ø
&amp;ø &amp;&amp;osLash; &amp;Ø
&amp;oslash; &amp;osLash; &amp;Oslash;
&amp;oslash; &amp;osLash; &amp;Oslash;
</div>

View File

@ -30,8 +30,8 @@
<div>A space character: &amp;&amp;</div>
<div>I'm ∉ I tell you</div>
<a href="http://lmgtfy.com/?l=1&q=rick+roll">tired meme</a>
<a href="#" onclick="window.location='?l=1&q=rick+roll';return false">
<a href="http://lmgtfy.com/?l=1&amp;q=rick+roll">tired meme</a>
<a href="#" onclick="window.location='?l=1&amp;q=rick+roll';return false">
kablammo!
</a>

View File

@ -5,9 +5,9 @@
<div>A space character: &amp;</div>
<div>A space character: &amp;</div>
<div>A space character: &nbsp;</div>
<div>A space character:  </div>
<div>The less-than sign: &lt;</div>
<div>The greater-than sign: &gt;</div>
<div>The greater-than sign: ></div>
<div>The double quote sign: "</div>
<div>The single quote sign: '</div>
<div>The cent sign: ¢</div>
@ -28,8 +28,8 @@
<div>A space character: &amp;&amp;</div>
<div>I'm ∉ I tell you</div>
<a href="http://lmgtfy.com/?l=1&q=rick+roll">tired meme</a>
<a href=# onclick="window.location='?l=1&q=rick+roll';return false">
<a href="http://lmgtfy.com/?l=1&amp;q=rick+roll">tired meme</a>
<a href=# onclick="window.location='?l=1&amp;q=rick+roll';return false">
kablammo!
</a>

View File

@ -146,7 +146,7 @@
<col span=2 class=flash>
</colgroup>
<tr>
<td>&nbsp;</td>
<td> </td>
<th scope=col>Batman</th>
<th scope=col>Robin</th>
<th scope=col>The Flash</th>
@ -168,7 +168,7 @@
<col span=2 class=batman>
<col span=2 class=flash>
<tr>
<td>&nbsp;</td>
<td> </td>
<th scope=col>Batman</th>
<th scope=col>Robin</th>
<th scope=col>The Flash</th>
@ -184,7 +184,7 @@
<col span=2 class=flash>
</colgroup><!-- comment -->
<tr>
<td>&nbsp;</td>
<td> </td>
<th scope=col>Batman</th>
<th scope=col>Robin</th>
<th scope=col>The Flash</th>
@ -206,7 +206,7 @@
<col span=2 class=flash>
</colgroup> <tbody>
<tr>
<td>&nbsp;</td>
<td> </td>
<th scope=col>Batman</th>
<th scope=col>Robin</th>
<th scope=col>The Flash</th>
@ -218,7 +218,7 @@
<table>
<caption>He-Man and Skeletor facts</caption>
<tr>
<td>&nbsp;</td>
<td> </td>
<th scope=col class=heman>He-Man</th>
<th scope=col class=skeletor>Skeletor</th>
</tr>
@ -231,7 +231,7 @@
<table>
<caption>He-Man and Skeletor facts<tr>
<td>&nbsp;</td>
<td> </td>
<th scope=col class=heman>He-Man</th>
<th scope=col class=skeletor>Skeletor</th>
</tr>
@ -244,7 +244,7 @@
<table>
<caption>He-Man and Skeletor facts</caption><!-- comment --> <tr>
<td>&nbsp;</td>
<td> </td>
<th scope=col class=heman>He-Man</th>
<th scope=col class=skeletor>Skeletor</th>
</tr>

View File

@ -1,8 +1,8 @@
<!doctype html><html lang=en><title>Document</title></head>
<body>
<div id='John"&Harry'>Test</div>
<div id='John"&HarryOther'>Test</div>
<div id='John"&amp;Harry'>Test</div>
<div id='John"&amp;HarryOther'>Test</div>
<div id='John"<HarryOtherOther'>Test</div>
<div id=John HarryOtherOtherOther>Test</div>
<div id="John<test>">Test</div>
@ -18,9 +18,9 @@
<p> This Registered Trademark is a Script used in HTML document. </p>
<div>I want to display &lt;br&gt; tag</div>
<div>I want to display &lt;br> tag</div>
<div>I want to display &lt;i&gt; tag</div>
<div>I want to display &lt;i> tag</div>
<div>The cent sign: ¢</div>
<div>The cent sign: ¢</div>