Implement html_escape and html_unescape.

This PR removes import of html-escape crate.
This commit is contained in:
jcamiel 2022-12-11 20:58:05 +01:00
parent 65b588f770
commit c2f0fe46ba
No known key found for this signature in database
GPG Key ID: 07FF11CFD55356CC
13 changed files with 2966 additions and 39 deletions

17
Cargo.lock generated
View File

@ -495,15 +495,6 @@ version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7ebdb29d2ea9ed0083cd8cece49bbd968021bd99b0849edb4a9a7ee0fdf6a4e0"
[[package]]
name = "html-escape"
version = "0.2.12"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "15315cfa9503e9aa85a477138eff76a1b203a430703548052c330b69d8d8c205"
dependencies = [
"utf8-width",
]
[[package]]
name = "hurl"
version = "2.0.0-SNAPSHOT"
@ -521,9 +512,9 @@ dependencies = [
"glob",
"hex",
"hex-literal",
"html-escape",
"hurl_core",
"indexmap",
"lazy_static",
"libflate",
"libxml",
"md5",
@ -1192,12 +1183,6 @@ dependencies = [
"percent-encoding",
]
[[package]]
name = "utf8-width"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5190c9442dcdaf0ddd50f37420417d219ae5261bbf5db120d0f9bab996c9cba1"
[[package]]
name = "vcpkg"
version = "0.2.15"

View File

@ -3,7 +3,7 @@
<span class="line"><span class="version">HTTP</span> <span class="number">200</span></span>
<span class="line section-header">[Captures]</span>
<span class="line"><span class="name">url</span><span>:</span> <span class="query-type">jsonpath</span> <span class="string">"$.url"</span></span>
<span class="line"><span class="name">html</span><span>:</span> <span class="query-type">jsonpath</span> <span class="string">"$.html"</span></span>
<span class="line"><span class="name">text</span><span>:</span> <span class="query-type">jsonpath</span> <span class="string">"$.text"</span></span>
<span class="line section-header">[Asserts]</span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.list"</span> <span class="filter-type">count</span> <span class="predicate-type">==</span> <span class="number">3</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.message"</span> <span class="filter-type">regex</span> <span class="regex">/Hello (.*)!/</span> <span class="predicate-type">==</span> <span class="string">"Bob"</span></span>
@ -11,10 +11,12 @@
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.url"</span> <span class="filter-type">urlEncode</span> <span class="predicate-type">==</span> <span class="string">"https%3A//mozilla.org/%3Fx%3D%D1%88%D0%B5%D0%BB%D0%BB%D1%8B"</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.encoded_url"</span> <span class="filter-type">urlDecode</span> <span class="predicate-type">==</span> <span class="string">"https://mozilla.org/?x=шеллы"</span></span>
<span class="line"><span class="query-type">variable</span> <span class="string">"url"</span> <span class="filter-type">urlEncode</span> <span class="filter-type">urlDecode</span> <span class="predicate-type">==</span> <span class="string">"{{url}}"</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.html"</span> <span class="predicate-type">==</span> <span class="string">"a &gt; b &amp;&amp; a &lt; c"</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.html"</span> <span class="filter-type">htmlEscape</span> <span class="predicate-type">==</span> <span class="string">"a &amp;gt; b &amp;amp;&amp;amp; a &amp;lt; c"</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.encoded_html"</span> <span class="filter-type">htmlUnescape</span> <span class="predicate-type">==</span> <span class="string">"a &gt; b &amp;&amp; a &lt; c"</span></span>
<span class="line"><span class="query-type">variable</span> <span class="string">"html"</span> <span class="filter-type">htmlEscape</span> <span class="filter-type">htmlUnescape</span> <span class="predicate-type">==</span> <span class="string">"{{html}}"</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.text"</span> <span class="predicate-type">==</span> <span class="string">"a &gt; b &amp;&amp; a &lt; c"</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.text"</span> <span class="filter-type">htmlEscape</span> <span class="predicate-type">==</span> <span class="string">"a &amp;gt; b &amp;amp;&amp;amp; a &amp;lt; c"</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.escaped_html[0]"</span> <span class="filter-type">htmlUnescape</span> <span class="predicate-type">==</span> <span class="string">"a &gt; b &amp;&amp; a &lt; c"</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.escaped_html[1]"</span> <span class="filter-type">htmlUnescape</span> <span class="predicate-type">==</span> <span class="string">"Foo © bar 𝌆 baz ☃ qux"</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.escaped_html[2]"</span> <span class="filter-type">htmlUnescape</span> <span class="predicate-type">==</span> <span class="string">"A foo"</span></span>
<span class="line"><span class="query-type">variable</span> <span class="string">"text"</span> <span class="filter-type">htmlEscape</span> <span class="filter-type">htmlUnescape</span> <span class="predicate-type">==</span> <span class="string">"{{text}}"</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.id"</span> <span class="filter-type">toInt</span> <span class="predicate-type">==</span> <span class="number">123</span></span>
<span class="line"><span class="query-type">jsonpath</span> <span class="string">"$.score"</span> <span class="filter-type">toInt</span> <span class="predicate-type">==</span> <span class="number">1</span></span>
<span class="json"><span class="line">{</span>
@ -22,8 +24,12 @@
<span class="line"> "message": "Hello Bob!",</span>
<span class="line"> "url": "https://mozilla.org/?x=шеллы",</span>
<span class="line"> "encoded_url": "https://mozilla.org/?x=%D1%88%D0%B5%D0%BB%D0%BB%D1%8B",</span>
<span class="line"> "html": "a &gt; b &amp;&amp; a &lt; c",</span>
<span class="line"> "encoded_html": "a &amp;gt; b &amp;amp;&amp;amp; a &amp;lt; c",</span>
<span class="line"> "text": "a &gt; b &amp;&amp; a &lt; c",</span>
<span class="line"> "escaped_html": [</span>
<span class="line"> "a &amp;gt; b &amp;amp;&amp;amp; a &amp;lt; c",</span>
<span class="line"> "Foo &amp;#xA9; bar &amp;#x1D306; baz &amp;#x2603; qux",</span>
<span class="line"> "&amp;#65 foo"</span>
<span class="line"> ],</span>
<span class="line"> "id": "123",</span>
<span class="line"> "score": 1.6</span>
<span class="line">}</span></span>

View File

@ -3,7 +3,7 @@ GET http://localhost:8000/filter
HTTP 200
[Captures]
url: jsonpath "$.url"
html: jsonpath "$.html"
text: jsonpath "$.text"
[Asserts]
jsonpath "$.list" count == 3
jsonpath "$.message" regex /Hello (.*)!/ == "Bob"
@ -11,10 +11,12 @@ jsonpath "$.url" == "https://mozilla.org/?x=шеллы"
jsonpath "$.url" urlEncode == "https%3A//mozilla.org/%3Fx%3D%D1%88%D0%B5%D0%BB%D0%BB%D1%8B"
jsonpath "$.encoded_url" urlDecode == "https://mozilla.org/?x=шеллы"
variable "url" urlEncode urlDecode == "{{url}}"
jsonpath "$.html" == "a > b && a < c"
jsonpath "$.html" htmlEscape == "a &gt; b &amp;&amp; a &lt; c"
jsonpath "$.encoded_html" htmlUnescape == "a > b && a < c"
variable "html" htmlEscape htmlUnescape == "{{html}}"
jsonpath "$.text" == "a > b && a < c"
jsonpath "$.text" htmlEscape == "a &gt; b &amp;&amp; a &lt; c"
jsonpath "$.escaped_html[0]" htmlUnescape == "a > b && a < c"
jsonpath "$.escaped_html[1]" htmlUnescape == "Foo © bar 𝌆 baz ☃ qux"
jsonpath "$.escaped_html[2]" htmlUnescape == "A foo"
variable "text" htmlEscape htmlUnescape == "{{text}}"
jsonpath "$.id" toInt == 123
jsonpath "$.score" toInt == 1
{
@ -22,8 +24,12 @@ jsonpath "$.score" toInt == 1
"message": "Hello Bob!",
"url": "https://mozilla.org/?x=шеллы",
"encoded_url": "https://mozilla.org/?x=%D1%88%D0%B5%D0%BB%D0%BB%D1%8B",
"html": "a > b && a < c",
"encoded_html": "a &gt; b &amp;&amp; a &lt; c",
"text": "a > b && a < c",
"escaped_html": [
"a &gt; b &amp;&amp; a &lt; c",
"Foo &#xA9; bar &#x1D306; baz &#x2603; qux",
"&#65 foo"
],
"id": "123",
"score": 1.6
}

View File

@ -1 +1 @@
{"entries":[{"request":{"method":"GET","url":"http://localhost:8000/filter"},"response":{"status":200,"captures":[{"name":"url","query":{"type":"jsonpath","expr":"$.url"}},{"name":"html","query":{"type":"jsonpath","expr":"$.html"}}],"asserts":[{"query":{"type":"jsonpath","expr":"$.list"},"filters":[{"type":"count"}],"predicate":{"type":"equal","value":3}},{"query":{"type":"jsonpath","expr":"$.message"},"filters":[{"type":"regex","expr":{"type":"regex","value":"Hello (.*)!"}}],"predicate":{"type":"equal","value":"Bob"}},{"query":{"type":"jsonpath","expr":"$.url"},"predicate":{"type":"equal","value":"https://mozilla.org/?x=шеллы"}},{"query":{"type":"jsonpath","expr":"$.url"},"filters":[{"type":"urlEncode"}],"predicate":{"type":"equal","value":"https%3A//mozilla.org/%3Fx%3D%D1%88%D0%B5%D0%BB%D0%BB%D1%8B"}},{"query":{"type":"jsonpath","expr":"$.encoded_url"},"filters":[{"type":"urlDecode"}],"predicate":{"type":"equal","value":"https://mozilla.org/?x=шеллы"}},{"query":{"type":"variable","name":"url"},"filters":[{"type":"urlEncode"},{"type":"urlDecode"}],"predicate":{"type":"equal","value":"{{url}}"}},{"query":{"type":"jsonpath","expr":"$.html"},"predicate":{"type":"equal","value":"a > b && a < c"}},{"query":{"type":"jsonpath","expr":"$.html"},"filters":[{"type":"htmlEscape"}],"predicate":{"type":"equal","value":"a &gt; b &amp;&amp; a &lt; c"}},{"query":{"type":"jsonpath","expr":"$.encoded_html"},"filters":[{"type":"htmlUnescape"}],"predicate":{"type":"equal","value":"a > b && a < c"}},{"query":{"type":"variable","name":"html"},"filters":[{"type":"htmlEscape"},{"type":"htmlUnescape"}],"predicate":{"type":"equal","value":"{{html}}"}},{"query":{"type":"jsonpath","expr":"$.id"},"filters":[{"type":"toInt"}],"predicate":{"type":"equal","value":123}},{"query":{"type":"jsonpath","expr":"$.score"},"filters":[{"type":"toInt"}],"predicate":{"type":"equal","value":1}}],"body":{"type":"json","value":{"list":[1,2,3],"message":"Hello Bob!","url":"https://mozilla.org/?x=шеллы","encoded_url":"https://mozilla.org/?x=%D1%88%D0%B5%D0%BB%D0%BB%D1%8B","html":"a > b && a < c","encoded_html":"a &gt; b &amp;&amp; a &lt; c","id":"123","score":1.6}}}}]}
{"entries":[{"request":{"method":"GET","url":"http://localhost:8000/filter"},"response":{"status":200,"captures":[{"name":"url","query":{"type":"jsonpath","expr":"$.url"}},{"name":"text","query":{"type":"jsonpath","expr":"$.text"}}],"asserts":[{"query":{"type":"jsonpath","expr":"$.list"},"filters":[{"type":"count"}],"predicate":{"type":"equal","value":3}},{"query":{"type":"jsonpath","expr":"$.message"},"filters":[{"type":"regex","expr":{"type":"regex","value":"Hello (.*)!"}}],"predicate":{"type":"equal","value":"Bob"}},{"query":{"type":"jsonpath","expr":"$.url"},"predicate":{"type":"equal","value":"https://mozilla.org/?x=шеллы"}},{"query":{"type":"jsonpath","expr":"$.url"},"filters":[{"type":"urlEncode"}],"predicate":{"type":"equal","value":"https%3A//mozilla.org/%3Fx%3D%D1%88%D0%B5%D0%BB%D0%BB%D1%8B"}},{"query":{"type":"jsonpath","expr":"$.encoded_url"},"filters":[{"type":"urlDecode"}],"predicate":{"type":"equal","value":"https://mozilla.org/?x=шеллы"}},{"query":{"type":"variable","name":"url"},"filters":[{"type":"urlEncode"},{"type":"urlDecode"}],"predicate":{"type":"equal","value":"{{url}}"}},{"query":{"type":"jsonpath","expr":"$.text"},"predicate":{"type":"equal","value":"a > b && a < c"}},{"query":{"type":"jsonpath","expr":"$.text"},"filters":[{"type":"htmlEscape"}],"predicate":{"type":"equal","value":"a &gt; b &amp;&amp; a &lt; c"}},{"query":{"type":"jsonpath","expr":"$.escaped_html[0]"},"filters":[{"type":"htmlUnescape"}],"predicate":{"type":"equal","value":"a > b && a < c"}},{"query":{"type":"jsonpath","expr":"$.escaped_html[1]"},"filters":[{"type":"htmlUnescape"}],"predicate":{"type":"equal","value":"Foo © bar 𝌆 baz ☃ qux"}},{"query":{"type":"jsonpath","expr":"$.escaped_html[2]"},"filters":[{"type":"htmlUnescape"}],"predicate":{"type":"equal","value":"A foo"}},{"query":{"type":"variable","name":"text"},"filters":[{"type":"htmlEscape"},{"type":"htmlUnescape"}],"predicate":{"type":"equal","value":"{{text}}"}},{"query":{"type":"jsonpath","expr":"$.id"},"filters":[{"type":"toInt"}],"predicate":{"type":"equal","value":123}},{"query":{"type":"jsonpath","expr":"$.score"},"filters":[{"type":"toInt"}],"predicate":{"type":"equal","value":1}}],"body":{"type":"json","value":{"list":[1,2,3],"message":"Hello Bob!","url":"https://mozilla.org/?x=шеллы","encoded_url":"https://mozilla.org/?x=%D1%88%D0%B5%D0%BB%D0%BB%D1%8B","text":"a > b && a < c","escaped_html":["a &gt; b &amp;&amp; a &lt; c","Foo &#xA9; bar &#x1D306; baz &#x2603; qux","&#65 foo"],"id":"123","score":1.6}}}}]}

View File

@ -0,0 +1,14 @@
{
"list": [1,2,3],
"message": "Hello Bob!",
"url": "https://mozilla.org/?x=шеллы",
"encoded_url": "https://mozilla.org/?x=%D1%88%D0%B5%D0%BB%D0%BB%D1%8B",
"text": "a > b && a < c",
"escaped_html": [
"a &gt; b &amp;&amp; a &lt; c",
"Foo &#xA9; bar &#x1D306; baz &#x2603; qux",
"&#65 foo"
],
"id": "123",
"score": 1.6
}

View File

@ -8,8 +8,12 @@ def filter():
"message": "Hello Bob!",
"url": "https://mozilla.org/?x=шеллы",
"encoded_url": "https://mozilla.org/?x=%D1%88%D0%B5%D0%BB%D0%BB%D1%8B",
"html": "a > b && a < c",
"encoded_html": "a &gt; b &amp;&amp; a &lt; c",
"text": "a > b && a < c",
"escaped_html": [
"a &gt; b &amp;&amp; a &lt; c",
"Foo &#xA9; bar &#x1D306; baz &#x2603; qux",
"&#65 foo"
],
"id": "123",
"score": 1.6
}"""

View File

@ -29,7 +29,6 @@ float-cmp = "0.9.0"
glob = "0.3.0"
hex = "0.4.3"
hex-literal = "0.3.4"
html-escape = "0.2.12"
hurl_core = { version = "2.0.0-SNAPSHOT", path = "../hurl_core" }
indexmap = "1.9.2"
libflate = "1.2.0"
@ -42,7 +41,7 @@ serde_json = "1.0.89"
sha2 = "0.10.6"
url = "2.3.1"
xmltree = { version = "0.10.3", features = ["attribute-order"] }
lazy_static = "1.4.0"
[target.'cfg(unix)'.dependencies]
termion = "2.0.1"

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,66 @@
/*
* Hurl (https://hurl.dev)
* Copyright (C) 2022 Orange
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
/// Replaces special characters "&", "<" and ">" to HTML-safe sequences.
///
/// Both double quote (") and single quote (') characters are also
/// translated.
///
/// # Examples
///
/// ```
/// use hurl::html;
///
/// let output = html::html_escape("<foo>");
/// assert_eq!(output, "&lt;foo&gt;")
/// ```
pub fn html_escape(text: &str) -> String {
let mut output = String::new();
for c in text.chars() {
match c {
'&' => output.push_str("&amp;"),
'<' => output.push_str("&lt;"),
'>' => output.push_str("&gt;"),
'"' => output.push_str("&quot;"),
'\'' => output.push_str("&#x27;"),
_ => output.push(c),
}
}
output
}
#[cfg(test)]
mod tests {
use super::html_escape;
#[test]
pub fn eval_html_escape() {
let tests = vec![
("foo", "foo"),
("<tag>", "&lt;tag&gt;"),
("foo & bar", "foo &amp; bar"),
(
"string with double quote: \"baz\"",
"string with double quote: &quot;baz&quot;",
),
];
for (input, output) in tests.iter() {
assert_eq!(html_escape(input), output.to_string())
}
}
}

View File

@ -0,0 +1,23 @@
/*
* Hurl (https://hurl.dev)
* Copyright (C) 2022 Orange
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
mod entities;
mod escape;
mod unescape;
pub use self::escape::html_escape;
pub use self::unescape::html_unescape;

View File

@ -0,0 +1,511 @@
/*
* Hurl (https://hurl.dev)
* Copyright (C) 2022 Orange
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/
use crate::html::entities::HTML5_ENTITIES;
use lazy_static::lazy_static;
use regex::{Captures, Regex};
use std::collections::HashMap;
// Ref https://html.spec.whatwg.org/#decimal-character-reference-start-state
lazy_static! {
static ref INVALID_CHAR_REF: HashMap<u32, &'static str> = {
let mut m = HashMap::new();
m.insert(0x00, "\u{fffd}"); // REPLACEMENT CHARACTER
m.insert(0x0d, "\r"); // CARRIAGE RETURN
m.insert(0x80, "\u{20ac}"); // EURO SIGN
m.insert(0x81, "\u{81}"); // <control>
m.insert(0x82, "\u{201a}"); // SINGLE LOW-9 QUOTATION MARK
m.insert(0x83, "\u{0192}"); // LATIN SMALL LETTER F WITH HOOK
m.insert(0x84, "\u{201e}"); // DOUBLE LOW-9 QUOTATION MARK
m.insert(0x85, "\u{2026}"); // HORIZONTAL ELLIPSIS
m.insert(0x86, "\u{2020}"); // DAGGER
m.insert(0x87, "\u{2021}"); // DOUBLE DAGGER
m.insert(0x88, "\u{02c6}"); // MODIFIER LETTER CIRCUMFLEX ACCENT
m.insert(0x89, "\u{2030}"); // PER MILLE SIGN
m.insert(0x8a, "\u{0160}"); // LATIN CAPITAL LETTER S WITH CARON
m.insert(0x8b, "\u{2039}"); // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
m.insert(0x8c, "\u{0152}"); // LATIN CAPITAL LIGATURE OE
m.insert(0x8d, "\u{8d}"); // <control>
m.insert(0x8e, "\u{017d}"); // LATIN CAPITAL LETTER Z WITH CARON
m.insert(0x8f, "\u{8f}"); // <control>
m.insert(0x90, "\u{90}"); // <control>
m.insert(0x91, "\u{2018}"); // LEFT SINGLE QUOTATION MARK
m.insert(0x92, "\u{2019}"); // RIGHT SINGLE QUOTATION MARK
m.insert(0x93, "\u{201c}"); // LEFT DOUBLE QUOTATION MARK
m.insert(0x94, "\u{201d}"); // RIGHT DOUBLE QUOTATION MARK
m.insert(0x95, "\u{2022}"); // BULLET
m.insert(0x96, "\u{2013}"); // EN DASH
m.insert(0x97, "\u{2014}"); // EM DASH
m.insert(0x98, "\u{02dc}"); // SMALL TILDE
m.insert(0x99, "\u{2122}"); // TRADE MARK SIGN
m.insert(0x9a, "\u{0161}"); // LATIN SMALL LETTER S WITH CARON
m.insert(0x9b, "\u{203a}"); // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
m.insert(0x9c, "\u{0153}"); // LATIN SMALL LIGATURE OE
m.insert(0x9d, "\u{9d}"); // <control>
m.insert(0x9e, "\u{017e}"); // LATIN SMALL LETTER Z WITH CARON
m.insert(0x9f, "\u{0178}"); // LATIN CAPITAL LETTER Y WITH DIAERESIS
m
};
}
const INVALID_CODEPOINTS: [u32; 126] = [
// 0x0001 to 0x0008
0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, // 0x000E to 0x001F
0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d,
0x1e, 0x1f, // 0x007F to 0x009F
0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e,
0x9f, // 0xFDD0 to 0xFDEF
0xfdd0, 0xfdd1, 0xfdd2, 0xfdd3, 0xfdd4, 0xfdd5, 0xfdd6, 0xfdd7, 0xfdd8, 0xfdd9, 0xfdda, 0xfddb,
0xfddc, 0xfddd, 0xfdde, 0xfddf, 0xfde0, 0xfde1, 0xfde2, 0xfde3, 0xfde4, 0xfde5, 0xfde6, 0xfde7,
0xfde8, 0xfde9, 0xfdea, 0xfdeb, 0xfdec, 0xfded, 0xfdee, 0xfdef, // Others
0xb, 0xfffe, 0xffff, 0x1fffe, 0x1ffff, 0x2fffe, 0x2ffff, 0x3fffe, 0x3ffff, 0x4fffe, 0x4ffff,
0x5fffe, 0x5ffff, 0x6fffe, 0x6ffff, 0x7fffe, 0x7ffff, 0x8fffe, 0x8ffff, 0x9fffe, 0x9ffff,
0xafffe, 0xaffff, 0xbfffe, 0xbffff, 0xcfffe, 0xcffff, 0xdfffe, 0xdffff, 0xefffe, 0xeffff,
0xffffe, 0xfffff, 0x10fffe, 0x10ffff,
];
lazy_static! {
static ref CHAR_REF: Regex = Regex::new(concat!(
r"&(#\d+;?",
r"|#[xX][\da-fA-F]+;?",
r"|[^\t\n\f <&#;]{1,32};?)",
))
.unwrap();
}
/// Convert all named and numeric character references (e.g. &gt;, &#62;,
/// &x3e;) in the string `text` to the corresponding unicode characters.
/// This function uses the rules defined by the HTML 5 standard
/// for both valid and invalid character references, and the list of
/// HTML 5 named character references defined in html.entities.html5.
///
/// The code is adapted from the Python standard library:
/// https://github.com/python/cpython/blob/main/Lib/html/__init__.py
///
/// See MDN decoder tool: https://mothereff.in/html-entities
///
/// # Examples
///
/// ```
/// use hurl::html;
///
/// let output = html::html_unescape("Foo &#xA9; bar &#x1D306; baz &#x2603; qux");
/// assert_eq!(output, "Foo © bar 𝌆 baz ☃ qux")
/// ```
pub fn html_unescape(text: &str) -> String {
return if text.chars().any(|c| c == '&') {
CHAR_REF
.replace_all(text, |caps: &Captures| {
let s = &caps[1];
let s0 = s.chars().next().unwrap();
return if s0 == '#' {
// Numeric charref
let s1 = s.chars().nth(1).unwrap();
let num = if s1 == 'x' || s1 == 'X' {
let val = s[2..].trim_end_matches(';');
match u32::from_str_radix(val, 16) {
Ok(val) => val,
Err(_) => return "\u{FFFD}".to_string(),
}
} else {
let val = s[1..].trim_end_matches(';');
match val.parse::<u32>() {
Ok(val) => val,
Err(_) => return "\u{FFFD}".to_string(),
}
};
if let Some(char) = INVALID_CHAR_REF.get(&num) {
return char.to_string();
}
if (0xD800..=0xDFFF).contains(&num) || num > 0x10FFFF {
return "\u{FFFD}".to_string();
}
if INVALID_CODEPOINTS.contains(&num) {
return "".to_string();
}
char::from_u32(num).unwrap().to_string()
} else {
if let Some(entity) = HTML5_ENTITIES.get(s) {
return entity.to_string();
}
// Find the longest matching name (as defined by the standard)
for x in (1..s.len()).rev() {
let name = &s[..x];
if let Some(entity) = HTML5_ENTITIES.get(name) {
return format!("{}{}", entity, &s[x..]);
}
}
format!("&{s}")
};
})
.to_string()
} else {
text.to_string()
};
}
#[cfg(test)]
mod tests {
use super::html_unescape;
/// Extracts from Python test suites: https://github.com/python/cpython/blob/main/Lib/test/test_html.py
#[test]
fn test_html_unescape() {
fn check(text: &str, expected: &str) {
assert_eq!(html_unescape(text), expected.to_string())
}
fn check_num(num: usize, expected: &str) {
let text = format!("&#{}", num);
check(&text, expected);
let text = format!("&#{};", num);
check(&text, expected);
let text = format!("&#x{:x}", num);
check(&text, expected);
let text = format!("&#x{:x};", num);
check(&text, expected);
}
check("Hurl&rlarr;", "Hurl⇄");
// Check simple
check(
"Foo &#xA9; bar &#x1D306; baz &#x2603; qux",
"Foo © bar 𝌆 baz ☃ qux",
);
// Check text with no character references
check("no character references", "no character references");
// Check & followed by invalid chars
check("&\n&\t& &&", "&\n&\t& &&");
// Check & followed by numbers and letters
check("&0 &9 &a &0; &9; &a;", "&0 &9 &a &0; &9; &a;");
// Check incomplete entities at the end of the string
for x in ["&", "&#", "&#x", "&#X", "&#y", "&#xy", "&#Xy"].iter() {
check(x, x);
check(&format!("{x};"), &format!("{x};"))
}
// Check several combinations of numeric character references,
// possibly followed by different characters
// Format &#1234 (without ending semi-colon)
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#{}", num), &format!("{}", char));
check(&format!("&#{} ", num), &format!("{} ", char));
check(&format!("&#{}X", num), &format!("{}X", char));
}
// Format &#0001234 (without ending semi-colon)
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#{:07}", num), &format!("{}", char));
check(&format!("&#{:07} ", num), &format!("{} ", char));
check(&format!("&#{:07}X", num), &format!("{}X", char));
}
// Format &#1234;
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#{};", num), &format!("{}", char));
check(&format!("&#{}; ", num), &format!("{} ", char));
check(&format!("&#{};X", num), &format!("{}X", char));
}
// Format &#0001234;
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#{:07};", num), &format!("{}", char));
check(&format!("&#{:07}; ", num), &format!("{} ", char));
check(&format!("&#{:07};X", num), &format!("{}X", char));
}
// Format &#x1abc
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{:x}", num), &format!("{}", char));
check(&format!("&#x{:x} ", num), &format!("{} ", char));
check(&format!("&#x{:x}X", num), &format!("{}X", char));
}
// Format &#x001abc
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{:06x}", num), &format!("{}", char));
check(&format!("&#x{:06x} ", num), &format!("{} ", char));
check(&format!("&#x{:06x}X", num), &format!("{}X", char));
}
// Format &#x1abc;
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{:x};", num), &format!("{}", char));
check(&format!("&#x{:x}; ", num), &format!("{} ", char));
check(&format!("&#x{:x};X", num), &format!("{}X", char));
}
// Format &#x001abc;
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{:06x};", num), &format!("{}", char));
check(&format!("&#x{:06x}; ", num), &format!("{} ", char));
check(&format!("&#x{:06x};X", num), &format!("{}X", char));
}
// Format &#x1ABC
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{:X}", num), &format!("{}", char));
check(&format!("&#x{:X} ", num), &format!("{} ", char));
check(&format!("&#x{:X}X", num), &format!("{}X", char));
}
// Format &#x001ABC
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{:06X}", num), &format!("{}", char));
check(&format!("&#x{:06X} ", num), &format!("{} ", char));
check(&format!("&#x{:06X}X", num), &format!("{}X", char));
}
// Format &#x1ABC;
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{:X};", num), &format!("{}", char));
check(&format!("&#x{:X}; ", num), &format!("{} ", char));
check(&format!("&#x{:X};X", num), &format!("{}X", char));
}
// Format &#x001ABC;
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#x{:06X};", num), &format!("{}", char));
check(&format!("&#x{:06X}; ", num), &format!("{} ", char));
check(&format!("&#x{:06X};X", num), &format!("{}X", char));
}
// Format &#X1abc;
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#X{:x};", num), &format!("{}", char));
check(&format!("&#X{:x}; ", num), &format!("{} ", char));
check(&format!("&#X{:x};X", num), &format!("{}X", char));
}
// Format &#X001abc;
for (num, char) in [
(65, 'A'),
(97, 'a'),
(34, '"'),
(38, '&'),
(0x2603, '\u{2603}'),
(0x101234, '\u{101234}'),
]
.iter()
{
check(&format!("&#X{:06x};", num), &format!("{}", char));
check(&format!("&#X{:06x}; ", num), &format!("{} ", char));
check(&format!("&#X{:06x};X", num), &format!("{}X", char));
}
// Check invalid code points
for cp in [0xD800, 0xDB00, 0xDC00, 0xDFFF, 0x110000] {
check_num(cp, "\u{FFFD}");
}
// Check more invalid code points
for cp in [0x1, 0xb, 0xe, 0x7f, 0xfffe, 0xffff, 0x10fffe, 0x10ffff] {
check_num(cp, "");
}
// Check invalid numbers
for (num, ch) in [(0x0d, "\r"), (0x80, "\u{20ac}"), (0x95, "\u{2022}")] {
check_num(num, ch)
}
// Check small numbers
check_num(0, "\u{FFFD}");
check_num(9, "\t");
// Check a big number
check_num(1000000000000000000, "\u{FFFD}");
// Check that multiple trailing semicolons are handled correctly
for e in ["&quot;;", "&#34;;", "&#x22;;", "&#X22;;"] {
check(e, "\";")
}
// Check that semicolons in the middle don't create problems
for e in ["&quot;quot;", "&#34;quot;", "&#x22;quot;", "&#X22;quot;"] {
check(e, "\"quot;")
}
// Check triple adjacent charrefs
for e in ["&quot", "&#34", "&#x22", "&#X22"] {
// check(&e.repeat(3), "\"\"\"");
check(&format!("{};", e).repeat(3), "\"\"\"")
}
// Check that the case is respected
for e in ["&amp", "&amp;", "&AMP", "&AMP;"] {
check(e, "&")
}
for e in ["&Amp", "&Amp;"] {
check(e, e)
}
// Check that non-existent named entities are returned unchanged
check("&svadilfari;", "&svadilfari;");
// The following examples are in the html5 specs
check("&notit", "¬it");
check("&notit;", "¬it;");
check("&notin", "¬in");
check("&notin;", "");
// A similar example with a long name
check(
"&notReallyAnExistingNamedCharacterReference;",
"¬ReallyAnExistingNamedCharacterReference;",
);
// Longest valid name
check("&CounterClockwiseContourIntegral;", "");
// Check a charref that maps to two unicode chars
check("&acE;", "\u{223e}\u{333}");
check("&acE", "&acE");
// See Python #12888
check(&"&#123; ".repeat(1050), &"{ ".repeat(1050));
// See Python #15156
check(
"&Eacuteric&Eacute;ric&alphacentauri&alpha;centauri",
"ÉricÉric&alphacentauriαcentauri",
);
check("&co;", "&co;")
}
}

View File

@ -18,6 +18,7 @@
#![cfg_attr(feature = "strict", deny(warnings))]
pub mod cli;
pub mod html;
pub mod http;
pub mod json;
pub mod jsonpath;

View File

@ -15,6 +15,7 @@
* limitations under the License.
*
*/
use crate::html;
use crate::runner::template::eval_template;
use crate::runner::{Error, RunnerError, Value};
use hurl_core::ast::{Filter, FilterValue, RegexValue, SourceInfo};
@ -158,9 +159,8 @@ fn eval_url_decode(value: &Value, source_info: &SourceInfo) -> Result<Value, Err
fn eval_html_encode(value: &Value, source_info: &SourceInfo) -> Result<Value, Error> {
match value {
Value::String(value) => {
let mut enco = String::from(value);
let encoded = html_escape::encode_text_to_string(value, &mut enco);
Ok(Value::String(encoded.to_string()))
let encoded = html::html_escape(value);
Ok(Value::String(encoded))
}
v => Err(Error {
source_info: source_info.clone(),
@ -173,7 +173,7 @@ fn eval_html_encode(value: &Value, source_info: &SourceInfo) -> Result<Value, Er
fn eval_html_decode(value: &Value, source_info: &SourceInfo) -> Result<Value, Error> {
match value {
Value::String(value) => {
let decoded = html_escape::decode_html_entities(value).to_string();
let decoded = html::html_unescape(value);
Ok(Value::String(decoded))
}
v => Err(Error {
@ -418,4 +418,54 @@ pub mod tests {
RunnerError::FilterInvalidInput("bool <true>".to_string())
);
}
#[test]
pub fn eval_filter_html_escape() {
let variables = HashMap::new();
let filter = Filter {
source_info: SourceInfo::new(1, 1, 1, 1),
value: FilterValue::HtmlEscape,
};
let tests = vec![
("foo", "foo"),
("<tag>", "&lt;tag&gt;"),
("foo & bar", "foo &amp; bar"),
(
"string with double quote: \"baz\"",
"string with double quote: &quot;baz&quot;",
),
];
for (input, output) in tests.iter() {
assert_eq!(
eval_filter(&filter, &Value::String(input.to_string()), &variables).unwrap(),
Value::String(output.to_string())
);
}
}
#[test]
pub fn eval_filter_html_unescape() {
let variables = HashMap::new();
let filter = Filter {
source_info: SourceInfo::new(1, 1, 1, 1),
value: FilterValue::HtmlUnescape,
};
let tests = vec![
("foo", "foo"),
("&lt;tag&gt;", "<tag>"),
("foo &amp; bar", "foo & bar"),
(
"string with double quote: &quot;baz&quot;",
"string with double quote: \"baz\"",
),
];
for (input, output) in tests.iter() {
assert_eq!(
eval_filter(&filter, &Value::String(input.to_string()), &variables).unwrap(),
Value::String(output.to_string())
);
}
}
}