Initial commit

This commit is contained in:
Dan Neumann 2022-05-16 20:28:13 -05:00
commit f953cc6ee4
8 changed files with 6923 additions and 0 deletions

3
.prettierrc.json Normal file
View File

@ -0,0 +1,3 @@
{
"tabWidth": 4
}

52
README.md Normal file
View File

@ -0,0 +1,52 @@
# elm-html-parser
A lenient html5 parser implemented with [Elm](https://elm-lang.org).
A lenient alternative to [hecrj/elm-html-parser](https://package.elm-lang.org/packages/hecrj/html-parser/latest/).
## Goals
- **Leniency**
- Avoids validating while parsing
- Prefers to immitate browser parsing behavior rather than html5 spec.
- Prefers to use the html5 spec only to handle ambiguous cases rather than to prohibit invalid html5
- Prefers to fall back to text nodes than short-circuit with parse errors
- **Handle user-written html**
- Users don't write character entities like `&amp;` and `&lt;`. This parser should strive to handle cases like `<p><:</p>` -> `Element "p" [] [ Text "<:" ]`.
## Features / Quirks
- Characters don't need to be escaped into entities.
e.g. `<div><:</div>` will parse correctly and doesn't need to be rewritten into `<div>&lt;:</div>`.
- Tags that should not nest are autoclosed.
e.g. `<p>a<p>b` -> `<p>a</p><p>b</p>`.
- Closing tags that have no matching open tags are ignored.
e.g. `</a><div></div></div></b>` -> `<div></div>`
- Ignores comments in whitespace positions:
e.g. `<div <!--comment-->/>` -> `<div/>`
- Parses comments in text node positions:
e.g. `div><!--comment--></div>` ->
`Element "div" [ Comment "comment" ]`
## Differences from existing packages
Currently, there is only one html parser published to Elm packages: [hecrj/elm-html-parser](https://package.elm-lang.org/packages/hecrj/html-parser/latest/).
@hecjr has said that following the html5 spec is a goal of their parser, so their parser is stricter by design and rejects invalid html5.
## Development
`git clone` and `npm install`.
- `npm test` to run tests
- `npm docs` to preview docs locally
## Special thanks
- @hecrj and their contributors.
- @ymtszw for their work on the Javascript `<script>` parser.

20
elm.json Normal file
View File

@ -0,0 +1,20 @@
{
"type": "package",
"name": "danneu/html-parser",
"summary": "TODO",
"license": "MIT",
"version": "1.0.0",
"exposed-modules": [
"Html.Parser"
],
"elm-version": "0.19.1 <= v < 0.20.0",
"dependencies": {
"elm/core": "1.0.5 <= v < 2.0.0",
"elm/html": "1.0.0 <= v < 2.0.0",
"elm/parser": "1.1.0 <= v < 2.0.0",
"rtfeldman/elm-hex": "1.0.0 <= v < 2.0.0"
},
"test-dependencies": {
"elm-explorations/test": "1.2.2 <= v < 2.0.0"
}
}

3047
package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

11
package.json Normal file
View File

@ -0,0 +1,11 @@
{
"private": true,
"devDependencies": {
"elm-doc-preview": "^5.0.5",
"elm-test": "^0.19.1-revision7"
},
"scripts": {
"test": "elm-test",
"docs": "elm-doc-preview"
}
}

2140
src/Html/CharRefs.elm Normal file

File diff suppressed because it is too large Load Diff

957
src/Html/Parser.elm Normal file
View File

@ -0,0 +1,957 @@
module Html.Parser exposing
( Node(..), Document
, run, runDocument
, nodeToHtml, nodesToHtml, nodeToString, nodesToString, nodeToPrettyString, nodesToPrettyString, documentToString, documentToPrettyString
)
{-| Leniently parse html5 documents and fragments and then render them
into strings or Elm's virtual dom nodes.
# Definition
@docs Node, Document
# Parsing
@docs run, runDocument
# Render
@docs nodeToHtml, nodesToHtml, nodeToString, nodesToString, nodeToPrettyString, nodesToPrettyString, documentToString, documentToPrettyString
-}
import Hex
import Html
import Html.Attributes
import Html.CharRefs
import Parser exposing (..)
{-| An html node is tree of text, comments, and element nodes.
An element (e.g. `<div foo="bar">hello</div>`) can have attributes and child nodes.
-}
type Node
= Text String
| Comment String
| Element String (List ( String, String )) (List Node)
{-| Parse an html fragment into a list of html nodes.
The html fragment can have multiple top-level nodes.
run "<div>hi</div><div>bye</div>"
== Ok
[ Element "div" [] [ Text "hi" ]
, Element "div" [] [ Text "bye" ]
]
-}
run : String -> Result (List DeadEnd) (List Node)
run input =
Parser.run parseAll input
{-| An html document has a `<!doctype>` and then a root html node.
-}
type alias Document =
{ legacyCompat : Bool
, root : Node
}
{-| Like `Parser.token` except token is matched case-insensitive.
-}
caseInsensitiveToken : String -> Parser ()
caseInsensitiveToken string =
let
help : String -> Parser.Parser () -> Parser.Parser ()
help string_ parser =
case String.uncons string_ of
Nothing ->
parser
Just ( char, rest ) ->
parser
|> Parser.andThen
(\_ ->
oneOf
[ chompIf (\c -> Char.toLower c == Char.toLower char)
, problem ("expected case-insensitive char '" ++ String.fromChar char ++ "'")
]
)
|> help rest
in
help string (succeed ())
doctypeLegacy : Parser Bool
doctypeLegacy =
-- https://html.spec.whatwg.org/multipage/syntax.html#doctype-legacy-string
(succeed identity
|. chompOneOrMore isSpace
|. caseInsensitiveToken "SYSTEM"
|. chompOneOrMore isSpace
|= (oneOf
[ token "\""
, token "'"
]
|> getChompedString
)
)
|> andThen
(\quote ->
succeed ()
|. token "about:legacy-compat"
|. token quote
)
|> andThen (\_ -> succeed True)
doctype : Parser Bool
doctype =
-- https://html.spec.whatwg.org/multipage/syntax.html#the-doctype
succeed identity
|. token "<!"
|. caseInsensitiveToken "DOCTYPE"
|. chompOneOrMore isSpace
|. caseInsensitiveToken "html"
|= oneOf
[ backtrackable doctypeLegacy
, succeed False
]
|. chompWhile isSpace
|. token ">"
{-| Parses `<!doctype html>` and any html nodes after.
Always returns a single root node. Wraps nodes in a root `<html>` node if one is not present.
**Caveat**: If there are multiple top-level nodes and one of them is `<html>`, then this
function will wrap them all in another `<html>` node.
-}
runDocument : String -> Result (List DeadEnd) Document
runDocument input =
Parser.run document input
document : Parser Document
document =
succeed Document
|= doctype
|. ws
|= (zeroOrMore node
|> map
(\nodes ->
case nodes of
[] ->
Element "html" [] []
((Element "html" _ _) as root) :: [] ->
root
other :: [] ->
Element "html" [] [ other ]
_ ->
Element "html" [] nodes
)
)
parseAll : Parser (List Node)
parseAll =
Parser.loop [] <|
\acc ->
oneOf
[ node |> map (\n -> Loop (mergeText n acc))
, succeed () |> map (\_ -> Done (List.reverse acc))
]
mergeText : Node -> List Node -> List Node
mergeText n nodes =
case ( n, nodes ) of
( Text s, (Text prev) :: rest ) ->
Text (prev ++ s) :: rest
_ ->
n :: nodes
{-| Chomps zero or more space characters or html comments.
-}
ws =
loop 0 <|
ifProgress <|
oneOf
[ multiComment "<!--" "-->" Nestable
, chompWhile isSpace
]
isSpace : Char -> Bool
isSpace c =
c == ' ' || c == '\n' || c == '\u{000D}' || c == '\n' || c == '\t' || c == '\u{000C}' || c == '\u{00A0}'
-- ATTRIBUTES
attributeValueUnquoted : Parser String
attributeValueUnquoted =
let
isUnquotedValueChar c =
not (isSpace c) && c /= '"' && c /= '\'' && c /= '=' && c /= '<' && c /= '>' && c /= '`' && c /= '&'
in
oneOf
[ chompOneOrMore isUnquotedValueChar
|> getChompedString
, characterReference
]
|> oneOrMore "attribute value"
|> map (String.join "")
attributeValueQuoted : Char -> Parser String
attributeValueQuoted quote =
let
isQuotedValueChar c =
c /= quote && c /= '&'
in
Parser.succeed identity
|. chompIf ((==) quote)
|= (oneOf
[ chompOneOrMore isQuotedValueChar
|> getChompedString
, characterReference
]
|> zeroOrMore
|> map (String.join "")
)
|. chompIf ((==) quote)
attributeKey : Parser String
attributeKey =
let
isKeyChar c =
not (isSpace c) && c /= '"' && c /= '\'' && c /= '>' && c /= '/' && c /= '='
in
succeed (++)
-- Attribute can start with '/' but it's ignored
|. oneOf
[ -- backtrackable because open tag can end with "/>"
backtrackable (chompIf ((==) '/'))
, succeed ()
]
-- Attribute name can start with '=': https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
-- e.g. <a =empty />
|= oneOf
[ chompIf ((==) '=') |> map (\_ -> "=")
, succeed ""
]
|= (chompOneOrMore isKeyChar
|> getChompedString
|> map String.toLower
)
attribute : Parser ( String, String )
attribute =
succeed Tuple.pair
|= attributeKey
|. ws
|= oneOf
[ succeed identity
|. symbol "="
|. ws
|= oneOf
[ attributeValueUnquoted -- <div foo=bar>
, attributeValueQuoted '"' -- <div foo="bar">
, attributeValueQuoted '\'' -- <div foo='bar'>
]
, succeed "" -- <div foo>
]
-- Reminder: Consume trailing whitespace so that following parsers don't need to consume whitespace
-- and then need to backtrack
|. ws
tagName : Parser String
tagName =
chompOneOrMore (\c -> Char.isAlphaNum c || c == '-')
|> getChompedString
|> map String.toLower
closeTag : String -> Parser ()
closeTag expectedTag =
(succeed identity
|. token "</"
|= tagName
|. ws
|. token ">"
)
|> andThen
(\tag ->
if tag == expectedTag then
succeed ()
else
problem ("found closing tag </" ++ tag ++ "> but expected </" ++ expectedTag ++ ">")
)
type OpenTagEnd
= NoClose
| SelfClose
anyCloseTag : Parser ()
anyCloseTag =
succeed ()
|. token "</"
|. tagName
|. ws
|. token ">"
node : Parser Node
node =
succeed identity
-- HACK: Ignore unmatched close tags like the browser does
|. zeroOrMore (backtrackable anyCloseTag)
|= oneOf
[ text
, comment
, backtrackable element
, justOneChar |> map Text
]
comment : Parser Node
comment =
succeed Comment
|. symbol "<!--"
|= (chompUntil "-->" |> getChompedString)
|. symbol "-->"
text : Parser Node
text =
oneOf
[ succeed Text
|= backtrackable characterReference
, succeed Text
|= (chompOneOrMore (\c -> c /= '<' && c /= '&') |> getChompedString)
]
{-| Parse any node unless it's one of the given tags.
-}
notNode : List String -> Parser Node
notNode tags =
oneOf
[ lookAhead
(openTag
|> andThen
(\( tag, _, _ ) ->
if List.member tag tags then
problem ""
else
succeed ()
)
)
|> andThen (\_ -> element)
, text
, comment
]
openTag : Parser ( String, List ( String, String ), OpenTagEnd )
openTag =
succeed (\a b c -> ( a, b, c ))
|. symbol "<"
|. ws
|= tagName
|. ws
|= zeroOrMore attribute
|. ws
|= oneOf
[ succeed NoClose
|. symbol ">"
, succeed SelfClose
|. symbol "/>"
]
element : Parser Node
element =
openTag
|> andThen
(\( tag, attrs, end ) ->
case end of
SelfClose ->
succeed (Element tag attrs [])
NoClose ->
if tag == "script" then
succeed (Element tag attrs)
|= consumeJavascriptUntilClosingTag
else if isVoidTag tag then
-- Void element expects no closing tag
succeed (Element tag attrs [])
else if isAutoclosingTag tag then
-- Autoclosing tag is automatically closed by an opening tag of the same name
succeed (Element tag attrs)
|= oneOf
[ succeed identity
|= zeroOrMore
(if tag == "head" then
notNode [ tag, "body" ]
else
notNode [ tag ]
)
|. oneOf
[ backtrackable (closeTag tag)
, succeed ()
]
]
else
-- Normal elements parse all nodes as children until their closing tag
succeed (Element tag attrs)
|= (loop [] <|
\acc ->
oneOf
[ backtrackable (closeTag tag) |> map (\_ -> Done (List.reverse acc))
, succeed (\n -> Loop (mergeText n acc))
|= backtrackable node
, succeed () |> map (\_ -> Done (List.reverse acc))
]
)
)
-- CHARACTER REFERENCE
{-| Parse one or more hexadecimal digits into an integer.
-}
base16 : Parser Int
base16 =
chompOneOrMore Char.isHexDigit
|> getChompedString
|> andThen
(\hex ->
case Hex.fromString (String.toLower hex) of
Ok num ->
succeed num
Err msg ->
problem msg
)
{-| Parse one or more 0-9 digits into an integer.
-}
base10 : Parser Int
base10 =
chompOneOrMore Char.isDigit
|> getChompedString
|> andThen
(\digits ->
String.toInt digits
|> Maybe.map succeed
|> Maybe.withDefault (problem "bad number")
)
numericCharacterReference : Parser String
numericCharacterReference =
let
codepoint =
oneOf
[ succeed identity
|. chompIf (\c -> c == 'x' || c == 'X')
|= base16
, succeed identity
|= base10
]
in
succeed identity
|. chompIf ((==) '#')
|= (codepoint
|> andThen
(\code ->
-- https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
if code == 0 then
succeed '<EFBFBD>'
else if 0xD800 <= code && code <= 0xDFFF then
-- Is surrogate
succeed '<EFBFBD>'
else
succeed (Char.fromCode code)
)
|> map String.fromChar
)
namedCharacterReference : Parser String
namedCharacterReference =
chompOneOrMore Char.isAlpha
|> getChompedString
|> map
(\ref ->
Html.CharRefs.decode ref
|> Maybe.withDefault ("&" ++ ref ++ ";")
)
characterReference : Parser String
characterReference =
succeed identity
|. chompIf ((==) '&')
|= oneOf
[ backtrackable numericCharacterReference
|. chompIf ((==) ';')
, backtrackable namedCharacterReference
|. chompIf ((==) ';')
, succeed "&"
]
-- SPECIAL ELEMENTS
isVoidTag : String -> Bool
isVoidTag tag =
List.member tag voidTags
voidTags : List String
voidTags =
[ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "source", "track", "wbr" ]
isAutoclosingTag : String -> Bool
isAutoclosingTag tag =
List.member tag autoclosingTags
autoclosingTags : List String
autoclosingTags =
[ "body", "colgroup", "dd", "dt", "head", "html", "li", "option", "p", "tbody", "td", "tfoot", "th", "thead", "tr" ]
-- HELPERS
chompOneOrMore : (Char -> Bool) -> Parser ()
chompOneOrMore predicate =
Parser.chompIf predicate
|. Parser.chompWhile predicate
{-| Loop a parser only if it actually consumes something.
For example, parsers like `spaces` and `chompWhile` will happily
consume 0 input, so when put in a loop the parser will never terminate.
-}
ifProgress : Parser a -> Int -> Parser (Step Int ())
ifProgress parser offset =
succeed identity
|. parser
|= getOffset
|> map
(\newOffset ->
if offset == newOffset then
Done ()
else
Loop newOffset
)
zeroOrMore : Parser a -> Parser (List a)
zeroOrMore parser =
Parser.loop []
(\acc ->
oneOf
[ succeed (\val -> Loop (val :: acc))
|= parser
, succeed (Done (List.reverse acc))
]
)
oneOrMore : String -> Parser a -> Parser (List a)
oneOrMore name parser =
Parser.loop []
(\acc ->
oneOf
[ succeed (\val -> Loop (val :: acc))
|= parser
, if List.isEmpty acc then
problem ("expecting at least one " ++ name)
else
succeed (Done (List.reverse acc))
]
)
{-| Create a parser that backtracks on success.
-}
lookAhead : Parser a -> Parser ()
lookAhead parser =
oneOf
[ oneOf
[ parser
|> backtrackable
|> andThen (\_ -> commit ())
|> andThen (\_ -> problem "")
, succeed
(parser
|> backtrackable
|> map (\_ -> ())
)
]
|> backtrackable
, succeed (succeed ())
]
|> andThen identity
-- JAVASCRIPT / <script>
{-| Chomp inside a <script> tag until the next </script>.
This can't be implemented as `chompUntil "</script>"` because
the Javascript inside the script tag may contain the string "</script>".
For example: "<script>alert('</script>')</script>"
-}
consumeJavascriptUntilClosingTag : Parser (List Node)
consumeJavascriptUntilClosingTag =
Parser.loop [] <|
\acc ->
let
accumulate newNode =
case ( acc, newNode ) of
( [], first ) ->
Loop [ first ]
( (Text accChunk) :: tail, Text newChunk ) ->
-- Merge top-most text node unless HTML comment nodes are interleaved
Loop (Text (accChunk ++ newChunk) :: tail)
( nonTextNode :: tail, _ ) ->
Loop (newNode :: nonTextNode :: tail)
in
Parser.oneOf
[ -- HTML comments are, albeit considered a bad practice recently,
-- allowed inside <script> to hide scripts from really ancient web browser
comment
|> map accumulate
, lineComment "//"
|> Parser.getChompedString
|> Parser.map (Text >> accumulate)
, Parser.multiComment "/*" "*/" Parser.NotNestable
|> Parser.getChompedString
|> Parser.map (Text >> accumulate)
, javaScriptStringLike '"'
|> Parser.map (Text >> accumulate)
, javaScriptStringLike '\''
|> Parser.map (Text >> accumulate)
, javaScriptStringLike '`'
|> Parser.map (Text >> accumulate)
, closeTag "script"
|> Parser.map (\() -> Done (List.reverse acc))
, Parser.chompIf (always True)
|> Parser.getChompedString
|> Parser.map (Text >> accumulate)
]
javaScriptStringLike : Char -> Parser String
javaScriptStringLike terminatorChar =
let
terminatorStr =
String.fromChar terminatorChar
in
Parser.succeed identity
|. Parser.token terminatorStr
|= Parser.loop "" (stringHelp terminatorChar terminatorStr)
-- Restoring original shape
|> Parser.map (\chunk -> terminatorStr ++ chunk ++ terminatorStr)
stringHelp : Char -> String -> String -> Parser (Parser.Step String String)
stringHelp terminatorChar terminatorStr acc =
Parser.oneOf
[ Parser.succeed (\char -> Parser.Loop (acc ++ "\\" ++ char))
|. Parser.token "\\"
|= justOneChar
, Parser.token terminatorStr
|> Parser.map (\_ -> Parser.Done acc)
-- Orig code caused infinite loop with single terminator char <script>'</script>
-- , Parser.chompWhile (\char -> char /= '\\' && char /= terminatorChar)
, chompOneOrMore (\char -> char /= '\\' && char /= terminatorChar)
|> Parser.getChompedString
|> Parser.map (\chunk -> Parser.Loop (acc ++ chunk))
]
justOneChar : Parser String
justOneChar =
chompIf (always True)
|> getChompedString
-- RENDER
openTagToString : String -> List ( String, String ) -> String
openTagToString tag attrs =
"<"
++ tag
++ (if List.isEmpty attrs then
""
else
List.map
(\( k, v ) ->
if String.isEmpty v then
k
else
k ++ "=\"" ++ v ++ "\""
)
attrs
|> String.join " "
)
++ ">"
{-| Convert an html node into a non-pretty string.
nodeToString (Element "a" [] [ Text "hi" ])
== "<a>hi</a>"
-}
nodeToString : Node -> String
nodeToString node_ =
case node_ of
Text s ->
s
Comment s ->
"<!--" ++ s ++ "-->"
Element tag attrs kids ->
if isVoidTag tag && List.isEmpty kids then
openTagToString tag attrs
else
openTagToString tag attrs
++ (List.map nodeToString kids
|> String.join ""
)
++ "</"
++ tag
++ ">"
{-| Convert multiple html nodes into a non-pretty string.
nodesToString
[ Element "a" [] [ Text "hi" ]
, Element "div" [] [ Element "span" [] [] ]
]
== "<a>hi</a><div><span></span></div>"
-}
nodesToString : List Node -> String
nodesToString nodes =
List.map nodeToString nodes
|> String.join ""
{-| Turn a single node into an Elm html node that Elm can render.
-}
nodeToHtml : Node -> Html.Html msg
nodeToHtml node_ =
case node_ of
Text s ->
Html.text s
Comment _ ->
Html.text ""
Element tag attrs kids ->
Html.node tag
(List.map (\( k, v ) -> Html.Attributes.attribute k v) attrs)
(List.map nodeToHtml kids)
{-| Turn a multiple html nodes into Elm html that Elm can render.
view : Html Msg
view =
Html.div
[]
("<p>hello world</p>"
|> Html.Parser.run
|> Result.map Html.Parser.nodesToHtml
|> Result.withDefault [ Html.text "parse error" ]
)
-}
nodesToHtml : List Node -> List (Html.Html msg)
nodesToHtml nodes =
List.foldr
(\node_ acc ->
nodeToHtml node_ :: acc
)
[]
nodes
{-| Generate a pretty string for a single html node.
-}
nodeToPrettyString : Node -> String
nodeToPrettyString node_ =
prettyNode_ 0 node_
prettyNode_ : Int -> Node -> String
prettyNode_ indent node_ =
let
pad =
String.padLeft (indent * 4) ' ' ""
in
case node_ of
Text s ->
if String.isEmpty (String.trim s) then
""
else
let
ss =
s
|> String.split "\n"
|> List.filter (String.trim >> String.isEmpty >> not)
|> String.join ("\n" ++ String.padLeft (indent * 4) ' ' "")
in
-- "\n" ++ pad ++ s
"\n" ++ pad ++ ss
Comment s ->
if String.isEmpty s then
""
else
"\n" ++ pad ++ "<!--" ++ s ++ "-->"
Element tag attrs kids ->
String.join ""
(List.concat
[ [ "\n" ++ pad ++ openTagToString tag attrs ]
, List.map (prettyNode_ (indent + 1)) kids
, [ if List.isEmpty kids then
""
else
"\n"
]
, [ (if List.isEmpty kids then
""
else
pad
)
++ (if isVoidTag tag && List.isEmpty kids then
""
else
"</"
++ tag
++ ">"
)
]
]
)
{-| Turn a node tree into a pretty-printed, indented html string.
("<a><b><c>hello</c></b></a>"
|> Html.Parser.run
|> Result.map nodesToPrettyString
)
== Ok """<a>
<b>
<c>
hello
</c>
</b>
</a>"""
-}
nodesToPrettyString : List Node -> String
nodesToPrettyString nodes =
List.foldl
(\node_ acc ->
acc ++ nodeToPrettyString node_
)
""
nodes
doctypeToString : Bool -> String
doctypeToString legacyCompat =
if legacyCompat then
"<!DOCTYPE html SYSTEM \"about:legacy-compat\">"
else
"<!DOCTYPE html>"
{-| Convert a document into a string starting with `<!doctype html>` followed by the root html node.
-}
documentToString : Document -> String
documentToString doc =
doctypeToString doc.legacyCompat ++ "\n" ++ nodeToString doc.root
{-| Convert a document into a pretty, indented string.
-}
documentToPrettyString : Document -> String
documentToPrettyString doc =
doctypeToString doc.legacyCompat ++ "\n" ++ nodeToPrettyString doc.root

693
tests/ParserTests.elm Normal file
View File

@ -0,0 +1,693 @@
module ParserTests exposing (..)
import Expect exposing (Expectation)
import Fuzz exposing (Fuzzer, int, list, string)
import Html.Parser exposing (Document, Node(..))
import Parser exposing (DeadEnd)
import Test exposing (..)
testDoc : List ( String, String, Result (List DeadEnd) Document ) -> List Test
testDoc cases =
List.map
(\( name, html, expected ) ->
test (name ++ ": " ++ html)
(\_ ->
let
actual =
Html.Parser.runDocument html
in
case expected of
Ok _ ->
Expect.equal actual expected
Err _ ->
case actual of
Err _ ->
Expect.pass
_ ->
Expect.equal actual expected
)
)
cases
testStringRoundtrip : List ( String, String, Result (List DeadEnd) String ) -> List Test
testStringRoundtrip cases =
List.map
(\( name, html, expected ) ->
test (name ++ " \"" ++ html ++ "\"")
(\_ ->
let
actual =
Html.Parser.run html
|> Result.map Html.Parser.nodesToString
in
case expected of
Err _ ->
case actual of
Err _ ->
Expect.pass
_ ->
Expect.equal actual expected
Ok _ ->
Expect.equal actual expected
)
)
cases
testAll : List ( String, String, Result (List DeadEnd) (List Node) ) -> List Test
testAll cases =
List.map
(\( name, html, expected ) ->
test (name ++ " \"" ++ html ++ "\"")
(\_ ->
let
actual =
Html.Parser.run html
in
case expected of
Err _ ->
case actual of
Err _ ->
Expect.pass
_ ->
Expect.equal actual expected
Ok _ ->
Expect.equal actual expected
)
)
cases
renderStringTests =
describe "stringify tests" <|
testStringRoundtrip
[ ( "basic", "<a></a>", Ok "<a></a>" )
, ( "basic", "<a>foo</a>", Ok "<a>foo</a>" )
, ( "basic", "<a> foo </a>", Ok "<a> foo </a>" )
, ( "basic", "<a><b><c>foo</c></b></a>", Ok "<a><b><c>foo</c></b></a>" )
, ( "basic", "<A><B><C>foo</C></B></A>", Ok "<a><b><c>foo</c></b></a>" )
, ( "basic", "<a><!--a-->b<!--c--></a>", Ok "<a><!--a-->b<!--c--></a>" )
]
ambiguousTextTests =
describe "ambiguous text node parsing" <|
testAll
[ ( "basic1", "<div>:></div>", Ok [ Element "div" [] [ Text ":>" ] ] )
, ( "basic2", "<div><:</div>", Ok [ Element "div" [] [ Text "<:" ] ] )
, ( "basic3", "<:", Ok [ Text "<:" ] )
, ( "basic4", ":>", Ok [ Text ":>" ] )
]
voidTests =
describe "void nodes" <|
testAll
[ ( "without closing tag", "<hr>a", Ok [ Element "hr" [] [], Text "a" ] )
, ( "with closing tag", "<hr>a</hr>", Ok [ Element "hr" [] [], Text "a" ] )
]
documentTests =
describe "document parsing" <|
testDoc
[ ( "", "<!doctype html>", Ok (Document False (Element "html" [] [])) )
, ( "", "<!DOCTYPE HTML>", Ok (Document False (Element "html" [] [])) )
, ( "", "<!doctype htmlSYSTEM \"about:legacy-compat\">", Err [] )
, ( "", "<!doctype html SYSTEM \"about:legacy-compat\">", Ok (Document True (Element "html" [] [])) )
, ( "", "<!doctype html sYsTem 'about:legacy-compat'>", Ok (Document True (Element "html" [] [])) )
, ( ""
, "<!doctype html><head>a<body>b"
, Ok
(Document False
(Element "html"
[]
[ Element "head" [] [ Text "a" ]
, Element "body" [] [ Text "b" ]
]
)
)
)
, ( ""
, "<!doctype html><meta charset=\"utf-8\">"
, Ok
(Document False
(Element "html"
[]
[ Element "meta" [ ( "charset", "utf-8" ) ] []
]
)
)
)
]
basicCommentTests =
describe "basic comment parsing" <|
testAll
[ ( "basic1", "<!---->", Ok [ Comment "" ] )
, ( "basic2", "<!-- -->", Ok [ Comment " " ] )
, ( "basic3", "<!--x-->", Ok [ Comment "x" ] )
, ( "basic4", "<a><!--x--></a>", Ok [ Element "a" [] [ Comment "x" ] ] )
, ( "basic5", "<!--a--><a><!--b--></a><!--c-->", Ok [ Comment "a", Element "a" [] [ Comment "b" ], Comment "c" ] )
, ( "basic6", "<!---->-->", Ok [ Comment "", Text "-->" ] )
]
basicElementTests =
describe "basic element parsing" <|
testAll
[ ( "my-basic1", "<a>:></a>", Ok [ Element "a" [] [ Text ":>" ] ] )
-- , ( "my-basic2", "<a><:</a>", Ok [ Element "a" [] [ Text "<:" ] ] )
-- Tests from hecrj/elm-html-parser
, ( "basic1", "<a></a>", Ok [ Element "a" [] [] ] )
, ( "basic2", "<a></a >", Ok [ Element "a" [] [] ] )
, ( "basic3", "<A></A >", Ok [ Element "a" [] [] ] )
, ( "basic4", " <a></a> ", Ok [ Text " ", Element "a" [] [], Text " " ] )
, ( "basic5", "a<a></a>b", Ok [ Text "a", Element "a" [] [], Text "b" ] )
, ( "basic6", "<A></A>", Ok [ Element "a" [] [] ] )
]
basicAttributeTests =
describe "basic attribute parsing" <|
testAll
[ ( "unquoted1", "<div a=b/></div>", Ok [ Element "div" [ ( "a", "b/" ) ] [] ] )
, ( "unquoted2", "<div a=b />", Ok [ Element "div" [ ( "a", "b" ) ] [] ] )
, ( "single-quoted", "<div a='b'/>", Ok [ Element "div" [ ( "a", "b" ) ] [] ] )
, ( "double-quoted", "<div a=\"b\"/>", Ok [ Element "div" [ ( "a", "b" ) ] [] ] )
, ( "key-only1", "<div a></div>", Ok [ Element "div" [ ( "a", "" ) ] [] ] )
, ( "key-only2", "<div a/>", Ok [ Element "div" [ ( "a", "" ) ] [] ] )
, ( "everything"
, "<div a=b c='d' e=\"f\" g/>"
, Ok
[ Element "div"
[ ( "a", "b" )
, ( "c", "d" )
, ( "e", "f" )
, ( "g", "" )
]
[]
]
)
]
autoclosingTests =
describe "autoclosing elements" <|
testAll
[ ( "p-basic1", "<p>a<p>b", Ok [ Element "p" [] [ Text "a" ], Element "p" [] [ Text "b" ] ] )
, ( "li-basic", "<li><li>", Ok [ Element "li" [] [], Element "li" [] [] ] )
, ( "li-basic", "<ul><li><li></ul>", Ok [ Element "ul" [] [ Element "li" [] [], Element "li" [] [] ] ] )
, ( "li-basic", "<li>a<li>b</li>", Ok [ Element "li" [] [ Text "a" ], Element "li" [] [ Text "b" ] ] )
, ( "li-comment"
, "<li>a<!--c--><li>b<!--d--></li>"
, Ok
[ Element "li" [] [ Text "a", Comment "c" ]
, Element "li" [] [ Text "b", Comment "d" ]
]
)
-- TODO
-- , ( "li-comment-backtrack"
-- , "<li>a<!-c<li>b<!-d</li>"
-- , Ok
-- [ Element "li" [] [ Text "a<!-c" ]
-- , Element "li" [] [ Text "b<!-d" ]
-- ]
-- )
, ( "li-basic", "<li>a</li><li>b", Ok [ Element "li" [] [ Text "a" ], Element "li" [] [ Text "b" ] ] )
, ( "li-basic1", "<li>a</li><li>b</li>", Ok [ Element "li" [] [ Text "a" ], Element "li" [] [ Text "b" ] ] )
, ( "li-basic2"
, "<li>a<li>b</li>c</li>"
, Ok
[ Element "li" [] [ Text "a" ]
, Element "li" [] [ Text "b" ]
, Text "c"
]
)
, ( "li-basic3"
, "<li>a<ul><li>b</li></ul>c</li>"
, Ok
[ Element "li"
[]
[ Text "a"
, Element "ul"
[]
[ Element "li" [] [ Text "b" ]
]
, Text "c"
]
]
)
-- Not valid html, but the parser should still parse it.
, ( "head1"
, "<head>a<head>b"
, Ok
[ Element "head" [] [ Text "a" ]
, Element "head" [] [ Text "b" ]
]
)
-- Unlike the previous test, here's an example of where the parser must invoke the html5
-- spec only to disambiguate where <body> should be a child vs. sibling
-- of the unended <head> element.
, ( "head2"
, "<head><title>hello</title><body>"
, Ok
[ Element "head"
[]
[ Element "title" [] [ Text "hello" ] ]
, Element "body" [] []
]
)
]
basicNestingTests =
describe "nested elements" <|
testAll
[ ( "abc"
, "<a><b><c></c></b></a>"
, Ok
[ Element "a"
[]
[ Element "b"
[]
[ Element "c" [] []
]
]
]
)
, ( "nested <ul> where all <li> are closed </li>"
, """<ul><li>a</li><li>b<ul><li>x</li><li>y</li></ul></li><li>c</li></ul>"""
, Ok
[ Element "ul"
[]
[ Element "li" [] [ Text "a" ]
, Element "li"
[]
[ Text "b"
, Element "ul"
[]
[ Element "li" [] [ Text "x" ]
, Element "li" [] [ Text "y" ]
]
]
, Element "li" [] [ Text "c" ]
]
]
)
, ( "nested <ul> where zero <li> are closed with </li>"
, """<ul><li>a<li>b<ul><li>x<li>y</ul><li>c</ul>"""
, Ok
[ Element "ul"
[]
[ Element "li" [] [ Text "a" ]
, Element "li"
[]
[ Text "b"
, Element "ul"
[]
[ Element "li" [] [ Text "x" ]
, Element "li" [] [ Text "y" ]
]
]
, Element "li" [] [ Text "c" ]
]
]
)
]
voidElementTests : Test
voidElementTests =
describe "void elements" <|
testAll
[ ( "invalid", "<hr></hr>", Ok [ Element "hr" [] [] ] )
, ( "valid1", "<hr>", Ok [ Element "hr" [] [] ] )
, ( "valid2", "<hr/>", Ok [ Element "hr" [] [] ] )
]
textNodeTests : Test
textNodeTests =
describe "text node parsing" <|
testAll
[ ( "empty", "", Ok [] )
, ( "space", " ", Ok [ Text " " ] )
, ( "basic1", "1", Ok [ Text "1" ] )
, ( "basic2", "a", Ok [ Text "a" ] )
, ( "basic3", "1a", Ok [ Text "1a" ] )
, ( "basic4", "^", Ok [ Text "^" ] )
, ( "decode1", "&", Ok [ Text "&" ] )
, ( "decode2", "&amp;", Ok [ Text "&" ] )
, ( "decode3", "&lt;", Ok [ Text "<" ] )
, ( "decode4", "&gt;", Ok [ Text ">" ] )
, ( "decode5", "&apos;", Ok [ Text "'" ] )
, ( "decode6", "&#38;", Ok [ Text "&" ] )
, ( "decode7", "&#x26;", Ok [ Text "&" ] )
, ( "decode8", "&#x3E;", Ok [ Text ">" ] )
, ( "decode9", "&#383;", Ok [ Text "ſ" ] )
, ( "decodeA", "&nbsp;", Ok [ Text "\u{00A0}" ] )
, ( "decodeB", "&nbsp;&nbsp;", Ok [ Text "\u{00A0}\u{00A0}" ] )
, ( "decodeC", "a&nbsp;b", Ok [ Text "a\u{00A0}b" ] )
, ( "decodeD", "a&nbsp;&nbsp;b", Ok [ Text "a\u{00A0}\u{00A0}b" ] )
, ( "decodeE", """<img alt="&lt;">""", Ok [ Element "img" [ ( "alt", "<" ) ] [] ] )
, ( "decodeF", "&#0038;", Ok [ Text "&" ] )
]
scriptTests : Test
scriptTests =
describe "<script> node" <|
testAll
[ ( "basic1", "<script></script>", Ok [ Element "script" [] [] ] )
, ( "basic2", "<script>foo</script>", Ok [ Element "script" [] [ Text "foo" ] ] )
-- Copy browser behavior
, ( "basic3", "<script></script></script>", Ok [ Element "script" [] [] ] )
, ( "basic4", "<script><script></script>", Ok [ Element "script" [] [ Text "<script>" ] ] )
, ( "attrs1", "<script src=index.js></script>", Ok [ Element "script" [ ( "src", "index.js" ) ] [] ] )
, ( "js1", "<script>'</script>'</script>", Ok [ Element "script" [] [ Text "'</script>'" ] ] )
, ( "js2", "<script>\"</script>\"</script>", Ok [ Element "script" [] [ Text "\"</script>\"" ] ] )
, ( "js3", "<script>`</script>`</script>", Ok [ Element "script" [] [ Text "`</script>`" ] ] )
, ( "js4", "<script>x < 42 || x > 42</script>", Ok [ Element "script" [] [ Text "x < 42 || x > 42" ] ] )
, ( "comment1", "<script>\n//</script>\n</script>", Ok [ Element "script" [] [ Text "\n//</script>\n" ] ] )
, ( "comment2", "<script>\n/*\n</script>\n*/\n</script>", Ok [ Element "script" [] [ Text "\n/*\n</script>\n*/\n" ] ] )
]
-- TESTS FROM hecrj/elm-html-parser
testParseAll : String -> List Node -> (() -> Expectation)
testParseAll s astList =
\_ ->
Expect.equal (Ok astList) (Html.Parser.run s)
testParse : String -> Node -> (() -> Expectation)
testParse input expected =
\_ ->
case Html.Parser.run input of
Err message ->
Expect.fail (Parser.deadEndsToString message)
Ok actual ->
Expect.equal (Ok actual) (Ok [ expected ])
hecrjNodeTests : Test
hecrjNodeTests =
describe "Node"
[ test "basic1" (testParse "<a></a>" (Element "a" [] []))
, test "basic2" (testParse "<a></a >" (Element "a" [] []))
, test "basic3" (testParse "<A></A >" (Element "a" [] []))
, test "basic4" (testParseAll " <a></a> " [ Text " ", Element "a" [] [], Text " " ])
, test "basic5" (testParseAll "a<a></a>b" [ Text "a", Element "a" [] [], Text "b" ])
, test "basic6" (testParse "<A></A>" (Element "a" [] []))
, test "basic7" (testParse "<a>a</a>" (Element "a" [] [ Text "a" ]))
, test "basic8" (testParse "<a> a </a>" (Element "a" [] [ Text " a " ]))
, test "basic10" (testParse "<br>" (Element "br" [] []))
, test "basic11" (testParse "<a><a></a></a>" (Element "a" [] [ Element "a" [] [] ]))
, test "basic12" (testParse "<a> <a> </a> </a>" (Element "a" [] [ Text " ", Element "a" [] [ Text " " ], Text " " ]))
, test "basic13" (testParse "<a> <br> </a>" (Element "a" [] [ Text " ", Element "br" [] [], Text " " ]))
, test "basic14" (testParse "<a><a></a><a></a></a>" (Element "a" [] [ Element "a" [] [], Element "a" [] [] ]))
, test "basic15" (testParse "<a><a><a></a></a></a>" (Element "a" [] [ Element "a" [] [ Element "a" [] [] ] ]))
, test "basic16" (testParse "<a><a></a><b></b></a>" (Element "a" [] [ Element "a" [] [], Element "b" [] [] ]))
, test "basic17" (testParse "<h1></h1>" (Element "h1" [] []))
, test "start-only-tag1" (testParse "<br>" (Element "br" [] []))
, test "start-only-tag2" (testParse "<BR>" (Element "br" [] []))
, test "start-only-tag3" (testParse "<br >" (Element "br" [] []))
, test "start-only-tag4" (testParse "<BR >" (Element "br" [] []))
, test "start-only-tag5" (testParse "<a> <br> </a>" (Element "a" [] [ Text " ", Element "br" [] [], Text " " ]))
, test "start-only-tag6" (testParse "<a><br><br></a>" (Element "a" [] [ Element "br" [] [], Element "br" [] [] ]))
, test "start-only-tag7" (testParse "<a><br><img><hr><meta></a>" (Element "a" [] [ Element "br" [] [], Element "img" [] [], Element "hr" [] [], Element "meta" [] [] ]))
, test "start-only-tag8" (testParse "<a>foo<br>bar</a>" (Element "a" [] [ Text "foo", Element "br" [] [], Text "bar" ]))
, test "self-closing-tag1" (testParse "<br/>" (Element "br" [] []))
, test "self-closing-tag2" (testParse "<br />" (Element "br" [] []))
, test "self-closing-tag3" (testParse "<link href=\"something\" rel=\"something else\"/>" (Element "link" [ ( "href", "something" ), ( "rel", "something else" ) ] []))
, test "web-component-tag" (testParse "<a-web-component></a-web-component>" (Element "a-web-component" [] []))
]
hecrjAttributeTests : Test
hecrjAttributeTests =
describe "Attribute"
[ test "basic1" (testParse """<a href="example.com"></a>""" (Element "a" [ ( "href", "example.com" ) ] []))
, test "basic2" (testParse """<a href='example.com'></a>""" (Element "a" [ ( "href", "example.com" ) ] []))
, test "basic3" (testParse """<a href=example.com></a>""" (Element "a" [ ( "href", "example.com" ) ] []))
, test "basic4" (testParse """<a HREF=example.com></a>""" (Element "a" [ ( "href", "example.com" ) ] []))
, test "basic5" (testParse """<a href=bare></a>""" (Element "a" [ ( "href", "bare" ) ] []))
, test "basic6" (testParse """<a href="example.com?a=b&amp;c=d"></a>""" (Element "a" [ ( "href", "example.com?a=b&c=d" ) ] []))
, test "basic7" (testParse """<a href="example.com?a=b&c=d"></a>""" (Element "a" [ ( "href", "example.com?a=b&c=d" ) ] []))
, test "basic8" (testParse """<input max=100 min = 10.5>""" (Element "input" [ ( "max", "100" ), ( "min", "10.5" ) ] []))
, test "basic9" (testParse """<input disabled>""" (Element "input" [ ( "disabled", "" ) ] []))
, test "basic10" (testParse """<input DISABLED>""" (Element "input" [ ( "disabled", "" ) ] []))
, test "basic11" (testParse """<meta http-equiv=Content-Type>""" (Element "meta" [ ( "http-equiv", "Content-Type" ) ] []))
, test "basic12" (testParse """<input data-foo2="a">""" (Element "input" [ ( "data-foo2", "a" ) ] []))
, test "basic13" (testParse """<html xmlns:v="urn:schemas-microsoft-com:vml"></html>""" (Element "html" [ ( "xmlns:v", "urn:schemas-microsoft-com:vml" ) ] []))
, test "basic14" (testParse """<link rel=stylesheet
href="">""" (Element "link" [ ( "rel", "stylesheet" ), ( "href", "" ) ] []))
-- Invalid attribute names shouldn't be parsed: https://github.com/elm/html/issues/46
, test "invalid character" (testParse """<p\u{00A0} ></p>""" (Element "p" [] []))
]
hecrjScriptTests : Test
hecrjScriptTests =
describe "Script"
[ test "script1" (testParse """<script></script>""" (Element "script" [] []))
, test "script2" (testParse """<SCRIPT></SCRIPT>""" (Element "script" [] []))
, test "script3" (testParse """<script src="script.js">foo</script>""" (Element "script" [ ( "src", "script.js" ) ] [ Text "foo" ]))
, test "script4" (testParse """<script>var a = 0 < 1; b = 1 > 0;</script>""" (Element "script" [] [ Text "var a = 0 < 1; b = 1 > 0;" ]))
, test "script5" (testParse """<script><!----></script>""" (Element "script" [] [ Comment "" ]))
, test "script6" (testParse """<script>a<!--</script><script>-->b</script>""" (Element "script" [] [ Text "a", Comment "</script><script>", Text "b" ]))
, test "style" (testParse """<style>a<!--</style><style>-->b</style>""" (Element "style" [] [ Text "a", Comment "</style><style>", Text "b" ]))
]
hecrjCommentTests : Test
hecrjCommentTests =
describe "Comment"
[ test "basic1" (testParse """<!---->""" (Comment ""))
, test "basic2" (testParse """<!--<div></div>-->""" (Comment "<div></div>"))
, test "basic3" (testParse """<div><!--</div>--></div>""" (Element "div" [] [ Comment "</div>" ]))
, test "basic4" (testParse """<!--<!---->""" (Comment "<!--"))
, test "basic5" (testParse """<!--foo\t\u{000D}
-->""" (Comment "foo\t\u{000D}\n "))
]
svgTests =
test "self-closing svg path"
(testParse
"""<svg viewBox="0 0 20 20" fill="currentColor" aria-hidden="true"><path fill-rule="evenodd" d="1 2 3" clip-rule="evenodd" /></svg>"""
(Element "svg"
[ ( "viewbox", "0 0 20 20" )
, ( "fill", "currentColor" )
, ( "aria-hidden", "true" )
]
[ Element "path"
[ ( "fill-rule", "evenodd" )
, ( "d", "1 2 3" )
, ( "clip-rule", "evenodd" )
]
[]
]
)
)
-- https://github.com/taoqf/node-html-parser/blob/main/test/tests/html.js
nodeHtmlParserTests =
describe "taoqf/node-html-parser tests" <|
testAll
[ ( "test1"
, "<p id=\"id\"><a class='cls'>Hello</a><ul><li><li></ul><span></span></p>"
, Ok
[ Element "p"
[ ( "id", "id" ) ]
[ Element "a"
[ ( "class", "cls" ) ]
[ Text "Hello"
]
, Element "ul"
[]
[ Element "li" [] []
, Element "li" [] []
]
, Element "span" [] []
]
]
)
, ( "test2"
, "<DIV><a><img/></A><p></P></div>"
, Ok
[ Element "div"
[]
[ Element "a"
[]
[ Element "img" [] []
]
, Element "p" [] []
]
]
)
, ( "test3"
, "<div><a><img/></a><p></p></div>"
, Ok
[ Element "div"
[]
[ Element "a"
[]
[ Element "img" [] []
]
, Element "p" [] []
]
]
)
, ( "test4"
, "<div><a><!-- my comment --></a></div>"
, Ok
[ Element "div"
[]
[ Element "a"
[]
[ Comment " my comment "
]
]
]
)
, ( "test5"
, "<div><!--<a></a>--></div>"
, Ok
[ Element "div"
[]
[ Comment "<a></a>"
]
]
)
, ( "test6"
, "<picture><source srcset=\"/images/example-1.jpg 1200w, /images/example-2.jpg 1600w\" sizes=\"100vw\"><img src=\"/images/example.jpg\" alt=\"Example\"/></picture>"
, Ok
[ Element "picture"
[]
[ Element "source"
[ ( "srcset", "/images/example-1.jpg 1200w, /images/example-2.jpg 1600w" )
, ( "sizes", "100vw" )
]
[]
, Element "img" [ ( "src", "/images/example.jpg" ), ( "alt", "Example" ) ] []
]
]
)
, ( "test7"
, "<script>1</script><style>2&amp;</style>"
, Ok
[ Element "script" [] [ Text "1" ]
, Element "style" [] [ Text "2&" ]
]
)
]
-- JSOUP TESTS
-- https://github.com/jhy/jsoup/blob/master/src/test/java/org/jsoup/parser/AttributeParseTest.java
jsoupAttributeTests =
describe "(from jsoup) attributes" <|
testAll
[ ( "parses rough attribute string"
, "<a id=\"123\" class=\"baz = 'bar'\" style = 'border: 2px'qux zim foo = 12 mux=18 />"
, Ok
[ Element "a"
[ ( "id", "123" )
, ( "class", "baz = 'bar'" )
, ( "style", "border: 2px" )
, ( "qux", "" )
, ( "zim", "" )
, ( "foo", "12" )
, ( "mux", "18" )
]
[]
]
)
, ( "handles newlines and returns"
, -- "<a\r\nfoo='bar\r\nqux'\r\nbar\r\n=\r\ntwo>One</a>"
"<a\u{000D}\nfoo='bar\u{000D}\nqux'\u{000D}\nbar\u{000D}\n=\u{000D}\ntwo>One</a>"
, Ok
[ Element "a"
[ ( "foo", "bar\u{000D}\nqux" )
, ( "bar", "two" )
]
[ Text "One" ]
]
)
, ( "parses empty string", "<a />", Ok [ Element "a" [] [] ] )
-- https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
, ( "can start with '='"
, "<a =empty />"
, Ok [ Element "a" [ ( "=empty", "" ) ] [] ]
)
, ( "strict attribute unescapes"
, -- "<a id=1 href='?foo=bar&mid&lt=true'>One</a> <a id=2 href='?foo=bar&lt;qux&lg=1'>Two</a>"
"<a id=1 href='?foo=bar&mid&lt=true'>One</a> <a id=2 href='?foo=bar&lt;qux&lg=1'>Two</a>"
, Ok
[ Element "a"
[ ( "id", "1" )
, ( "href", "?foo=bar&mid&lt=true" )
]
[ Text "One" ]
, Text " "
, Element "a"
[ ( "id", "2" )
, ( "href", "?foo=bar<qux&lg=1" )
]
[ Text "Two" ]
]
)
, ( "more attribute unescapes"
, "<a href='&wr_id=123&mid-size=true&ok=&wr'>Check</a>"
, Ok
[ Element "a"
[ ( "href", "&wr_id=123&mid-size=true&ok=&wr" )
]
[ Text "Check" ]
]
)
, ( "drops slash from attribute"
, "<img /onerror='doMyJob' /a /=b/>"
, Ok
[ Element "img"
[ ( "onerror", "doMyJob" )
, ( "a", "" )
, ( "=b", "" )
]
[]
]
)
]
-- TODO: https://github.com/jhy/jsoup/blob/master/src/test/java/org/jsoup/parser/HtmlParserTest.java