mirror of
https://github.com/danneu/html-parser.git
synced 2024-11-22 02:44:04 +03:00
Initial commit
This commit is contained in:
commit
f953cc6ee4
3
.prettierrc.json
Normal file
3
.prettierrc.json
Normal file
@ -0,0 +1,3 @@
|
||||
{
|
||||
"tabWidth": 4
|
||||
}
|
52
README.md
Normal file
52
README.md
Normal file
@ -0,0 +1,52 @@
|
||||
# elm-html-parser
|
||||
|
||||
A lenient html5 parser implemented with [Elm](https://elm-lang.org).
|
||||
|
||||
A lenient alternative to [hecrj/elm-html-parser](https://package.elm-lang.org/packages/hecrj/html-parser/latest/).
|
||||
|
||||
## Goals
|
||||
|
||||
- **Leniency**
|
||||
- Avoids validating while parsing
|
||||
- Prefers to immitate browser parsing behavior rather than html5 spec.
|
||||
- Prefers to use the html5 spec only to handle ambiguous cases rather than to prohibit invalid html5
|
||||
- Prefers to fall back to text nodes than short-circuit with parse errors
|
||||
- **Handle user-written html**
|
||||
- Users don't write character entities like `&` and `<`. This parser should strive to handle cases like `<p><:</p>` -> `Element "p" [] [ Text "<:" ]`.
|
||||
|
||||
## Features / Quirks
|
||||
|
||||
- Characters don't need to be escaped into entities.
|
||||
|
||||
e.g. `<div><:</div>` will parse correctly and doesn't need to be rewritten into `<div><:</div>`.
|
||||
- Tags that should not nest are autoclosed.
|
||||
|
||||
e.g. `<p>a<p>b` -> `<p>a</p><p>b</p>`.
|
||||
- Closing tags that have no matching open tags are ignored.
|
||||
|
||||
e.g. `</a><div></div></div></b>` -> `<div></div>`
|
||||
- Ignores comments in whitespace positions:
|
||||
|
||||
e.g. `<div <!--comment-->/>` -> `<div/>`
|
||||
- Parses comments in text node positions:
|
||||
|
||||
e.g. `div><!--comment--></div>` ->
|
||||
`Element "div" [ Comment "comment" ]`
|
||||
|
||||
## Differences from existing packages
|
||||
|
||||
Currently, there is only one html parser published to Elm packages: [hecrj/elm-html-parser](https://package.elm-lang.org/packages/hecrj/html-parser/latest/).
|
||||
|
||||
@hecjr has said that following the html5 spec is a goal of their parser, so their parser is stricter by design and rejects invalid html5.
|
||||
|
||||
## Development
|
||||
|
||||
`git clone` and `npm install`.
|
||||
|
||||
- `npm test` to run tests
|
||||
- `npm docs` to preview docs locally
|
||||
|
||||
## Special thanks
|
||||
|
||||
- @hecrj and their contributors.
|
||||
- @ymtszw for their work on the Javascript `<script>` parser.
|
20
elm.json
Normal file
20
elm.json
Normal file
@ -0,0 +1,20 @@
|
||||
{
|
||||
"type": "package",
|
||||
"name": "danneu/html-parser",
|
||||
"summary": "TODO",
|
||||
"license": "MIT",
|
||||
"version": "1.0.0",
|
||||
"exposed-modules": [
|
||||
"Html.Parser"
|
||||
],
|
||||
"elm-version": "0.19.1 <= v < 0.20.0",
|
||||
"dependencies": {
|
||||
"elm/core": "1.0.5 <= v < 2.0.0",
|
||||
"elm/html": "1.0.0 <= v < 2.0.0",
|
||||
"elm/parser": "1.1.0 <= v < 2.0.0",
|
||||
"rtfeldman/elm-hex": "1.0.0 <= v < 2.0.0"
|
||||
},
|
||||
"test-dependencies": {
|
||||
"elm-explorations/test": "1.2.2 <= v < 2.0.0"
|
||||
}
|
||||
}
|
3047
package-lock.json
generated
Normal file
3047
package-lock.json
generated
Normal file
File diff suppressed because it is too large
Load Diff
11
package.json
Normal file
11
package.json
Normal file
@ -0,0 +1,11 @@
|
||||
{
|
||||
"private": true,
|
||||
"devDependencies": {
|
||||
"elm-doc-preview": "^5.0.5",
|
||||
"elm-test": "^0.19.1-revision7"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "elm-test",
|
||||
"docs": "elm-doc-preview"
|
||||
}
|
||||
}
|
2140
src/Html/CharRefs.elm
Normal file
2140
src/Html/CharRefs.elm
Normal file
File diff suppressed because it is too large
Load Diff
957
src/Html/Parser.elm
Normal file
957
src/Html/Parser.elm
Normal file
@ -0,0 +1,957 @@
|
||||
module Html.Parser exposing
|
||||
( Node(..), Document
|
||||
, run, runDocument
|
||||
, nodeToHtml, nodesToHtml, nodeToString, nodesToString, nodeToPrettyString, nodesToPrettyString, documentToString, documentToPrettyString
|
||||
)
|
||||
|
||||
{-| Leniently parse html5 documents and fragments and then render them
|
||||
into strings or Elm's virtual dom nodes.
|
||||
|
||||
|
||||
# Definition
|
||||
|
||||
@docs Node, Document
|
||||
|
||||
|
||||
# Parsing
|
||||
|
||||
@docs run, runDocument
|
||||
|
||||
|
||||
# Render
|
||||
|
||||
@docs nodeToHtml, nodesToHtml, nodeToString, nodesToString, nodeToPrettyString, nodesToPrettyString, documentToString, documentToPrettyString
|
||||
|
||||
-}
|
||||
|
||||
import Hex
|
||||
import Html
|
||||
import Html.Attributes
|
||||
import Html.CharRefs
|
||||
import Parser exposing (..)
|
||||
|
||||
|
||||
{-| An html node is tree of text, comments, and element nodes.
|
||||
|
||||
An element (e.g. `<div foo="bar">hello</div>`) can have attributes and child nodes.
|
||||
|
||||
-}
|
||||
type Node
|
||||
= Text String
|
||||
| Comment String
|
||||
| Element String (List ( String, String )) (List Node)
|
||||
|
||||
|
||||
{-| Parse an html fragment into a list of html nodes.
|
||||
|
||||
The html fragment can have multiple top-level nodes.
|
||||
|
||||
run "<div>hi</div><div>bye</div>"
|
||||
== Ok
|
||||
[ Element "div" [] [ Text "hi" ]
|
||||
, Element "div" [] [ Text "bye" ]
|
||||
]
|
||||
|
||||
-}
|
||||
run : String -> Result (List DeadEnd) (List Node)
|
||||
run input =
|
||||
Parser.run parseAll input
|
||||
|
||||
|
||||
{-| An html document has a `<!doctype>` and then a root html node.
|
||||
-}
|
||||
type alias Document =
|
||||
{ legacyCompat : Bool
|
||||
, root : Node
|
||||
}
|
||||
|
||||
|
||||
{-| Like `Parser.token` except token is matched case-insensitive.
|
||||
-}
|
||||
caseInsensitiveToken : String -> Parser ()
|
||||
caseInsensitiveToken string =
|
||||
let
|
||||
help : String -> Parser.Parser () -> Parser.Parser ()
|
||||
help string_ parser =
|
||||
case String.uncons string_ of
|
||||
Nothing ->
|
||||
parser
|
||||
|
||||
Just ( char, rest ) ->
|
||||
parser
|
||||
|> Parser.andThen
|
||||
(\_ ->
|
||||
oneOf
|
||||
[ chompIf (\c -> Char.toLower c == Char.toLower char)
|
||||
, problem ("expected case-insensitive char '" ++ String.fromChar char ++ "'")
|
||||
]
|
||||
)
|
||||
|> help rest
|
||||
in
|
||||
help string (succeed ())
|
||||
|
||||
|
||||
doctypeLegacy : Parser Bool
|
||||
doctypeLegacy =
|
||||
-- https://html.spec.whatwg.org/multipage/syntax.html#doctype-legacy-string
|
||||
(succeed identity
|
||||
|. chompOneOrMore isSpace
|
||||
|. caseInsensitiveToken "SYSTEM"
|
||||
|. chompOneOrMore isSpace
|
||||
|= (oneOf
|
||||
[ token "\""
|
||||
, token "'"
|
||||
]
|
||||
|> getChompedString
|
||||
)
|
||||
)
|
||||
|> andThen
|
||||
(\quote ->
|
||||
succeed ()
|
||||
|. token "about:legacy-compat"
|
||||
|. token quote
|
||||
)
|
||||
|> andThen (\_ -> succeed True)
|
||||
|
||||
|
||||
doctype : Parser Bool
|
||||
doctype =
|
||||
-- https://html.spec.whatwg.org/multipage/syntax.html#the-doctype
|
||||
succeed identity
|
||||
|. token "<!"
|
||||
|. caseInsensitiveToken "DOCTYPE"
|
||||
|. chompOneOrMore isSpace
|
||||
|. caseInsensitiveToken "html"
|
||||
|= oneOf
|
||||
[ backtrackable doctypeLegacy
|
||||
, succeed False
|
||||
]
|
||||
|. chompWhile isSpace
|
||||
|. token ">"
|
||||
|
||||
|
||||
{-| Parses `<!doctype html>` and any html nodes after.
|
||||
|
||||
Always returns a single root node. Wraps nodes in a root `<html>` node if one is not present.
|
||||
|
||||
**Caveat**: If there are multiple top-level nodes and one of them is `<html>`, then this
|
||||
function will wrap them all in another `<html>` node.
|
||||
|
||||
-}
|
||||
runDocument : String -> Result (List DeadEnd) Document
|
||||
runDocument input =
|
||||
Parser.run document input
|
||||
|
||||
|
||||
document : Parser Document
|
||||
document =
|
||||
succeed Document
|
||||
|= doctype
|
||||
|. ws
|
||||
|= (zeroOrMore node
|
||||
|> map
|
||||
(\nodes ->
|
||||
case nodes of
|
||||
[] ->
|
||||
Element "html" [] []
|
||||
|
||||
((Element "html" _ _) as root) :: [] ->
|
||||
root
|
||||
|
||||
other :: [] ->
|
||||
Element "html" [] [ other ]
|
||||
|
||||
_ ->
|
||||
Element "html" [] nodes
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
parseAll : Parser (List Node)
|
||||
parseAll =
|
||||
Parser.loop [] <|
|
||||
\acc ->
|
||||
oneOf
|
||||
[ node |> map (\n -> Loop (mergeText n acc))
|
||||
, succeed () |> map (\_ -> Done (List.reverse acc))
|
||||
]
|
||||
|
||||
|
||||
mergeText : Node -> List Node -> List Node
|
||||
mergeText n nodes =
|
||||
case ( n, nodes ) of
|
||||
( Text s, (Text prev) :: rest ) ->
|
||||
Text (prev ++ s) :: rest
|
||||
|
||||
_ ->
|
||||
n :: nodes
|
||||
|
||||
|
||||
{-| Chomps zero or more space characters or html comments.
|
||||
-}
|
||||
ws =
|
||||
loop 0 <|
|
||||
ifProgress <|
|
||||
oneOf
|
||||
[ multiComment "<!--" "-->" Nestable
|
||||
, chompWhile isSpace
|
||||
]
|
||||
|
||||
|
||||
isSpace : Char -> Bool
|
||||
isSpace c =
|
||||
c == ' ' || c == '\n' || c == '\u{000D}' || c == '\n' || c == '\t' || c == '\u{000C}' || c == '\u{00A0}'
|
||||
|
||||
|
||||
|
||||
-- ATTRIBUTES
|
||||
|
||||
|
||||
attributeValueUnquoted : Parser String
|
||||
attributeValueUnquoted =
|
||||
let
|
||||
isUnquotedValueChar c =
|
||||
not (isSpace c) && c /= '"' && c /= '\'' && c /= '=' && c /= '<' && c /= '>' && c /= '`' && c /= '&'
|
||||
in
|
||||
oneOf
|
||||
[ chompOneOrMore isUnquotedValueChar
|
||||
|> getChompedString
|
||||
, characterReference
|
||||
]
|
||||
|> oneOrMore "attribute value"
|
||||
|> map (String.join "")
|
||||
|
||||
|
||||
attributeValueQuoted : Char -> Parser String
|
||||
attributeValueQuoted quote =
|
||||
let
|
||||
isQuotedValueChar c =
|
||||
c /= quote && c /= '&'
|
||||
in
|
||||
Parser.succeed identity
|
||||
|. chompIf ((==) quote)
|
||||
|= (oneOf
|
||||
[ chompOneOrMore isQuotedValueChar
|
||||
|> getChompedString
|
||||
, characterReference
|
||||
]
|
||||
|> zeroOrMore
|
||||
|> map (String.join "")
|
||||
)
|
||||
|. chompIf ((==) quote)
|
||||
|
||||
|
||||
attributeKey : Parser String
|
||||
attributeKey =
|
||||
let
|
||||
isKeyChar c =
|
||||
not (isSpace c) && c /= '"' && c /= '\'' && c /= '>' && c /= '/' && c /= '='
|
||||
in
|
||||
succeed (++)
|
||||
-- Attribute can start with '/' but it's ignored
|
||||
|. oneOf
|
||||
[ -- backtrackable because open tag can end with "/>"
|
||||
backtrackable (chompIf ((==) '/'))
|
||||
, succeed ()
|
||||
]
|
||||
-- Attribute name can start with '=': https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
|
||||
-- e.g. <a =empty />
|
||||
|= oneOf
|
||||
[ chompIf ((==) '=') |> map (\_ -> "=")
|
||||
, succeed ""
|
||||
]
|
||||
|= (chompOneOrMore isKeyChar
|
||||
|> getChompedString
|
||||
|> map String.toLower
|
||||
)
|
||||
|
||||
|
||||
attribute : Parser ( String, String )
|
||||
attribute =
|
||||
succeed Tuple.pair
|
||||
|= attributeKey
|
||||
|. ws
|
||||
|= oneOf
|
||||
[ succeed identity
|
||||
|. symbol "="
|
||||
|. ws
|
||||
|= oneOf
|
||||
[ attributeValueUnquoted -- <div foo=bar>
|
||||
, attributeValueQuoted '"' -- <div foo="bar">
|
||||
, attributeValueQuoted '\'' -- <div foo='bar'>
|
||||
]
|
||||
, succeed "" -- <div foo>
|
||||
]
|
||||
-- Reminder: Consume trailing whitespace so that following parsers don't need to consume whitespace
|
||||
-- and then need to backtrack
|
||||
|. ws
|
||||
|
||||
|
||||
tagName : Parser String
|
||||
tagName =
|
||||
chompOneOrMore (\c -> Char.isAlphaNum c || c == '-')
|
||||
|> getChompedString
|
||||
|> map String.toLower
|
||||
|
||||
|
||||
closeTag : String -> Parser ()
|
||||
closeTag expectedTag =
|
||||
(succeed identity
|
||||
|. token "</"
|
||||
|= tagName
|
||||
|. ws
|
||||
|. token ">"
|
||||
)
|
||||
|> andThen
|
||||
(\tag ->
|
||||
if tag == expectedTag then
|
||||
succeed ()
|
||||
|
||||
else
|
||||
problem ("found closing tag </" ++ tag ++ "> but expected </" ++ expectedTag ++ ">")
|
||||
)
|
||||
|
||||
|
||||
type OpenTagEnd
|
||||
= NoClose
|
||||
| SelfClose
|
||||
|
||||
|
||||
anyCloseTag : Parser ()
|
||||
anyCloseTag =
|
||||
succeed ()
|
||||
|. token "</"
|
||||
|. tagName
|
||||
|. ws
|
||||
|. token ">"
|
||||
|
||||
|
||||
node : Parser Node
|
||||
node =
|
||||
succeed identity
|
||||
-- HACK: Ignore unmatched close tags like the browser does
|
||||
|. zeroOrMore (backtrackable anyCloseTag)
|
||||
|= oneOf
|
||||
[ text
|
||||
, comment
|
||||
, backtrackable element
|
||||
, justOneChar |> map Text
|
||||
]
|
||||
|
||||
|
||||
comment : Parser Node
|
||||
comment =
|
||||
succeed Comment
|
||||
|. symbol "<!--"
|
||||
|= (chompUntil "-->" |> getChompedString)
|
||||
|. symbol "-->"
|
||||
|
||||
|
||||
text : Parser Node
|
||||
text =
|
||||
oneOf
|
||||
[ succeed Text
|
||||
|= backtrackable characterReference
|
||||
, succeed Text
|
||||
|= (chompOneOrMore (\c -> c /= '<' && c /= '&') |> getChompedString)
|
||||
]
|
||||
|
||||
|
||||
{-| Parse any node unless it's one of the given tags.
|
||||
-}
|
||||
notNode : List String -> Parser Node
|
||||
notNode tags =
|
||||
oneOf
|
||||
[ lookAhead
|
||||
(openTag
|
||||
|> andThen
|
||||
(\( tag, _, _ ) ->
|
||||
if List.member tag tags then
|
||||
problem ""
|
||||
|
||||
else
|
||||
succeed ()
|
||||
)
|
||||
)
|
||||
|> andThen (\_ -> element)
|
||||
, text
|
||||
, comment
|
||||
]
|
||||
|
||||
|
||||
openTag : Parser ( String, List ( String, String ), OpenTagEnd )
|
||||
openTag =
|
||||
succeed (\a b c -> ( a, b, c ))
|
||||
|. symbol "<"
|
||||
|. ws
|
||||
|= tagName
|
||||
|. ws
|
||||
|= zeroOrMore attribute
|
||||
|. ws
|
||||
|= oneOf
|
||||
[ succeed NoClose
|
||||
|. symbol ">"
|
||||
, succeed SelfClose
|
||||
|. symbol "/>"
|
||||
]
|
||||
|
||||
|
||||
element : Parser Node
|
||||
element =
|
||||
openTag
|
||||
|> andThen
|
||||
(\( tag, attrs, end ) ->
|
||||
case end of
|
||||
SelfClose ->
|
||||
succeed (Element tag attrs [])
|
||||
|
||||
NoClose ->
|
||||
if tag == "script" then
|
||||
succeed (Element tag attrs)
|
||||
|= consumeJavascriptUntilClosingTag
|
||||
|
||||
else if isVoidTag tag then
|
||||
-- Void element expects no closing tag
|
||||
succeed (Element tag attrs [])
|
||||
|
||||
else if isAutoclosingTag tag then
|
||||
-- Autoclosing tag is automatically closed by an opening tag of the same name
|
||||
succeed (Element tag attrs)
|
||||
|= oneOf
|
||||
[ succeed identity
|
||||
|= zeroOrMore
|
||||
(if tag == "head" then
|
||||
notNode [ tag, "body" ]
|
||||
|
||||
else
|
||||
notNode [ tag ]
|
||||
)
|
||||
|. oneOf
|
||||
[ backtrackable (closeTag tag)
|
||||
, succeed ()
|
||||
]
|
||||
]
|
||||
|
||||
else
|
||||
-- Normal elements parse all nodes as children until their closing tag
|
||||
succeed (Element tag attrs)
|
||||
|= (loop [] <|
|
||||
\acc ->
|
||||
oneOf
|
||||
[ backtrackable (closeTag tag) |> map (\_ -> Done (List.reverse acc))
|
||||
, succeed (\n -> Loop (mergeText n acc))
|
||||
|= backtrackable node
|
||||
, succeed () |> map (\_ -> Done (List.reverse acc))
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
|
||||
-- CHARACTER REFERENCE
|
||||
|
||||
|
||||
{-| Parse one or more hexadecimal digits into an integer.
|
||||
-}
|
||||
base16 : Parser Int
|
||||
base16 =
|
||||
chompOneOrMore Char.isHexDigit
|
||||
|> getChompedString
|
||||
|> andThen
|
||||
(\hex ->
|
||||
case Hex.fromString (String.toLower hex) of
|
||||
Ok num ->
|
||||
succeed num
|
||||
|
||||
Err msg ->
|
||||
problem msg
|
||||
)
|
||||
|
||||
|
||||
{-| Parse one or more 0-9 digits into an integer.
|
||||
-}
|
||||
base10 : Parser Int
|
||||
base10 =
|
||||
chompOneOrMore Char.isDigit
|
||||
|> getChompedString
|
||||
|> andThen
|
||||
(\digits ->
|
||||
String.toInt digits
|
||||
|> Maybe.map succeed
|
||||
|> Maybe.withDefault (problem "bad number")
|
||||
)
|
||||
|
||||
|
||||
numericCharacterReference : Parser String
|
||||
numericCharacterReference =
|
||||
let
|
||||
codepoint =
|
||||
oneOf
|
||||
[ succeed identity
|
||||
|. chompIf (\c -> c == 'x' || c == 'X')
|
||||
|= base16
|
||||
, succeed identity
|
||||
|= base10
|
||||
]
|
||||
in
|
||||
succeed identity
|
||||
|. chompIf ((==) '#')
|
||||
|= (codepoint
|
||||
|> andThen
|
||||
(\code ->
|
||||
-- https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
|
||||
if code == 0 then
|
||||
succeed '<EFBFBD>'
|
||||
|
||||
else if 0xD800 <= code && code <= 0xDFFF then
|
||||
-- Is surrogate
|
||||
succeed '<EFBFBD>'
|
||||
|
||||
else
|
||||
succeed (Char.fromCode code)
|
||||
)
|
||||
|> map String.fromChar
|
||||
)
|
||||
|
||||
|
||||
namedCharacterReference : Parser String
|
||||
namedCharacterReference =
|
||||
chompOneOrMore Char.isAlpha
|
||||
|> getChompedString
|
||||
|> map
|
||||
(\ref ->
|
||||
Html.CharRefs.decode ref
|
||||
|> Maybe.withDefault ("&" ++ ref ++ ";")
|
||||
)
|
||||
|
||||
|
||||
characterReference : Parser String
|
||||
characterReference =
|
||||
succeed identity
|
||||
|. chompIf ((==) '&')
|
||||
|= oneOf
|
||||
[ backtrackable numericCharacterReference
|
||||
|. chompIf ((==) ';')
|
||||
, backtrackable namedCharacterReference
|
||||
|. chompIf ((==) ';')
|
||||
, succeed "&"
|
||||
]
|
||||
|
||||
|
||||
|
||||
-- SPECIAL ELEMENTS
|
||||
|
||||
|
||||
isVoidTag : String -> Bool
|
||||
isVoidTag tag =
|
||||
List.member tag voidTags
|
||||
|
||||
|
||||
voidTags : List String
|
||||
voidTags =
|
||||
[ "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "source", "track", "wbr" ]
|
||||
|
||||
|
||||
isAutoclosingTag : String -> Bool
|
||||
isAutoclosingTag tag =
|
||||
List.member tag autoclosingTags
|
||||
|
||||
|
||||
autoclosingTags : List String
|
||||
autoclosingTags =
|
||||
[ "body", "colgroup", "dd", "dt", "head", "html", "li", "option", "p", "tbody", "td", "tfoot", "th", "thead", "tr" ]
|
||||
|
||||
|
||||
|
||||
-- HELPERS
|
||||
|
||||
|
||||
chompOneOrMore : (Char -> Bool) -> Parser ()
|
||||
chompOneOrMore predicate =
|
||||
Parser.chompIf predicate
|
||||
|. Parser.chompWhile predicate
|
||||
|
||||
|
||||
{-| Loop a parser only if it actually consumes something.
|
||||
|
||||
For example, parsers like `spaces` and `chompWhile` will happily
|
||||
consume 0 input, so when put in a loop the parser will never terminate.
|
||||
|
||||
-}
|
||||
ifProgress : Parser a -> Int -> Parser (Step Int ())
|
||||
ifProgress parser offset =
|
||||
succeed identity
|
||||
|. parser
|
||||
|= getOffset
|
||||
|> map
|
||||
(\newOffset ->
|
||||
if offset == newOffset then
|
||||
Done ()
|
||||
|
||||
else
|
||||
Loop newOffset
|
||||
)
|
||||
|
||||
|
||||
zeroOrMore : Parser a -> Parser (List a)
|
||||
zeroOrMore parser =
|
||||
Parser.loop []
|
||||
(\acc ->
|
||||
oneOf
|
||||
[ succeed (\val -> Loop (val :: acc))
|
||||
|= parser
|
||||
, succeed (Done (List.reverse acc))
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
oneOrMore : String -> Parser a -> Parser (List a)
|
||||
oneOrMore name parser =
|
||||
Parser.loop []
|
||||
(\acc ->
|
||||
oneOf
|
||||
[ succeed (\val -> Loop (val :: acc))
|
||||
|= parser
|
||||
, if List.isEmpty acc then
|
||||
problem ("expecting at least one " ++ name)
|
||||
|
||||
else
|
||||
succeed (Done (List.reverse acc))
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
{-| Create a parser that backtracks on success.
|
||||
-}
|
||||
lookAhead : Parser a -> Parser ()
|
||||
lookAhead parser =
|
||||
oneOf
|
||||
[ oneOf
|
||||
[ parser
|
||||
|> backtrackable
|
||||
|> andThen (\_ -> commit ())
|
||||
|> andThen (\_ -> problem "")
|
||||
, succeed
|
||||
(parser
|
||||
|> backtrackable
|
||||
|> map (\_ -> ())
|
||||
)
|
||||
]
|
||||
|> backtrackable
|
||||
, succeed (succeed ())
|
||||
]
|
||||
|> andThen identity
|
||||
|
||||
|
||||
|
||||
-- JAVASCRIPT / <script>
|
||||
|
||||
|
||||
{-| Chomp inside a <script> tag until the next </script>.
|
||||
|
||||
This can't be implemented as `chompUntil "</script>"` because
|
||||
the Javascript inside the script tag may contain the string "</script>".
|
||||
|
||||
For example: "<script>alert('</script>')</script>"
|
||||
|
||||
-}
|
||||
consumeJavascriptUntilClosingTag : Parser (List Node)
|
||||
consumeJavascriptUntilClosingTag =
|
||||
Parser.loop [] <|
|
||||
\acc ->
|
||||
let
|
||||
accumulate newNode =
|
||||
case ( acc, newNode ) of
|
||||
( [], first ) ->
|
||||
Loop [ first ]
|
||||
|
||||
( (Text accChunk) :: tail, Text newChunk ) ->
|
||||
-- Merge top-most text node unless HTML comment nodes are interleaved
|
||||
Loop (Text (accChunk ++ newChunk) :: tail)
|
||||
|
||||
( nonTextNode :: tail, _ ) ->
|
||||
Loop (newNode :: nonTextNode :: tail)
|
||||
in
|
||||
Parser.oneOf
|
||||
[ -- HTML comments are, albeit considered a bad practice recently,
|
||||
-- allowed inside <script> to hide scripts from really ancient web browser
|
||||
comment
|
||||
|> map accumulate
|
||||
, lineComment "//"
|
||||
|> Parser.getChompedString
|
||||
|> Parser.map (Text >> accumulate)
|
||||
, Parser.multiComment "/*" "*/" Parser.NotNestable
|
||||
|> Parser.getChompedString
|
||||
|> Parser.map (Text >> accumulate)
|
||||
, javaScriptStringLike '"'
|
||||
|> Parser.map (Text >> accumulate)
|
||||
, javaScriptStringLike '\''
|
||||
|> Parser.map (Text >> accumulate)
|
||||
, javaScriptStringLike '`'
|
||||
|> Parser.map (Text >> accumulate)
|
||||
, closeTag "script"
|
||||
|> Parser.map (\() -> Done (List.reverse acc))
|
||||
, Parser.chompIf (always True)
|
||||
|> Parser.getChompedString
|
||||
|> Parser.map (Text >> accumulate)
|
||||
]
|
||||
|
||||
|
||||
javaScriptStringLike : Char -> Parser String
|
||||
javaScriptStringLike terminatorChar =
|
||||
let
|
||||
terminatorStr =
|
||||
String.fromChar terminatorChar
|
||||
in
|
||||
Parser.succeed identity
|
||||
|. Parser.token terminatorStr
|
||||
|= Parser.loop "" (stringHelp terminatorChar terminatorStr)
|
||||
-- Restoring original shape
|
||||
|> Parser.map (\chunk -> terminatorStr ++ chunk ++ terminatorStr)
|
||||
|
||||
|
||||
stringHelp : Char -> String -> String -> Parser (Parser.Step String String)
|
||||
stringHelp terminatorChar terminatorStr acc =
|
||||
Parser.oneOf
|
||||
[ Parser.succeed (\char -> Parser.Loop (acc ++ "\\" ++ char))
|
||||
|. Parser.token "\\"
|
||||
|= justOneChar
|
||||
, Parser.token terminatorStr
|
||||
|> Parser.map (\_ -> Parser.Done acc)
|
||||
|
||||
-- Orig code caused infinite loop with single terminator char <script>'</script>
|
||||
-- , Parser.chompWhile (\char -> char /= '\\' && char /= terminatorChar)
|
||||
, chompOneOrMore (\char -> char /= '\\' && char /= terminatorChar)
|
||||
|> Parser.getChompedString
|
||||
|> Parser.map (\chunk -> Parser.Loop (acc ++ chunk))
|
||||
]
|
||||
|
||||
|
||||
justOneChar : Parser String
|
||||
justOneChar =
|
||||
chompIf (always True)
|
||||
|> getChompedString
|
||||
|
||||
|
||||
|
||||
-- RENDER
|
||||
|
||||
|
||||
openTagToString : String -> List ( String, String ) -> String
|
||||
openTagToString tag attrs =
|
||||
"<"
|
||||
++ tag
|
||||
++ (if List.isEmpty attrs then
|
||||
""
|
||||
|
||||
else
|
||||
List.map
|
||||
(\( k, v ) ->
|
||||
if String.isEmpty v then
|
||||
k
|
||||
|
||||
else
|
||||
k ++ "=\"" ++ v ++ "\""
|
||||
)
|
||||
attrs
|
||||
|> String.join " "
|
||||
)
|
||||
++ ">"
|
||||
|
||||
|
||||
{-| Convert an html node into a non-pretty string.
|
||||
|
||||
nodeToString (Element "a" [] [ Text "hi" ])
|
||||
== "<a>hi</a>"
|
||||
|
||||
-}
|
||||
nodeToString : Node -> String
|
||||
nodeToString node_ =
|
||||
case node_ of
|
||||
Text s ->
|
||||
s
|
||||
|
||||
Comment s ->
|
||||
"<!--" ++ s ++ "-->"
|
||||
|
||||
Element tag attrs kids ->
|
||||
if isVoidTag tag && List.isEmpty kids then
|
||||
openTagToString tag attrs
|
||||
|
||||
else
|
||||
openTagToString tag attrs
|
||||
++ (List.map nodeToString kids
|
||||
|> String.join ""
|
||||
)
|
||||
++ "</"
|
||||
++ tag
|
||||
++ ">"
|
||||
|
||||
|
||||
{-| Convert multiple html nodes into a non-pretty string.
|
||||
|
||||
nodesToString
|
||||
[ Element "a" [] [ Text "hi" ]
|
||||
, Element "div" [] [ Element "span" [] [] ]
|
||||
]
|
||||
== "<a>hi</a><div><span></span></div>"
|
||||
|
||||
-}
|
||||
nodesToString : List Node -> String
|
||||
nodesToString nodes =
|
||||
List.map nodeToString nodes
|
||||
|> String.join ""
|
||||
|
||||
|
||||
{-| Turn a single node into an Elm html node that Elm can render.
|
||||
-}
|
||||
nodeToHtml : Node -> Html.Html msg
|
||||
nodeToHtml node_ =
|
||||
case node_ of
|
||||
Text s ->
|
||||
Html.text s
|
||||
|
||||
Comment _ ->
|
||||
Html.text ""
|
||||
|
||||
Element tag attrs kids ->
|
||||
Html.node tag
|
||||
(List.map (\( k, v ) -> Html.Attributes.attribute k v) attrs)
|
||||
(List.map nodeToHtml kids)
|
||||
|
||||
|
||||
{-| Turn a multiple html nodes into Elm html that Elm can render.
|
||||
|
||||
view : Html Msg
|
||||
view =
|
||||
Html.div
|
||||
[]
|
||||
("<p>hello world</p>"
|
||||
|> Html.Parser.run
|
||||
|> Result.map Html.Parser.nodesToHtml
|
||||
|> Result.withDefault [ Html.text "parse error" ]
|
||||
)
|
||||
|
||||
-}
|
||||
nodesToHtml : List Node -> List (Html.Html msg)
|
||||
nodesToHtml nodes =
|
||||
List.foldr
|
||||
(\node_ acc ->
|
||||
nodeToHtml node_ :: acc
|
||||
)
|
||||
[]
|
||||
nodes
|
||||
|
||||
|
||||
{-| Generate a pretty string for a single html node.
|
||||
-}
|
||||
nodeToPrettyString : Node -> String
|
||||
nodeToPrettyString node_ =
|
||||
prettyNode_ 0 node_
|
||||
|
||||
|
||||
prettyNode_ : Int -> Node -> String
|
||||
prettyNode_ indent node_ =
|
||||
let
|
||||
pad =
|
||||
String.padLeft (indent * 4) ' ' ""
|
||||
in
|
||||
case node_ of
|
||||
Text s ->
|
||||
if String.isEmpty (String.trim s) then
|
||||
""
|
||||
|
||||
else
|
||||
let
|
||||
ss =
|
||||
s
|
||||
|> String.split "\n"
|
||||
|> List.filter (String.trim >> String.isEmpty >> not)
|
||||
|> String.join ("\n" ++ String.padLeft (indent * 4) ' ' "")
|
||||
in
|
||||
-- "\n" ++ pad ++ s
|
||||
"\n" ++ pad ++ ss
|
||||
|
||||
Comment s ->
|
||||
if String.isEmpty s then
|
||||
""
|
||||
|
||||
else
|
||||
"\n" ++ pad ++ "<!--" ++ s ++ "-->"
|
||||
|
||||
Element tag attrs kids ->
|
||||
String.join ""
|
||||
(List.concat
|
||||
[ [ "\n" ++ pad ++ openTagToString tag attrs ]
|
||||
, List.map (prettyNode_ (indent + 1)) kids
|
||||
, [ if List.isEmpty kids then
|
||||
""
|
||||
|
||||
else
|
||||
"\n"
|
||||
]
|
||||
, [ (if List.isEmpty kids then
|
||||
""
|
||||
|
||||
else
|
||||
pad
|
||||
)
|
||||
++ (if isVoidTag tag && List.isEmpty kids then
|
||||
""
|
||||
|
||||
else
|
||||
"</"
|
||||
++ tag
|
||||
++ ">"
|
||||
)
|
||||
]
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
{-| Turn a node tree into a pretty-printed, indented html string.
|
||||
|
||||
("<a><b><c>hello</c></b></a>"
|
||||
|> Html.Parser.run
|
||||
|> Result.map nodesToPrettyString
|
||||
)
|
||||
== Ok """<a>
|
||||
<b>
|
||||
<c>
|
||||
hello
|
||||
</c>
|
||||
</b>
|
||||
</a>"""
|
||||
|
||||
-}
|
||||
nodesToPrettyString : List Node -> String
|
||||
nodesToPrettyString nodes =
|
||||
List.foldl
|
||||
(\node_ acc ->
|
||||
acc ++ nodeToPrettyString node_
|
||||
)
|
||||
""
|
||||
nodes
|
||||
|
||||
|
||||
doctypeToString : Bool -> String
|
||||
doctypeToString legacyCompat =
|
||||
if legacyCompat then
|
||||
"<!DOCTYPE html SYSTEM \"about:legacy-compat\">"
|
||||
|
||||
else
|
||||
"<!DOCTYPE html>"
|
||||
|
||||
|
||||
{-| Convert a document into a string starting with `<!doctype html>` followed by the root html node.
|
||||
-}
|
||||
documentToString : Document -> String
|
||||
documentToString doc =
|
||||
doctypeToString doc.legacyCompat ++ "\n" ++ nodeToString doc.root
|
||||
|
||||
|
||||
{-| Convert a document into a pretty, indented string.
|
||||
-}
|
||||
documentToPrettyString : Document -> String
|
||||
documentToPrettyString doc =
|
||||
doctypeToString doc.legacyCompat ++ "\n" ++ nodeToPrettyString doc.root
|
693
tests/ParserTests.elm
Normal file
693
tests/ParserTests.elm
Normal file
@ -0,0 +1,693 @@
|
||||
module ParserTests exposing (..)
|
||||
|
||||
import Expect exposing (Expectation)
|
||||
import Fuzz exposing (Fuzzer, int, list, string)
|
||||
import Html.Parser exposing (Document, Node(..))
|
||||
import Parser exposing (DeadEnd)
|
||||
import Test exposing (..)
|
||||
|
||||
|
||||
testDoc : List ( String, String, Result (List DeadEnd) Document ) -> List Test
|
||||
testDoc cases =
|
||||
List.map
|
||||
(\( name, html, expected ) ->
|
||||
test (name ++ ": " ++ html)
|
||||
(\_ ->
|
||||
let
|
||||
actual =
|
||||
Html.Parser.runDocument html
|
||||
in
|
||||
case expected of
|
||||
Ok _ ->
|
||||
Expect.equal actual expected
|
||||
|
||||
Err _ ->
|
||||
case actual of
|
||||
Err _ ->
|
||||
Expect.pass
|
||||
|
||||
_ ->
|
||||
Expect.equal actual expected
|
||||
)
|
||||
)
|
||||
cases
|
||||
|
||||
|
||||
testStringRoundtrip : List ( String, String, Result (List DeadEnd) String ) -> List Test
|
||||
testStringRoundtrip cases =
|
||||
List.map
|
||||
(\( name, html, expected ) ->
|
||||
test (name ++ " \"" ++ html ++ "\"")
|
||||
(\_ ->
|
||||
let
|
||||
actual =
|
||||
Html.Parser.run html
|
||||
|> Result.map Html.Parser.nodesToString
|
||||
in
|
||||
case expected of
|
||||
Err _ ->
|
||||
case actual of
|
||||
Err _ ->
|
||||
Expect.pass
|
||||
|
||||
_ ->
|
||||
Expect.equal actual expected
|
||||
|
||||
Ok _ ->
|
||||
Expect.equal actual expected
|
||||
)
|
||||
)
|
||||
cases
|
||||
|
||||
|
||||
testAll : List ( String, String, Result (List DeadEnd) (List Node) ) -> List Test
|
||||
testAll cases =
|
||||
List.map
|
||||
(\( name, html, expected ) ->
|
||||
test (name ++ " \"" ++ html ++ "\"")
|
||||
(\_ ->
|
||||
let
|
||||
actual =
|
||||
Html.Parser.run html
|
||||
in
|
||||
case expected of
|
||||
Err _ ->
|
||||
case actual of
|
||||
Err _ ->
|
||||
Expect.pass
|
||||
|
||||
_ ->
|
||||
Expect.equal actual expected
|
||||
|
||||
Ok _ ->
|
||||
Expect.equal actual expected
|
||||
)
|
||||
)
|
||||
cases
|
||||
|
||||
|
||||
renderStringTests =
|
||||
describe "stringify tests" <|
|
||||
testStringRoundtrip
|
||||
[ ( "basic", "<a></a>", Ok "<a></a>" )
|
||||
, ( "basic", "<a>foo</a>", Ok "<a>foo</a>" )
|
||||
, ( "basic", "<a> foo </a>", Ok "<a> foo </a>" )
|
||||
, ( "basic", "<a><b><c>foo</c></b></a>", Ok "<a><b><c>foo</c></b></a>" )
|
||||
, ( "basic", "<A><B><C>foo</C></B></A>", Ok "<a><b><c>foo</c></b></a>" )
|
||||
, ( "basic", "<a><!--a-->b<!--c--></a>", Ok "<a><!--a-->b<!--c--></a>" )
|
||||
]
|
||||
|
||||
|
||||
ambiguousTextTests =
|
||||
describe "ambiguous text node parsing" <|
|
||||
testAll
|
||||
[ ( "basic1", "<div>:></div>", Ok [ Element "div" [] [ Text ":>" ] ] )
|
||||
, ( "basic2", "<div><:</div>", Ok [ Element "div" [] [ Text "<:" ] ] )
|
||||
, ( "basic3", "<:", Ok [ Text "<:" ] )
|
||||
, ( "basic4", ":>", Ok [ Text ":>" ] )
|
||||
]
|
||||
|
||||
|
||||
voidTests =
|
||||
describe "void nodes" <|
|
||||
testAll
|
||||
[ ( "without closing tag", "<hr>a", Ok [ Element "hr" [] [], Text "a" ] )
|
||||
, ( "with closing tag", "<hr>a</hr>", Ok [ Element "hr" [] [], Text "a" ] )
|
||||
]
|
||||
|
||||
|
||||
documentTests =
|
||||
describe "document parsing" <|
|
||||
testDoc
|
||||
[ ( "", "<!doctype html>", Ok (Document False (Element "html" [] [])) )
|
||||
, ( "", "<!DOCTYPE HTML>", Ok (Document False (Element "html" [] [])) )
|
||||
, ( "", "<!doctype htmlSYSTEM \"about:legacy-compat\">", Err [] )
|
||||
, ( "", "<!doctype html SYSTEM \"about:legacy-compat\">", Ok (Document True (Element "html" [] [])) )
|
||||
, ( "", "<!doctype html sYsTem 'about:legacy-compat'>", Ok (Document True (Element "html" [] [])) )
|
||||
, ( ""
|
||||
, "<!doctype html><head>a<body>b"
|
||||
, Ok
|
||||
(Document False
|
||||
(Element "html"
|
||||
[]
|
||||
[ Element "head" [] [ Text "a" ]
|
||||
, Element "body" [] [ Text "b" ]
|
||||
]
|
||||
)
|
||||
)
|
||||
)
|
||||
, ( ""
|
||||
, "<!doctype html><meta charset=\"utf-8\">"
|
||||
, Ok
|
||||
(Document False
|
||||
(Element "html"
|
||||
[]
|
||||
[ Element "meta" [ ( "charset", "utf-8" ) ] []
|
||||
]
|
||||
)
|
||||
)
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
basicCommentTests =
|
||||
describe "basic comment parsing" <|
|
||||
testAll
|
||||
[ ( "basic1", "<!---->", Ok [ Comment "" ] )
|
||||
, ( "basic2", "<!-- -->", Ok [ Comment " " ] )
|
||||
, ( "basic3", "<!--x-->", Ok [ Comment "x" ] )
|
||||
, ( "basic4", "<a><!--x--></a>", Ok [ Element "a" [] [ Comment "x" ] ] )
|
||||
, ( "basic5", "<!--a--><a><!--b--></a><!--c-->", Ok [ Comment "a", Element "a" [] [ Comment "b" ], Comment "c" ] )
|
||||
, ( "basic6", "<!---->-->", Ok [ Comment "", Text "-->" ] )
|
||||
]
|
||||
|
||||
|
||||
basicElementTests =
|
||||
describe "basic element parsing" <|
|
||||
testAll
|
||||
[ ( "my-basic1", "<a>:></a>", Ok [ Element "a" [] [ Text ":>" ] ] )
|
||||
|
||||
-- , ( "my-basic2", "<a><:</a>", Ok [ Element "a" [] [ Text "<:" ] ] )
|
||||
-- Tests from hecrj/elm-html-parser
|
||||
, ( "basic1", "<a></a>", Ok [ Element "a" [] [] ] )
|
||||
, ( "basic2", "<a></a >", Ok [ Element "a" [] [] ] )
|
||||
, ( "basic3", "<A></A >", Ok [ Element "a" [] [] ] )
|
||||
, ( "basic4", " <a></a> ", Ok [ Text " ", Element "a" [] [], Text " " ] )
|
||||
, ( "basic5", "a<a></a>b", Ok [ Text "a", Element "a" [] [], Text "b" ] )
|
||||
, ( "basic6", "<A></A>", Ok [ Element "a" [] [] ] )
|
||||
]
|
||||
|
||||
|
||||
basicAttributeTests =
|
||||
describe "basic attribute parsing" <|
|
||||
testAll
|
||||
[ ( "unquoted1", "<div a=b/></div>", Ok [ Element "div" [ ( "a", "b/" ) ] [] ] )
|
||||
, ( "unquoted2", "<div a=b />", Ok [ Element "div" [ ( "a", "b" ) ] [] ] )
|
||||
, ( "single-quoted", "<div a='b'/>", Ok [ Element "div" [ ( "a", "b" ) ] [] ] )
|
||||
, ( "double-quoted", "<div a=\"b\"/>", Ok [ Element "div" [ ( "a", "b" ) ] [] ] )
|
||||
, ( "key-only1", "<div a></div>", Ok [ Element "div" [ ( "a", "" ) ] [] ] )
|
||||
, ( "key-only2", "<div a/>", Ok [ Element "div" [ ( "a", "" ) ] [] ] )
|
||||
, ( "everything"
|
||||
, "<div a=b c='d' e=\"f\" g/>"
|
||||
, Ok
|
||||
[ Element "div"
|
||||
[ ( "a", "b" )
|
||||
, ( "c", "d" )
|
||||
, ( "e", "f" )
|
||||
, ( "g", "" )
|
||||
]
|
||||
[]
|
||||
]
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
autoclosingTests =
|
||||
describe "autoclosing elements" <|
|
||||
testAll
|
||||
[ ( "p-basic1", "<p>a<p>b", Ok [ Element "p" [] [ Text "a" ], Element "p" [] [ Text "b" ] ] )
|
||||
, ( "li-basic", "<li><li>", Ok [ Element "li" [] [], Element "li" [] [] ] )
|
||||
, ( "li-basic", "<ul><li><li></ul>", Ok [ Element "ul" [] [ Element "li" [] [], Element "li" [] [] ] ] )
|
||||
, ( "li-basic", "<li>a<li>b</li>", Ok [ Element "li" [] [ Text "a" ], Element "li" [] [ Text "b" ] ] )
|
||||
, ( "li-comment"
|
||||
, "<li>a<!--c--><li>b<!--d--></li>"
|
||||
, Ok
|
||||
[ Element "li" [] [ Text "a", Comment "c" ]
|
||||
, Element "li" [] [ Text "b", Comment "d" ]
|
||||
]
|
||||
)
|
||||
|
||||
-- TODO
|
||||
-- , ( "li-comment-backtrack"
|
||||
-- , "<li>a<!-c<li>b<!-d</li>"
|
||||
-- , Ok
|
||||
-- [ Element "li" [] [ Text "a<!-c" ]
|
||||
-- , Element "li" [] [ Text "b<!-d" ]
|
||||
-- ]
|
||||
-- )
|
||||
, ( "li-basic", "<li>a</li><li>b", Ok [ Element "li" [] [ Text "a" ], Element "li" [] [ Text "b" ] ] )
|
||||
, ( "li-basic1", "<li>a</li><li>b</li>", Ok [ Element "li" [] [ Text "a" ], Element "li" [] [ Text "b" ] ] )
|
||||
, ( "li-basic2"
|
||||
, "<li>a<li>b</li>c</li>"
|
||||
, Ok
|
||||
[ Element "li" [] [ Text "a" ]
|
||||
, Element "li" [] [ Text "b" ]
|
||||
, Text "c"
|
||||
]
|
||||
)
|
||||
, ( "li-basic3"
|
||||
, "<li>a<ul><li>b</li></ul>c</li>"
|
||||
, Ok
|
||||
[ Element "li"
|
||||
[]
|
||||
[ Text "a"
|
||||
, Element "ul"
|
||||
[]
|
||||
[ Element "li" [] [ Text "b" ]
|
||||
]
|
||||
, Text "c"
|
||||
]
|
||||
]
|
||||
)
|
||||
|
||||
-- Not valid html, but the parser should still parse it.
|
||||
, ( "head1"
|
||||
, "<head>a<head>b"
|
||||
, Ok
|
||||
[ Element "head" [] [ Text "a" ]
|
||||
, Element "head" [] [ Text "b" ]
|
||||
]
|
||||
)
|
||||
|
||||
-- Unlike the previous test, here's an example of where the parser must invoke the html5
|
||||
-- spec only to disambiguate where <body> should be a child vs. sibling
|
||||
-- of the unended <head> element.
|
||||
, ( "head2"
|
||||
, "<head><title>hello</title><body>"
|
||||
, Ok
|
||||
[ Element "head"
|
||||
[]
|
||||
[ Element "title" [] [ Text "hello" ] ]
|
||||
, Element "body" [] []
|
||||
]
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
basicNestingTests =
|
||||
describe "nested elements" <|
|
||||
testAll
|
||||
[ ( "abc"
|
||||
, "<a><b><c></c></b></a>"
|
||||
, Ok
|
||||
[ Element "a"
|
||||
[]
|
||||
[ Element "b"
|
||||
[]
|
||||
[ Element "c" [] []
|
||||
]
|
||||
]
|
||||
]
|
||||
)
|
||||
, ( "nested <ul> where all <li> are closed </li>"
|
||||
, """<ul><li>a</li><li>b<ul><li>x</li><li>y</li></ul></li><li>c</li></ul>"""
|
||||
, Ok
|
||||
[ Element "ul"
|
||||
[]
|
||||
[ Element "li" [] [ Text "a" ]
|
||||
, Element "li"
|
||||
[]
|
||||
[ Text "b"
|
||||
, Element "ul"
|
||||
[]
|
||||
[ Element "li" [] [ Text "x" ]
|
||||
, Element "li" [] [ Text "y" ]
|
||||
]
|
||||
]
|
||||
, Element "li" [] [ Text "c" ]
|
||||
]
|
||||
]
|
||||
)
|
||||
, ( "nested <ul> where zero <li> are closed with </li>"
|
||||
, """<ul><li>a<li>b<ul><li>x<li>y</ul><li>c</ul>"""
|
||||
, Ok
|
||||
[ Element "ul"
|
||||
[]
|
||||
[ Element "li" [] [ Text "a" ]
|
||||
, Element "li"
|
||||
[]
|
||||
[ Text "b"
|
||||
, Element "ul"
|
||||
[]
|
||||
[ Element "li" [] [ Text "x" ]
|
||||
, Element "li" [] [ Text "y" ]
|
||||
]
|
||||
]
|
||||
, Element "li" [] [ Text "c" ]
|
||||
]
|
||||
]
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
voidElementTests : Test
|
||||
voidElementTests =
|
||||
describe "void elements" <|
|
||||
testAll
|
||||
[ ( "invalid", "<hr></hr>", Ok [ Element "hr" [] [] ] )
|
||||
, ( "valid1", "<hr>", Ok [ Element "hr" [] [] ] )
|
||||
, ( "valid2", "<hr/>", Ok [ Element "hr" [] [] ] )
|
||||
]
|
||||
|
||||
|
||||
textNodeTests : Test
|
||||
textNodeTests =
|
||||
describe "text node parsing" <|
|
||||
testAll
|
||||
[ ( "empty", "", Ok [] )
|
||||
, ( "space", " ", Ok [ Text " " ] )
|
||||
, ( "basic1", "1", Ok [ Text "1" ] )
|
||||
, ( "basic2", "a", Ok [ Text "a" ] )
|
||||
, ( "basic3", "1a", Ok [ Text "1a" ] )
|
||||
, ( "basic4", "^", Ok [ Text "^" ] )
|
||||
, ( "decode1", "&", Ok [ Text "&" ] )
|
||||
, ( "decode2", "&", Ok [ Text "&" ] )
|
||||
, ( "decode3", "<", Ok [ Text "<" ] )
|
||||
, ( "decode4", ">", Ok [ Text ">" ] )
|
||||
, ( "decode5", "'", Ok [ Text "'" ] )
|
||||
, ( "decode6", "&", Ok [ Text "&" ] )
|
||||
, ( "decode7", "&", Ok [ Text "&" ] )
|
||||
, ( "decode8", ">", Ok [ Text ">" ] )
|
||||
, ( "decode9", "ſ", Ok [ Text "ſ" ] )
|
||||
, ( "decodeA", " ", Ok [ Text "\u{00A0}" ] )
|
||||
, ( "decodeB", " ", Ok [ Text "\u{00A0}\u{00A0}" ] )
|
||||
, ( "decodeC", "a b", Ok [ Text "a\u{00A0}b" ] )
|
||||
, ( "decodeD", "a b", Ok [ Text "a\u{00A0}\u{00A0}b" ] )
|
||||
, ( "decodeE", """<img alt="<">""", Ok [ Element "img" [ ( "alt", "<" ) ] [] ] )
|
||||
, ( "decodeF", "&", Ok [ Text "&" ] )
|
||||
]
|
||||
|
||||
|
||||
scriptTests : Test
|
||||
scriptTests =
|
||||
describe "<script> node" <|
|
||||
testAll
|
||||
[ ( "basic1", "<script></script>", Ok [ Element "script" [] [] ] )
|
||||
, ( "basic2", "<script>foo</script>", Ok [ Element "script" [] [ Text "foo" ] ] )
|
||||
|
||||
-- Copy browser behavior
|
||||
, ( "basic3", "<script></script></script>", Ok [ Element "script" [] [] ] )
|
||||
, ( "basic4", "<script><script></script>", Ok [ Element "script" [] [ Text "<script>" ] ] )
|
||||
, ( "attrs1", "<script src=index.js></script>", Ok [ Element "script" [ ( "src", "index.js" ) ] [] ] )
|
||||
, ( "js1", "<script>'</script>'</script>", Ok [ Element "script" [] [ Text "'</script>'" ] ] )
|
||||
, ( "js2", "<script>\"</script>\"</script>", Ok [ Element "script" [] [ Text "\"</script>\"" ] ] )
|
||||
, ( "js3", "<script>`</script>`</script>", Ok [ Element "script" [] [ Text "`</script>`" ] ] )
|
||||
, ( "js4", "<script>x < 42 || x > 42</script>", Ok [ Element "script" [] [ Text "x < 42 || x > 42" ] ] )
|
||||
, ( "comment1", "<script>\n//</script>\n</script>", Ok [ Element "script" [] [ Text "\n//</script>\n" ] ] )
|
||||
, ( "comment2", "<script>\n/*\n</script>\n*/\n</script>", Ok [ Element "script" [] [ Text "\n/*\n</script>\n*/\n" ] ] )
|
||||
]
|
||||
|
||||
|
||||
|
||||
-- TESTS FROM hecrj/elm-html-parser
|
||||
|
||||
|
||||
testParseAll : String -> List Node -> (() -> Expectation)
|
||||
testParseAll s astList =
|
||||
\_ ->
|
||||
Expect.equal (Ok astList) (Html.Parser.run s)
|
||||
|
||||
|
||||
testParse : String -> Node -> (() -> Expectation)
|
||||
testParse input expected =
|
||||
\_ ->
|
||||
case Html.Parser.run input of
|
||||
Err message ->
|
||||
Expect.fail (Parser.deadEndsToString message)
|
||||
|
||||
Ok actual ->
|
||||
Expect.equal (Ok actual) (Ok [ expected ])
|
||||
|
||||
|
||||
hecrjNodeTests : Test
|
||||
hecrjNodeTests =
|
||||
describe "Node"
|
||||
[ test "basic1" (testParse "<a></a>" (Element "a" [] []))
|
||||
, test "basic2" (testParse "<a></a >" (Element "a" [] []))
|
||||
, test "basic3" (testParse "<A></A >" (Element "a" [] []))
|
||||
, test "basic4" (testParseAll " <a></a> " [ Text " ", Element "a" [] [], Text " " ])
|
||||
, test "basic5" (testParseAll "a<a></a>b" [ Text "a", Element "a" [] [], Text "b" ])
|
||||
, test "basic6" (testParse "<A></A>" (Element "a" [] []))
|
||||
, test "basic7" (testParse "<a>a</a>" (Element "a" [] [ Text "a" ]))
|
||||
, test "basic8" (testParse "<a> a </a>" (Element "a" [] [ Text " a " ]))
|
||||
, test "basic10" (testParse "<br>" (Element "br" [] []))
|
||||
, test "basic11" (testParse "<a><a></a></a>" (Element "a" [] [ Element "a" [] [] ]))
|
||||
, test "basic12" (testParse "<a> <a> </a> </a>" (Element "a" [] [ Text " ", Element "a" [] [ Text " " ], Text " " ]))
|
||||
, test "basic13" (testParse "<a> <br> </a>" (Element "a" [] [ Text " ", Element "br" [] [], Text " " ]))
|
||||
, test "basic14" (testParse "<a><a></a><a></a></a>" (Element "a" [] [ Element "a" [] [], Element "a" [] [] ]))
|
||||
, test "basic15" (testParse "<a><a><a></a></a></a>" (Element "a" [] [ Element "a" [] [ Element "a" [] [] ] ]))
|
||||
, test "basic16" (testParse "<a><a></a><b></b></a>" (Element "a" [] [ Element "a" [] [], Element "b" [] [] ]))
|
||||
, test "basic17" (testParse "<h1></h1>" (Element "h1" [] []))
|
||||
, test "start-only-tag1" (testParse "<br>" (Element "br" [] []))
|
||||
, test "start-only-tag2" (testParse "<BR>" (Element "br" [] []))
|
||||
, test "start-only-tag3" (testParse "<br >" (Element "br" [] []))
|
||||
, test "start-only-tag4" (testParse "<BR >" (Element "br" [] []))
|
||||
, test "start-only-tag5" (testParse "<a> <br> </a>" (Element "a" [] [ Text " ", Element "br" [] [], Text " " ]))
|
||||
, test "start-only-tag6" (testParse "<a><br><br></a>" (Element "a" [] [ Element "br" [] [], Element "br" [] [] ]))
|
||||
, test "start-only-tag7" (testParse "<a><br><img><hr><meta></a>" (Element "a" [] [ Element "br" [] [], Element "img" [] [], Element "hr" [] [], Element "meta" [] [] ]))
|
||||
, test "start-only-tag8" (testParse "<a>foo<br>bar</a>" (Element "a" [] [ Text "foo", Element "br" [] [], Text "bar" ]))
|
||||
, test "self-closing-tag1" (testParse "<br/>" (Element "br" [] []))
|
||||
, test "self-closing-tag2" (testParse "<br />" (Element "br" [] []))
|
||||
, test "self-closing-tag3" (testParse "<link href=\"something\" rel=\"something else\"/>" (Element "link" [ ( "href", "something" ), ( "rel", "something else" ) ] []))
|
||||
, test "web-component-tag" (testParse "<a-web-component></a-web-component>" (Element "a-web-component" [] []))
|
||||
]
|
||||
|
||||
|
||||
hecrjAttributeTests : Test
|
||||
hecrjAttributeTests =
|
||||
describe "Attribute"
|
||||
[ test "basic1" (testParse """<a href="example.com"></a>""" (Element "a" [ ( "href", "example.com" ) ] []))
|
||||
, test "basic2" (testParse """<a href='example.com'></a>""" (Element "a" [ ( "href", "example.com" ) ] []))
|
||||
, test "basic3" (testParse """<a href=example.com></a>""" (Element "a" [ ( "href", "example.com" ) ] []))
|
||||
, test "basic4" (testParse """<a HREF=example.com></a>""" (Element "a" [ ( "href", "example.com" ) ] []))
|
||||
, test "basic5" (testParse """<a href=bare></a>""" (Element "a" [ ( "href", "bare" ) ] []))
|
||||
, test "basic6" (testParse """<a href="example.com?a=b&c=d"></a>""" (Element "a" [ ( "href", "example.com?a=b&c=d" ) ] []))
|
||||
, test "basic7" (testParse """<a href="example.com?a=b&c=d"></a>""" (Element "a" [ ( "href", "example.com?a=b&c=d" ) ] []))
|
||||
, test "basic8" (testParse """<input max=100 min = 10.5>""" (Element "input" [ ( "max", "100" ), ( "min", "10.5" ) ] []))
|
||||
, test "basic9" (testParse """<input disabled>""" (Element "input" [ ( "disabled", "" ) ] []))
|
||||
, test "basic10" (testParse """<input DISABLED>""" (Element "input" [ ( "disabled", "" ) ] []))
|
||||
, test "basic11" (testParse """<meta http-equiv=Content-Type>""" (Element "meta" [ ( "http-equiv", "Content-Type" ) ] []))
|
||||
, test "basic12" (testParse """<input data-foo2="a">""" (Element "input" [ ( "data-foo2", "a" ) ] []))
|
||||
, test "basic13" (testParse """<html xmlns:v="urn:schemas-microsoft-com:vml"></html>""" (Element "html" [ ( "xmlns:v", "urn:schemas-microsoft-com:vml" ) ] []))
|
||||
, test "basic14" (testParse """<link rel=stylesheet
|
||||
href="">""" (Element "link" [ ( "rel", "stylesheet" ), ( "href", "" ) ] []))
|
||||
|
||||
-- Invalid attribute names shouldn't be parsed: https://github.com/elm/html/issues/46
|
||||
, test "invalid character" (testParse """<p\u{00A0} ></p>""" (Element "p" [] []))
|
||||
]
|
||||
|
||||
|
||||
hecrjScriptTests : Test
|
||||
hecrjScriptTests =
|
||||
describe "Script"
|
||||
[ test "script1" (testParse """<script></script>""" (Element "script" [] []))
|
||||
, test "script2" (testParse """<SCRIPT></SCRIPT>""" (Element "script" [] []))
|
||||
, test "script3" (testParse """<script src="script.js">foo</script>""" (Element "script" [ ( "src", "script.js" ) ] [ Text "foo" ]))
|
||||
, test "script4" (testParse """<script>var a = 0 < 1; b = 1 > 0;</script>""" (Element "script" [] [ Text "var a = 0 < 1; b = 1 > 0;" ]))
|
||||
, test "script5" (testParse """<script><!----></script>""" (Element "script" [] [ Comment "" ]))
|
||||
, test "script6" (testParse """<script>a<!--</script><script>-->b</script>""" (Element "script" [] [ Text "a", Comment "</script><script>", Text "b" ]))
|
||||
, test "style" (testParse """<style>a<!--</style><style>-->b</style>""" (Element "style" [] [ Text "a", Comment "</style><style>", Text "b" ]))
|
||||
]
|
||||
|
||||
|
||||
hecrjCommentTests : Test
|
||||
hecrjCommentTests =
|
||||
describe "Comment"
|
||||
[ test "basic1" (testParse """<!---->""" (Comment ""))
|
||||
, test "basic2" (testParse """<!--<div></div>-->""" (Comment "<div></div>"))
|
||||
, test "basic3" (testParse """<div><!--</div>--></div>""" (Element "div" [] [ Comment "</div>" ]))
|
||||
, test "basic4" (testParse """<!--<!---->""" (Comment "<!--"))
|
||||
, test "basic5" (testParse """<!--foo\t\u{000D}
|
||||
-->""" (Comment "foo\t\u{000D}\n "))
|
||||
]
|
||||
|
||||
|
||||
svgTests =
|
||||
test "self-closing svg path"
|
||||
(testParse
|
||||
"""<svg viewBox="0 0 20 20" fill="currentColor" aria-hidden="true"><path fill-rule="evenodd" d="1 2 3" clip-rule="evenodd" /></svg>"""
|
||||
(Element "svg"
|
||||
[ ( "viewbox", "0 0 20 20" )
|
||||
, ( "fill", "currentColor" )
|
||||
, ( "aria-hidden", "true" )
|
||||
]
|
||||
[ Element "path"
|
||||
[ ( "fill-rule", "evenodd" )
|
||||
, ( "d", "1 2 3" )
|
||||
, ( "clip-rule", "evenodd" )
|
||||
]
|
||||
[]
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
|
||||
-- https://github.com/taoqf/node-html-parser/blob/main/test/tests/html.js
|
||||
|
||||
|
||||
nodeHtmlParserTests =
|
||||
describe "taoqf/node-html-parser tests" <|
|
||||
testAll
|
||||
[ ( "test1"
|
||||
, "<p id=\"id\"><a class='cls'>Hello</a><ul><li><li></ul><span></span></p>"
|
||||
, Ok
|
||||
[ Element "p"
|
||||
[ ( "id", "id" ) ]
|
||||
[ Element "a"
|
||||
[ ( "class", "cls" ) ]
|
||||
[ Text "Hello"
|
||||
]
|
||||
, Element "ul"
|
||||
[]
|
||||
[ Element "li" [] []
|
||||
, Element "li" [] []
|
||||
]
|
||||
, Element "span" [] []
|
||||
]
|
||||
]
|
||||
)
|
||||
, ( "test2"
|
||||
, "<DIV><a><img/></A><p></P></div>"
|
||||
, Ok
|
||||
[ Element "div"
|
||||
[]
|
||||
[ Element "a"
|
||||
[]
|
||||
[ Element "img" [] []
|
||||
]
|
||||
, Element "p" [] []
|
||||
]
|
||||
]
|
||||
)
|
||||
, ( "test3"
|
||||
, "<div><a><img/></a><p></p></div>"
|
||||
, Ok
|
||||
[ Element "div"
|
||||
[]
|
||||
[ Element "a"
|
||||
[]
|
||||
[ Element "img" [] []
|
||||
]
|
||||
, Element "p" [] []
|
||||
]
|
||||
]
|
||||
)
|
||||
, ( "test4"
|
||||
, "<div><a><!-- my comment --></a></div>"
|
||||
, Ok
|
||||
[ Element "div"
|
||||
[]
|
||||
[ Element "a"
|
||||
[]
|
||||
[ Comment " my comment "
|
||||
]
|
||||
]
|
||||
]
|
||||
)
|
||||
, ( "test5"
|
||||
, "<div><!--<a></a>--></div>"
|
||||
, Ok
|
||||
[ Element "div"
|
||||
[]
|
||||
[ Comment "<a></a>"
|
||||
]
|
||||
]
|
||||
)
|
||||
, ( "test6"
|
||||
, "<picture><source srcset=\"/images/example-1.jpg 1200w, /images/example-2.jpg 1600w\" sizes=\"100vw\"><img src=\"/images/example.jpg\" alt=\"Example\"/></picture>"
|
||||
, Ok
|
||||
[ Element "picture"
|
||||
[]
|
||||
[ Element "source"
|
||||
[ ( "srcset", "/images/example-1.jpg 1200w, /images/example-2.jpg 1600w" )
|
||||
, ( "sizes", "100vw" )
|
||||
]
|
||||
[]
|
||||
, Element "img" [ ( "src", "/images/example.jpg" ), ( "alt", "Example" ) ] []
|
||||
]
|
||||
]
|
||||
)
|
||||
, ( "test7"
|
||||
, "<script>1</script><style>2&</style>"
|
||||
, Ok
|
||||
[ Element "script" [] [ Text "1" ]
|
||||
, Element "style" [] [ Text "2&" ]
|
||||
]
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
|
||||
-- JSOUP TESTS
|
||||
-- https://github.com/jhy/jsoup/blob/master/src/test/java/org/jsoup/parser/AttributeParseTest.java
|
||||
|
||||
|
||||
jsoupAttributeTests =
|
||||
describe "(from jsoup) attributes" <|
|
||||
testAll
|
||||
[ ( "parses rough attribute string"
|
||||
, "<a id=\"123\" class=\"baz = 'bar'\" style = 'border: 2px'qux zim foo = 12 mux=18 />"
|
||||
, Ok
|
||||
[ Element "a"
|
||||
[ ( "id", "123" )
|
||||
, ( "class", "baz = 'bar'" )
|
||||
, ( "style", "border: 2px" )
|
||||
, ( "qux", "" )
|
||||
, ( "zim", "" )
|
||||
, ( "foo", "12" )
|
||||
, ( "mux", "18" )
|
||||
]
|
||||
[]
|
||||
]
|
||||
)
|
||||
, ( "handles newlines and returns"
|
||||
, -- "<a\r\nfoo='bar\r\nqux'\r\nbar\r\n=\r\ntwo>One</a>"
|
||||
"<a\u{000D}\nfoo='bar\u{000D}\nqux'\u{000D}\nbar\u{000D}\n=\u{000D}\ntwo>One</a>"
|
||||
, Ok
|
||||
[ Element "a"
|
||||
[ ( "foo", "bar\u{000D}\nqux" )
|
||||
, ( "bar", "two" )
|
||||
]
|
||||
[ Text "One" ]
|
||||
]
|
||||
)
|
||||
, ( "parses empty string", "<a />", Ok [ Element "a" [] [] ] )
|
||||
|
||||
-- https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
|
||||
, ( "can start with '='"
|
||||
, "<a =empty />"
|
||||
, Ok [ Element "a" [ ( "=empty", "" ) ] [] ]
|
||||
)
|
||||
, ( "strict attribute unescapes"
|
||||
, -- "<a id=1 href='?foo=bar&mid<=true'>One</a> <a id=2 href='?foo=bar<qux&lg=1'>Two</a>"
|
||||
"<a id=1 href='?foo=bar&mid<=true'>One</a> <a id=2 href='?foo=bar<qux&lg=1'>Two</a>"
|
||||
, Ok
|
||||
[ Element "a"
|
||||
[ ( "id", "1" )
|
||||
, ( "href", "?foo=bar&mid<=true" )
|
||||
]
|
||||
[ Text "One" ]
|
||||
, Text " "
|
||||
, Element "a"
|
||||
[ ( "id", "2" )
|
||||
, ( "href", "?foo=bar<qux&lg=1" )
|
||||
]
|
||||
[ Text "Two" ]
|
||||
]
|
||||
)
|
||||
, ( "more attribute unescapes"
|
||||
, "<a href='&wr_id=123&mid-size=true&ok=&wr'>Check</a>"
|
||||
, Ok
|
||||
[ Element "a"
|
||||
[ ( "href", "&wr_id=123&mid-size=true&ok=&wr" )
|
||||
]
|
||||
[ Text "Check" ]
|
||||
]
|
||||
)
|
||||
, ( "drops slash from attribute"
|
||||
, "<img /onerror='doMyJob' /a /=b/>"
|
||||
, Ok
|
||||
[ Element "img"
|
||||
[ ( "onerror", "doMyJob" )
|
||||
, ( "a", "" )
|
||||
, ( "=b", "" )
|
||||
]
|
||||
[]
|
||||
]
|
||||
)
|
||||
]
|
||||
|
||||
|
||||
|
||||
-- TODO: https://github.com/jhy/jsoup/blob/master/src/test/java/org/jsoup/parser/HtmlParserTest.java
|
Loading…
Reference in New Issue
Block a user