Impl Config / optional char ref decoding

This commit is contained in:
Dan Neumann 2022-05-22 16:04:14 -05:00
parent 3126b1965f
commit e4926b7720
4 changed files with 117 additions and 69 deletions

View File

@ -1,15 +1,10 @@
module Html.CharRefs exposing (decode)
module Html.CharRefs exposing (default)
import Dict exposing (Dict)
decode : String -> Maybe String
decode name =
Dict.get name dict
dict : Dict String String
dict =
default : Dict String String
default =
-- Source: https://www.w3.org/TR/html5/syntax.html#named-character-references
[ ( "Aacute", "Á" )
, ( "aacute", "á" )

View File

@ -1,5 +1,6 @@
module Html.Parser exposing
( Node(..), Document
( Node(..), Document, Config
, configWithCharRefs, configWithoutCharRefs
, run, runElement, runDocument
, nodeToHtml, nodesToHtml, nodeToString, nodesToString, nodeToPrettyString, nodesToPrettyString, documentToString, documentToPrettyString
)
@ -10,7 +11,12 @@ into strings or Elm's virtual dom nodes.
# Definition
@docs Node, Document
@docs Node, Document, Config
# Config
@docs configWithCharRefs, configWithoutCharRefs
# Parse
@ -24,6 +30,7 @@ into strings or Elm's virtual dom nodes.
-}
import Dict exposing (Dict)
import Hex
import Html
import Html.Attributes
@ -42,6 +49,42 @@ type Node
| Element String (List ( String, String )) (List Node)
{-| The config object tells the parser if it should decode named character references.
-}
type Config
= Config
{ charRefs : Dict String String
}
{-| A config with char reference decoding turned on.
This will add ~40kb to your bundle, but it is necessary to decode
entities like "Δ" into "Δ".
run configWithoutCharRefs "abcΔdef"
== Ok [ text "abcΔdef" ]
-}
configWithCharRefs : Config
configWithCharRefs =
Config { charRefs = Html.CharRefs.default }
{-| A config with char reference decoding turned off.
If you know that the html you are parsing never has named character references,
or if it's sufficient to just consume them as undecoded text, then turning this off will shrink your bundle size.
run configWithoutCharRefs "abcΔdef"
== Ok [ text "abcΔdef" ]
-}
configWithoutCharRefs : Config
configWithoutCharRefs =
Config { charRefs = Dict.empty }
{-| Parse an html fragment into a list of html nodes.
The html fragment can have multiple top-level nodes.
@ -53,17 +96,17 @@ The html fragment can have multiple top-level nodes.
]
-}
run : String -> Result (List DeadEnd) (List Node)
run input =
Parser.run parseAll input
run : Config -> String -> Result (List DeadEnd) (List Node)
run cfg input =
Parser.run (parseAll cfg) input
{-| Like `run` except it only succeeds when the html input is a
single top-level element, and it always returns a single node.
-}
runElement : String -> Result (List DeadEnd) Node
runElement input =
Parser.run element input
runElement : Config -> String -> Result (List DeadEnd) Node
runElement cfg input =
Parser.run (element cfg) input
{-| An html document has a `<!doctype>` and then a root html node.
@ -146,17 +189,17 @@ Always returns a single root node. Wraps nodes in a root `<html>` node if one is
function will wrap them all in another `<html>` node.
-}
runDocument : String -> Result (List DeadEnd) Document
runDocument input =
Parser.run document input
runDocument : Config -> String -> Result (List DeadEnd) Document
runDocument cfg input =
Parser.run (document cfg) input
document : Parser Document
document =
document : Config -> Parser Document
document cfg =
succeed Document
|= doctype
|. ws
|= (zeroOrMore node
|= (zeroOrMore (node cfg)
|> map
(\nodes ->
case nodes of
@ -175,12 +218,12 @@ document =
)
parseAll : Parser (List Node)
parseAll =
parseAll : Config -> Parser (List Node)
parseAll cfg =
Parser.loop [] <|
\acc ->
oneOf
[ node |> map (\n -> Loop (mergeText n acc))
[ node cfg |> map (\n -> Loop (mergeText n acc))
, succeed () |> map (\_ -> Done (List.reverse acc))
]
@ -215,8 +258,8 @@ isSpace c =
-- ATTRIBUTES
attributeValueUnquoted : Parser String
attributeValueUnquoted =
attributeValueUnquoted : Config -> Parser String
attributeValueUnquoted cfg =
let
isUnquotedValueChar c =
not (isSpace c) && c /= '"' && c /= '\'' && c /= '=' && c /= '<' && c /= '>' && c /= '`' && c /= '&'
@ -224,14 +267,14 @@ attributeValueUnquoted =
oneOf
[ chompOneOrMore isUnquotedValueChar
|> getChompedString
, characterReference
, characterReference cfg
]
|> oneOrMore "attribute value"
|> map (String.join "")
attributeValueQuoted : Char -> Parser String
attributeValueQuoted quote =
attributeValueQuoted : Config -> Char -> Parser String
attributeValueQuoted cfg quote =
let
isQuotedValueChar c =
c /= quote && c /= '&'
@ -241,7 +284,7 @@ attributeValueQuoted quote =
|= (oneOf
[ chompOneOrMore isQuotedValueChar
|> getChompedString
, characterReference
, characterReference cfg
]
|> zeroOrMore
|> map (String.join "")
@ -274,8 +317,8 @@ attributeKey =
)
attribute : Parser ( String, String )
attribute =
attribute : Config -> Parser ( String, String )
attribute cfg =
succeed Tuple.pair
|= attributeKey
|. ws
@ -284,9 +327,9 @@ attribute =
|. symbol "="
|. ws
|= oneOf
[ attributeValueUnquoted -- <div foo=bar>
, attributeValueQuoted '"' -- <div foo="bar">
, attributeValueQuoted '\'' -- <div foo='bar'>
[ attributeValueUnquoted cfg -- <div foo=bar>
, attributeValueQuoted cfg '"' -- <div foo="bar">
, attributeValueQuoted cfg '\'' -- <div foo='bar'>
]
, succeed "" -- <div foo>
]
@ -334,15 +377,15 @@ anyCloseTag =
|. token ">"
node : Parser Node
node =
node : Config -> Parser Node
node cfg =
succeed identity
-- HACK: Ignore unmatched close tags like the browser does
|. zeroOrMore (backtrackable anyCloseTag)
|= oneOf
[ text
[ text cfg
, comment
, backtrackable element
, backtrackable (element cfg)
, justOneChar |> map Text
]
@ -355,11 +398,11 @@ comment =
|. symbol "-->"
text : Parser Node
text =
text : Config -> Parser Node
text cfg =
oneOf
[ succeed Text
|= backtrackable characterReference
|= backtrackable (characterReference cfg)
, succeed Text
|= (chompOneOrMore (\c -> c /= '<' && c /= '&') |> getChompedString)
]
@ -367,11 +410,11 @@ text =
{-| Parse any node unless it's one of the given tags.
-}
notNode : List String -> Parser Node
notNode tags =
notNode : Config -> List String -> Parser Node
notNode cfg tags =
oneOf
[ lookAhead
(openTag
(openTag cfg
|> andThen
(\( tag, _, _ ) ->
if List.member tag tags then
@ -381,20 +424,20 @@ notNode tags =
succeed ()
)
)
|> andThen (\_ -> element)
, text
|> andThen (\_ -> element cfg)
, text cfg
, comment
]
openTag : Parser ( String, List ( String, String ), OpenTagEnd )
openTag =
openTag : Config -> Parser ( String, List ( String, String ), OpenTagEnd )
openTag cfg =
succeed (\a b c -> ( a, b, c ))
|. symbol "<"
|. ws
|= tagName
|. ws
|= zeroOrMore attribute
|= zeroOrMore (attribute cfg)
|. ws
|= oneOf
[ succeed NoClose
@ -412,9 +455,9 @@ they always have a closing `</tag>`.
The element parser is useful when the html input will only have one top-level element.
-}
element : Parser Node
element =
openTag
element : Config -> Parser Node
element cfg =
openTag cfg
|> andThen
(\( tag, attrs, end ) ->
case end of
@ -437,10 +480,10 @@ element =
[ succeed identity
|= zeroOrMore
(if tag == "head" then
notNode [ tag, "body" ]
notNode cfg [ tag, "body" ]
else
notNode [ tag ]
notNode cfg [ tag ]
)
|. oneOf
[ backtrackable (closeTag tag)
@ -456,7 +499,7 @@ element =
oneOf
[ backtrackable (closeTag tag) |> map (\_ -> Done (List.reverse acc))
, succeed (\n -> Loop (mergeText n acc))
|= backtrackable node
|= backtrackable (node cfg)
, succeed () |> map (\_ -> Done (List.reverse acc))
]
)
@ -530,25 +573,25 @@ numericCharacterReference =
)
namedCharacterReference : Parser String
namedCharacterReference =
namedCharacterReference : Config -> Parser String
namedCharacterReference (Config cfg) =
chompOneOrMore Char.isAlpha
|> getChompedString
|> map
(\ref ->
Html.CharRefs.decode ref
Dict.get ref cfg.charRefs
|> Maybe.withDefault ("&" ++ ref ++ ";")
)
characterReference : Parser String
characterReference =
characterReference : Config -> Parser String
characterReference cfg =
succeed identity
|. chompIf ((==) '&')
|= oneOf
[ backtrackable numericCharacterReference
|. chompIf ((==) ';')
, backtrackable namedCharacterReference
, backtrackable (namedCharacterReference cfg)
|. chompIf ((==) ';')
, succeed "&"
]

View File

@ -2,12 +2,17 @@ module LocTests exposing (..)
import Expect exposing (Expectation)
import Fuzz exposing (Fuzzer, int, list, string)
import Html.CharRefs
import Html.Loc as Q exposing (Loc)
import Html.Parser exposing (Document, Node(..))
import Parser exposing (DeadEnd)
import Test exposing (..)
config =
Html.Parser.configWithCharRefs
type alias DirCase =
{ name : String
, html : String
@ -55,7 +60,7 @@ testTraversals cases =
)
actual =
Html.Parser.run x.html
Html.Parser.run config x.html
|> Result.toMaybe
|> Maybe.andThen List.head
|> Maybe.map Q.toLoc

View File

@ -2,11 +2,16 @@ module ParserTests exposing (..)
import Expect exposing (Expectation)
import Fuzz exposing (Fuzzer, int, list, string)
import Html.CharRefs
import Html.Parser exposing (Document, Node(..))
import Parser exposing (DeadEnd)
import Test exposing (..)
config =
Html.Parser.configWithCharRefs
testDoc : List ( String, String, Result (List DeadEnd) Document ) -> List Test
testDoc cases =
List.map
@ -15,7 +20,7 @@ testDoc cases =
(\_ ->
let
actual =
Html.Parser.runDocument html
Html.Parser.runDocument config html
in
case expected of
Ok _ ->
@ -41,7 +46,7 @@ testStringRoundtrip cases =
(\_ ->
let
actual =
Html.Parser.run html
Html.Parser.run config html
|> Result.map Html.Parser.nodesToString
in
case expected of
@ -68,7 +73,7 @@ testAll cases =
(\_ ->
let
actual =
Html.Parser.run html
Html.Parser.run config html
in
case expected of
Err _ ->
@ -395,13 +400,13 @@ scriptTests =
testParseAll : String -> List Node -> (() -> Expectation)
testParseAll s astList =
\_ ->
Expect.equal (Ok astList) (Html.Parser.run s)
Expect.equal (Ok astList) (Html.Parser.run config s)
testParse : String -> Node -> (() -> Expectation)
testParse input expected =
\_ ->
case Html.Parser.run input of
case Html.Parser.run config input of
Err message ->
Expect.fail (Parser.deadEndsToString message)