From e4926b77206b9ea2eecdc83537bbfa167024c45d Mon Sep 17 00:00:00 2001 From: Dan Neumann Date: Sun, 22 May 2022 16:04:14 -0500 Subject: [PATCH] Impl Config / optional char ref decoding --- src/Html/CharRefs.elm | 11 +-- src/Html/Parser.elm | 153 +++++++++++++++++++++++++++--------------- tests/LocTests.elm | 7 +- tests/ParserTests.elm | 15 +++-- 4 files changed, 117 insertions(+), 69 deletions(-) diff --git a/src/Html/CharRefs.elm b/src/Html/CharRefs.elm index af7b40b..cc27a30 100644 --- a/src/Html/CharRefs.elm +++ b/src/Html/CharRefs.elm @@ -1,15 +1,10 @@ -module Html.CharRefs exposing (decode) +module Html.CharRefs exposing (default) import Dict exposing (Dict) -decode : String -> Maybe String -decode name = - Dict.get name dict - - -dict : Dict String String -dict = +default : Dict String String +default = -- Source: https://www.w3.org/TR/html5/syntax.html#named-character-references [ ( "Aacute", "Á" ) , ( "aacute", "á" ) diff --git a/src/Html/Parser.elm b/src/Html/Parser.elm index 9563ac7..d7b7a52 100644 --- a/src/Html/Parser.elm +++ b/src/Html/Parser.elm @@ -1,5 +1,6 @@ module Html.Parser exposing - ( Node(..), Document + ( Node(..), Document, Config + , configWithCharRefs, configWithoutCharRefs , run, runElement, runDocument , nodeToHtml, nodesToHtml, nodeToString, nodesToString, nodeToPrettyString, nodesToPrettyString, documentToString, documentToPrettyString ) @@ -10,7 +11,12 @@ into strings or Elm's virtual dom nodes. # Definition -@docs Node, Document +@docs Node, Document, Config + + +# Config + +@docs configWithCharRefs, configWithoutCharRefs # Parse @@ -24,6 +30,7 @@ into strings or Elm's virtual dom nodes. -} +import Dict exposing (Dict) import Hex import Html import Html.Attributes @@ -42,6 +49,42 @@ type Node | Element String (List ( String, String )) (List Node) +{-| The config object tells the parser if it should decode named character references. +-} +type Config + = Config + { charRefs : Dict String String + } + + +{-| A config with char reference decoding turned on. + +This will add ~40kb to your bundle, but it is necessary to decode +entities like "Δ" into "Δ". + + run configWithoutCharRefs "abcΔdef" + == Ok [ text "abcΔdef" ] + +-} +configWithCharRefs : Config +configWithCharRefs = + Config { charRefs = Html.CharRefs.default } + + +{-| A config with char reference decoding turned off. + +If you know that the html you are parsing never has named character references, +or if it's sufficient to just consume them as undecoded text, then turning this off will shrink your bundle size. + + run configWithoutCharRefs "abcΔdef" + == Ok [ text "abcΔdef" ] + +-} +configWithoutCharRefs : Config +configWithoutCharRefs = + Config { charRefs = Dict.empty } + + {-| Parse an html fragment into a list of html nodes. The html fragment can have multiple top-level nodes. @@ -53,17 +96,17 @@ The html fragment can have multiple top-level nodes. ] -} -run : String -> Result (List DeadEnd) (List Node) -run input = - Parser.run parseAll input +run : Config -> String -> Result (List DeadEnd) (List Node) +run cfg input = + Parser.run (parseAll cfg) input {-| Like `run` except it only succeeds when the html input is a single top-level element, and it always returns a single node. -} -runElement : String -> Result (List DeadEnd) Node -runElement input = - Parser.run element input +runElement : Config -> String -> Result (List DeadEnd) Node +runElement cfg input = + Parser.run (element cfg) input {-| An html document has a `` and then a root html node. @@ -146,17 +189,17 @@ Always returns a single root node. Wraps nodes in a root `` node if one is function will wrap them all in another `` node. -} -runDocument : String -> Result (List DeadEnd) Document -runDocument input = - Parser.run document input +runDocument : Config -> String -> Result (List DeadEnd) Document +runDocument cfg input = + Parser.run (document cfg) input -document : Parser Document -document = +document : Config -> Parser Document +document cfg = succeed Document |= doctype |. ws - |= (zeroOrMore node + |= (zeroOrMore (node cfg) |> map (\nodes -> case nodes of @@ -175,12 +218,12 @@ document = ) -parseAll : Parser (List Node) -parseAll = +parseAll : Config -> Parser (List Node) +parseAll cfg = Parser.loop [] <| \acc -> oneOf - [ node |> map (\n -> Loop (mergeText n acc)) + [ node cfg |> map (\n -> Loop (mergeText n acc)) , succeed () |> map (\_ -> Done (List.reverse acc)) ] @@ -215,8 +258,8 @@ isSpace c = -- ATTRIBUTES -attributeValueUnquoted : Parser String -attributeValueUnquoted = +attributeValueUnquoted : Config -> Parser String +attributeValueUnquoted cfg = let isUnquotedValueChar c = not (isSpace c) && c /= '"' && c /= '\'' && c /= '=' && c /= '<' && c /= '>' && c /= '`' && c /= '&' @@ -224,14 +267,14 @@ attributeValueUnquoted = oneOf [ chompOneOrMore isUnquotedValueChar |> getChompedString - , characterReference + , characterReference cfg ] |> oneOrMore "attribute value" |> map (String.join "") -attributeValueQuoted : Char -> Parser String -attributeValueQuoted quote = +attributeValueQuoted : Config -> Char -> Parser String +attributeValueQuoted cfg quote = let isQuotedValueChar c = c /= quote && c /= '&' @@ -241,7 +284,7 @@ attributeValueQuoted quote = |= (oneOf [ chompOneOrMore isQuotedValueChar |> getChompedString - , characterReference + , characterReference cfg ] |> zeroOrMore |> map (String.join "") @@ -274,8 +317,8 @@ attributeKey = ) -attribute : Parser ( String, String ) -attribute = +attribute : Config -> Parser ( String, String ) +attribute cfg = succeed Tuple.pair |= attributeKey |. ws @@ -284,9 +327,9 @@ attribute = |. symbol "=" |. ws |= oneOf - [ attributeValueUnquoted --
- , attributeValueQuoted '"' --
- , attributeValueQuoted '\'' --
+ [ attributeValueUnquoted cfg --
+ , attributeValueQuoted cfg '"' --
+ , attributeValueQuoted cfg '\'' --
] , succeed "" --
] @@ -334,15 +377,15 @@ anyCloseTag = |. token ">" -node : Parser Node -node = +node : Config -> Parser Node +node cfg = succeed identity -- HACK: Ignore unmatched close tags like the browser does |. zeroOrMore (backtrackable anyCloseTag) |= oneOf - [ text + [ text cfg , comment - , backtrackable element + , backtrackable (element cfg) , justOneChar |> map Text ] @@ -355,11 +398,11 @@ comment = |. symbol "-->" -text : Parser Node -text = +text : Config -> Parser Node +text cfg = oneOf [ succeed Text - |= backtrackable characterReference + |= backtrackable (characterReference cfg) , succeed Text |= (chompOneOrMore (\c -> c /= '<' && c /= '&') |> getChompedString) ] @@ -367,11 +410,11 @@ text = {-| Parse any node unless it's one of the given tags. -} -notNode : List String -> Parser Node -notNode tags = +notNode : Config -> List String -> Parser Node +notNode cfg tags = oneOf [ lookAhead - (openTag + (openTag cfg |> andThen (\( tag, _, _ ) -> if List.member tag tags then @@ -381,20 +424,20 @@ notNode tags = succeed () ) ) - |> andThen (\_ -> element) - , text + |> andThen (\_ -> element cfg) + , text cfg , comment ] -openTag : Parser ( String, List ( String, String ), OpenTagEnd ) -openTag = +openTag : Config -> Parser ( String, List ( String, String ), OpenTagEnd ) +openTag cfg = succeed (\a b c -> ( a, b, c )) |. symbol "<" |. ws |= tagName |. ws - |= zeroOrMore attribute + |= zeroOrMore (attribute cfg) |. ws |= oneOf [ succeed NoClose @@ -412,9 +455,9 @@ they always have a closing ``. The element parser is useful when the html input will only have one top-level element. -} -element : Parser Node -element = - openTag +element : Config -> Parser Node +element cfg = + openTag cfg |> andThen (\( tag, attrs, end ) -> case end of @@ -437,10 +480,10 @@ element = [ succeed identity |= zeroOrMore (if tag == "head" then - notNode [ tag, "body" ] + notNode cfg [ tag, "body" ] else - notNode [ tag ] + notNode cfg [ tag ] ) |. oneOf [ backtrackable (closeTag tag) @@ -456,7 +499,7 @@ element = oneOf [ backtrackable (closeTag tag) |> map (\_ -> Done (List.reverse acc)) , succeed (\n -> Loop (mergeText n acc)) - |= backtrackable node + |= backtrackable (node cfg) , succeed () |> map (\_ -> Done (List.reverse acc)) ] ) @@ -530,25 +573,25 @@ numericCharacterReference = ) -namedCharacterReference : Parser String -namedCharacterReference = +namedCharacterReference : Config -> Parser String +namedCharacterReference (Config cfg) = chompOneOrMore Char.isAlpha |> getChompedString |> map (\ref -> - Html.CharRefs.decode ref + Dict.get ref cfg.charRefs |> Maybe.withDefault ("&" ++ ref ++ ";") ) -characterReference : Parser String -characterReference = +characterReference : Config -> Parser String +characterReference cfg = succeed identity |. chompIf ((==) '&') |= oneOf [ backtrackable numericCharacterReference |. chompIf ((==) ';') - , backtrackable namedCharacterReference + , backtrackable (namedCharacterReference cfg) |. chompIf ((==) ';') , succeed "&" ] diff --git a/tests/LocTests.elm b/tests/LocTests.elm index 0546868..52a8321 100644 --- a/tests/LocTests.elm +++ b/tests/LocTests.elm @@ -2,12 +2,17 @@ module LocTests exposing (..) import Expect exposing (Expectation) import Fuzz exposing (Fuzzer, int, list, string) +import Html.CharRefs import Html.Loc as Q exposing (Loc) import Html.Parser exposing (Document, Node(..)) import Parser exposing (DeadEnd) import Test exposing (..) +config = + Html.Parser.configWithCharRefs + + type alias DirCase = { name : String , html : String @@ -55,7 +60,7 @@ testTraversals cases = ) actual = - Html.Parser.run x.html + Html.Parser.run config x.html |> Result.toMaybe |> Maybe.andThen List.head |> Maybe.map Q.toLoc diff --git a/tests/ParserTests.elm b/tests/ParserTests.elm index 16780bd..eaf4f22 100644 --- a/tests/ParserTests.elm +++ b/tests/ParserTests.elm @@ -2,11 +2,16 @@ module ParserTests exposing (..) import Expect exposing (Expectation) import Fuzz exposing (Fuzzer, int, list, string) +import Html.CharRefs import Html.Parser exposing (Document, Node(..)) import Parser exposing (DeadEnd) import Test exposing (..) +config = + Html.Parser.configWithCharRefs + + testDoc : List ( String, String, Result (List DeadEnd) Document ) -> List Test testDoc cases = List.map @@ -15,7 +20,7 @@ testDoc cases = (\_ -> let actual = - Html.Parser.runDocument html + Html.Parser.runDocument config html in case expected of Ok _ -> @@ -41,7 +46,7 @@ testStringRoundtrip cases = (\_ -> let actual = - Html.Parser.run html + Html.Parser.run config html |> Result.map Html.Parser.nodesToString in case expected of @@ -68,7 +73,7 @@ testAll cases = (\_ -> let actual = - Html.Parser.run html + Html.Parser.run config html in case expected of Err _ -> @@ -395,13 +400,13 @@ scriptTests = testParseAll : String -> List Node -> (() -> Expectation) testParseAll s astList = \_ -> - Expect.equal (Ok astList) (Html.Parser.run s) + Expect.equal (Ok astList) (Html.Parser.run config s) testParse : String -> Node -> (() -> Expectation) testParse input expected = \_ -> - case Html.Parser.run input of + case Html.Parser.run config input of Err message -> Expect.fail (Parser.deadEndsToString message)