Impl Config / optional char ref decoding

2024-11-25 08:19:17 +03:00 · 2022-05-22 16:04:14 -05:00 · 2022-05-22 16:04:14 -05:00 · e4926b7720
commit e4926b7720
parent 3126b1965f
4 changed files with 117 additions and 69 deletions
--- a/src/Html/CharRefs.elm
+++ b/src/Html/CharRefs.elm
@ -1,15 +1,10 @@
-module Html.CharRefs exposing (decode)
+module Html.CharRefs exposing (default)

 import Dict exposing (Dict)


-decode : String -> Maybe String
-decode name =
-    Dict.get name dict
-
-
-dict : Dict String String
-dict =
+default : Dict String String
+default =
    -- Source: https://www.w3.org/TR/html5/syntax.html#named-character-references
    [ ( "Aacute", "Á" )
    , ( "aacute", "á" )
--- a/src/Html/Parser.elm
+++ b/src/Html/Parser.elm
@ -1,5 +1,6 @@
 module Html.Parser exposing
-    ( Node(..), Document
+    ( Node(..), Document, Config
+    , configWithCharRefs, configWithoutCharRefs
    , run, runElement, runDocument
    , nodeToHtml, nodesToHtml, nodeToString, nodesToString, nodeToPrettyString, nodesToPrettyString, documentToString, documentToPrettyString
    )
@ -10,7 +11,12 @@ into strings or Elm's virtual dom nodes.

 # Definition

-@docs Node, Document
+@docs Node, Document, Config
+
+
+# Config
+
+@docs configWithCharRefs, configWithoutCharRefs


 # Parse
@ -24,6 +30,7 @@ into strings or Elm's virtual dom nodes.

 -}

+import Dict exposing (Dict)
 import Hex
 import Html
 import Html.Attributes
@ -42,6 +49,42 @@ type Node
    | Element String (List ( String, String )) (List Node)


+{-| The config object tells the parser if it should decode named character references.
+-}
+type Config
+    = Config
+        { charRefs : Dict String String
+        }
+
+
+{-| A config with char reference decoding turned on.
+
+This will add ~40kb to your bundle, but it is necessary to decode
+entities like "&Delta;" into "Δ".
+
+    run configWithoutCharRefs "abc&Delta;def"
+        == Ok [ text "abcΔdef" ]
+
+-}
+configWithCharRefs : Config
+configWithCharRefs =
+    Config { charRefs = Html.CharRefs.default }
+
+
+{-| A config with char reference decoding turned off.
+
+If you know that the html you are parsing never has named character references,
+or if it's sufficient to just consume them as undecoded text, then turning this off will shrink your bundle size.
+
+    run configWithoutCharRefs "abc&Delta;def"
+        == Ok [ text "abc&Delta;def" ]
+
+-}
+configWithoutCharRefs : Config
+configWithoutCharRefs =
+    Config { charRefs = Dict.empty }
+
+
 {-| Parse an html fragment into a list of html nodes.

 The html fragment can have multiple top-level nodes.
@ -53,17 +96,17 @@ The html fragment can have multiple top-level nodes.
            ]

 -}
-run : String -> Result (List DeadEnd) (List Node)
-run input =
-    Parser.run parseAll input
+run : Config -> String -> Result (List DeadEnd) (List Node)
+run cfg input =
+    Parser.run (parseAll cfg) input


 {-| Like `run` except it only succeeds when the html input is a
 single top-level element, and it always returns a single node.
 -}
-runElement : String -> Result (List DeadEnd) Node
-runElement input =
-    Parser.run element input
+runElement : Config -> String -> Result (List DeadEnd) Node
+runElement cfg input =
+    Parser.run (element cfg) input


 {-| An html document has a `<!doctype>` and then a root html node.
@ -146,17 +189,17 @@ Always returns a single root node. Wraps nodes in a root `<html>` node if one is
 function will wrap them all in another `<html>` node.

 -}
-runDocument : String -> Result (List DeadEnd) Document
-runDocument input =
-    Parser.run document input
+runDocument : Config -> String -> Result (List DeadEnd) Document
+runDocument cfg input =
+    Parser.run (document cfg) input


-document : Parser Document
-document =
+document : Config -> Parser Document
+document cfg =
    succeed Document
        |= doctype
        |. ws
-        |= (zeroOrMore node
+        |= (zeroOrMore (node cfg)
                |> map
                    (\nodes ->
                        case nodes of
@ -175,12 +218,12 @@ document =
           )


-parseAll : Parser (List Node)
-parseAll =
+parseAll : Config -> Parser (List Node)
+parseAll cfg =
    Parser.loop [] <|
        \acc ->
            oneOf
-                [ node |> map (\n -> Loop (mergeText n acc))
+                [ node cfg |> map (\n -> Loop (mergeText n acc))
                , succeed () |> map (\_ -> Done (List.reverse acc))
                ]

@ -215,8 +258,8 @@ isSpace c =
 -- ATTRIBUTES


-attributeValueUnquoted : Parser String
-attributeValueUnquoted =
+attributeValueUnquoted : Config -> Parser String
+attributeValueUnquoted cfg =
    let
        isUnquotedValueChar c =
            not (isSpace c) && c /= '"' && c /= '\'' && c /= '=' && c /= '<' && c /= '>' && c /= '`' && c /= '&'
@ -224,14 +267,14 @@ attributeValueUnquoted =
    oneOf
        [ chompOneOrMore isUnquotedValueChar
            |> getChompedString
-        , characterReference
+        , characterReference cfg
        ]
        |> oneOrMore "attribute value"
        |> map (String.join "")


-attributeValueQuoted : Char -> Parser String
-attributeValueQuoted quote =
+attributeValueQuoted : Config -> Char -> Parser String
+attributeValueQuoted cfg quote =
    let
        isQuotedValueChar c =
            c /= quote && c /= '&'
@ -241,7 +284,7 @@ attributeValueQuoted quote =
        |= (oneOf
                [ chompOneOrMore isQuotedValueChar
                    |> getChompedString
-                , characterReference
+                , characterReference cfg
                ]
                |> zeroOrMore
                |> map (String.join "")
@ -274,8 +317,8 @@ attributeKey =
           )


-attribute : Parser ( String, String )
-attribute =
+attribute : Config -> Parser ( String, String )
+attribute cfg =
    succeed Tuple.pair
        |= attributeKey
        |. ws
@ -284,9 +327,9 @@ attribute =
                |. symbol "="
                |. ws
                |= oneOf
-                    [ attributeValueUnquoted -- <div foo=bar>
-                    , attributeValueQuoted '"' -- <div foo="bar">
-                    , attributeValueQuoted '\'' -- <div foo='bar'>
+                    [ attributeValueUnquoted cfg -- <div foo=bar>
+                    , attributeValueQuoted cfg '"' -- <div foo="bar">
+                    , attributeValueQuoted cfg '\'' -- <div foo='bar'>
                    ]
            , succeed "" -- <div foo>
            ]
@ -334,15 +377,15 @@ anyCloseTag =
        |. token ">"


-node : Parser Node
-node =
+node : Config -> Parser Node
+node cfg =
    succeed identity
        -- HACK: Ignore unmatched close tags like the browser does
        |. zeroOrMore (backtrackable anyCloseTag)
        |= oneOf
-            [ text
+            [ text cfg
            , comment
-            , backtrackable element
+            , backtrackable (element cfg)
            , justOneChar |> map Text
            ]

@ -355,11 +398,11 @@ comment =
        |. symbol "-->"


-text : Parser Node
-text =
+text : Config -> Parser Node
+text cfg =
    oneOf
        [ succeed Text
-            |= backtrackable characterReference
+            |= backtrackable (characterReference cfg)
        , succeed Text
            |= (chompOneOrMore (\c -> c /= '<' && c /= '&') |> getChompedString)
        ]
@ -367,11 +410,11 @@ text =

 {-| Parse any node unless it's one of the given tags.
 -}
-notNode : List String -> Parser Node
-notNode tags =
+notNode : Config -> List String -> Parser Node
+notNode cfg tags =
    oneOf
        [ lookAhead
-            (openTag
+            (openTag cfg
                |> andThen
                    (\( tag, _, _ ) ->
                        if List.member tag tags then
@ -381,20 +424,20 @@ notNode tags =
                            succeed ()
                    )
            )
-            |> andThen (\_ -> element)
-        , text
+            |> andThen (\_ -> element cfg)
+        , text cfg
        , comment
        ]


-openTag : Parser ( String, List ( String, String ), OpenTagEnd )
-openTag =
+openTag : Config -> Parser ( String, List ( String, String ), OpenTagEnd )
+openTag cfg =
    succeed (\a b c -> ( a, b, c ))
        |. symbol "<"
        |. ws
        |= tagName
        |. ws
-        |= zeroOrMore attribute
+        |= zeroOrMore (attribute cfg)
        |. ws
        |= oneOf
            [ succeed NoClose
@ -412,9 +455,9 @@ they always have a closing `</tag>`.
 The element parser is useful when the html input will only have one top-level element.

 -}
-element : Parser Node
-element =
-    openTag
+element : Config -> Parser Node
+element cfg =
+    openTag cfg
        |> andThen
            (\( tag, attrs, end ) ->
                case end of
@ -437,10 +480,10 @@ element =
                                    [ succeed identity
                                        |= zeroOrMore
                                            (if tag == "head" then
-                                                notNode [ tag, "body" ]
+                                                notNode cfg [ tag, "body" ]

                                             else
-                                                notNode [ tag ]
+                                                notNode cfg [ tag ]
                                            )
                                        |. oneOf
                                            [ backtrackable (closeTag tag)
@ -456,7 +499,7 @@ element =
                                            oneOf
                                                [ backtrackable (closeTag tag) |> map (\_ -> Done (List.reverse acc))
                                                , succeed (\n -> Loop (mergeText n acc))
-                                                    |= backtrackable node
+                                                    |= backtrackable (node cfg)
                                                , succeed () |> map (\_ -> Done (List.reverse acc))
                                                ]
                                   )
@ -530,25 +573,25 @@ numericCharacterReference =
           )


-namedCharacterReference : Parser String
-namedCharacterReference =
+namedCharacterReference : Config -> Parser String
+namedCharacterReference (Config cfg) =
    chompOneOrMore Char.isAlpha
        |> getChompedString
        |> map
            (\ref ->
-                Html.CharRefs.decode ref
+                Dict.get ref cfg.charRefs
                    |> Maybe.withDefault ("&" ++ ref ++ ";")
            )


-characterReference : Parser String
-characterReference =
+characterReference : Config -> Parser String
+characterReference cfg =
    succeed identity
        |. chompIf ((==) '&')
        |= oneOf
            [ backtrackable numericCharacterReference
                |. chompIf ((==) ';')
-            , backtrackable namedCharacterReference
+            , backtrackable (namedCharacterReference cfg)
                |. chompIf ((==) ';')
            , succeed "&"
            ]
--- a/tests/LocTests.elm
+++ b/tests/LocTests.elm
@ -2,12 +2,17 @@ module LocTests exposing (..)

 import Expect exposing (Expectation)
 import Fuzz exposing (Fuzzer, int, list, string)
+import Html.CharRefs
 import Html.Loc as Q exposing (Loc)
 import Html.Parser exposing (Document, Node(..))
 import Parser exposing (DeadEnd)
 import Test exposing (..)


+config =
+    Html.Parser.configWithCharRefs
+
+
 type alias DirCase =
    { name : String
    , html : String
@ -55,7 +60,7 @@ testTraversals cases =
                                    )

                        actual =
-                            Html.Parser.run x.html
+                            Html.Parser.run config x.html
                                |> Result.toMaybe
                                |> Maybe.andThen List.head
                                |> Maybe.map Q.toLoc
--- a/tests/ParserTests.elm
+++ b/tests/ParserTests.elm
@ -2,11 +2,16 @@ module ParserTests exposing (..)

 import Expect exposing (Expectation)
 import Fuzz exposing (Fuzzer, int, list, string)
+import Html.CharRefs
 import Html.Parser exposing (Document, Node(..))
 import Parser exposing (DeadEnd)
 import Test exposing (..)


+config =
+    Html.Parser.configWithCharRefs
+
+
 testDoc : List ( String, String, Result (List DeadEnd) Document ) -> List Test
 testDoc cases =
    List.map
@ -15,7 +20,7 @@ testDoc cases =
                (\_ ->
                    let
                        actual =
-                            Html.Parser.runDocument html
+                            Html.Parser.runDocument config html
                    in
                    case expected of
                        Ok _ ->
@ -41,7 +46,7 @@ testStringRoundtrip cases =
                (\_ ->
                    let
                        actual =
-                            Html.Parser.run html
+                            Html.Parser.run config html
                                |> Result.map Html.Parser.nodesToString
                    in
                    case expected of
@ -68,7 +73,7 @@ testAll cases =
                (\_ ->
                    let
                        actual =
-                            Html.Parser.run html
+                            Html.Parser.run config html
                    in
                    case expected of
                        Err _ ->
@ -395,13 +400,13 @@ scriptTests =
 testParseAll : String -> List Node -> (() -> Expectation)
 testParseAll s astList =
    \_ ->
-        Expect.equal (Ok astList) (Html.Parser.run s)
+        Expect.equal (Ok astList) (Html.Parser.run config s)


 testParse : String -> Node -> (() -> Expectation)
 testParse input expected =
    \_ ->
-        case Html.Parser.run input of
+        case Html.Parser.run config input of
            Err message ->
                Expect.fail (Parser.deadEndsToString message)