mirror of
https://github.com/danneu/html-parser.git
synced 2024-11-28 22:32:00 +03:00
Impl Config / optional char ref decoding
This commit is contained in:
parent
3126b1965f
commit
e4926b7720
@ -1,15 +1,10 @@
|
||||
module Html.CharRefs exposing (decode)
|
||||
module Html.CharRefs exposing (default)
|
||||
|
||||
import Dict exposing (Dict)
|
||||
|
||||
|
||||
decode : String -> Maybe String
|
||||
decode name =
|
||||
Dict.get name dict
|
||||
|
||||
|
||||
dict : Dict String String
|
||||
dict =
|
||||
default : Dict String String
|
||||
default =
|
||||
-- Source: https://www.w3.org/TR/html5/syntax.html#named-character-references
|
||||
[ ( "Aacute", "Á" )
|
||||
, ( "aacute", "á" )
|
||||
|
@ -1,5 +1,6 @@
|
||||
module Html.Parser exposing
|
||||
( Node(..), Document
|
||||
( Node(..), Document, Config
|
||||
, configWithCharRefs, configWithoutCharRefs
|
||||
, run, runElement, runDocument
|
||||
, nodeToHtml, nodesToHtml, nodeToString, nodesToString, nodeToPrettyString, nodesToPrettyString, documentToString, documentToPrettyString
|
||||
)
|
||||
@ -10,7 +11,12 @@ into strings or Elm's virtual dom nodes.
|
||||
|
||||
# Definition
|
||||
|
||||
@docs Node, Document
|
||||
@docs Node, Document, Config
|
||||
|
||||
|
||||
# Config
|
||||
|
||||
@docs configWithCharRefs, configWithoutCharRefs
|
||||
|
||||
|
||||
# Parse
|
||||
@ -24,6 +30,7 @@ into strings or Elm's virtual dom nodes.
|
||||
|
||||
-}
|
||||
|
||||
import Dict exposing (Dict)
|
||||
import Hex
|
||||
import Html
|
||||
import Html.Attributes
|
||||
@ -42,6 +49,42 @@ type Node
|
||||
| Element String (List ( String, String )) (List Node)
|
||||
|
||||
|
||||
{-| The config object tells the parser if it should decode named character references.
|
||||
-}
|
||||
type Config
|
||||
= Config
|
||||
{ charRefs : Dict String String
|
||||
}
|
||||
|
||||
|
||||
{-| A config with char reference decoding turned on.
|
||||
|
||||
This will add ~40kb to your bundle, but it is necessary to decode
|
||||
entities like "Δ" into "Δ".
|
||||
|
||||
run configWithoutCharRefs "abcΔdef"
|
||||
== Ok [ text "abcΔdef" ]
|
||||
|
||||
-}
|
||||
configWithCharRefs : Config
|
||||
configWithCharRefs =
|
||||
Config { charRefs = Html.CharRefs.default }
|
||||
|
||||
|
||||
{-| A config with char reference decoding turned off.
|
||||
|
||||
If you know that the html you are parsing never has named character references,
|
||||
or if it's sufficient to just consume them as undecoded text, then turning this off will shrink your bundle size.
|
||||
|
||||
run configWithoutCharRefs "abcΔdef"
|
||||
== Ok [ text "abcΔdef" ]
|
||||
|
||||
-}
|
||||
configWithoutCharRefs : Config
|
||||
configWithoutCharRefs =
|
||||
Config { charRefs = Dict.empty }
|
||||
|
||||
|
||||
{-| Parse an html fragment into a list of html nodes.
|
||||
|
||||
The html fragment can have multiple top-level nodes.
|
||||
@ -53,17 +96,17 @@ The html fragment can have multiple top-level nodes.
|
||||
]
|
||||
|
||||
-}
|
||||
run : String -> Result (List DeadEnd) (List Node)
|
||||
run input =
|
||||
Parser.run parseAll input
|
||||
run : Config -> String -> Result (List DeadEnd) (List Node)
|
||||
run cfg input =
|
||||
Parser.run (parseAll cfg) input
|
||||
|
||||
|
||||
{-| Like `run` except it only succeeds when the html input is a
|
||||
single top-level element, and it always returns a single node.
|
||||
-}
|
||||
runElement : String -> Result (List DeadEnd) Node
|
||||
runElement input =
|
||||
Parser.run element input
|
||||
runElement : Config -> String -> Result (List DeadEnd) Node
|
||||
runElement cfg input =
|
||||
Parser.run (element cfg) input
|
||||
|
||||
|
||||
{-| An html document has a `<!doctype>` and then a root html node.
|
||||
@ -146,17 +189,17 @@ Always returns a single root node. Wraps nodes in a root `<html>` node if one is
|
||||
function will wrap them all in another `<html>` node.
|
||||
|
||||
-}
|
||||
runDocument : String -> Result (List DeadEnd) Document
|
||||
runDocument input =
|
||||
Parser.run document input
|
||||
runDocument : Config -> String -> Result (List DeadEnd) Document
|
||||
runDocument cfg input =
|
||||
Parser.run (document cfg) input
|
||||
|
||||
|
||||
document : Parser Document
|
||||
document =
|
||||
document : Config -> Parser Document
|
||||
document cfg =
|
||||
succeed Document
|
||||
|= doctype
|
||||
|. ws
|
||||
|= (zeroOrMore node
|
||||
|= (zeroOrMore (node cfg)
|
||||
|> map
|
||||
(\nodes ->
|
||||
case nodes of
|
||||
@ -175,12 +218,12 @@ document =
|
||||
)
|
||||
|
||||
|
||||
parseAll : Parser (List Node)
|
||||
parseAll =
|
||||
parseAll : Config -> Parser (List Node)
|
||||
parseAll cfg =
|
||||
Parser.loop [] <|
|
||||
\acc ->
|
||||
oneOf
|
||||
[ node |> map (\n -> Loop (mergeText n acc))
|
||||
[ node cfg |> map (\n -> Loop (mergeText n acc))
|
||||
, succeed () |> map (\_ -> Done (List.reverse acc))
|
||||
]
|
||||
|
||||
@ -215,8 +258,8 @@ isSpace c =
|
||||
-- ATTRIBUTES
|
||||
|
||||
|
||||
attributeValueUnquoted : Parser String
|
||||
attributeValueUnquoted =
|
||||
attributeValueUnquoted : Config -> Parser String
|
||||
attributeValueUnquoted cfg =
|
||||
let
|
||||
isUnquotedValueChar c =
|
||||
not (isSpace c) && c /= '"' && c /= '\'' && c /= '=' && c /= '<' && c /= '>' && c /= '`' && c /= '&'
|
||||
@ -224,14 +267,14 @@ attributeValueUnquoted =
|
||||
oneOf
|
||||
[ chompOneOrMore isUnquotedValueChar
|
||||
|> getChompedString
|
||||
, characterReference
|
||||
, characterReference cfg
|
||||
]
|
||||
|> oneOrMore "attribute value"
|
||||
|> map (String.join "")
|
||||
|
||||
|
||||
attributeValueQuoted : Char -> Parser String
|
||||
attributeValueQuoted quote =
|
||||
attributeValueQuoted : Config -> Char -> Parser String
|
||||
attributeValueQuoted cfg quote =
|
||||
let
|
||||
isQuotedValueChar c =
|
||||
c /= quote && c /= '&'
|
||||
@ -241,7 +284,7 @@ attributeValueQuoted quote =
|
||||
|= (oneOf
|
||||
[ chompOneOrMore isQuotedValueChar
|
||||
|> getChompedString
|
||||
, characterReference
|
||||
, characterReference cfg
|
||||
]
|
||||
|> zeroOrMore
|
||||
|> map (String.join "")
|
||||
@ -274,8 +317,8 @@ attributeKey =
|
||||
)
|
||||
|
||||
|
||||
attribute : Parser ( String, String )
|
||||
attribute =
|
||||
attribute : Config -> Parser ( String, String )
|
||||
attribute cfg =
|
||||
succeed Tuple.pair
|
||||
|= attributeKey
|
||||
|. ws
|
||||
@ -284,9 +327,9 @@ attribute =
|
||||
|. symbol "="
|
||||
|. ws
|
||||
|= oneOf
|
||||
[ attributeValueUnquoted -- <div foo=bar>
|
||||
, attributeValueQuoted '"' -- <div foo="bar">
|
||||
, attributeValueQuoted '\'' -- <div foo='bar'>
|
||||
[ attributeValueUnquoted cfg -- <div foo=bar>
|
||||
, attributeValueQuoted cfg '"' -- <div foo="bar">
|
||||
, attributeValueQuoted cfg '\'' -- <div foo='bar'>
|
||||
]
|
||||
, succeed "" -- <div foo>
|
||||
]
|
||||
@ -334,15 +377,15 @@ anyCloseTag =
|
||||
|. token ">"
|
||||
|
||||
|
||||
node : Parser Node
|
||||
node =
|
||||
node : Config -> Parser Node
|
||||
node cfg =
|
||||
succeed identity
|
||||
-- HACK: Ignore unmatched close tags like the browser does
|
||||
|. zeroOrMore (backtrackable anyCloseTag)
|
||||
|= oneOf
|
||||
[ text
|
||||
[ text cfg
|
||||
, comment
|
||||
, backtrackable element
|
||||
, backtrackable (element cfg)
|
||||
, justOneChar |> map Text
|
||||
]
|
||||
|
||||
@ -355,11 +398,11 @@ comment =
|
||||
|. symbol "-->"
|
||||
|
||||
|
||||
text : Parser Node
|
||||
text =
|
||||
text : Config -> Parser Node
|
||||
text cfg =
|
||||
oneOf
|
||||
[ succeed Text
|
||||
|= backtrackable characterReference
|
||||
|= backtrackable (characterReference cfg)
|
||||
, succeed Text
|
||||
|= (chompOneOrMore (\c -> c /= '<' && c /= '&') |> getChompedString)
|
||||
]
|
||||
@ -367,11 +410,11 @@ text =
|
||||
|
||||
{-| Parse any node unless it's one of the given tags.
|
||||
-}
|
||||
notNode : List String -> Parser Node
|
||||
notNode tags =
|
||||
notNode : Config -> List String -> Parser Node
|
||||
notNode cfg tags =
|
||||
oneOf
|
||||
[ lookAhead
|
||||
(openTag
|
||||
(openTag cfg
|
||||
|> andThen
|
||||
(\( tag, _, _ ) ->
|
||||
if List.member tag tags then
|
||||
@ -381,20 +424,20 @@ notNode tags =
|
||||
succeed ()
|
||||
)
|
||||
)
|
||||
|> andThen (\_ -> element)
|
||||
, text
|
||||
|> andThen (\_ -> element cfg)
|
||||
, text cfg
|
||||
, comment
|
||||
]
|
||||
|
||||
|
||||
openTag : Parser ( String, List ( String, String ), OpenTagEnd )
|
||||
openTag =
|
||||
openTag : Config -> Parser ( String, List ( String, String ), OpenTagEnd )
|
||||
openTag cfg =
|
||||
succeed (\a b c -> ( a, b, c ))
|
||||
|. symbol "<"
|
||||
|. ws
|
||||
|= tagName
|
||||
|. ws
|
||||
|= zeroOrMore attribute
|
||||
|= zeroOrMore (attribute cfg)
|
||||
|. ws
|
||||
|= oneOf
|
||||
[ succeed NoClose
|
||||
@ -412,9 +455,9 @@ they always have a closing `</tag>`.
|
||||
The element parser is useful when the html input will only have one top-level element.
|
||||
|
||||
-}
|
||||
element : Parser Node
|
||||
element =
|
||||
openTag
|
||||
element : Config -> Parser Node
|
||||
element cfg =
|
||||
openTag cfg
|
||||
|> andThen
|
||||
(\( tag, attrs, end ) ->
|
||||
case end of
|
||||
@ -437,10 +480,10 @@ element =
|
||||
[ succeed identity
|
||||
|= zeroOrMore
|
||||
(if tag == "head" then
|
||||
notNode [ tag, "body" ]
|
||||
notNode cfg [ tag, "body" ]
|
||||
|
||||
else
|
||||
notNode [ tag ]
|
||||
notNode cfg [ tag ]
|
||||
)
|
||||
|. oneOf
|
||||
[ backtrackable (closeTag tag)
|
||||
@ -456,7 +499,7 @@ element =
|
||||
oneOf
|
||||
[ backtrackable (closeTag tag) |> map (\_ -> Done (List.reverse acc))
|
||||
, succeed (\n -> Loop (mergeText n acc))
|
||||
|= backtrackable node
|
||||
|= backtrackable (node cfg)
|
||||
, succeed () |> map (\_ -> Done (List.reverse acc))
|
||||
]
|
||||
)
|
||||
@ -530,25 +573,25 @@ numericCharacterReference =
|
||||
)
|
||||
|
||||
|
||||
namedCharacterReference : Parser String
|
||||
namedCharacterReference =
|
||||
namedCharacterReference : Config -> Parser String
|
||||
namedCharacterReference (Config cfg) =
|
||||
chompOneOrMore Char.isAlpha
|
||||
|> getChompedString
|
||||
|> map
|
||||
(\ref ->
|
||||
Html.CharRefs.decode ref
|
||||
Dict.get ref cfg.charRefs
|
||||
|> Maybe.withDefault ("&" ++ ref ++ ";")
|
||||
)
|
||||
|
||||
|
||||
characterReference : Parser String
|
||||
characterReference =
|
||||
characterReference : Config -> Parser String
|
||||
characterReference cfg =
|
||||
succeed identity
|
||||
|. chompIf ((==) '&')
|
||||
|= oneOf
|
||||
[ backtrackable numericCharacterReference
|
||||
|. chompIf ((==) ';')
|
||||
, backtrackable namedCharacterReference
|
||||
, backtrackable (namedCharacterReference cfg)
|
||||
|. chompIf ((==) ';')
|
||||
, succeed "&"
|
||||
]
|
||||
|
@ -2,12 +2,17 @@ module LocTests exposing (..)
|
||||
|
||||
import Expect exposing (Expectation)
|
||||
import Fuzz exposing (Fuzzer, int, list, string)
|
||||
import Html.CharRefs
|
||||
import Html.Loc as Q exposing (Loc)
|
||||
import Html.Parser exposing (Document, Node(..))
|
||||
import Parser exposing (DeadEnd)
|
||||
import Test exposing (..)
|
||||
|
||||
|
||||
config =
|
||||
Html.Parser.configWithCharRefs
|
||||
|
||||
|
||||
type alias DirCase =
|
||||
{ name : String
|
||||
, html : String
|
||||
@ -55,7 +60,7 @@ testTraversals cases =
|
||||
)
|
||||
|
||||
actual =
|
||||
Html.Parser.run x.html
|
||||
Html.Parser.run config x.html
|
||||
|> Result.toMaybe
|
||||
|> Maybe.andThen List.head
|
||||
|> Maybe.map Q.toLoc
|
||||
|
@ -2,11 +2,16 @@ module ParserTests exposing (..)
|
||||
|
||||
import Expect exposing (Expectation)
|
||||
import Fuzz exposing (Fuzzer, int, list, string)
|
||||
import Html.CharRefs
|
||||
import Html.Parser exposing (Document, Node(..))
|
||||
import Parser exposing (DeadEnd)
|
||||
import Test exposing (..)
|
||||
|
||||
|
||||
config =
|
||||
Html.Parser.configWithCharRefs
|
||||
|
||||
|
||||
testDoc : List ( String, String, Result (List DeadEnd) Document ) -> List Test
|
||||
testDoc cases =
|
||||
List.map
|
||||
@ -15,7 +20,7 @@ testDoc cases =
|
||||
(\_ ->
|
||||
let
|
||||
actual =
|
||||
Html.Parser.runDocument html
|
||||
Html.Parser.runDocument config html
|
||||
in
|
||||
case expected of
|
||||
Ok _ ->
|
||||
@ -41,7 +46,7 @@ testStringRoundtrip cases =
|
||||
(\_ ->
|
||||
let
|
||||
actual =
|
||||
Html.Parser.run html
|
||||
Html.Parser.run config html
|
||||
|> Result.map Html.Parser.nodesToString
|
||||
in
|
||||
case expected of
|
||||
@ -68,7 +73,7 @@ testAll cases =
|
||||
(\_ ->
|
||||
let
|
||||
actual =
|
||||
Html.Parser.run html
|
||||
Html.Parser.run config html
|
||||
in
|
||||
case expected of
|
||||
Err _ ->
|
||||
@ -395,13 +400,13 @@ scriptTests =
|
||||
testParseAll : String -> List Node -> (() -> Expectation)
|
||||
testParseAll s astList =
|
||||
\_ ->
|
||||
Expect.equal (Ok astList) (Html.Parser.run s)
|
||||
Expect.equal (Ok astList) (Html.Parser.run config s)
|
||||
|
||||
|
||||
testParse : String -> Node -> (() -> Expectation)
|
||||
testParse input expected =
|
||||
\_ ->
|
||||
case Html.Parser.run input of
|
||||
case Html.Parser.run config input of
|
||||
Err message ->
|
||||
Expect.fail (Parser.deadEndsToString message)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user