improve the DOM module's correctness

This commit is contained in:
Alp Mestanogullari 2014-06-12 14:13:05 +02:00
parent 995796047e
commit af8e70bdd6
5 changed files with 144 additions and 36 deletions

View File

@ -9,15 +9,22 @@ import qualified Data.Text.Lazy.IO as T
main :: IO ()
main = do
args <- getArgs
case length args of
1 -> taggy (head args)
case args of
["--help"] -> usage
[opt, filename] | opt == "-d" || opt == "--dom" -> dom filename
[opt, filename] | opt == "-t" || opt == "--t" -> taggy filename
[filename] -> taggy filename
_ -> usage
usage :: IO ()
usage = do
putStrLn "taggy - simple and fast HTML parser"
putStrLn ""
putStrLn "Usage:\t taggy <HTML file name>"
putStrLn "Usage:\t taggy [OPTION] <HTML file name>"
putStrLn "\n"
putStrLn "Options are:"
putStrLn "\t -d/--dom\t Parse as a DOM tree. This isn't the default."
putStrLn "\t -t/--tags\t Parse as a list of opening/closing/text/comment/script/style tags"
taggy :: FilePath -> IO ()
taggy fp = do
@ -25,3 +32,8 @@ taggy fp = do
either (\s -> putStrLn $ "couldn't parse: " ++ s)
(mapM_ print)
(eitherResult $ run content)
dom :: FilePath -> IO ()
dom fp = do
content <- T.readFile fp
mapM_ print . g' $ tagsIn content

View File

@ -2,45 +2,128 @@
module Text.Taggy.DOM where
import Data.HashMap.Strict (HashMap)
import Data.Monoid ((<>))
import Data.Text (Text)
import Text.Taggy.Types
-- TODO: provide lenses for this
data Tree = Branch !Text [Attribute] [Tree]
| Leaf !Tag
import qualified Data.HashMap.Strict as HM
type AttrName = Text
type AttrValue = Text
data Element =
Element { eltName :: !Text
, eltAttrs :: !(HashMap AttrName AttrValue)
, eltChildren :: [Node]
}
deriving (Eq, Show)
domify :: [Tag] -> [Tree]
data Node =
NodeElement Element
| NodeContent Text
deriving (Eq, Show)
nodeChildren :: Node -> [Node]
nodeChildren (NodeContent _) = []
nodeChildren (NodeElement e) = eltChildren e
domify :: [Tag] -> [Node]
domify [] = []
domify xs = go xs
domify (TagOpen name attribs True : tags)
= NodeElement (Element name as []) : domify tags
where go [] = []
go ts = a ++ map Leaf (take 1 b) ++ go (drop 1 b)
where (a, b) = f ts
where as = HM.fromListWith (\v1 v2 -> v1 <> " " <> v2)
. map attrToPair $ attribs
f (TagScript tago scr tagc : rest) =
f $ [tago, TagText scr, tagc] ++ rest
attrToPair (Attribute k v) = (k, v)
f (TagStyle tago sty tagc : rest) =
f $ [tago, TagText sty, tagc] ++ rest
domify (TagText txt : tags)
= NodeContent txt : domify tags
domify (TagOpen name attribs False : tags)
= NodeElement (Element name as cs) : domify unusedTags
f (TagOpen name as autocl : rest) =
case f rest of
(inner, []) -> ( Leaf (TagOpen name as autocl) : inner
, []
)
(inner, TagClose x : ts)
| x == name ->
let (a, b) = f ts in
(Branch name as inner : a, b)
| otherwise ->
( Leaf (TagOpen name as autocl) : inner
, TagClose x : ts
)
_ -> error "Text.Taggy.DOM.domify: shouldn't happen"
where (cs, unusedTags) = untilClosed name ([], tags)
as = HM.fromListWith (\v1 v2 -> v1 <> " " <> v2)
. map attrToPair $ attribs
attrToPair (Attribute k v) = (k, v)
domify (TagClose _ : tags) = domify tags
domify (TagComment _ : tags) = domify tags
domify (TagScript tago scr tagc : tags) =
domify $ [tago, TagText scr, tagc] ++ tags
domify (TagStyle tago sty tagc : tags) =
domify $ [tago, TagText sty, tagc] ++ tags
untilClosed :: Text -> ([Node], [Tag]) -> ([Node], [Tag])
untilClosed name (cousins, TagClose n : ts)
| n == name = (cousins, ts)
| otherwise = untilClosed name ( cousins
, TagOpen n [] False
: TagClose n
: ts )
untilClosed name (cousins, TagText t : ts)
= let (cousins', ts') = untilClosed name (cousins, ts)
cousins'' = convertText t : cousins'
in (cousins++cousins'', ts')
untilClosed name (cousins, TagComment _ : ts)
= untilClosed name (cousins, ts)
untilClosed name (cousins, TagOpen n as True : ts)
= let (cousins', ts') = untilClosed name (cousins, ts)
elt = Element n as' []
cousins'' = NodeElement elt : cousins'
in (cousins++cousins'', ts')
where as' = HM.fromListWith (\v1 v2 -> v1 <> " " <> v2)
. map attrToPair $ as
attrToPair (Attribute k v) = (k, v)
untilClosed name (cousins, TagOpen n as False : ts)
= let (insideNew, ts') = untilClosed n ([], ts)
(cousins', ts'') = untilClosed name (cousins, ts')
elt = Element n as' insideNew
cousins'' = NodeElement elt : cousins'
in (cousins'', ts'')
where as' = HM.fromListWith (\v1 v2 -> v1 <> " " <> v2)
. map attrToPair $ as
attrToPair (Attribute k v) = (k, v)
untilClosed name (cousins, TagScript tago scr _ : ts)
= let (TagOpen n at _) = tago
(cousins', ts') = untilClosed name (cousins, ts)
cousins'' = NodeElement (Element n (at' at) [NodeContent scr]) : cousins'
in (cousins++cousins'', ts')
where at' at = HM.fromListWith (\v1 v2 -> v1 <> " " <> v2)
. map attrToPair $ at
attrToPair (Attribute k v) = (k, v)
untilClosed name (cousins, TagStyle tago sty _ : ts)
= let (TagOpen n at _) = tago
(cousins', ts') = untilClosed name (cousins, ts)
cousins'' = NodeElement (Element n (at' at) [NodeContent sty]) : cousins'
in (cousins++cousins'', ts')
where at' at = HM.fromListWith (\v1 v2 -> v1 <> " " <> v2)
. map attrToPair $ at
attrToPair (Attribute k v) = (k, v)
untilClosed _ (cs, []) = (cs, [])
convertText :: Text -> Node
convertText t = NodeContent t
f (TagClose x:ts) = ([], TagClose x : ts)
f (x:ts) = (Leaf x : a, b)
where (a,b) = f ts
f [] = ([], [])

View File

@ -144,7 +144,11 @@ html = go
(t:) `fmap` go
tag :: Parser Tag
tag = tagcomment
tag = skipSpace >> tag'
tag' :: Parser Tag
tag' =
tagcomment
<|> tagscript
<|> tagstyle
<|> tagopen

View File

@ -1,3 +1,4 @@
{-# LANGUAGE OverloadedStrings #-}
-- |
-- Module : Text.Taggy.Types
-- Copyright : (c) 2014 Alp Mestanogullari
@ -31,6 +32,14 @@ data Tag = TagOpen !Text [Attribute] !Bool -- is it a self-closing tag?
| TagStyle !Tag !Text !Tag
deriving (Show, Eq)
tname :: Tag -> Text
tname (TagOpen n _ _) = n
tname (TagClose n) = n
tname (TagText _) = ""
tname (TagComment _) = "<!-- -->"
tname (TagScript _ _ _) = "script"
tname (TagStyle _ _ _) = "style"
isTagOpen :: Tag -> Bool
isTagOpen (TagOpen _ _ _) = True
isTagOpen _ = False

View File

@ -22,8 +22,8 @@ library
build-depends: base >=4.5 && <5,
text >=1 && <1.2,
attoparsec >=0.11 && <0.13,
vector,
tagsoup
vector >=0.7,
unordered-containers >= 0.2 && <0.3
hs-source-dirs: src
default-language: Haskell2010
ghc-options: -Wall -O2 -fno-warn-unused-do-bind -funbox-strict-fields