From c586d9c6328b2e726bf4ac70db752278dedf250e Mon Sep 17 00:00:00 2001 From: Alp Mestanogullari Date: Tue, 1 Jul 2014 12:14:27 +0200 Subject: [PATCH] add a lot of documentation and also a 'parseDOM' function to directly go from Text to Nodes and Elements --- src/Text/Taggy.hs | 47 +++++++++++++++++++------- src/Text/Taggy/Combinators.hs | 21 +++++++++++- src/Text/Taggy/DOM.hs | 50 ++++++++++++++++++++++++--- src/Text/Taggy/Entities.hs | 6 +++- src/Text/Taggy/Parser.hs | 10 ++++-- src/Text/Taggy/Renderer.hs | 16 ++++++--- src/Text/Taggy/Types.hs | 63 ++++++++++++++++++++++++++++++++--- taggy.cabal | 36 ++++++++++++++++++-- 8 files changed, 217 insertions(+), 32 deletions(-) diff --git a/src/Text/Taggy.hs b/src/Text/Taggy.hs index 303990f..05fec76 100644 --- a/src/Text/Taggy.hs +++ b/src/Text/Taggy.hs @@ -1,31 +1,54 @@ {-# LANGUAGE OverloadedStrings #-} -- | -- Module : Text.Taggy --- Copyright : (c) 2014 Alp Mestanogullari +-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma -- License : BSD3 -- Maintainer : alpmestan@gmail.com -- Stability : experimental -- --- ??? +-- /taggy/ is a simple package for parsing HTML (and should work with XML) +-- written on top of the +-- library, which makes it one of the most efficient (space and time consumption wise) +-- on hackage. +-- +-- This is the root module of /taggy/. It reexports everything +-- from the package. See each module's docs for details about +-- the functions and types involved in /taggy/. +-- +-- While we've been testing the parser on /many/ pages, it may still +-- be a bit rough around the edges. Let us know on +-- if you have any problem. +-- +-- If you like to look at your HTML through +-- various optical instruments, feel free to take a look at +-- the companion +-- package we've put up together. +-- +-- * If you want to parse a document as list of tags +-- and go through it as some kind of stream by just picking +-- what you need, head to "Text.Taggy.Parser" and take +-- a look at 'Text.Taggy.Parser.taggyWith' and +-- 'Text.Taggy.Parser.run'. +-- * If you want to parse the document as a DOM tree and +-- traverse it to find the information you need, +-- use 'Text.Taggy.DOM.parseDOM'. This is especially useful +-- when combined with the helpful combinators from +-- "Text.Taggy.Combinators". +-- * If you build some HTML manually +-- or just transform some existing DOM tree +-- and want to turn it into a 'Data.Text.Lazy.Text' +-- head to "Text.Taggy.Renderer" and look at 'Text.Taggy.Renderer.render'. module Text.Taggy - ( linksIn - , module Text.Taggy.Types + ( -- * Exported modules + module Text.Taggy.Types , module Text.Taggy.Parser , module Text.Taggy.DOM , module Text.Taggy.Combinators , module Text.Taggy.Renderer ) where -import Data.Text (Text) import Text.Taggy.Types import Text.Taggy.Parser import Text.Taggy.DOM import Text.Taggy.Combinators import Text.Taggy.Renderer - -linksIn :: [Tag] -> [Text] -linksIn = map attrValue - . filter ((=="href") . attrKey) - . concat - . map attrs - . tagsNamed "a" diff --git a/src/Text/Taggy/Combinators.hs b/src/Text/Taggy/Combinators.hs index cbb0522..e2cfb90 100644 --- a/src/Text/Taggy/Combinators.hs +++ b/src/Text/Taggy/Combinators.hs @@ -1,5 +1,13 @@ {-# LANGUAGE LambdaCase #-} - +-- | +-- Module : Text.Taggy.DOM +-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma +-- License : BSD3 +-- Maintainer : alpmestan@gmail.com +-- Stability : experimental +-- +-- Many useful combinators for querying 'Element's +-- of a DOM tree. module Text.Taggy.Combinators (hasName, hasAttr, getAttr, innerText, (//), (/&), (/*), trees, subtrees) where import Prelude hiding (lookup) @@ -9,15 +17,26 @@ import Data.Text (Text) import Text.Taggy.DOM (Element(..), Node(..), AttrName, AttrValue) import Data.HashMap.Strict (lookup, keys) +-- | Does the given 'Element' have +-- the given name? hasName :: Element -> Text -> Bool hasName = (==) . eltName +-- | Does the given element have +-- an attribute with the given name (or /key/) hasAttr :: Element -> AttrName -> Bool hasAttr = flip elem . keys . eltAttrs +-- | Get the value for the given attribute name +-- in the given 'Element'. Returns 'Nothing' if +-- the provided 'Element' doesn't have an attribute +-- with that name. getAttr :: Element -> AttrName -> Maybe AttrValue getAttr = flip lookup . eltAttrs +-- | Get all the bits of raw text present +-- everywhere below the given 'Element' +-- in the DOM tree. innerText :: Element -> Text innerText = mconcat . map getContent . eltChildren where getContent = \case { NodeElement e -> innerText e; NodeContent x -> x } diff --git a/src/Text/Taggy/DOM.hs b/src/Text/Taggy/DOM.hs index 4c46fc6..231611a 100644 --- a/src/Text/Taggy/DOM.hs +++ b/src/Text/Taggy/DOM.hs @@ -1,33 +1,75 @@ {-# LANGUAGE OverloadedStrings #-} - +-- | +-- Module : Text.Taggy.DOM +-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma +-- License : BSD3 +-- Maintainer : alpmestan@gmail.com +-- Stability : experimental +-- +-- This module will help you represent +-- an HTML or XML document as a tree +-- and let you traverse it in whatever +-- way you like. +-- +-- This is especially useful when used in +-- conjunction with . module Text.Taggy.DOM where import Data.HashMap.Strict (HashMap) import Data.Monoid ((<>)) import Data.Text (Text) +import Text.Taggy.Parser (taggyWith) import Text.Taggy.Types import qualified Data.HashMap.Strict as HM +import qualified Data.Text.Lazy as LT +-- | An attribute name is just a 'Text' value type AttrName = Text +-- | An attribute value is just a 'Text' value type AttrValue = Text +-- | An 'Element' here refers to a tag name, the attributes +-- specified withing that tag, and all the children nodes +-- of that element. An 'Element' is basically anything but +-- \"raw\" content. data Element = - Element { eltName :: !Text - , eltAttrs :: !(HashMap AttrName AttrValue) - , eltChildren :: [Node] + Element { eltName :: !Text -- ^ name of the element. e.g "a" for + , eltAttrs :: !(HashMap AttrName AttrValue) -- ^ a (hash)map from attribute names to attribute values + , eltChildren :: [Node] -- ^ children 'Node's } deriving (Eq, Show) +-- | A 'Node' is either an 'Element' or some raw text. data Node = NodeElement Element | NodeContent Text deriving (Eq, Show) +-- | Get the children of a node. +-- +-- If called on some raw text, this function returns @[]@. nodeChildren :: Node -> [Node] nodeChildren (NodeContent _) = [] nodeChildren (NodeElement e) = eltChildren e +-- | Parse an HTML or XML document +-- as a DOM tree. +-- +-- The 'Bool' argument lets you specify +-- whether you want to convert HTML entities +-- to their corresponding unicode characters, +-- just like in "Text.Taggy.Parser". +-- +-- > parseDOM convertEntities = domify . taggyWith cventities +parseDOM :: Bool -> LT.Text -> [Node] +parseDOM cventities = + domify . taggyWith cventities + +-- | Transform a list of tags (produced with 'taggyWith') +-- into a list of toplevel nodes. If the document you're working +-- on is valid, there should only be one toplevel node, but let's +-- not assume we're living in an ideal world. domify :: [Tag] -> [Node] domify [] = [] domify (TagOpen name attribs True : tags) diff --git a/src/Text/Taggy/Entities.hs b/src/Text/Taggy/Entities.hs index f4e4514..fcd036d 100644 --- a/src/Text/Taggy/Entities.hs +++ b/src/Text/Taggy/Entities.hs @@ -1,6 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} -module Text.Taggy.Entities where +module Text.Taggy.Entities + (convertEntities) where import Control.Applicative import Control.Monad @@ -9,6 +10,9 @@ import qualified Data.HashMap.Strict as HM import qualified Data.Text as T import qualified Data.Attoparsec.Text as Atto +-- | Convert all the (currently supported) +-- HTML entities to their corresponding +-- unicode characters. convertEntities :: T.Text -> T.Text convertEntities t = either (const t) T.concat diff --git a/src/Text/Taggy/Parser.hs b/src/Text/Taggy/Parser.hs index 3c4f943..e43fe99 100644 --- a/src/Text/Taggy/Parser.hs +++ b/src/Text/Taggy/Parser.hs @@ -6,8 +6,12 @@ -- Maintainer : alpmestan@gmail.com -- Stability : experimental -- --- ??? -module Text.Taggy.Parser where +-- Parse an HTML or XML document as a list of 'Tag's +-- with 'taggyWith' or 'run'. +module Text.Taggy.Parser + ( taggyWith + , run + ) where import Control.Applicative import Data.Attoparsec.Combinator as Atto @@ -132,7 +136,7 @@ attributes cventities = postProcess `fmap` go emptyL char '>' return True - postProcess (l, b) = (toList l, b) + postProcess (l, b) = (toListL l, b) attribute :: Bool -> Parser Attribute attribute cventities = do diff --git a/src/Text/Taggy/Renderer.hs b/src/Text/Taggy/Renderer.hs index da13c0b..7263bbd 100644 --- a/src/Text/Taggy/Renderer.hs +++ b/src/Text/Taggy/Renderer.hs @@ -1,8 +1,14 @@ {-# LANGUAGE LambdaCase, RecordWildCards, FlexibleInstances, UndecidableInstances, OverloadedStrings #-} - -module Text.Taggy.Renderer ( - Renderable(..) -) where +-- | +-- Module : Text.Taggy.Renderer +-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma +-- License : BSD3 +-- Maintainer : alpmestan@gmail.com +-- Stability : experimental +-- +-- Render a DOM tree (from "Text.Taggy.DOM") +-- using the excellent blaze markup rendering library. +module Text.Taggy.Renderer where import Data.Foldable (Foldable(foldMap)) import Data.HashMap.Strict (HashMap, foldlWithKey') @@ -24,11 +30,13 @@ class AsMarkup a where -- conversion. toMarkup :: Bool -> a -> Markup +-- | A 'Node' is convertible to 'Markup' instance AsMarkup Node where toMarkup convertEntities = \case NodeContent text -> Content $ if convertEntities then Text text else PreEscaped (Text text) NodeElement elmt -> toMarkup convertEntities elmt +-- | An 'Element' is convertible to 'Markup' instance AsMarkup Element where toMarkup convertEntities Element{..} = eltAttrs `toAttribute` Parent tag begin end kids where tag = toStatic eltName diff --git a/src/Text/Taggy/Types.hs b/src/Text/Taggy/Types.hs index d77cff6..0071f95 100644 --- a/src/Text/Taggy/Types.hs +++ b/src/Text/Taggy/Types.hs @@ -6,24 +6,67 @@ -- Maintainer : alpmestan@gmail.com -- Stability : experimental -- --- ??? -module Text.Taggy.Types where +-- Core types of /taggy/. +module Text.Taggy.Types + ( -- * 'Tag' type + Tag(..) + , tname + , isTagOpen + , isTagClose + , isTagText + , isTagComment + , isTagScript + , isTagStyle + , tagsNamed + + , -- * 'Attribute's + Attribute(..) + , attrs + , attrKey + , attrValue + + , -- * A small difference list implementation + L + , emptyL + , appL + , insertL + , singletonL + , toListL + ) where import Data.Text (Text, toCaseFold) +-- | An attribute is just an attribute name +-- and an attribute value. data Attribute = Attribute !Text !Text deriving (Show, Eq) +-- | Get the attributes of a 'Tag'. attrs :: Tag -> [Attribute] attrs (TagOpen _ as _) = as attrs _ = [] +-- | Get the name of an 'Attribute'. attrKey :: Attribute -> Text attrKey (Attribute k _) = k +-- | Get the value of an 'Attribute'. attrValue :: Attribute -> Text attrValue (Attribute _ v) = v +-- A 'Tag' can be one of the following types of tags: +-- +-- * an opening tag that has a name, a list of attributes, and whether +-- it is a self-closing tag or not +-- * a closing tag with the name of the tag +-- * some raw 'Text' +-- * an HTML comment tag +-- * a @@ tag +-- * a @@ tag +-- +-- The latter two are useful to be considered +-- separately in the parser and also lets you +-- collect these bits quite easily. data Tag = TagOpen !Text [Attribute] !Bool -- is it a self-closing tag? | TagClose !Text | TagText !Text @@ -32,6 +75,9 @@ data Tag = TagOpen !Text [Attribute] !Bool -- is it a self-closing tag? | TagStyle !Tag !Text !Tag deriving (Show, Eq) +-- | Name of a 'Tag'. +-- +-- > tname (TagClose "a") == "a" tname :: Tag -> Text tname (TagOpen n _ _) = n tname (TagClose n) = n @@ -40,30 +86,37 @@ tname (TagComment _) = "" tname (TagScript _ _ _) = "script" tname (TagStyle _ _ _) = "style" +-- | Is this 'Tag' an opening tag? isTagOpen :: Tag -> Bool isTagOpen (TagOpen _ _ _) = True isTagOpen _ = False +-- | Is this 'Tag' a closing tag? isTagClose :: Tag -> Bool isTagClose (TagClose _) = True isTagClose _ = False +-- | Is this 'Tag' just some flat text? isTagText :: Tag -> Bool isTagText (TagText _) = True isTagText _ = False +-- | Is this 'Tag' an HTML comment tag? isTagComment :: Tag -> Bool isTagComment (TagComment _) = True isTagComment _ = False +-- | Is this 'Tag' a @@ tag? isTagScript :: Tag -> Bool isTagScript (TagScript _ _ _) = True isTagScript _ = False +-- | Is this 'Tag' a @@ tag? isTagStyle :: Tag -> Bool isTagStyle (TagStyle _ _ _) = True isTagStyle _ = False +-- | Get all the (opening) tags with the given name tagsNamed :: Text -> [Tag] -> [Tag] tagsNamed nam = filter (named nam) @@ -81,8 +134,8 @@ appL (L l1) (L l2) = L $ l1 . l2 singletonL :: a -> L a singletonL x = L (x:) -toList :: L a -> [a] -toList (L f) = f [] +toListL :: L a -> [a] +toListL (L f) = f [] insertL :: a -> L a -> L a -insertL x (L f) = L $ (x:) . f \ No newline at end of file +insertL x (L f) = L $ (x:) . f diff --git a/taggy.cabal b/taggy.cabal index 19714cb..d531013 100644 --- a/taggy.cabal +++ b/taggy.cabal @@ -1,7 +1,39 @@ name: taggy version: 0.1.0.0 -synopsis: HTML parsing à la tagsoup using attoparsec -description: ??? +synopsis: Efficient and simple HTML/XML parsing library +description: + /taggy/ is a simple package for parsing HTML (and should work with XML) + written on top of the + library, which makes it one of the most efficient (space and time consumption wise) + on hackage. + . + This is the root module of /taggy/. It reexports everything + from the package. See each module's docs for details about + the functions and types involved in /taggy/. + . + While we've been testing the parser on /many/ pages, it may still + be a bit rough around the edges. Let us know on + if you have any problem. + . + If you like to look at your HTML through + various optical instruments, feel free to take a look at + the companion + package we've put up together. + . + * If you want to parse a document as list of tags + and go through it as some kind of stream by just picking + what you need, head to "Text.Taggy.Parser" and take + a look at 'Text.Taggy.Parser.taggyWith' and + 'Text.Taggy.Parser.run'. + * If you want to parse the document as a DOM tree and + traverse it to find the information you need, + use 'Text.Taggy.DOM.parseDOM'. This is especially useful + when combined with the helpful combinators from + "Text.Taggy.Combinators". + * If you build some HTML manually + or just transform some existing DOM tree + and want to turn it into a 'Data.Text.Lazy.Text' + head to "Text.Taggy.Renderer" and look at 'Text.Taggy.Renderer.render'. homepage: http://github.com/alpmestan/taggy license: BSD3 license-file: LICENSE