add a lot of documentation and also a 'parseDOM' function to directly go from Text to Nodes and Elements

This commit is contained in:
Alp Mestanogullari 2014-07-01 12:14:27 +02:00
parent 65c5288718
commit c586d9c632
8 changed files with 217 additions and 32 deletions

View File

@ -1,31 +1,54 @@
{-# LANGUAGE OverloadedStrings #-}
-- |
-- Module : Text.Taggy
-- Copyright : (c) 2014 Alp Mestanogullari
-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma
-- License : BSD3
-- Maintainer : alpmestan@gmail.com
-- Stability : experimental
--
-- ???
-- /taggy/ is a simple package for parsing HTML (and should work with XML)
-- written on top of the <http://hackage.haskell.org/package/attoparsec attoparsec>
-- library, which makes it one of the most efficient (space and time consumption wise)
-- on hackage.
--
-- This is the root module of /taggy/. It reexports everything
-- from the package. See each module's docs for details about
-- the functions and types involved in /taggy/.
--
-- While we've been testing the parser on /many/ pages, it may still
-- be a bit rough around the edges. Let us know on <http://github.com/alpmestan/taggy/issues github>
-- if you have any problem.
--
-- If you like to look at your HTML through
-- various optical instruments, feel free to take a look at
-- the companion <http://hackage.haskell.org/package/taggy-lens taggy-lens>
-- package we've put up together.
--
-- * If you want to parse a document as list of tags
-- and go through it as some kind of stream by just picking
-- what you need, head to "Text.Taggy.Parser" and take
-- a look at 'Text.Taggy.Parser.taggyWith' and
-- 'Text.Taggy.Parser.run'.
-- * If you want to parse the document as a DOM tree and
-- traverse it to find the information you need,
-- use 'Text.Taggy.DOM.parseDOM'. This is especially useful
-- when combined with the helpful combinators from
-- "Text.Taggy.Combinators".
-- * If you build some HTML manually
-- or just transform some existing DOM tree
-- and want to turn it into a 'Data.Text.Lazy.Text'
-- head to "Text.Taggy.Renderer" and look at 'Text.Taggy.Renderer.render'.
module Text.Taggy
( linksIn
, module Text.Taggy.Types
( -- * Exported modules
module Text.Taggy.Types
, module Text.Taggy.Parser
, module Text.Taggy.DOM
, module Text.Taggy.Combinators
, module Text.Taggy.Renderer
) where
import Data.Text (Text)
import Text.Taggy.Types
import Text.Taggy.Parser
import Text.Taggy.DOM
import Text.Taggy.Combinators
import Text.Taggy.Renderer
linksIn :: [Tag] -> [Text]
linksIn = map attrValue
. filter ((=="href") . attrKey)
. concat
. map attrs
. tagsNamed "a"

View File

@ -1,5 +1,13 @@
{-# LANGUAGE LambdaCase #-}
-- |
-- Module : Text.Taggy.DOM
-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma
-- License : BSD3
-- Maintainer : alpmestan@gmail.com
-- Stability : experimental
--
-- Many useful combinators for querying 'Element's
-- of a DOM tree.
module Text.Taggy.Combinators (hasName, hasAttr, getAttr, innerText, (//), (/&), (/*), trees, subtrees) where
import Prelude hiding (lookup)
@ -9,15 +17,26 @@ import Data.Text (Text)
import Text.Taggy.DOM (Element(..), Node(..), AttrName, AttrValue)
import Data.HashMap.Strict (lookup, keys)
-- | Does the given 'Element' have
-- the given name?
hasName :: Element -> Text -> Bool
hasName = (==) . eltName
-- | Does the given element have
-- an attribute with the given name (or /key/)
hasAttr :: Element -> AttrName -> Bool
hasAttr = flip elem . keys . eltAttrs
-- | Get the value for the given attribute name
-- in the given 'Element'. Returns 'Nothing' if
-- the provided 'Element' doesn't have an attribute
-- with that name.
getAttr :: Element -> AttrName -> Maybe AttrValue
getAttr = flip lookup . eltAttrs
-- | Get all the bits of raw text present
-- everywhere below the given 'Element'
-- in the DOM tree.
innerText :: Element -> Text
innerText = mconcat . map getContent . eltChildren
where getContent = \case { NodeElement e -> innerText e; NodeContent x -> x }

View File

@ -1,33 +1,75 @@
{-# LANGUAGE OverloadedStrings #-}
-- |
-- Module : Text.Taggy.DOM
-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma
-- License : BSD3
-- Maintainer : alpmestan@gmail.com
-- Stability : experimental
--
-- This module will help you represent
-- an HTML or XML document as a tree
-- and let you traverse it in whatever
-- way you like.
--
-- This is especially useful when used in
-- conjunction with <http://hackage.haskell.org/package/taggy-lens taggy-lens>.
module Text.Taggy.DOM where
import Data.HashMap.Strict (HashMap)
import Data.Monoid ((<>))
import Data.Text (Text)
import Text.Taggy.Parser (taggyWith)
import Text.Taggy.Types
import qualified Data.HashMap.Strict as HM
import qualified Data.Text.Lazy as LT
-- | An attribute name is just a 'Text' value
type AttrName = Text
-- | An attribute value is just a 'Text' value
type AttrValue = Text
-- | An 'Element' here refers to a tag name, the attributes
-- specified withing that tag, and all the children nodes
-- of that element. An 'Element' is basically anything but
-- \"raw\" content.
data Element =
Element { eltName :: !Text
, eltAttrs :: !(HashMap AttrName AttrValue)
, eltChildren :: [Node]
Element { eltName :: !Text -- ^ name of the element. e.g "a" for <a>
, eltAttrs :: !(HashMap AttrName AttrValue) -- ^ a (hash)map from attribute names to attribute values
, eltChildren :: [Node] -- ^ children 'Node's
}
deriving (Eq, Show)
-- | A 'Node' is either an 'Element' or some raw text.
data Node =
NodeElement Element
| NodeContent Text
deriving (Eq, Show)
-- | Get the children of a node.
--
-- If called on some raw text, this function returns @[]@.
nodeChildren :: Node -> [Node]
nodeChildren (NodeContent _) = []
nodeChildren (NodeElement e) = eltChildren e
-- | Parse an HTML or XML document
-- as a DOM tree.
--
-- The 'Bool' argument lets you specify
-- whether you want to convert HTML entities
-- to their corresponding unicode characters,
-- just like in "Text.Taggy.Parser".
--
-- > parseDOM convertEntities = domify . taggyWith cventities
parseDOM :: Bool -> LT.Text -> [Node]
parseDOM cventities =
domify . taggyWith cventities
-- | Transform a list of tags (produced with 'taggyWith')
-- into a list of toplevel nodes. If the document you're working
-- on is valid, there should only be one toplevel node, but let's
-- not assume we're living in an ideal world.
domify :: [Tag] -> [Node]
domify [] = []
domify (TagOpen name attribs True : tags)

View File

@ -1,6 +1,7 @@
{-# LANGUAGE OverloadedStrings #-}
module Text.Taggy.Entities where
module Text.Taggy.Entities
(convertEntities) where
import Control.Applicative
import Control.Monad
@ -9,6 +10,9 @@ import qualified Data.HashMap.Strict as HM
import qualified Data.Text as T
import qualified Data.Attoparsec.Text as Atto
-- | Convert all the (currently supported)
-- HTML entities to their corresponding
-- unicode characters.
convertEntities :: T.Text -> T.Text
convertEntities t =
either (const t) T.concat

View File

@ -6,8 +6,12 @@
-- Maintainer : alpmestan@gmail.com
-- Stability : experimental
--
-- ???
module Text.Taggy.Parser where
-- Parse an HTML or XML document as a list of 'Tag's
-- with 'taggyWith' or 'run'.
module Text.Taggy.Parser
( taggyWith
, run
) where
import Control.Applicative
import Data.Attoparsec.Combinator as Atto
@ -132,7 +136,7 @@ attributes cventities = postProcess `fmap` go emptyL
char '>'
return True
postProcess (l, b) = (toList l, b)
postProcess (l, b) = (toListL l, b)
attribute :: Bool -> Parser Attribute
attribute cventities = do

View File

@ -1,8 +1,14 @@
{-# LANGUAGE LambdaCase, RecordWildCards, FlexibleInstances, UndecidableInstances, OverloadedStrings #-}
module Text.Taggy.Renderer (
Renderable(..)
) where
-- |
-- Module : Text.Taggy.Renderer
-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma
-- License : BSD3
-- Maintainer : alpmestan@gmail.com
-- Stability : experimental
--
-- Render a DOM tree (from "Text.Taggy.DOM")
-- using the excellent blaze markup rendering library.
module Text.Taggy.Renderer where
import Data.Foldable (Foldable(foldMap))
import Data.HashMap.Strict (HashMap, foldlWithKey')
@ -24,11 +30,13 @@ class AsMarkup a where
-- conversion.
toMarkup :: Bool -> a -> Markup
-- | A 'Node' is convertible to 'Markup'
instance AsMarkup Node where
toMarkup convertEntities = \case
NodeContent text -> Content $ if convertEntities then Text text else PreEscaped (Text text)
NodeElement elmt -> toMarkup convertEntities elmt
-- | An 'Element' is convertible to 'Markup'
instance AsMarkup Element where
toMarkup convertEntities Element{..} = eltAttrs `toAttribute` Parent tag begin end kids
where tag = toStatic eltName

View File

@ -6,24 +6,67 @@
-- Maintainer : alpmestan@gmail.com
-- Stability : experimental
--
-- ???
module Text.Taggy.Types where
-- Core types of /taggy/.
module Text.Taggy.Types
( -- * 'Tag' type
Tag(..)
, tname
, isTagOpen
, isTagClose
, isTagText
, isTagComment
, isTagScript
, isTagStyle
, tagsNamed
, -- * 'Attribute's
Attribute(..)
, attrs
, attrKey
, attrValue
, -- * A small difference list implementation
L
, emptyL
, appL
, insertL
, singletonL
, toListL
) where
import Data.Text (Text, toCaseFold)
-- | An attribute is just an attribute name
-- and an attribute value.
data Attribute = Attribute !Text !Text
deriving (Show, Eq)
-- | Get the attributes of a 'Tag'.
attrs :: Tag -> [Attribute]
attrs (TagOpen _ as _) = as
attrs _ = []
-- | Get the name of an 'Attribute'.
attrKey :: Attribute -> Text
attrKey (Attribute k _) = k
-- | Get the value of an 'Attribute'.
attrValue :: Attribute -> Text
attrValue (Attribute _ v) = v
-- A 'Tag' can be one of the following types of tags:
--
-- * an opening tag that has a name, a list of attributes, and whether
-- it is a self-closing tag or not
-- * a closing tag with the name of the tag
-- * some raw 'Text'
-- * an HTML comment tag
-- * a @<script>...</script>@ tag
-- * a @<style>...</style>@ tag
--
-- The latter two are useful to be considered
-- separately in the parser and also lets you
-- collect these bits quite easily.
data Tag = TagOpen !Text [Attribute] !Bool -- is it a self-closing tag?
| TagClose !Text
| TagText !Text
@ -32,6 +75,9 @@ data Tag = TagOpen !Text [Attribute] !Bool -- is it a self-closing tag?
| TagStyle !Tag !Text !Tag
deriving (Show, Eq)
-- | Name of a 'Tag'.
--
-- > tname (TagClose "a") == "a"
tname :: Tag -> Text
tname (TagOpen n _ _) = n
tname (TagClose n) = n
@ -40,30 +86,37 @@ tname (TagComment _) = "<!-- -->"
tname (TagScript _ _ _) = "script"
tname (TagStyle _ _ _) = "style"
-- | Is this 'Tag' an opening tag?
isTagOpen :: Tag -> Bool
isTagOpen (TagOpen _ _ _) = True
isTagOpen _ = False
-- | Is this 'Tag' a closing tag?
isTagClose :: Tag -> Bool
isTagClose (TagClose _) = True
isTagClose _ = False
-- | Is this 'Tag' just some flat text?
isTagText :: Tag -> Bool
isTagText (TagText _) = True
isTagText _ = False
-- | Is this 'Tag' an HTML comment tag?
isTagComment :: Tag -> Bool
isTagComment (TagComment _) = True
isTagComment _ = False
-- | Is this 'Tag' a @<script>...</script>@ tag?
isTagScript :: Tag -> Bool
isTagScript (TagScript _ _ _) = True
isTagScript _ = False
-- | Is this 'Tag' a @<style>...</style>@ tag?
isTagStyle :: Tag -> Bool
isTagStyle (TagStyle _ _ _) = True
isTagStyle _ = False
-- | Get all the (opening) tags with the given name
tagsNamed :: Text -> [Tag] -> [Tag]
tagsNamed nam = filter (named nam)
@ -81,8 +134,8 @@ appL (L l1) (L l2) = L $ l1 . l2
singletonL :: a -> L a
singletonL x = L (x:)
toList :: L a -> [a]
toList (L f) = f []
toListL :: L a -> [a]
toListL (L f) = f []
insertL :: a -> L a -> L a
insertL x (L f) = L $ (x:) . f
insertL x (L f) = L $ (x:) . f

View File

@ -1,7 +1,39 @@
name: taggy
version: 0.1.0.0
synopsis: HTML parsing à la tagsoup using attoparsec
description: ???
synopsis: Efficient and simple HTML/XML parsing library
description:
/taggy/ is a simple package for parsing HTML (and should work with XML)
written on top of the <http://hackage.haskell.org/package/attoparsec attoparsec>
library, which makes it one of the most efficient (space and time consumption wise)
on hackage.
.
This is the root module of /taggy/. It reexports everything
from the package. See each module's docs for details about
the functions and types involved in /taggy/.
.
While we've been testing the parser on /many/ pages, it may still
be a bit rough around the edges. Let us know on <http://github.com/alpmestan/taggy/issues github>
if you have any problem.
.
If you like to look at your HTML through
various optical instruments, feel free to take a look at
the companion <http://hackage.haskell.org/package/taggy-lens taggy-lens>
package we've put up together.
.
* If you want to parse a document as list of tags
and go through it as some kind of stream by just picking
what you need, head to "Text.Taggy.Parser" and take
a look at 'Text.Taggy.Parser.taggyWith' and
'Text.Taggy.Parser.run'.
* If you want to parse the document as a DOM tree and
traverse it to find the information you need,
use 'Text.Taggy.DOM.parseDOM'. This is especially useful
when combined with the helpful combinators from
"Text.Taggy.Combinators".
* If you build some HTML manually
or just transform some existing DOM tree
and want to turn it into a 'Data.Text.Lazy.Text'
head to "Text.Taggy.Renderer" and look at 'Text.Taggy.Renderer.render'.
homepage: http://github.com/alpmestan/taggy
license: BSD3
license-file: LICENSE