mirror of
https://github.com/alpmestan/taggy.git
synced 2024-08-16 10:20:30 +03:00
add a lot of documentation and also a 'parseDOM' function to directly go from Text to Nodes and Elements
This commit is contained in:
parent
65c5288718
commit
c586d9c632
@ -1,31 +1,54 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
-- |
|
||||
-- Module : Text.Taggy
|
||||
-- Copyright : (c) 2014 Alp Mestanogullari
|
||||
-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma
|
||||
-- License : BSD3
|
||||
-- Maintainer : alpmestan@gmail.com
|
||||
-- Stability : experimental
|
||||
--
|
||||
-- ???
|
||||
-- /taggy/ is a simple package for parsing HTML (and should work with XML)
|
||||
-- written on top of the <http://hackage.haskell.org/package/attoparsec attoparsec>
|
||||
-- library, which makes it one of the most efficient (space and time consumption wise)
|
||||
-- on hackage.
|
||||
--
|
||||
-- This is the root module of /taggy/. It reexports everything
|
||||
-- from the package. See each module's docs for details about
|
||||
-- the functions and types involved in /taggy/.
|
||||
--
|
||||
-- While we've been testing the parser on /many/ pages, it may still
|
||||
-- be a bit rough around the edges. Let us know on <http://github.com/alpmestan/taggy/issues github>
|
||||
-- if you have any problem.
|
||||
--
|
||||
-- If you like to look at your HTML through
|
||||
-- various optical instruments, feel free to take a look at
|
||||
-- the companion <http://hackage.haskell.org/package/taggy-lens taggy-lens>
|
||||
-- package we've put up together.
|
||||
--
|
||||
-- * If you want to parse a document as list of tags
|
||||
-- and go through it as some kind of stream by just picking
|
||||
-- what you need, head to "Text.Taggy.Parser" and take
|
||||
-- a look at 'Text.Taggy.Parser.taggyWith' and
|
||||
-- 'Text.Taggy.Parser.run'.
|
||||
-- * If you want to parse the document as a DOM tree and
|
||||
-- traverse it to find the information you need,
|
||||
-- use 'Text.Taggy.DOM.parseDOM'. This is especially useful
|
||||
-- when combined with the helpful combinators from
|
||||
-- "Text.Taggy.Combinators".
|
||||
-- * If you build some HTML manually
|
||||
-- or just transform some existing DOM tree
|
||||
-- and want to turn it into a 'Data.Text.Lazy.Text'
|
||||
-- head to "Text.Taggy.Renderer" and look at 'Text.Taggy.Renderer.render'.
|
||||
module Text.Taggy
|
||||
( linksIn
|
||||
, module Text.Taggy.Types
|
||||
( -- * Exported modules
|
||||
module Text.Taggy.Types
|
||||
, module Text.Taggy.Parser
|
||||
, module Text.Taggy.DOM
|
||||
, module Text.Taggy.Combinators
|
||||
, module Text.Taggy.Renderer
|
||||
) where
|
||||
|
||||
import Data.Text (Text)
|
||||
import Text.Taggy.Types
|
||||
import Text.Taggy.Parser
|
||||
import Text.Taggy.DOM
|
||||
import Text.Taggy.Combinators
|
||||
import Text.Taggy.Renderer
|
||||
|
||||
linksIn :: [Tag] -> [Text]
|
||||
linksIn = map attrValue
|
||||
. filter ((=="href") . attrKey)
|
||||
. concat
|
||||
. map attrs
|
||||
. tagsNamed "a"
|
||||
|
@ -1,5 +1,13 @@
|
||||
{-# LANGUAGE LambdaCase #-}
|
||||
|
||||
-- |
|
||||
-- Module : Text.Taggy.DOM
|
||||
-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma
|
||||
-- License : BSD3
|
||||
-- Maintainer : alpmestan@gmail.com
|
||||
-- Stability : experimental
|
||||
--
|
||||
-- Many useful combinators for querying 'Element's
|
||||
-- of a DOM tree.
|
||||
module Text.Taggy.Combinators (hasName, hasAttr, getAttr, innerText, (//), (/&), (/*), trees, subtrees) where
|
||||
|
||||
import Prelude hiding (lookup)
|
||||
@ -9,15 +17,26 @@ import Data.Text (Text)
|
||||
import Text.Taggy.DOM (Element(..), Node(..), AttrName, AttrValue)
|
||||
import Data.HashMap.Strict (lookup, keys)
|
||||
|
||||
-- | Does the given 'Element' have
|
||||
-- the given name?
|
||||
hasName :: Element -> Text -> Bool
|
||||
hasName = (==) . eltName
|
||||
|
||||
-- | Does the given element have
|
||||
-- an attribute with the given name (or /key/)
|
||||
hasAttr :: Element -> AttrName -> Bool
|
||||
hasAttr = flip elem . keys . eltAttrs
|
||||
|
||||
-- | Get the value for the given attribute name
|
||||
-- in the given 'Element'. Returns 'Nothing' if
|
||||
-- the provided 'Element' doesn't have an attribute
|
||||
-- with that name.
|
||||
getAttr :: Element -> AttrName -> Maybe AttrValue
|
||||
getAttr = flip lookup . eltAttrs
|
||||
|
||||
-- | Get all the bits of raw text present
|
||||
-- everywhere below the given 'Element'
|
||||
-- in the DOM tree.
|
||||
innerText :: Element -> Text
|
||||
innerText = mconcat . map getContent . eltChildren
|
||||
where getContent = \case { NodeElement e -> innerText e; NodeContent x -> x }
|
||||
|
@ -1,33 +1,75 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
-- |
|
||||
-- Module : Text.Taggy.DOM
|
||||
-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma
|
||||
-- License : BSD3
|
||||
-- Maintainer : alpmestan@gmail.com
|
||||
-- Stability : experimental
|
||||
--
|
||||
-- This module will help you represent
|
||||
-- an HTML or XML document as a tree
|
||||
-- and let you traverse it in whatever
|
||||
-- way you like.
|
||||
--
|
||||
-- This is especially useful when used in
|
||||
-- conjunction with <http://hackage.haskell.org/package/taggy-lens taggy-lens>.
|
||||
module Text.Taggy.DOM where
|
||||
|
||||
import Data.HashMap.Strict (HashMap)
|
||||
import Data.Monoid ((<>))
|
||||
import Data.Text (Text)
|
||||
import Text.Taggy.Parser (taggyWith)
|
||||
import Text.Taggy.Types
|
||||
|
||||
import qualified Data.HashMap.Strict as HM
|
||||
import qualified Data.Text.Lazy as LT
|
||||
|
||||
-- | An attribute name is just a 'Text' value
|
||||
type AttrName = Text
|
||||
-- | An attribute value is just a 'Text' value
|
||||
type AttrValue = Text
|
||||
|
||||
-- | An 'Element' here refers to a tag name, the attributes
|
||||
-- specified withing that tag, and all the children nodes
|
||||
-- of that element. An 'Element' is basically anything but
|
||||
-- \"raw\" content.
|
||||
data Element =
|
||||
Element { eltName :: !Text
|
||||
, eltAttrs :: !(HashMap AttrName AttrValue)
|
||||
, eltChildren :: [Node]
|
||||
Element { eltName :: !Text -- ^ name of the element. e.g "a" for <a>
|
||||
, eltAttrs :: !(HashMap AttrName AttrValue) -- ^ a (hash)map from attribute names to attribute values
|
||||
, eltChildren :: [Node] -- ^ children 'Node's
|
||||
}
|
||||
deriving (Eq, Show)
|
||||
|
||||
-- | A 'Node' is either an 'Element' or some raw text.
|
||||
data Node =
|
||||
NodeElement Element
|
||||
| NodeContent Text
|
||||
deriving (Eq, Show)
|
||||
|
||||
-- | Get the children of a node.
|
||||
--
|
||||
-- If called on some raw text, this function returns @[]@.
|
||||
nodeChildren :: Node -> [Node]
|
||||
nodeChildren (NodeContent _) = []
|
||||
nodeChildren (NodeElement e) = eltChildren e
|
||||
|
||||
-- | Parse an HTML or XML document
|
||||
-- as a DOM tree.
|
||||
--
|
||||
-- The 'Bool' argument lets you specify
|
||||
-- whether you want to convert HTML entities
|
||||
-- to their corresponding unicode characters,
|
||||
-- just like in "Text.Taggy.Parser".
|
||||
--
|
||||
-- > parseDOM convertEntities = domify . taggyWith cventities
|
||||
parseDOM :: Bool -> LT.Text -> [Node]
|
||||
parseDOM cventities =
|
||||
domify . taggyWith cventities
|
||||
|
||||
-- | Transform a list of tags (produced with 'taggyWith')
|
||||
-- into a list of toplevel nodes. If the document you're working
|
||||
-- on is valid, there should only be one toplevel node, but let's
|
||||
-- not assume we're living in an ideal world.
|
||||
domify :: [Tag] -> [Node]
|
||||
domify [] = []
|
||||
domify (TagOpen name attribs True : tags)
|
||||
|
@ -1,6 +1,7 @@
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
|
||||
module Text.Taggy.Entities where
|
||||
module Text.Taggy.Entities
|
||||
(convertEntities) where
|
||||
|
||||
import Control.Applicative
|
||||
import Control.Monad
|
||||
@ -9,6 +10,9 @@ import qualified Data.HashMap.Strict as HM
|
||||
import qualified Data.Text as T
|
||||
import qualified Data.Attoparsec.Text as Atto
|
||||
|
||||
-- | Convert all the (currently supported)
|
||||
-- HTML entities to their corresponding
|
||||
-- unicode characters.
|
||||
convertEntities :: T.Text -> T.Text
|
||||
convertEntities t =
|
||||
either (const t) T.concat
|
||||
|
@ -6,8 +6,12 @@
|
||||
-- Maintainer : alpmestan@gmail.com
|
||||
-- Stability : experimental
|
||||
--
|
||||
-- ???
|
||||
module Text.Taggy.Parser where
|
||||
-- Parse an HTML or XML document as a list of 'Tag's
|
||||
-- with 'taggyWith' or 'run'.
|
||||
module Text.Taggy.Parser
|
||||
( taggyWith
|
||||
, run
|
||||
) where
|
||||
|
||||
import Control.Applicative
|
||||
import Data.Attoparsec.Combinator as Atto
|
||||
@ -132,7 +136,7 @@ attributes cventities = postProcess `fmap` go emptyL
|
||||
char '>'
|
||||
return True
|
||||
|
||||
postProcess (l, b) = (toList l, b)
|
||||
postProcess (l, b) = (toListL l, b)
|
||||
|
||||
attribute :: Bool -> Parser Attribute
|
||||
attribute cventities = do
|
||||
|
@ -1,8 +1,14 @@
|
||||
{-# LANGUAGE LambdaCase, RecordWildCards, FlexibleInstances, UndecidableInstances, OverloadedStrings #-}
|
||||
|
||||
module Text.Taggy.Renderer (
|
||||
Renderable(..)
|
||||
) where
|
||||
-- |
|
||||
-- Module : Text.Taggy.Renderer
|
||||
-- Copyright : (c) 2014 Alp Mestanogullari, Vikram Verma
|
||||
-- License : BSD3
|
||||
-- Maintainer : alpmestan@gmail.com
|
||||
-- Stability : experimental
|
||||
--
|
||||
-- Render a DOM tree (from "Text.Taggy.DOM")
|
||||
-- using the excellent blaze markup rendering library.
|
||||
module Text.Taggy.Renderer where
|
||||
|
||||
import Data.Foldable (Foldable(foldMap))
|
||||
import Data.HashMap.Strict (HashMap, foldlWithKey')
|
||||
@ -24,11 +30,13 @@ class AsMarkup a where
|
||||
-- conversion.
|
||||
toMarkup :: Bool -> a -> Markup
|
||||
|
||||
-- | A 'Node' is convertible to 'Markup'
|
||||
instance AsMarkup Node where
|
||||
toMarkup convertEntities = \case
|
||||
NodeContent text -> Content $ if convertEntities then Text text else PreEscaped (Text text)
|
||||
NodeElement elmt -> toMarkup convertEntities elmt
|
||||
|
||||
-- | An 'Element' is convertible to 'Markup'
|
||||
instance AsMarkup Element where
|
||||
toMarkup convertEntities Element{..} = eltAttrs `toAttribute` Parent tag begin end kids
|
||||
where tag = toStatic eltName
|
||||
|
@ -6,24 +6,67 @@
|
||||
-- Maintainer : alpmestan@gmail.com
|
||||
-- Stability : experimental
|
||||
--
|
||||
-- ???
|
||||
module Text.Taggy.Types where
|
||||
-- Core types of /taggy/.
|
||||
module Text.Taggy.Types
|
||||
( -- * 'Tag' type
|
||||
Tag(..)
|
||||
, tname
|
||||
, isTagOpen
|
||||
, isTagClose
|
||||
, isTagText
|
||||
, isTagComment
|
||||
, isTagScript
|
||||
, isTagStyle
|
||||
, tagsNamed
|
||||
|
||||
, -- * 'Attribute's
|
||||
Attribute(..)
|
||||
, attrs
|
||||
, attrKey
|
||||
, attrValue
|
||||
|
||||
, -- * A small difference list implementation
|
||||
L
|
||||
, emptyL
|
||||
, appL
|
||||
, insertL
|
||||
, singletonL
|
||||
, toListL
|
||||
) where
|
||||
|
||||
import Data.Text (Text, toCaseFold)
|
||||
|
||||
-- | An attribute is just an attribute name
|
||||
-- and an attribute value.
|
||||
data Attribute = Attribute !Text !Text
|
||||
deriving (Show, Eq)
|
||||
|
||||
-- | Get the attributes of a 'Tag'.
|
||||
attrs :: Tag -> [Attribute]
|
||||
attrs (TagOpen _ as _) = as
|
||||
attrs _ = []
|
||||
|
||||
-- | Get the name of an 'Attribute'.
|
||||
attrKey :: Attribute -> Text
|
||||
attrKey (Attribute k _) = k
|
||||
|
||||
-- | Get the value of an 'Attribute'.
|
||||
attrValue :: Attribute -> Text
|
||||
attrValue (Attribute _ v) = v
|
||||
|
||||
-- A 'Tag' can be one of the following types of tags:
|
||||
--
|
||||
-- * an opening tag that has a name, a list of attributes, and whether
|
||||
-- it is a self-closing tag or not
|
||||
-- * a closing tag with the name of the tag
|
||||
-- * some raw 'Text'
|
||||
-- * an HTML comment tag
|
||||
-- * a @<script>...</script>@ tag
|
||||
-- * a @<style>...</style>@ tag
|
||||
--
|
||||
-- The latter two are useful to be considered
|
||||
-- separately in the parser and also lets you
|
||||
-- collect these bits quite easily.
|
||||
data Tag = TagOpen !Text [Attribute] !Bool -- is it a self-closing tag?
|
||||
| TagClose !Text
|
||||
| TagText !Text
|
||||
@ -32,6 +75,9 @@ data Tag = TagOpen !Text [Attribute] !Bool -- is it a self-closing tag?
|
||||
| TagStyle !Tag !Text !Tag
|
||||
deriving (Show, Eq)
|
||||
|
||||
-- | Name of a 'Tag'.
|
||||
--
|
||||
-- > tname (TagClose "a") == "a"
|
||||
tname :: Tag -> Text
|
||||
tname (TagOpen n _ _) = n
|
||||
tname (TagClose n) = n
|
||||
@ -40,30 +86,37 @@ tname (TagComment _) = "<!-- -->"
|
||||
tname (TagScript _ _ _) = "script"
|
||||
tname (TagStyle _ _ _) = "style"
|
||||
|
||||
-- | Is this 'Tag' an opening tag?
|
||||
isTagOpen :: Tag -> Bool
|
||||
isTagOpen (TagOpen _ _ _) = True
|
||||
isTagOpen _ = False
|
||||
|
||||
-- | Is this 'Tag' a closing tag?
|
||||
isTagClose :: Tag -> Bool
|
||||
isTagClose (TagClose _) = True
|
||||
isTagClose _ = False
|
||||
|
||||
-- | Is this 'Tag' just some flat text?
|
||||
isTagText :: Tag -> Bool
|
||||
isTagText (TagText _) = True
|
||||
isTagText _ = False
|
||||
|
||||
-- | Is this 'Tag' an HTML comment tag?
|
||||
isTagComment :: Tag -> Bool
|
||||
isTagComment (TagComment _) = True
|
||||
isTagComment _ = False
|
||||
|
||||
-- | Is this 'Tag' a @<script>...</script>@ tag?
|
||||
isTagScript :: Tag -> Bool
|
||||
isTagScript (TagScript _ _ _) = True
|
||||
isTagScript _ = False
|
||||
|
||||
-- | Is this 'Tag' a @<style>...</style>@ tag?
|
||||
isTagStyle :: Tag -> Bool
|
||||
isTagStyle (TagStyle _ _ _) = True
|
||||
isTagStyle _ = False
|
||||
|
||||
-- | Get all the (opening) tags with the given name
|
||||
tagsNamed :: Text -> [Tag] -> [Tag]
|
||||
tagsNamed nam = filter (named nam)
|
||||
|
||||
@ -81,8 +134,8 @@ appL (L l1) (L l2) = L $ l1 . l2
|
||||
singletonL :: a -> L a
|
||||
singletonL x = L (x:)
|
||||
|
||||
toList :: L a -> [a]
|
||||
toList (L f) = f []
|
||||
toListL :: L a -> [a]
|
||||
toListL (L f) = f []
|
||||
|
||||
insertL :: a -> L a -> L a
|
||||
insertL x (L f) = L $ (x:) . f
|
||||
insertL x (L f) = L $ (x:) . f
|
||||
|
36
taggy.cabal
36
taggy.cabal
@ -1,7 +1,39 @@
|
||||
name: taggy
|
||||
version: 0.1.0.0
|
||||
synopsis: HTML parsing à la tagsoup using attoparsec
|
||||
description: ???
|
||||
synopsis: Efficient and simple HTML/XML parsing library
|
||||
description:
|
||||
/taggy/ is a simple package for parsing HTML (and should work with XML)
|
||||
written on top of the <http://hackage.haskell.org/package/attoparsec attoparsec>
|
||||
library, which makes it one of the most efficient (space and time consumption wise)
|
||||
on hackage.
|
||||
.
|
||||
This is the root module of /taggy/. It reexports everything
|
||||
from the package. See each module's docs for details about
|
||||
the functions and types involved in /taggy/.
|
||||
.
|
||||
While we've been testing the parser on /many/ pages, it may still
|
||||
be a bit rough around the edges. Let us know on <http://github.com/alpmestan/taggy/issues github>
|
||||
if you have any problem.
|
||||
.
|
||||
If you like to look at your HTML through
|
||||
various optical instruments, feel free to take a look at
|
||||
the companion <http://hackage.haskell.org/package/taggy-lens taggy-lens>
|
||||
package we've put up together.
|
||||
.
|
||||
* If you want to parse a document as list of tags
|
||||
and go through it as some kind of stream by just picking
|
||||
what you need, head to "Text.Taggy.Parser" and take
|
||||
a look at 'Text.Taggy.Parser.taggyWith' and
|
||||
'Text.Taggy.Parser.run'.
|
||||
* If you want to parse the document as a DOM tree and
|
||||
traverse it to find the information you need,
|
||||
use 'Text.Taggy.DOM.parseDOM'. This is especially useful
|
||||
when combined with the helpful combinators from
|
||||
"Text.Taggy.Combinators".
|
||||
* If you build some HTML manually
|
||||
or just transform some existing DOM tree
|
||||
and want to turn it into a 'Data.Text.Lazy.Text'
|
||||
head to "Text.Taggy.Renderer" and look at 'Text.Taggy.Renderer.render'.
|
||||
homepage: http://github.com/alpmestan/taggy
|
||||
license: BSD3
|
||||
license-file: LICENSE
|
||||
|
Loading…
Reference in New Issue
Block a user