add a lot of documentation and also a 'parseDOM' function to directly go from Text to Nodes and Elements

2024-08-16 10:20:30 +03:00 · 2014-07-01 12:14:27 +02:00 · 2014-07-01 12:14:27 +02:00 · c586d9c632
commit c586d9c632
parent 65c5288718
8 changed files with 217 additions and 32 deletions
--- a/src/Text/Taggy.hs
+++ b/src/Text/Taggy.hs
@ -1,31 +1,54 @@
 {-# LANGUAGE OverloadedStrings #-}
 -- |
 -- Module       : Text.Taggy
-- Copyright    : (c) 2014 Alp Mestanogullari
+-- Copyright    : (c) 2014 Alp Mestanogullari, Vikram Verma
 -- License      : BSD3
 -- Maintainer   : alpmestan@gmail.com
 -- Stability    : experimental
 -- 
-- ???
+-- /taggy/ is a simple package for parsing HTML (and should work with XML)
+-- written on top of the <http://hackage.haskell.org/package/attoparsec attoparsec>
+-- library, which makes it one of the most efficient (space and time consumption wise)
+-- on hackage.
+--
+-- This is the root module of /taggy/. It reexports everything
+-- from the package. See each module's docs for details about
+-- the functions and types involved in /taggy/.
+--
+-- While we've been testing the parser on /many/ pages, it may still
+-- be a bit rough around the edges. Let us know on <http://github.com/alpmestan/taggy/issues github>
+-- if you have any problem.
+--
+-- If you like to look at your HTML through
+-- various optical instruments, feel free to take a look at
+-- the companion <http://hackage.haskell.org/package/taggy-lens taggy-lens>
+-- package we've put up together.
+--
+-- * If you want to parse a document as list of tags
+--   and go through it as some kind of stream by just picking
+--   what you need, head to "Text.Taggy.Parser" and take
+--   a look at 'Text.Taggy.Parser.taggyWith' and
+--   'Text.Taggy.Parser.run'.
+-- * If you want to parse the document as a DOM tree and
+--   traverse it to find the information you need,
+--   use 'Text.Taggy.DOM.parseDOM'. This is especially useful
+--   when combined with the helpful combinators from
+--   "Text.Taggy.Combinators".
+-- * If you build some HTML manually
+--   or just transform some existing DOM tree
+--   and want to turn it into a 'Data.Text.Lazy.Text'
+--   head to "Text.Taggy.Renderer" and look at 'Text.Taggy.Renderer.render'.
 module Text.Taggy 
-  ( linksIn
-  , module Text.Taggy.Types
+  ( -- * Exported modules
+    module Text.Taggy.Types
  , module Text.Taggy.Parser
  , module Text.Taggy.DOM
  , module Text.Taggy.Combinators
  , module Text.Taggy.Renderer
  ) where 

-import Data.Text (Text)
 import Text.Taggy.Types
 import Text.Taggy.Parser
 import Text.Taggy.DOM
 import Text.Taggy.Combinators
 import Text.Taggy.Renderer
-
-linksIn :: [Tag] -> [Text]
-linksIn = map attrValue
-		. filter ((=="href") . attrKey)
-		. concat
-		. map attrs
-		. tagsNamed "a"
--- a/src/Text/Taggy/Combinators.hs
+++ b/src/Text/Taggy/Combinators.hs
@ -1,5 +1,13 @@
 {-# LANGUAGE LambdaCase #-}
-
+-- |
+-- Module       : Text.Taggy.DOM
+-- Copyright    : (c) 2014 Alp Mestanogullari, Vikram Verma
+-- License      : BSD3
+-- Maintainer   : alpmestan@gmail.com
+-- Stability    : experimental
+--
+-- Many useful combinators for querying 'Element's
+-- of a DOM tree.
 module Text.Taggy.Combinators (hasName, hasAttr, getAttr, innerText, (//), (/&), (/*), trees, subtrees) where

 import Prelude hiding (lookup)
@ -9,15 +17,26 @@ import Data.Text (Text)
 import Text.Taggy.DOM (Element(..), Node(..), AttrName, AttrValue)
 import Data.HashMap.Strict (lookup, keys)

+-- | Does the given 'Element' have
+--   the given name?
 hasName :: Element -> Text -> Bool
 hasName = (==) . eltName

+-- | Does the given element have
+--   an attribute with the given name (or /key/)
 hasAttr :: Element -> AttrName -> Bool
 hasAttr = flip elem . keys . eltAttrs

+-- | Get the value for the given attribute name
+--   in the given 'Element'. Returns 'Nothing' if
+--   the provided 'Element' doesn't have an attribute
+--   with that name.
 getAttr :: Element -> AttrName -> Maybe AttrValue
 getAttr = flip lookup . eltAttrs

+-- | Get all the bits of raw text present
+--   everywhere below the given 'Element'
+--   in the DOM tree.
 innerText :: Element -> Text
 innerText = mconcat . map getContent . eltChildren
  where getContent = \case { NodeElement e -> innerText e; NodeContent x -> x }
--- a/src/Text/Taggy/DOM.hs
+++ b/src/Text/Taggy/DOM.hs
@ -1,33 +1,75 @@
 {-# LANGUAGE OverloadedStrings #-}
-
+-- |
+-- Module       : Text.Taggy.DOM
+-- Copyright    : (c) 2014 Alp Mestanogullari, Vikram Verma
+-- License      : BSD3
+-- Maintainer   : alpmestan@gmail.com
+-- Stability    : experimental
+-- 
+-- This module will help you represent
+-- an HTML or XML document as a tree
+-- and let you traverse it in whatever
+-- way you like.
+--
+-- This is especially useful when used in
+-- conjunction with <http://hackage.haskell.org/package/taggy-lens taggy-lens>.
 module Text.Taggy.DOM where

 import Data.HashMap.Strict (HashMap)
 import Data.Monoid ((<>))
 import Data.Text (Text)
+import Text.Taggy.Parser (taggyWith)
 import Text.Taggy.Types

 import qualified Data.HashMap.Strict as HM
+import qualified Data.Text.Lazy as LT

+-- | An attribute name is just a 'Text' value
 type AttrName = Text
+-- | An attribute value is just a 'Text' value
 type AttrValue = Text

+-- | An 'Element' here refers to a tag name, the attributes
+--   specified withing that tag, and all the children nodes
+--   of that element. An 'Element' is basically anything but
+--   \"raw\" content.
 data Element = 
-  Element { eltName     :: !Text 
-          , eltAttrs    :: !(HashMap AttrName AttrValue)
-          , eltChildren :: [Node]
+  Element { eltName     :: !Text -- ^ name of the element. e.g "a" for <a>
+          , eltAttrs    :: !(HashMap AttrName AttrValue) -- ^ a (hash)map from attribute names to attribute values
+          , eltChildren :: [Node] -- ^ children 'Node's
          }
  deriving (Eq, Show)

+-- | A 'Node' is either an 'Element' or some raw text.
 data Node =
    NodeElement Element
  | NodeContent Text
  deriving (Eq, Show)

+-- | Get the children of a node.
+--
+--   If called on some raw text, this function returns @[]@.
 nodeChildren :: Node -> [Node]
 nodeChildren (NodeContent _) = []
 nodeChildren (NodeElement e) = eltChildren e

+-- | Parse an HTML or XML document
+--   as a DOM tree.
+--
+--   The 'Bool' argument lets you specify
+--   whether you want to convert HTML entities
+--   to their corresponding unicode characters,
+--   just like in "Text.Taggy.Parser".
+--
+--   > parseDOM convertEntities = domify . taggyWith cventities
+parseDOM :: Bool -> LT.Text -> [Node]
+parseDOM cventities =
+  domify . taggyWith cventities
+
+-- | Transform a list of tags (produced with 'taggyWith')
+--   into a list of toplevel nodes. If the document you're working
+--   on is valid, there should only be one toplevel node, but let's
+--   not assume we're living in an ideal world.
 domify :: [Tag] -> [Node]
 domify [] = []
 domify (TagOpen name attribs True : tags)
--- a/src/Text/Taggy/Entities.hs
+++ b/src/Text/Taggy/Entities.hs
@ -1,6 +1,7 @@
 {-# LANGUAGE OverloadedStrings #-}

-module Text.Taggy.Entities where
+module Text.Taggy.Entities
+  (convertEntities) where

 import Control.Applicative
 import Control.Monad
@ -9,6 +10,9 @@ import qualified Data.HashMap.Strict as HM
 import qualified Data.Text as T
 import qualified Data.Attoparsec.Text as Atto

+-- | Convert all the (currently supported)
+--   HTML entities to their corresponding
+--   unicode characters.
 convertEntities :: T.Text -> T.Text
 convertEntities t =
    either (const t) T.concat
--- a/src/Text/Taggy/Parser.hs
+++ b/src/Text/Taggy/Parser.hs
@ -6,8 +6,12 @@
 -- Maintainer   : alpmestan@gmail.com
 -- Stability    : experimental
 -- 
-- ???
-module Text.Taggy.Parser where 
+-- Parse an HTML or XML document as a list of 'Tag's
+-- with 'taggyWith' or 'run'.
+module Text.Taggy.Parser
+  ( taggyWith
+  , run
+  ) where 

 import Control.Applicative
 import Data.Attoparsec.Combinator as Atto
@ -132,7 +136,7 @@ attributes cventities = postProcess `fmap` go emptyL
      char '>'
      return True

-    postProcess (l, b) = (toList l, b)
+    postProcess (l, b) = (toListL l, b)

 attribute :: Bool -> Parser Attribute
 attribute cventities = do
--- a/src/Text/Taggy/Renderer.hs
+++ b/src/Text/Taggy/Renderer.hs
@ -1,8 +1,14 @@
 {-# LANGUAGE LambdaCase, RecordWildCards, FlexibleInstances, UndecidableInstances, OverloadedStrings #-}
-
-module Text.Taggy.Renderer (
-  Renderable(..)
-) where
+-- |
+-- Module       : Text.Taggy.Renderer
+-- Copyright    : (c) 2014 Alp Mestanogullari, Vikram Verma
+-- License      : BSD3
+-- Maintainer   : alpmestan@gmail.com
+-- Stability    : experimental
+-- 
+-- Render a DOM tree (from "Text.Taggy.DOM")
+-- using the excellent blaze markup rendering library.
+module Text.Taggy.Renderer where

 import Data.Foldable (Foldable(foldMap))
 import Data.HashMap.Strict (HashMap, foldlWithKey')
@ -24,11 +30,13 @@ class AsMarkup a where
  -- conversion.
  toMarkup :: Bool -> a -> Markup

+-- | A 'Node' is convertible to 'Markup'
 instance AsMarkup Node where
  toMarkup convertEntities = \case
    NodeContent text -> Content $ if convertEntities then Text text else PreEscaped (Text text)
    NodeElement elmt -> toMarkup convertEntities elmt

+-- | An 'Element' is convertible to 'Markup'
 instance AsMarkup Element where
  toMarkup convertEntities Element{..} = eltAttrs `toAttribute` Parent tag begin end kids
    where tag   = toStatic eltName
--- a/src/Text/Taggy/Types.hs
+++ b/src/Text/Taggy/Types.hs
@ -6,24 +6,67 @@
 -- Maintainer   : alpmestan@gmail.com
 -- Stability    : experimental
 -- 
-- ???
-module Text.Taggy.Types where
+-- Core types of /taggy/.
+module Text.Taggy.Types
+	( -- * 'Tag' type
+	  Tag(..)
+	, tname
+	, isTagOpen
+	, isTagClose
+	, isTagText
+	, isTagComment
+	, isTagScript
+	, isTagStyle
+	, tagsNamed
+
+    , -- * 'Attribute's
+      Attribute(..)
+	, attrs
+	, attrKey
+	, attrValue
+
+	, -- * A small difference list implementation
+	  L
+	, emptyL
+	, appL
+	, insertL
+	, singletonL
+	, toListL
+	) where

 import Data.Text (Text, toCaseFold)

+-- | An attribute is just an attribute name
+--   and an attribute value.
 data Attribute = Attribute !Text !Text
  deriving (Show, Eq)

+-- | Get the attributes of a 'Tag'.
 attrs :: Tag -> [Attribute]
 attrs (TagOpen _ as _) = as
 attrs                _ = []

+-- | Get the name of an 'Attribute'.
 attrKey :: Attribute -> Text
 attrKey (Attribute k _) = k

+-- | Get the value of an 'Attribute'.
 attrValue :: Attribute -> Text
 attrValue (Attribute _ v) = v

+-- A 'Tag' can be one of the following types of tags:
+--
+--   * an opening tag that has a name, a list of attributes, and whether
+--     it is a self-closing tag or not
+--   * a closing tag with the name of the tag
+--   * some raw 'Text'
+--   * an HTML comment tag
+--   * a @<script>...</script>@ tag
+--   * a @<style>...</style>@ tag
+--
+-- The latter two are useful to be considered
+-- separately in the parser and also lets you
+-- collect these bits quite easily.
 data Tag = TagOpen !Text [Attribute] !Bool -- is it a self-closing tag?
         | TagClose !Text
         | TagText !Text
@ -32,6 +75,9 @@ data Tag = TagOpen !Text [Attribute] !Bool -- is it a self-closing tag?
         | TagStyle !Tag !Text !Tag
  deriving (Show, Eq)

+-- | Name of a 'Tag'.
+--
+-- > tname (TagClose "a") == "a"
 tname :: Tag -> Text
 tname (TagOpen n _ _) = n
 tname (TagClose n) = n
@ -40,30 +86,37 @@ tname (TagComment _) = "<!-- -->"
 tname (TagScript _ _ _) = "script"
 tname (TagStyle _ _ _) = "style"

+-- | Is this 'Tag' an opening tag?
 isTagOpen :: Tag -> Bool
 isTagOpen (TagOpen _ _ _) = True
 isTagOpen _               = False

+-- | Is this 'Tag' a closing tag?
 isTagClose :: Tag -> Bool
 isTagClose (TagClose _) = True
 isTagClose _            = False

+-- | Is this 'Tag' just some flat text?
 isTagText :: Tag -> Bool
 isTagText (TagText _) = True
 isTagText _           = False

+-- | Is this 'Tag' an HTML comment tag?
 isTagComment :: Tag -> Bool
 isTagComment (TagComment _) = True
 isTagComment _              = False

+-- | Is this 'Tag' a @<script>...</script>@ tag?
 isTagScript :: Tag -> Bool
 isTagScript (TagScript _ _ _) = True
 isTagScript _           = False

+-- | Is this 'Tag' a @<style>...</style>@ tag?
 isTagStyle :: Tag -> Bool
 isTagStyle (TagStyle _ _ _) = True
 isTagStyle _          = False

+-- | Get all the (opening) tags with the given name
 tagsNamed :: Text -> [Tag] -> [Tag]
 tagsNamed nam = filter (named nam)
  
@ -81,8 +134,8 @@ appL (L l1) (L l2) = L $ l1 . l2
 singletonL :: a -> L a
 singletonL x = L (x:)

-toList :: L a -> [a]
-toList (L f) = f []
+toListL :: L a -> [a]
+toListL (L f) = f []

 insertL :: a -> L a -> L a
-insertL x (L f) = L $ (x:) . f
+insertL x (L f) = L $ (x:) . f
--- a/taggy.cabal
+++ b/taggy.cabal
@ -1,7 +1,39 @@
 name:                taggy
 version:             0.1.0.0
-synopsis:            HTML parsing à la tagsoup using attoparsec
-description:         ???
+synopsis:            Efficient and simple HTML/XML parsing library
+description:         
+  /taggy/ is a simple package for parsing HTML (and should work with XML)
+  written on top of the <http://hackage.haskell.org/package/attoparsec attoparsec>
+  library, which makes it one of the most efficient (space and time consumption wise)
+  on hackage.
+  .
+  This is the root module of /taggy/. It reexports everything
+  from the package. See each module's docs for details about
+  the functions and types involved in /taggy/.
+  .
+  While we've been testing the parser on /many/ pages, it may still
+  be a bit rough around the edges. Let us know on <http://github.com/alpmestan/taggy/issues github>
+  if you have any problem.
+  .
+  If you like to look at your HTML through
+  various optical instruments, feel free to take a look at
+  the companion <http://hackage.haskell.org/package/taggy-lens taggy-lens>
+  package we've put up together.
+  .
+  * If you want to parse a document as list of tags
+    and go through it as some kind of stream by just picking
+    what you need, head to "Text.Taggy.Parser" and take
+    a look at 'Text.Taggy.Parser.taggyWith' and
+    'Text.Taggy.Parser.run'.
+  * If you want to parse the document as a DOM tree and
+    traverse it to find the information you need,
+    use 'Text.Taggy.DOM.parseDOM'. This is especially useful
+    when combined with the helpful combinators from
+    "Text.Taggy.Combinators".
+  * If you build some HTML manually
+    or just transform some existing DOM tree
+    and want to turn it into a 'Data.Text.Lazy.Text'
+    head to "Text.Taggy.Renderer" and look at 'Text.Taggy.Renderer.render'.
 homepage:            http://github.com/alpmestan/taggy
 license:             BSD3
 license-file:        LICENSE