make the parser a bit more permissive so that it handles Zalora's HTML wel

This commit is contained in:
Alp Mestanogullari 2014-06-13 16:53:23 +02:00
parent bb5e968958
commit 7bf1e35232

View File

@ -71,11 +71,28 @@ possibly c = (char c *> return ())
<|> return () <|> return ()
ident :: Parser T.Text ident :: Parser T.Text
ident = takeWhile1 (\c -> isAlphaNum c || c == '-' || c == '_' || c == ':') ident =
takeWhile1 (\c -> isAlphaNum c
|| c == '-'
|| c == '_'
|| c == ':'
)
attribute_ident :: Parser T.Text
attribute_ident =
takeWhile1 (\c -> isAlphaNum c
|| c == '-'
|| c == '_'
|| c == ':'
|| c == '('
|| c == ')'
|| c == ','
)
tagopen :: Bool -> Parser Tag tagopen :: Bool -> Parser Tag
tagopen cventities = do tagopen cventities = do
char '<' char '<'
possibly '<'
possibly '!' possibly '!'
skipSpace skipSpace
i <- ident i <- ident
@ -84,7 +101,8 @@ tagopen cventities = do
tagclose :: Parser Tag tagclose :: Parser Tag
tagclose = do tagclose = do
"</" char '<'
char '/'
skipSpace skipSpace
i <- ident i <- ident
char '>' char '>'
@ -113,9 +131,11 @@ attributes cventities = postProcess `fmap` go emptyL
attribute :: Bool -> Parser Attribute attribute :: Bool -> Parser Attribute
attribute cventities = do attribute cventities = do
skipSpace skipSpace
key <- quoted <|> ident key <- quoted <|> attribute_ident
value <- option "" $ fmap (if cventities then convertEntities else id) $ do value <- option "" $ fmap (if cventities then convertEntities else id) $ do
possibly ' '
"=" "="
possibly ' '
quoted <|> singlequoted <|> unquoted quoted <|> singlequoted <|> unquoted
return $ Attribute key value return $ Attribute key value
@ -133,47 +153,34 @@ attribute cventities = do
unquoted = Atto.takeTill (\c -> isSpace c || c == '>') unquoted = Atto.takeTill (\c -> isSpace c || c == '>')
htmlWith :: Parser Tag -> Parser [Tag] htmlWith :: Bool -> Parser [Tag]
htmlWith tp = go htmlWith cventities = go
where go = do where go = do
finished <- atEnd finished <- atEnd
if finished if finished
then return [] then return []
else do t <- tp else do t <- tag cventities
(t:) `fmap` go (t:) `fmap` go
tag :: Parser Tag -> Parser Tag tag :: Bool -> Parser Tag
tag p = skipSpace >> p tag cventities = skipSpace >> tag' cventities
tag' :: Parser Tag tag' :: Bool -> Parser Tag
tag' = tag' b =
tagcomment tagcomment
<|> tagscript False <|> tagscript b
<|> tagstyle False <|> tagstyle b
<|> tagopen False <|> tagopen b
<|> tagclose <|> tagclose
<|> tagtext False <|> tagtext b
tag'2 :: Parser Tag
tag'2 =
tagcomment
<|> tagscript True
<|> tagstyle True
<|> tagopen True
<|> tagclose
<|> tagtext True
-- | Do we convert html entities to unicode ?
tagparser :: Bool -> Parser Tag
tagparser True = tag tag'2
tagparser False = tag tag'
-- | Do we want to convert html entities to their unicode chars
taggyWith :: Bool -> LT.Text -> [Tag] taggyWith :: Bool -> LT.Text -> [Tag]
taggyWith cventities = taggyWith cventities =
either (const []) id either (const []) id
. AttoLT.eitherResult . AttoLT.eitherResult
. AttoLT.parse (htmlWith $ tagparser cventities) . AttoLT.parse (htmlWith cventities)
run :: Bool -> LT.Text -> AttoLT.Result [Tag] run :: Bool -> LT.Text -> AttoLT.Result [Tag]
run cventities = AttoLT.parse (htmlWith $ tagparser cventities) run cventities = AttoLT.parse (htmlWith cventities)