2020-08-15 12:14:27 +03:00
{- # LANGUAGE FlexibleInstances # -}
{- # LANGUAGE MultiParamTypeClasses # -}
2020-12-27 02:52:39 +03:00
{- # LANGUAGE OverloadedStrings # -}
2020-08-15 12:14:27 +03:00
{- # LANGUAGE ScopedTypeVariables # -}
2014-07-08 22:50:19 +04:00
{- |
2014-07-07 01:03:28 +04:00
2015-05-17 18:32:45 +03:00
Easy regular expression helpers , currently based on regex - tdfa . These should :
2014-07-07 01:03:28 +04:00
2014-10-29 17:46:49 +03:00
- be cross - platform , not requiring C libraries
2014-07-08 22:50:19 +04:00
2014-10-29 17:46:49 +03:00
- support unicode
2014-07-08 22:50:19 +04:00
2014-07-07 01:03:28 +04:00
- support extended regular expressions
2014-07-08 22:50:19 +04:00
2014-10-29 17:46:49 +03:00
- support replacement , with backreferences etc .
2014-07-08 22:50:19 +04:00
2014-07-07 01:03:28 +04:00
- support splitting
2014-07-08 22:50:19 +04:00
2014-10-29 17:46:49 +03:00
- have mnemonic names
2014-07-08 22:50:19 +04:00
2014-10-29 17:46:49 +03:00
- have simple monomorphic types
2015-09-27 04:39:21 +03:00
- work with simple strings
2020-08-06 21:43:22 +03:00
Regex strings are automatically compiled into regular expressions the first
time they are seen , and these are cached . If you use a huge number of unique
regular expressions this might lead to increased memory usage . Several
functions have memoised variants ( * Memo ) , which also trade space for time .
Currently two APIs are provided :
2020-09-01 04:36:34 +03:00
- The old partial one ( with ' s u f f i x e s' ) which will call error on any problem
( eg with malformed regexps ) . This comes from hledger's origin as a
command - line tool .
2020-08-06 21:43:22 +03:00
2020-09-01 04:36:34 +03:00
- The new total one which will return an error message . This is better for
long - running apps like hledger - web .
2014-10-29 17:46:49 +03:00
Current limitations :
- ( ? i ) and similar are not supported
2014-07-07 01:03:28 +04:00
- }
2014-07-06 21:11:02 +04:00
module Hledger.Utils.Regex (
2020-08-15 12:14:27 +03:00
-- * Regexp type and constructors
Regexp ( reString )
2020-09-01 04:36:34 +03:00
, toRegex
, toRegexCI
2020-08-15 12:14:27 +03:00
, toRegex'
, toRegexCI'
2014-10-29 17:46:49 +03:00
-- * type aliases
2014-10-25 01:30:49 +04:00
, Replacement
2020-08-06 21:43:22 +03:00
, RegexError
2020-08-06 21:18:08 +03:00
-- * total regex operations
2020-09-01 04:36:34 +03:00
, regexMatch
2020-12-27 02:52:39 +03:00
, regexMatchText
2020-08-15 12:14:27 +03:00
, regexReplace
2020-09-01 04:36:34 +03:00
, regexReplaceUnmemo
, regexReplaceAllBy
2014-07-06 21:11:02 +04:00
)
where
2020-08-06 21:18:08 +03:00
import Control.Monad ( foldM )
2020-08-15 12:14:27 +03:00
import Data.Aeson ( ToJSON ( .. ) , Value ( String ) )
import Data.Array ( ( ! ) , elems , indices )
import Data.Char ( isDigit )
2014-07-07 01:03:28 +04:00
import Data.List ( foldl' )
2015-09-27 04:39:21 +03:00
import Data.MemoUgly ( memo )
2020-12-27 02:52:39 +03:00
import Data.Text ( Text )
2020-08-15 12:14:27 +03:00
import qualified Data.Text as T
2014-07-07 01:03:28 +04:00
import Text.Regex.TDFA (
2020-08-15 12:14:27 +03:00
Regex , CompOption ( .. ) , defaultCompOpt , defaultExecOpt ,
makeRegexOptsM , AllMatches ( getAllMatches ) , match , MatchText ,
RegexLike ( .. ) , RegexMaker ( .. ) , RegexOptions ( .. ) , RegexContext ( .. )
2014-07-07 01:03:28 +04:00
)
2014-07-06 21:11:02 +04:00
2014-07-07 01:03:28 +04:00
2014-10-29 17:46:49 +03:00
-- | Regular expression. Extended regular expression-ish syntax ? But does not support eg (?i) syntax.
2020-08-15 12:14:27 +03:00
data Regexp
2020-12-27 02:52:39 +03:00
= Regexp { reString :: Text , reCompiled :: Regex }
| RegexpCI { reString :: Text , reCompiled :: Regex }
2020-08-15 12:14:27 +03:00
instance Eq Regexp where
Regexp s1 _ == Regexp s2 _ = s1 == s2
RegexpCI s1 _ == RegexpCI s2 _ = s1 == s2
_ == _ = False
instance Ord Regexp where
Regexp s1 _ ` compare ` Regexp s2 _ = s1 ` compare ` s2
RegexpCI s1 _ ` compare ` RegexpCI s2 _ = s1 ` compare ` s2
Regexp _ _ ` compare ` RegexpCI _ _ = LT
RegexpCI _ _ ` compare ` Regexp _ _ = GT
instance Show Regexp where
2020-12-27 02:52:39 +03:00
showsPrec d r = showParen ( d > app_prec ) $ reCons . showsPrec ( app_prec + 1 ) ( T . unpack $ reString r )
2020-08-18 04:32:15 +03:00
where app_prec = 10
reCons = case r of Regexp _ _ -> showString " Regexp "
RegexpCI _ _ -> showString " RegexpCI "
2020-08-15 12:14:27 +03:00
instance Read Regexp where
2022-08-23 13:58:31 +03:00
readsPrec d r = readParen ( d > app_prec ) ( \ r' -> [ ( toRegexCI' m , t ) |
( " RegexCI " , s ) <- lex r' ,
2020-08-18 04:32:15 +03:00
( m , t ) <- readsPrec ( app_prec + 1 ) s ] ) r
2022-08-23 13:58:31 +03:00
++ readParen ( d > app_prec ) ( \ r' -> [ ( toRegex' m , t ) |
( " Regex " , s ) <- lex r' ,
2020-08-18 04:32:15 +03:00
( m , t ) <- readsPrec ( app_prec + 1 ) s ] ) r
where app_prec = 10
2020-08-15 12:14:27 +03:00
instance ToJSON Regexp where
2020-12-27 02:52:39 +03:00
toJSON ( Regexp s _ ) = String $ " Regexp " <> s
toJSON ( RegexpCI s _ ) = String $ " RegexpCI " <> s
2020-08-15 12:14:27 +03:00
instance RegexLike Regexp String where
matchOnce = matchOnce . reCompiled
matchAll = matchAll . reCompiled
matchCount = matchCount . reCompiled
matchTest = matchTest . reCompiled
matchAllText = matchAllText . reCompiled
matchOnceText = matchOnceText . reCompiled
instance RegexContext Regexp String String where
match = match . reCompiled
matchM = matchM . reCompiled
2014-07-07 01:03:28 +04:00
2020-08-15 12:14:27 +03:00
-- Convert a Regexp string to a compiled Regex, or return an error message.
2020-12-27 02:52:39 +03:00
toRegex :: Text -> Either RegexError Regexp
2020-12-27 10:59:30 +03:00
toRegex = memo $ \ s -> mkRegexErr s ( Regexp s <$> makeRegexM ( T . unpack s ) ) -- Have to unpack here because Text instance in regex-tdfa only appears in 1.3.1
2014-07-07 01:03:28 +04:00
2020-09-01 04:36:34 +03:00
-- Like toRegex, but make a case-insensitive Regex.
2020-12-27 02:52:39 +03:00
toRegexCI :: Text -> Either RegexError Regexp
2020-12-27 10:59:30 +03:00
toRegexCI = memo $ \ s -> mkRegexErr s ( RegexpCI s <$> makeRegexOptsM defaultCompOpt { caseSensitive = False } defaultExecOpt ( T . unpack s ) ) -- Have to unpack here because Text instance in regex-tdfa only appears in 1.3.1
2014-07-07 01:03:28 +04:00
2020-08-15 12:14:27 +03:00
-- | Make a nice error message for a regexp error.
2020-12-27 02:52:39 +03:00
mkRegexErr :: Text -> Maybe a -> Either RegexError a
2020-08-15 12:14:27 +03:00
mkRegexErr s = maybe ( Left errmsg ) Right
2022-07-13 04:29:36 +03:00
where errmsg = T . unpack $ " This regular expression is malformed, please correct it: \ n " <> s
2014-07-06 21:11:02 +04:00
2020-08-15 12:14:27 +03:00
-- Convert a Regexp string to a compiled Regex, throw an error
2020-12-27 02:52:39 +03:00
toRegex' :: Text -> Regexp
2021-08-28 15:51:28 +03:00
toRegex' = either errorWithoutStackTrace id . toRegex
2015-09-27 04:45:17 +03:00
2020-08-15 12:14:27 +03:00
-- Like toRegex', but make a case-insensitive Regex.
2020-12-27 02:52:39 +03:00
toRegexCI' :: Text -> Regexp
2021-08-28 15:51:28 +03:00
toRegexCI' = either errorWithoutStackTrace id . toRegexCI
2015-09-27 04:45:17 +03:00
2020-08-15 12:14:27 +03:00
-- | A replacement pattern. May include numeric backreferences (\N).
type Replacement = String
2020-08-06 21:18:08 +03:00
2020-11-26 07:59:07 +03:00
-- | An error message arising during a regular expression operation.
-- Eg: trying to compile a malformed regular expression, or
-- trying to apply a malformed replacement pattern.
2020-08-15 12:14:27 +03:00
type RegexError = String
2020-08-06 21:18:08 +03:00
-- helpers
2020-09-01 04:36:34 +03:00
-- | Test whether a Regexp matches a String. This is an alias for `matchTest` for consistent
-- naming.
regexMatch :: Regexp -> String -> Bool
regexMatch = matchTest
2020-08-06 21:18:08 +03:00
2020-12-27 02:52:39 +03:00
-- | Tests whether a Regexp matches a Text.
--
-- This currently unpacks the Text to a String an works on that. This is due to
-- a performance bug in regex-tdfa (#9), which may or may not be relevant here.
regexMatchText :: Regexp -> Text -> Bool
regexMatchText r = matchTest r . T . unpack
2020-08-06 21:18:08 +03:00
--------------------------------------------------------------------------------
-- new total functions
2020-09-01 04:36:34 +03:00
-- | A memoising version of regexReplace. Caches the result for each
2020-08-06 21:18:08 +03:00
-- search pattern, replacement pattern, target string tuple.
2020-11-24 20:16:08 +03:00
-- This won't generate a regular expression parsing error since that
-- is pre-compiled nowadays, but there can still be a runtime error
-- from the replacement pattern, eg with a backreference referring
-- to a nonexistent match group.
2020-09-01 04:36:34 +03:00
regexReplace :: Regexp -> Replacement -> String -> Either RegexError String
regexReplace re repl = memo $ regexReplaceUnmemo re repl
2020-08-06 21:18:08 +03:00
-- helpers:
2020-08-06 21:35:04 +03:00
-- Replace this regular expression with this replacement pattern in this
2020-11-24 20:16:08 +03:00
-- string, or return an error message. (There should be no regexp
-- parsing errors these days since Regexp's compiled form is used,
-- but there can still be a runtime error from the replacement
-- pattern, eg a backreference referring to a nonexistent match group.)
2020-09-01 04:36:34 +03:00
regexReplaceUnmemo :: Regexp -> Replacement -> String -> Either RegexError String
2022-08-23 13:58:31 +03:00
regexReplaceUnmemo re repl str = foldM ( replaceMatch repl ) str ( reverse $ match ( reCompiled re ) str :: [ MatchText String ] )
2014-10-29 17:46:49 +03:00
where
2020-08-06 21:18:08 +03:00
-- Replace one match within the string with the replacement text
-- appropriate for this match. Or return an error message.
2020-09-01 04:36:34 +03:00
replaceMatch :: Replacement -> String -> MatchText String -> Either RegexError String
replaceMatch replpat s matchgroups =
2021-12-07 00:07:28 +03:00
case elems matchgroups of
[] -> Right s
( ( _ , ( off , len ) ) : _ ) -> -- groups should have 0-based indexes, and there should always be at least one, since this is a match
2022-08-23 13:58:31 +03:00
erpl >>= \ rpl -> Right $ pre ++ rpl ++ post
2020-08-06 21:18:08 +03:00
where
2021-12-07 00:07:28 +03:00
( pre , post' ) = splitAt off s
post = drop len post'
-- The replacement text: the replacement pattern with all
-- numeric backreferences replaced by the appropriate groups
-- from this match. Or an error message.
2022-08-23 13:58:31 +03:00
erpl = regexReplaceAllByM backrefRegex ( lookupMatchGroup matchgroups ) replpat
2021-12-07 00:07:28 +03:00
where
-- Given some match groups and a numeric backreference,
-- return the referenced group text, or an error message.
lookupMatchGroup :: MatchText String -> String -> Either RegexError String
2022-08-23 13:58:31 +03:00
lookupMatchGroup grps ( '\\ ': s2 @ ( _ : _ ) ) | all isDigit s2 =
case read s2 of n | n ` elem ` indices grps -> Right $ fst ( grps ! n ) -- PARTIAL: should not fail, all digits
_ -> Left $ " no match group exists for backreference \ " \ \ " ++ s ++ " \ " "
lookupMatchGroup _ s2 = Left $ " lookupMatchGroup called on non-numeric-backreference \ " " ++ s2 ++ " \ " , shouldn't happen "
2020-09-03 19:49:28 +03:00
backrefRegex = toRegex' " \ \ \ \ [0-9]+ " -- PARTIAL: should not fail
2020-08-06 21:18:08 +03:00
2020-09-01 04:36:34 +03:00
-- regexReplace' :: Regexp -> Replacement -> String -> String
-- regexReplace' re repl s =
-- foldl (replaceMatch repl) s (reverse $ match (reCompiled re) s :: [MatchText String])
-- where
-- replaceMatch :: Replacement -> String -> MatchText String -> String
-- replaceMatch replpat s matchgroups = pre ++ repl ++ post
-- where
-- ((_,(off,len)):_) = elems matchgroups -- groups should have 0-based indexes, and there should always be at least one, since this is a match
-- (pre, post') = splitAt off s
-- post = drop len post'
-- repl = regexReplaceAllBy backrefRegex (lookupMatchGroup matchgroups) replpat
-- where
-- lookupMatchGroup :: MatchText String -> String -> String
-- lookupMatchGroup grps ('\\':s@(_:_)) | all isDigit s =
-- case read s of n | n `elem` indices grps -> fst (grps ! n)
-- -- PARTIAL:
-- _ -> error' $ "no match group exists for backreference \"\\"++s++"\""
-- lookupMatchGroup _ s = error' $ "lookupMatchGroup called on non-numeric-backreference \""++s++"\", shouldn't happen"
2020-09-03 19:49:28 +03:00
-- backrefRegex = toRegex' "\\\\[0-9]+" -- PARTIAL: should not fail
2020-09-01 04:36:34 +03:00
2020-08-06 21:18:08 +03:00
-- helpers
2020-08-06 21:35:04 +03:00
-- adapted from http://stackoverflow.com/questions/9071682/replacement-substition-with-haskell-regex-libraries:
2020-08-06 21:18:08 +03:00
2020-08-06 21:35:04 +03:00
-- Replace all occurrences of a regexp in a string, transforming each match
-- with the given pure function.
2020-09-01 04:36:34 +03:00
regexReplaceAllBy :: Regexp -> ( String -> String ) -> String -> String
regexReplaceAllBy re transform s = prependdone rest
2020-08-06 21:18:08 +03:00
where
( _ , rest , prependdone ) = foldl' go ( 0 , s , id ) matches
where
2020-08-15 12:14:27 +03:00
matches = getAllMatches $ match ( reCompiled re ) s :: [ ( Int , Int ) ] -- offset and length
2020-08-06 21:18:08 +03:00
go :: ( Int , String , String -> String ) -> ( Int , Int ) -> ( Int , String , String -> String )
go ( pos , todo , prepend ) ( off , len ) =
let ( prematch , matchandrest ) = splitAt ( off - pos ) todo
2022-08-23 13:58:31 +03:00
( matched , rest2 ) = splitAt len matchandrest
in ( off + len , rest2 , prepend . ( prematch ++ ) . ( transform matched ++ ) )
2020-08-06 21:18:08 +03:00
2020-08-06 21:35:04 +03:00
-- Replace all occurrences of a regexp in a string, transforming each match
-- with the given monadic function. Eg if the monad is Either, a Left result
-- from the transform function short-circuits and is returned as the overall
-- result.
2020-09-01 04:36:34 +03:00
regexReplaceAllByM :: forall m . Monad m => Regexp -> ( String -> m String ) -> String -> m String
regexReplaceAllByM re transform s =
foldM go ( 0 , s , id ) matches >>= \ ( _ , rest , prependdone ) -> pure $ prependdone rest
2014-07-07 01:03:28 +04:00
where
2020-08-15 12:14:27 +03:00
matches = getAllMatches $ match ( reCompiled re ) s :: [ ( Int , Int ) ] -- offset and length
2020-08-06 21:18:08 +03:00
go :: ( Int , String , String -> String ) -> ( Int , Int ) -> m ( Int , String , String -> String )
go ( pos , todo , prepend ) ( off , len ) =
let ( prematch , matchandrest ) = splitAt ( off - pos ) todo
( matched , rest ) = splitAt len matchandrest
in transform matched >>= \ matched' -> pure ( off + len , rest , prepend . ( prematch ++ ) . ( matched' ++ ) )