{-# LANGUAGE ScopedTypeVariables #-} {-| Easy regular expression helpers, currently based on regex-tdfa. These should: - be cross-platform, not requiring C libraries - support unicode - support extended regular expressions - support replacement, with backreferences etc. - support splitting - have mnemonic names - have simple monomorphic types - work with simple strings Regex strings are automatically compiled into regular expressions the first time they are seen, and these are cached. If you use a huge number of unique regular expressions this might lead to increased memory usage. Several functions have memoised variants (*Memo), which also trade space for time. Current limitations: - (?i) and similar are not supported -} module Hledger.Utils.Regex ( -- * type aliases Regexp ,Replacement -- * partial regex operations (may call error) ,regexMatches ,regexMatchesCI ,regexReplace ,regexReplaceCI ,regexReplaceMemo ,regexReplaceCIMemo ,regexReplaceBy ,regexReplaceByCI -- * total regex operations ,regexMatches_ ,regexMatchesCI_ ,regexReplace_ ,regexReplaceCI_ ,regexReplaceMemo_ ,regexReplaceCIMemo_ ,regexReplaceBy_ ,regexReplaceByCI_ ) where import Control.Monad (foldM) import Data.Array import Data.Char import Data.List (foldl') import Data.Maybe (fromMaybe) import Data.MemoUgly (memo) import Text.Regex.TDFA ( Regex, CompOption(..), ExecOption(..), defaultCompOpt, defaultExecOpt, makeRegexOptsM, AllMatches(getAllMatches), match, (=~), MatchText ) import Hledger.Utils.UTF8IOCompat (error') -- | Regular expression. Extended regular expression-ish syntax ? But does not support eg (?i) syntax. type Regexp = String -- | A replacement pattern. May include numeric backreferences (\N). type Replacement = String -- | An regular expression compilation/processing error message. type Error = String -------------------------------------------------------------------------------- -- old partial functions -- PARTIAL: -- regexMatch' :: RegexContext Regexp String a => Regexp -> String -> a -- regexMatch' r s = s =~ (toRegex' r) regexMatches :: Regexp -> String -> Bool regexMatches = flip (=~) regexMatchesCI :: Regexp -> String -> Bool regexMatchesCI r = match (toRegexCI r) -- | Replace all occurrences of the regexp with the replacement -- pattern. The replacement pattern supports numeric backreferences -- (\N) but no other RE syntax. regexReplace :: Regexp -> Replacement -> String -> String regexReplace re = replaceRegex (toRegex re) regexReplaceCI :: Regexp -> Replacement -> String -> String regexReplaceCI re = replaceRegex (toRegexCI re) -- | A memoising version of regexReplace. Caches the result for each -- search pattern, replacement pattern, target string tuple. regexReplaceMemo :: Regexp -> Replacement -> String -> String regexReplaceMemo re repl = memo (regexReplace re repl) regexReplaceCIMemo :: Regexp -> Replacement -> String -> String regexReplaceCIMemo re repl = memo (regexReplaceCI re repl) -- | Replace all occurrences of the regexp, transforming each match with the given function. regexReplaceBy :: Regexp -> (String -> String) -> String -> String regexReplaceBy r = replaceAllBy (toRegex r) regexReplaceByCI :: Regexp -> (String -> String) -> String -> String regexReplaceByCI r = replaceAllBy (toRegexCI r) -- helpers -- | Convert our string-based Regexp to a real Regex. -- Or if it's not well formed, call error with a "malformed regexp" message. toRegex :: Regexp -> Regex toRegex = memo (compileRegex defaultCompOpt defaultExecOpt) -- PARTIAL: -- | Like toRegex but make a case-insensitive Regex. toRegexCI :: Regexp -> Regex toRegexCI = memo (compileRegex defaultCompOpt{caseSensitive=False} defaultExecOpt) -- PARTIAL: compileRegex :: CompOption -> ExecOption -> Regexp -> Regex compileRegex compopt execopt r = fromMaybe (error $ "this regular expression could not be compiled: " ++ show r) $ -- PARTIAL: makeRegexOptsM compopt execopt r replaceRegex :: Regex -> Replacement -> String -> String replaceRegex re repl s = foldl (replaceMatch repl) s (reverse $ match re s :: [MatchText String]) where replaceMatch :: Replacement -> String -> MatchText String -> String replaceMatch replpat s matchgroups = pre ++ repl ++ post where ((_,(off,len)):_) = elems matchgroups -- groups should have 0-based indexes, and there should always be at least one, since this is a match (pre, post') = splitAt off s post = drop len post' repl = replaceAllBy (toRegex "\\\\[0-9]+") (lookupMatchGroup matchgroups) replpat where lookupMatchGroup :: MatchText String -> String -> String lookupMatchGroup grps ('\\':s@(_:_)) | all isDigit s = case read s of n | n `elem` indices grps -> fst (grps ! n) -- PARTIAL: _ -> error' $ "no match group exists for backreference \"\\"++s++"\"" lookupMatchGroup _ s = error' $ "lookupMatchGroup called on non-numeric-backreference \""++s++"\", shouldn't happen" -------------------------------------------------------------------------------- -- new total functions -- | Does this regexp match the given string ? -- Or return an error if the regexp is malformed. regexMatches_ :: Regexp -> String -> Either Error Bool regexMatches_ r s = (`match` s) <$> toRegex_ r -- | Like regexMatches_ but match case-insensitively. regexMatchesCI_ :: Regexp -> String -> Either Error Bool regexMatchesCI_ r s = (`match` s) <$> toRegexCI_ r -- | Replace all occurrences of the regexp with the replacement -- pattern, or return an error message. The replacement pattern -- supports numeric backreferences (\N) but no other RE syntax. regexReplace_ :: Regexp -> Replacement -> String -> Either Error String regexReplace_ re repl s = toRegex_ re >>= \rx -> replaceRegex_ rx repl s -- | Like regexReplace_ but match occurrences case-insensitively. regexReplaceCI_ :: Regexp -> Replacement -> String -> Either Error String regexReplaceCI_ re repl s = toRegexCI_ re >>= \rx -> replaceRegex_ rx repl s -- | A memoising version of regexReplace_. Caches the result for each -- search pattern, replacement pattern, target string tuple. regexReplaceMemo_ :: Regexp -> Replacement -> String -> Either Error String regexReplaceMemo_ re repl = memo (regexReplace_ re repl) -- | Like regexReplaceMemo_ but match occurrences case-insensitively. regexReplaceCIMemo_ :: Regexp -> Replacement -> String -> Either Error String regexReplaceCIMemo_ re repl = memo (regexReplaceCI_ re repl) -- | Replace all occurrences of the regexp, transforming each match -- with the given function, or return an error message. regexReplaceBy_ :: Regexp -> (String -> String) -> String -> Either Error String regexReplaceBy_ r f s = toRegex_ r >>= \rx -> Right $ replaceAllBy rx f s -- | Like regexReplaceBy_ but match occurrences case-insensitively. regexReplaceByCI_ :: Regexp -> (String -> String) -> String -> Either Error String regexReplaceByCI_ r f s = toRegexCI_ r >>= \rx -> Right $ replaceAllBy rx f s -- helpers: -- Convert a Regexp string to a compiled Regex, or return an error message. toRegex_ :: Regexp -> Either Error Regex toRegex_ = memo (compileRegex_ defaultCompOpt defaultExecOpt) -- Like toRegex, but make a case-insensitive Regex. toRegexCI_ :: Regexp -> Either Error Regex toRegexCI_ = memo (compileRegex_ defaultCompOpt{caseSensitive=False} defaultExecOpt) -- Compile a Regexp string to a Regex with the given options, or return an -- error message if this fails. compileRegex_ :: CompOption -> ExecOption -> Regexp -> Either Error Regex compileRegex_ compopt execopt r = maybe (Left $ "this regular expression could not be compiled: " ++ show r) Right $ makeRegexOptsM compopt execopt r -- Replace this regular expression with this replacement pattern in this -- string, or return an error message. replaceRegex_ :: Regex -> Replacement -> String -> Either Error String replaceRegex_ re repl s = foldM (replaceMatch_ repl) s (reverse $ match re s :: [MatchText String]) where -- Replace one match within the string with the replacement text -- appropriate for this match. Or return an error message. replaceMatch_ :: Replacement -> String -> MatchText String -> Either Error String replaceMatch_ replpat s matchgroups = erepl >>= \repl -> Right $ pre ++ repl ++ post where ((_,(off,len)):_) = elems matchgroups -- groups should have 0-based indexes, and there should always be at least one, since this is a match (pre, post') = splitAt off s post = drop len post' -- The replacement text: the replacement pattern with all -- numeric backreferences replaced by the appropriate groups -- from this match. Or an error message. erepl = toRegex_ "\\\\[0-9]+" >>= \rx -> replaceAllByM rx (lookupMatchGroup_ matchgroups) replpat where -- Given some match groups and a numeric backreference, -- return the referenced group text, or an error message. lookupMatchGroup_ :: MatchText String -> String -> Either Error String lookupMatchGroup_ grps ('\\':s@(_:_)) | all isDigit s = case read s of n | n `elem` indices grps -> Right $ fst (grps ! n) _ -> Left $ "no match group exists for backreference \"\\"++s++"\"" lookupMatchGroup_ _ s = Left $ "lookupMatchGroup called on non-numeric-backreference \""++s++"\", shouldn't happen" -- helpers -- adapted from http://stackoverflow.com/questions/9071682/replacement-substition-with-haskell-regex-libraries: -- Replace all occurrences of a regexp in a string, transforming each match -- with the given pure function. replaceAllBy :: Regex -> (String -> String) -> String -> String replaceAllBy re transform s = prependdone rest where (_, rest, prependdone) = foldl' go (0, s, id) matches where matches = getAllMatches $ match re s :: [(Int, Int)] -- offset and length go :: (Int,String,String->String) -> (Int,Int) -> (Int,String,String->String) go (pos,todo,prepend) (off,len) = let (prematch, matchandrest) = splitAt (off - pos) todo (matched, rest) = splitAt len matchandrest in (off + len, rest, prepend . (prematch++) . (transform matched ++)) -- Replace all occurrences of a regexp in a string, transforming each match -- with the given monadic function. Eg if the monad is Either, a Left result -- from the transform function short-circuits and is returned as the overall -- result. replaceAllByM :: forall m. Monad m => Regex -> (String -> m String) -> String -> m String replaceAllByM re transform s = foldM go (0, s, id) matches >>= \(_, rest, prependdone) -> pure $ prependdone rest where matches = getAllMatches $ match re s :: [(Int, Int)] -- offset and length go :: (Int,String,String->String) -> (Int,Int) -> m (Int,String,String->String) go (pos,todo,prepend) (off,len) = let (prematch, matchandrest) = splitAt (off - pos) todo (matched, rest) = splitAt len matchandrest in transform matched >>= \matched' -> pure (off + len, rest, prepend . (prematch++) . (matched' ++))