2015-04-29 00:06:22 +03:00
|
|
|
-- | String formatting helpers, starting to get a bit out of control.
|
|
|
|
|
2015-08-19 23:47:26 +03:00
|
|
|
module Hledger.Utils.String (
|
2020-07-16 13:28:48 +03:00
|
|
|
takeEnd,
|
2015-08-19 23:47:26 +03:00
|
|
|
-- * misc
|
|
|
|
lowercase,
|
|
|
|
uppercase,
|
|
|
|
underline,
|
|
|
|
stripbrackets,
|
|
|
|
-- quoting
|
|
|
|
quoteIfNeeded,
|
|
|
|
singleQuoteIfNeeded,
|
|
|
|
-- quotechars,
|
|
|
|
-- whitespacechars,
|
|
|
|
words',
|
|
|
|
unwords',
|
2017-04-26 04:27:25 +03:00
|
|
|
stripAnsi,
|
2015-08-19 23:47:26 +03:00
|
|
|
-- * single-line layout
|
|
|
|
strip,
|
|
|
|
lstrip,
|
|
|
|
rstrip,
|
|
|
|
chomp,
|
2020-12-31 03:31:13 +03:00
|
|
|
chomp1,
|
2019-10-12 02:50:06 +03:00
|
|
|
singleline,
|
2015-08-19 23:47:26 +03:00
|
|
|
elideLeft,
|
|
|
|
elideRight,
|
2015-08-20 06:28:24 +03:00
|
|
|
formatString,
|
2015-09-29 07:33:18 +03:00
|
|
|
-- * wide-character-aware layout
|
lib: textification begins! account names
The first of several conversions from String to (strict) Text, hopefully
reducing space and time usage.
This one shows a small improvement, with GHC 7.10.3 and text-1.2.2.1:
hledger -f data/100x100x10.journal stats
string: <<ghc: 39471064 bytes, 77 GCs, 198421/275048 avg/max bytes residency (3 samples), 2M in use, 0.000 INIT (0.001 elapsed), 0.015 MUT (0.020 elapsed), 0.010 GC (0.014 elapsed) :ghc>>
text: <<ghc: 39268024 bytes, 77 GCs, 197018/270840 avg/max bytes residency (3 samples), 2M in use, 0.000 INIT (0.002 elapsed), 0.016 MUT (0.022 elapsed), 0.009 GC (0.011 elapsed) :ghc>>
hledger -f data/1000x100x10.journal stats
string: <<ghc: 318555920 bytes, 617 GCs, 2178997/7134472 avg/max bytes residency (7 samples), 16M in use, 0.000 INIT (0.001 elapsed), 0.129 MUT (0.136 elapsed), 0.067 GC (0.077 elapsed) :ghc>>
text: <<ghc: 314248496 bytes, 612 GCs, 2074045/6617960 avg/max bytes residency (7 samples), 16M in use, 0.000 INIT (0.003 elapsed), 0.137 MUT (0.145 elapsed), 0.067 GC (0.079 elapsed) :ghc>>
hledger -f data/10000x100x10.journal stats
string: <<ghc: 3114763608 bytes, 6026 GCs, 18858950/75552024 avg/max bytes residency (11 samples), 201M in use, 0.000 INIT (0.000 elapsed), 1.331 MUT (1.372 elapsed), 0.699 GC (0.812 elapsed) :ghc>>
text: <<ghc: 3071468920 bytes, 5968 GCs, 14120344/62951360 avg/max bytes residency (9 samples), 124M in use, 0.000 INIT (0.003 elapsed), 1.272 MUT (1.349 elapsed), 0.513 GC (0.578 elapsed) :ghc>>
hledger -f data/100000x100x10.journal stats
string: <<ghc: 31186579432 bytes, 60278 GCs, 135332581/740228992 avg/max bytes residency (13 samples), 1697M in use, 0.000 INIT (0.008 elapsed), 14.677 MUT (15.508 elapsed), 7.081 GC (8.074 elapsed) :ghc>>
text: <<ghc: 30753427672 bytes, 59763 GCs, 117595958/666457240 avg/max bytes residency (14 samples), 1588M in use, 0.000 INIT (0.008 elapsed), 13.713 MUT (13.966 elapsed), 6.220 GC (7.108 elapsed) :ghc>>
2016-05-24 04:16:21 +03:00
|
|
|
charWidth,
|
2015-09-29 07:33:18 +03:00
|
|
|
strWidth,
|
2020-11-02 03:32:53 +03:00
|
|
|
strWidthAnsi,
|
2015-09-29 07:33:18 +03:00
|
|
|
takeWidth,
|
2015-08-19 23:47:26 +03:00
|
|
|
) where
|
|
|
|
|
|
|
|
|
fix a slowdown with report rendering in 1.19.1 (#1350)
stripAnsi is called many times during rendering (by strWidth), so
should be fast. It was originally a regex replacement, and more
recently a custom parser. The parser was slower, particularly the one
in 1.19.1. See #1350, and this rough test:
time118ish = timeIt $ print $ length $ concat $ map (fromRight undefined . regexReplace (toRegex' "\ESC\\[([0-9]+;)*([0-9]+)?[ABCDHJKfmsu]") "") testdata
time119 = timeparser (many (takeWhile1P Nothing (/='\ESC') <|> "" <$ ansi))
time1191 = timeparser (many ("" <$ try ansi <|> pure <$> anySingle))
timeparser p = timeIt $ print $ length $ concat $ map (concat . fromJust . parseMaybe p) testdata
testdata = concat $ replicate 10000
[ "2008-01-01 income assets:bank:checking $1 $1"
, "2008-06-01 gift assets:bank:checking $1 $2"
, "2008-06-02 save assets:bank:saving $1 $3"
, " assets:bank:checking ..m$-1\ESC[m\ESC[m $2"
, "2008-06-03 eat & shop assets:cash ..m$-2\ESC[m\ESC[m 0"
, "2008-12-31 pay off assets:bank:checking ..m$-1\ESC[m\ESC[m ..m$-1\ESC[m\ESC[m"
]
ghci> time118ish
4560000
CPU time: 0.17s
ghci> time119
4560000
CPU time: 0.91s
ghci> time1191
4560000
CPU time: 2.76s
Possibly a more careful parser could beat regexReplace. Note the
latter does memoisation, which could be faster and/or could also use
more resident memory in some situations.
Ideally we would calculate all widths before adding ANSI colour codes,
so we wouldn't have to wastefully strip them.
2020-09-11 03:46:16 +03:00
|
|
|
import Data.Char (isSpace, toLower, toUpper)
|
2020-11-04 02:44:15 +03:00
|
|
|
import Data.List (intercalate)
|
2020-11-09 08:54:28 +03:00
|
|
|
import qualified Data.Text as T
|
fix a slowdown with report rendering in 1.19.1 (#1350)
stripAnsi is called many times during rendering (by strWidth), so
should be fast. It was originally a regex replacement, and more
recently a custom parser. The parser was slower, particularly the one
in 1.19.1. See #1350, and this rough test:
time118ish = timeIt $ print $ length $ concat $ map (fromRight undefined . regexReplace (toRegex' "\ESC\\[([0-9]+;)*([0-9]+)?[ABCDHJKfmsu]") "") testdata
time119 = timeparser (many (takeWhile1P Nothing (/='\ESC') <|> "" <$ ansi))
time1191 = timeparser (many ("" <$ try ansi <|> pure <$> anySingle))
timeparser p = timeIt $ print $ length $ concat $ map (concat . fromJust . parseMaybe p) testdata
testdata = concat $ replicate 10000
[ "2008-01-01 income assets:bank:checking $1 $1"
, "2008-06-01 gift assets:bank:checking $1 $2"
, "2008-06-02 save assets:bank:saving $1 $3"
, " assets:bank:checking ..m$-1\ESC[m\ESC[m $2"
, "2008-06-03 eat & shop assets:cash ..m$-2\ESC[m\ESC[m 0"
, "2008-12-31 pay off assets:bank:checking ..m$-1\ESC[m\ESC[m ..m$-1\ESC[m\ESC[m"
]
ghci> time118ish
4560000
CPU time: 0.17s
ghci> time119
4560000
CPU time: 0.91s
ghci> time1191
4560000
CPU time: 2.76s
Possibly a more careful parser could beat regexReplace. Note the
latter does memoisation, which could be faster and/or could also use
more resident memory in some situations.
Ideally we would calculate all widths before adding ANSI colour codes,
so we wouldn't have to wastefully strip them.
2020-09-11 03:46:16 +03:00
|
|
|
import Text.Megaparsec ((<|>), between, many, noneOf, sepBy)
|
|
|
|
import Text.Megaparsec.Char (char)
|
2015-08-20 06:28:24 +03:00
|
|
|
import Text.Printf (printf)
|
2015-08-19 23:47:26 +03:00
|
|
|
|
|
|
|
import Hledger.Utils.Parse
|
fix a slowdown with report rendering in 1.19.1 (#1350)
stripAnsi is called many times during rendering (by strWidth), so
should be fast. It was originally a regex replacement, and more
recently a custom parser. The parser was slower, particularly the one
in 1.19.1. See #1350, and this rough test:
time118ish = timeIt $ print $ length $ concat $ map (fromRight undefined . regexReplace (toRegex' "\ESC\\[([0-9]+;)*([0-9]+)?[ABCDHJKfmsu]") "") testdata
time119 = timeparser (many (takeWhile1P Nothing (/='\ESC') <|> "" <$ ansi))
time1191 = timeparser (many ("" <$ try ansi <|> pure <$> anySingle))
timeparser p = timeIt $ print $ length $ concat $ map (concat . fromJust . parseMaybe p) testdata
testdata = concat $ replicate 10000
[ "2008-01-01 income assets:bank:checking $1 $1"
, "2008-06-01 gift assets:bank:checking $1 $2"
, "2008-06-02 save assets:bank:saving $1 $3"
, " assets:bank:checking ..m$-1\ESC[m\ESC[m $2"
, "2008-06-03 eat & shop assets:cash ..m$-2\ESC[m\ESC[m 0"
, "2008-12-31 pay off assets:bank:checking ..m$-1\ESC[m\ESC[m ..m$-1\ESC[m\ESC[m"
]
ghci> time118ish
4560000
CPU time: 0.17s
ghci> time119
4560000
CPU time: 0.91s
ghci> time1191
4560000
CPU time: 2.76s
Possibly a more careful parser could beat regexReplace. Note the
latter does memoisation, which could be faster and/or could also use
more resident memory in some situations.
Ideally we would calculate all widths before adding ANSI colour codes,
so we wouldn't have to wastefully strip them.
2020-09-11 03:46:16 +03:00
|
|
|
import Hledger.Utils.Regex (toRegex', regexReplace)
|
2020-12-24 03:18:25 +03:00
|
|
|
import Text.WideString (charWidth, strWidth)
|
2020-11-04 02:44:15 +03:00
|
|
|
|
2020-07-16 13:28:48 +03:00
|
|
|
|
|
|
|
-- | Take elements from the end of a list.
|
|
|
|
takeEnd n l = go (drop n l) l
|
|
|
|
where
|
|
|
|
go (_:xs) (_:ys) = go xs ys
|
|
|
|
go [] r = r
|
|
|
|
go _ [] = []
|
|
|
|
|
2015-08-19 23:47:26 +03:00
|
|
|
lowercase, uppercase :: String -> String
|
|
|
|
lowercase = map toLower
|
|
|
|
uppercase = map toUpper
|
|
|
|
|
|
|
|
-- | Remove leading and trailing whitespace.
|
|
|
|
strip :: String -> String
|
|
|
|
strip = lstrip . rstrip
|
|
|
|
|
|
|
|
-- | Remove leading whitespace.
|
|
|
|
lstrip :: String -> String
|
2016-12-20 20:29:12 +03:00
|
|
|
lstrip = dropWhile isSpace
|
2015-08-19 23:47:26 +03:00
|
|
|
|
|
|
|
-- | Remove trailing whitespace.
|
|
|
|
rstrip :: String -> String
|
|
|
|
rstrip = reverse . lstrip . reverse
|
|
|
|
|
2020-12-31 03:31:13 +03:00
|
|
|
-- | Remove all trailing newlines/carriage returns.
|
2015-08-19 23:47:26 +03:00
|
|
|
chomp :: String -> String
|
|
|
|
chomp = reverse . dropWhile (`elem` "\r\n") . reverse
|
|
|
|
|
2020-12-31 03:31:13 +03:00
|
|
|
-- | Remove all trailing newline/carriage returns, leaving just one trailing newline.
|
|
|
|
chomp1 :: String -> String
|
|
|
|
chomp1 = (++"\n") . chomp
|
|
|
|
|
2020-01-05 04:37:55 +03:00
|
|
|
-- | Remove consecutive line breaks, replacing them with single space
|
2019-10-12 02:50:06 +03:00
|
|
|
singleline :: String -> String
|
2019-10-12 12:16:38 +03:00
|
|
|
singleline = unwords . filter (/="") . (map strip) . lines
|
2019-10-12 02:50:06 +03:00
|
|
|
|
2015-08-19 23:47:26 +03:00
|
|
|
stripbrackets :: String -> String
|
|
|
|
stripbrackets = dropWhile (`elem` "([") . reverse . dropWhile (`elem` "])") . reverse :: String -> String
|
|
|
|
|
|
|
|
elideLeft :: Int -> String -> String
|
|
|
|
elideLeft width s =
|
2020-07-16 13:28:48 +03:00
|
|
|
if length s > width then ".." ++ takeEnd (width - 2) s else s
|
2015-08-19 23:47:26 +03:00
|
|
|
|
|
|
|
elideRight :: Int -> String -> String
|
|
|
|
elideRight width s =
|
|
|
|
if length s > width then take (width - 2) s ++ ".." else s
|
|
|
|
|
2015-08-20 06:28:24 +03:00
|
|
|
-- | Clip and pad a string to a minimum & maximum width, and/or left/right justify it.
|
|
|
|
-- Works on multi-line strings too (but will rewrite non-unix line endings).
|
|
|
|
formatString :: Bool -> Maybe Int -> Maybe Int -> String -> String
|
|
|
|
formatString leftJustified minwidth maxwidth s = intercalate "\n" $ map (printf fmt) $ lines s
|
|
|
|
where
|
|
|
|
justify = if leftJustified then "-" else ""
|
|
|
|
minwidth' = maybe "" show minwidth
|
|
|
|
maxwidth' = maybe "" (("."++).show) maxwidth
|
|
|
|
fmt = "%" ++ justify ++ minwidth' ++ maxwidth' ++ "s"
|
|
|
|
|
2015-08-19 23:47:26 +03:00
|
|
|
underline :: String -> String
|
|
|
|
underline s = s' ++ replicate (length s) '-' ++ "\n"
|
|
|
|
where s'
|
|
|
|
| last s == '\n' = s
|
|
|
|
| otherwise = s ++ "\n"
|
|
|
|
|
|
|
|
-- | Double-quote this string if it contains whitespace, single quotes
|
|
|
|
-- or double-quotes, escaping the quotes as needed.
|
|
|
|
quoteIfNeeded :: String -> String
|
2020-10-07 07:30:28 +03:00
|
|
|
quoteIfNeeded s | any (`elem` s) (quotechars++whitespacechars++redirectchars) = showChar '"' $ escapeQuotes s "\""
|
2015-08-19 23:47:26 +03:00
|
|
|
| otherwise = s
|
2020-10-07 07:30:28 +03:00
|
|
|
where
|
|
|
|
escapeQuotes [] x = x
|
|
|
|
escapeQuotes ('"':cs) x = showString "\\\"" $ escapeQuotes cs x
|
|
|
|
escapeQuotes (c:cs) x = showChar c $ escapeQuotes cs x
|
2020-09-01 04:41:55 +03:00
|
|
|
|
2015-08-19 23:47:26 +03:00
|
|
|
-- | Single-quote this string if it contains whitespace or double-quotes.
|
|
|
|
-- No good for strings containing single quotes.
|
|
|
|
singleQuoteIfNeeded :: String -> String
|
2020-10-07 07:30:28 +03:00
|
|
|
singleQuoteIfNeeded s | any (`elem` s) (quotechars++whitespacechars) = "'"++s++"'"
|
2015-08-19 23:47:26 +03:00
|
|
|
| otherwise = s
|
|
|
|
|
2019-11-30 16:34:59 +03:00
|
|
|
quotechars, whitespacechars, redirectchars :: [Char]
|
2015-08-19 23:47:26 +03:00
|
|
|
quotechars = "'\""
|
|
|
|
whitespacechars = " \t\n\r"
|
2019-11-30 16:34:59 +03:00
|
|
|
redirectchars = "<>"
|
2015-08-19 23:47:26 +03:00
|
|
|
|
|
|
|
-- | Quote-aware version of words - don't split on spaces which are inside quotes.
|
|
|
|
-- NB correctly handles "a'b" but not "''a''". Can raise an error if parsing fails.
|
|
|
|
words' :: String -> [String]
|
|
|
|
words' "" = []
|
2016-07-29 18:57:10 +03:00
|
|
|
words' s = map stripquotes $ fromparse $ parsewithString p s
|
2015-08-19 23:47:26 +03:00
|
|
|
where
|
2020-07-20 18:09:46 +03:00
|
|
|
p = do ss <- (singleQuotedPattern <|> doubleQuotedPattern <|> pattern) `sepBy` skipNonNewlineSpaces1
|
2015-08-19 23:47:26 +03:00
|
|
|
-- eof
|
|
|
|
return ss
|
|
|
|
pattern = many (noneOf whitespacechars)
|
|
|
|
singleQuotedPattern = between (char '\'') (char '\'') (many $ noneOf "'")
|
|
|
|
doubleQuotedPattern = between (char '"') (char '"') (many $ noneOf "\"")
|
|
|
|
|
|
|
|
-- | Quote-aware version of unwords - single-quote strings which contain whitespace
|
|
|
|
unwords' :: [String] -> String
|
|
|
|
unwords' = unwords . map quoteIfNeeded
|
|
|
|
|
|
|
|
-- | Strip one matching pair of single or double quotes on the ends of a string.
|
|
|
|
stripquotes :: String -> String
|
|
|
|
stripquotes s = if isSingleQuoted s || isDoubleQuoted s then init $ tail s else s
|
|
|
|
|
|
|
|
isSingleQuoted s@(_:_:_) = head s == '\'' && last s == '\''
|
|
|
|
isSingleQuoted _ = False
|
|
|
|
|
|
|
|
isDoubleQuoted s@(_:_:_) = head s == '"' && last s == '"'
|
|
|
|
isDoubleQuoted _ = False
|
|
|
|
|
2015-09-29 07:33:18 +03:00
|
|
|
-- Functions below treat wide (eg CJK) characters as double-width.
|
|
|
|
|
2015-04-29 00:06:22 +03:00
|
|
|
-- | Double-width-character-aware string truncation. Take as many
|
|
|
|
-- characters as possible from a string without exceeding the
|
|
|
|
-- specified width. Eg takeWidth 3 "りんご" = "り".
|
|
|
|
takeWidth :: Int -> String -> String
|
|
|
|
takeWidth _ "" = ""
|
|
|
|
takeWidth 0 _ = ""
|
|
|
|
takeWidth w (c:cs) | cw <= w = c:takeWidth (w-cw) cs
|
|
|
|
| otherwise = ""
|
|
|
|
where cw = charWidth c
|
|
|
|
|
2020-11-02 03:32:53 +03:00
|
|
|
-- | Like strWidth, but also strips ANSI escape sequences before
|
|
|
|
-- calculating the width.
|
|
|
|
--
|
|
|
|
-- This is no longer used in code, as widths are calculated before
|
|
|
|
-- adding ANSI escape sequences, but is being kept around for now.
|
|
|
|
strWidthAnsi :: String -> Int
|
|
|
|
strWidthAnsi = strWidth . stripAnsi
|
2017-04-26 04:27:25 +03:00
|
|
|
|
2020-09-07 04:12:46 +03:00
|
|
|
-- | Strip ANSI escape sequences from a string.
|
|
|
|
--
|
|
|
|
-- >>> stripAnsi "\ESC[31m-1\ESC[m"
|
|
|
|
-- "-1"
|
2017-04-26 04:27:25 +03:00
|
|
|
stripAnsi :: String -> String
|
fix a slowdown with report rendering in 1.19.1 (#1350)
stripAnsi is called many times during rendering (by strWidth), so
should be fast. It was originally a regex replacement, and more
recently a custom parser. The parser was slower, particularly the one
in 1.19.1. See #1350, and this rough test:
time118ish = timeIt $ print $ length $ concat $ map (fromRight undefined . regexReplace (toRegex' "\ESC\\[([0-9]+;)*([0-9]+)?[ABCDHJKfmsu]") "") testdata
time119 = timeparser (many (takeWhile1P Nothing (/='\ESC') <|> "" <$ ansi))
time1191 = timeparser (many ("" <$ try ansi <|> pure <$> anySingle))
timeparser p = timeIt $ print $ length $ concat $ map (concat . fromJust . parseMaybe p) testdata
testdata = concat $ replicate 10000
[ "2008-01-01 income assets:bank:checking $1 $1"
, "2008-06-01 gift assets:bank:checking $1 $2"
, "2008-06-02 save assets:bank:saving $1 $3"
, " assets:bank:checking ..m$-1\ESC[m\ESC[m $2"
, "2008-06-03 eat & shop assets:cash ..m$-2\ESC[m\ESC[m 0"
, "2008-12-31 pay off assets:bank:checking ..m$-1\ESC[m\ESC[m ..m$-1\ESC[m\ESC[m"
]
ghci> time118ish
4560000
CPU time: 0.17s
ghci> time119
4560000
CPU time: 0.91s
ghci> time1191
4560000
CPU time: 2.76s
Possibly a more careful parser could beat regexReplace. Note the
latter does memoisation, which could be faster and/or could also use
more resident memory in some situations.
Ideally we would calculate all widths before adding ANSI colour codes,
so we wouldn't have to wastefully strip them.
2020-09-11 03:46:16 +03:00
|
|
|
stripAnsi s = either err id $ regexReplace ansire "" s
|
|
|
|
where
|
|
|
|
err = error "stripAnsi: invalid replacement pattern" -- PARTIAL, shouldn't happen
|
2020-12-27 02:52:39 +03:00
|
|
|
ansire = toRegex' $ T.pack "\ESC\\[([0-9]+;)*([0-9]+)?[ABCDHJKfmsu]" -- PARTIAL, should succeed
|