mirror of
https://github.com/serokell/haskell-with-utf8.git
synced 2024-10-26 07:51:16 +03:00
Allow not changing file handles to UTF-8
The old interface only allowed one to perform terminal detection on a handle and change it to UTF-8 if it was not a terminal. Turns our, it makes sense to sometimes leave the original encoding on a handle even if it does not point to a terminal. * Change the interface to separate these two use-cases. * Extract main-wrappers to a new `Main` module.
This commit is contained in:
parent
8a76bb2577
commit
40069d4b2b
@ -12,9 +12,12 @@ Initial release.
|
||||
|
||||
### Added
|
||||
|
||||
- `withUtf8StdHandles`
|
||||
- `hSetEncoding`
|
||||
- `hWithEncoding`
|
||||
- `withUtf8`
|
||||
- `withStdTerminalHandles`
|
||||
- `setHandleEncoding`
|
||||
- `withHandle`
|
||||
- `setTerminalHandleEncoding`
|
||||
- `withTerminalHandle`
|
||||
- `openFile`
|
||||
- `withFile`
|
||||
- `readFile`
|
||||
|
@ -24,12 +24,12 @@ import qualified System.IO as IO
|
||||
import qualified System.IO.Utf8 as Utf8
|
||||
|
||||
|
||||
-- | Like @readFile@, but assumes the file is encoded in UTF-8, regardless
|
||||
-- | Like 'T.readFile', but assumes the file is encoded in UTF-8, regardless
|
||||
-- of the current locale.
|
||||
readFile :: MonadIO m => IO.FilePath -> m Text
|
||||
readFile path = Utf8.openFile path IO.ReadMode >>= liftIO . T.hGetContents
|
||||
|
||||
-- | Like @writeFile@, but encodes the data in UTF-8, regardless
|
||||
-- | Like 'T.writeFile', but encodes the data in UTF-8, regardless
|
||||
-- of the current locale.
|
||||
writeFile :: (MonadIO m, MonadMask m) => IO.FilePath -> Text -> m ()
|
||||
writeFile path = Utf8.withFile path IO.WriteMode . (liftIO .) . flip T.hPutStr
|
||||
|
82
lib/Main/Utf8.hs
Normal file
82
lib/Main/Utf8.hs
Normal file
@ -0,0 +1,82 @@
|
||||
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
|
||||
-
|
||||
- SPDX-License-Identifier: MPL-2.0
|
||||
-}
|
||||
|
||||
-----------------------------------------------------------------------------
|
||||
-- |
|
||||
--
|
||||
-- Functions in this module will help you make your /executable/ work
|
||||
-- correctly with encodings of text files and standard handles.
|
||||
--
|
||||
-- /Note: if you are developing a library, see "System.IO.Utf8"./
|
||||
--
|
||||
-- = Quick start
|
||||
--
|
||||
-- Wrap a call to 'withUtf8' around your @main@:
|
||||
--
|
||||
-- @
|
||||
-- import Main.Utf8 (withUtf8)
|
||||
--
|
||||
-- main :: IO ()
|
||||
-- main = 'withUtf8' $ do
|
||||
-- putStrLn "Hello, мир!"
|
||||
-- @
|
||||
--
|
||||
-- Basically, this is all you have to do for a program that uses
|
||||
-- @stdin@ and @stdout@ to interact with the user. However, some
|
||||
-- programs read input from and write output to files and,
|
||||
-- at the same time, allow the user to redirect @stdin@ and @stdout@
|
||||
-- instead of providing explicit file names.
|
||||
--
|
||||
-- If this is the case for your executable, you should also wrap
|
||||
-- @Utf8.@'System.IO.Utf8.withHandle' around the code that passes
|
||||
-- the handle to a third-party library. It is not necessary to do
|
||||
-- when passing it to your own library, assuming that it follows
|
||||
-- the recommendations from the documentation of "System.IO.Utf8".
|
||||
module Main.Utf8
|
||||
( withUtf8
|
||||
, withStdTerminalHandles
|
||||
) where
|
||||
|
||||
import Control.Exception.Safe (MonadMask, bracket)
|
||||
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||
import GHC.IO.Encoding (getLocaleEncoding, setLocaleEncoding, utf8)
|
||||
import System.IO (stderr, stdin, stdout)
|
||||
|
||||
import System.IO.Utf8 (withTerminalHandle)
|
||||
|
||||
|
||||
-- | Make standard handles safe to write anything to them and change
|
||||
-- program-global default file handle encoding to UTF-8.
|
||||
--
|
||||
-- This function will:
|
||||
--
|
||||
-- 1. Adjust the encoding of 'stdin', 'stdout', and 'stderr' to
|
||||
-- enable transliteration, like 'withStdTerminalHandles' does.
|
||||
-- 2. Call 'setLocaleEncoding' to change the program-global locale
|
||||
-- encoding to UTF-8.
|
||||
-- 3. Undo everything when the wrapped action finishes.
|
||||
withUtf8 :: (MonadIO m, MonadMask m) => m r -> m r
|
||||
withUtf8 act = withStdTerminalHandles $
|
||||
bracket
|
||||
(liftIO $ getLocaleEncoding <* setLocaleEncoding utf8)
|
||||
(liftIO . setLocaleEncoding)
|
||||
(\_ -> act)
|
||||
|
||||
-- | Make standard handles safe to write anything to them.
|
||||
--
|
||||
-- This function will for each of 'stdin', 'stdout', 'stderr' do:
|
||||
--
|
||||
-- 1. Tweak the existing encoding so that unrepresentable characters
|
||||
-- will get approximated (aka transliterated) by visually similar
|
||||
-- ones or question marks.
|
||||
-- 2. Restore the original encoding when the wrapped action finishes.
|
||||
--
|
||||
-- Use this function only if you do not want to change the program-global
|
||||
-- locale encoding. Otherwise prefer 'withUtf8'.
|
||||
withStdTerminalHandles :: (MonadIO m, MonadMask m) => m r -> m r
|
||||
withStdTerminalHandles
|
||||
= withTerminalHandle stdin
|
||||
. withTerminalHandle stdout
|
||||
. withTerminalHandle stderr
|
@ -5,12 +5,13 @@
|
||||
|
||||
{-# LANGUAGE LambdaCase #-}
|
||||
|
||||
-- | System IO for the modern world.
|
||||
-----------------------------------------------------------------------------
|
||||
-- |
|
||||
--
|
||||
-- Standard IO functions assume that the character encoding of the data
|
||||
-- they read or write is the same as the one used by current locale. In many
|
||||
-- situtations this assumption is wrong, as tools work with files, and
|
||||
-- the files nowadays are mostly UTF-8 encoded, regardless of the locale.
|
||||
-- files nowadays are mostly UTF-8 encoded, regardless of the locale.
|
||||
-- Therefore, it is almost always a good idea to switch the encoding of
|
||||
-- file handles to UTF-8.
|
||||
--
|
||||
@ -18,12 +19,58 @@
|
||||
-- there is an edge-case: if they are attached to a terminal, and the
|
||||
-- encoding is not UTF-8, using UTF-8 might actually be unsafe.
|
||||
--
|
||||
-- Functions in this module help deal with all these issues.
|
||||
-- If you are developing an executable, in most cases, it is enough to
|
||||
-- configure the environment accordingly on program start, see the
|
||||
-- "Main.Utf8" for functions that help with this.
|
||||
-- However, if you are a library author, you should avoid modifying the
|
||||
-- global environment.
|
||||
--
|
||||
-- = Quick start
|
||||
--
|
||||
-- == Opening new files
|
||||
--
|
||||
-- If you need to open a text file, use @Utf8.@'withFile'
|
||||
-- (or @Utf8.@'openFile'). These will not only open the file, but also
|
||||
-- set the handle’s encoding to UTF-8, regardless of the user’s locale.
|
||||
--
|
||||
-- == Working with existing handles
|
||||
--
|
||||
-- Suppose you are creating a function which produces some text and writes
|
||||
-- it to a file handle that is passed to it from the outside.
|
||||
-- Ask yourself this question: do I want to encode this text in UTF-8
|
||||
-- or using the encoding from the user’s locale?
|
||||
--
|
||||
-- In many cases this question is easy to answer. For example, if your
|
||||
-- function produces Haskell code, then you always want it in UTF-8,
|
||||
-- because that is what all other tools (including GHC) expect.
|
||||
--
|
||||
-- In some cases it is not that clear. What you can do then is consider
|
||||
-- what the user is going to do with the data produced.
|
||||
-- If it is, primarily, meant to be displayed on their screen and then
|
||||
-- forgotten, you don’t need UTF-8. On the other hand, if it is meant
|
||||
-- to be saved somewhere and then used or edited by other tools, then
|
||||
-- you need UTF-8.
|
||||
--
|
||||
-- If you decided that your function needs to try to switch the handle
|
||||
-- to UTF-8, it is very easy to achieve:
|
||||
--
|
||||
-- @
|
||||
-- import qualified System.IO.Utf8 as Utf8
|
||||
--
|
||||
-- writeData :: 'IO.Handle' -> InputDataType -> IO ()
|
||||
-- writeData hOut inData = Utf8.'withHandle' hOut $ do
|
||||
-- {- ... write the data ... -}
|
||||
-- @
|
||||
--
|
||||
-- If you decided that you don’t need to try to switch it to UTF-8,
|
||||
-- replace @withHandle@ with 'withTerminalHandle' to only make the
|
||||
-- handle safe to write to without runtime errors.
|
||||
module System.IO.Utf8
|
||||
( withUtf8StdHandles
|
||||
( withHandle
|
||||
, withTerminalHandle
|
||||
|
||||
, hSetEncoding
|
||||
, hWithEncoding
|
||||
, setHandleEncoding
|
||||
, setTerminalHandleEncoding
|
||||
|
||||
, openFile
|
||||
, withFile
|
||||
@ -32,8 +79,7 @@ module System.IO.Utf8
|
||||
import Control.Exception.Safe (MonadMask, bracket)
|
||||
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||
import Data.Functor (void)
|
||||
import GHC.IO.Encoding (mkTextEncoding)
|
||||
import System.IO (stderr, stdin, stdout)
|
||||
import GHC.IO.Encoding (mkTextEncoding, utf8)
|
||||
|
||||
import qualified System.IO as IO
|
||||
|
||||
@ -49,60 +95,74 @@ type EncRestoreAction m = IO.Handle -> m ()
|
||||
-- If the handle is not attached to a terminal, sets UTF-8.
|
||||
-- Otherwise, keeps its current encoding, but augments it to transliterate
|
||||
-- unsupported characters.
|
||||
hSetBestUtf8Enc :: MonadIO m => IO.Handle -> m (EncRestoreAction m)
|
||||
hSetBestUtf8Enc h = liftIO $ do
|
||||
IO.hGetEncoding h >>= chooseBestEnc h >>= \case
|
||||
hSetBestUtf8Enc
|
||||
:: MonadIO m
|
||||
=> (IO.Handle -> IO Bool)
|
||||
-> IO.Handle
|
||||
-> m (EncRestoreAction m)
|
||||
hSetBestUtf8Enc hIsTerm h = liftIO $ do
|
||||
IO.hGetEncoding h >>= chooseBestEnc h hIsTerm >>= \case
|
||||
Keep -> pure (\_ -> pure ())
|
||||
ChangeFromTo enc newName -> do
|
||||
mkTextEncoding newName >>= IO.hSetEncoding h
|
||||
pure $ liftIO . flip IO.hSetEncoding enc
|
||||
|
||||
-- | Configures the encodings of the three standard handles (stdin, stdout, stderr)
|
||||
-- to work with UTF-8 encoded data and runs the specified IO action.
|
||||
-- After the action finishes, restores the original encodings.
|
||||
withUtf8StdHandles :: IO a -> IO a
|
||||
withUtf8StdHandles action =
|
||||
hWithEncoding stdin $
|
||||
hWithEncoding stdout $
|
||||
hWithEncoding stderr $
|
||||
action
|
||||
|
||||
|
||||
-- | Set handle encoding to the best possible.
|
||||
--
|
||||
-- It is safe to call this function on any kind of handle whatsoever.
|
||||
--
|
||||
-- * If the handle is in binary mode, it will do nothing.
|
||||
-- * If the handle is a terminal, it will use the same encoding, but switch
|
||||
-- it to Unicode approximation mode so it won't throw errors on invalid
|
||||
-- byte sequences, but instead try to approximate unecodable characters
|
||||
-- with visually similar encodable ones.
|
||||
-- * For regular files it will always choose UTF-8, of course.
|
||||
--
|
||||
-- You probably shouldn't be using this function. If you open the file
|
||||
-- yourself, use 'openFile' (or, even better, 'withFile') instead.
|
||||
-- If you get the handle from somewhere else, use 'hWithEncoding',
|
||||
-- which will restore the previous encoding when you are done.
|
||||
hSetEncoding :: MonadIO m => IO.Handle -> m ()
|
||||
hSetEncoding = liftIO . void . hSetBestUtf8Enc
|
||||
-- See 'withHandle' for description and prefer it, if possible.
|
||||
setHandleEncoding :: MonadIO m => IO.Handle -> m ()
|
||||
setHandleEncoding = liftIO . void . hSetBestUtf8Enc IO.hIsTerminalDevice
|
||||
|
||||
-- | Temporarily set handle encoding to the best possible.
|
||||
--
|
||||
-- This is like 'hSetEncoding', but it will restore the encoding
|
||||
-- to the previous one when the action is done.
|
||||
hWithEncoding :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r
|
||||
hWithEncoding h = bracket (hSetBestUtf8Enc h) ($ h) . const
|
||||
-- “Best possible” means UTF-8, unless the handle points to a terminal
|
||||
-- device, in which case the encoding will be left the same, but tweaked
|
||||
-- to approximate unencodable characters.
|
||||
--
|
||||
-- This function is safe to call on handles open in binary mode and it will
|
||||
-- do nothing on them.
|
||||
--
|
||||
-- To sum up:
|
||||
--
|
||||
-- * If the handle is in binary mode, do nothing.
|
||||
-- * If the handle points to a terminal device, act like 'withTerminalHandle'.
|
||||
-- * For regular files always choose UTF-8, of course.
|
||||
withHandle :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r
|
||||
withHandle h = bracket (hSetBestUtf8Enc IO.hIsTerminalDevice h) ($ h) . const
|
||||
|
||||
-- | Make a handle safe to write any text to.
|
||||
--
|
||||
-- See 'withTerminalHandle' for description and prefer it, if possible.
|
||||
setTerminalHandleEncoding :: MonadIO m => IO.Handle -> m ()
|
||||
setTerminalHandleEncoding = liftIO . void . hSetBestUtf8Enc (const $ pure True)
|
||||
|
||||
-- | Temporarily make a handle safe to write any text to.
|
||||
--
|
||||
-- If the handle is not using UTF-8, adjust the encoding to remain the same
|
||||
-- as before, but approximate unencodable characters. When the action is done,
|
||||
-- restore it back to the previous one.
|
||||
--
|
||||
-- Use this function only if you are sure you want to treat this handle as
|
||||
-- a terminal (that is, you will be using it to interact with the user
|
||||
-- and to write user-visible messages, rather than something that can
|
||||
-- be reasonable expected to go to a file).
|
||||
--
|
||||
-- This function is safe to call on handles open in binary mode and it will
|
||||
-- do nothing on them.
|
||||
withTerminalHandle :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r
|
||||
withTerminalHandle h = bracket (hSetBestUtf8Enc (const $ pure True) h) ($ h) . const
|
||||
|
||||
|
||||
-- | Like @openFile@, but sets the file encoding to UTF-8, regardless
|
||||
-- | Like 'System.IO.openFile', but sets the file encoding to UTF-8, regardless
|
||||
-- of the current locale.
|
||||
openFile :: MonadIO m => IO.FilePath -> IO.IOMode -> m IO.Handle
|
||||
openFile path mode = do
|
||||
h <- liftIO $ IO.openFile path mode
|
||||
hSetEncoding h
|
||||
openFile path mode = liftIO $ do
|
||||
h <- IO.openFile path mode
|
||||
IO.hSetEncoding h utf8
|
||||
pure h
|
||||
|
||||
-- | Like @withFile@, but sets the file encoding to UTF-8, regardless
|
||||
-- | Like 'System.IO.withFile', but sets the file encoding to UTF-8, regardless
|
||||
-- of the current locale.
|
||||
withFile
|
||||
:: (MonadIO m, MonadMask m)
|
||||
|
@ -36,7 +36,7 @@ data EncodingAction
|
||||
| ChangeFromTo TextEncoding String
|
||||
-- ^ Change the first encoding to the second.
|
||||
|
||||
-- | Pure version of 'chooseBestUtf8Enc'.
|
||||
-- | Pure version of 'chooseBestEnc'.
|
||||
--
|
||||
-- This function is not actually used in the library. It exists only
|
||||
-- for documentation purposes to demonstrate the logic.
|
||||
@ -68,15 +68,19 @@ chooseBestEncPure True (Just name)
|
||||
-- (e.g. to be able to restore it), so we avoid repeating the query.
|
||||
-- 2. It first checks for the cases where it doesn't care whether the device
|
||||
-- is a terminal or not, so the query will be made only if really necessary.
|
||||
chooseBestEnc :: IO.Handle -> Maybe TextEncoding -> IO EncodingAction
|
||||
chooseBestEnc _ Nothing = pure Keep
|
||||
chooseBestEnc h (Just enc) = case textEncodingName enc of
|
||||
chooseBestEnc
|
||||
:: IO.Handle -- ^ Handle to choose encoding for
|
||||
-> (IO.Handle -> IO Bool) -- ^ @hIsTerminalDevice@
|
||||
-> Maybe TextEncoding -- ^ Current encoding.
|
||||
-> IO EncodingAction
|
||||
chooseBestEnc _ _ Nothing = pure Keep
|
||||
chooseBestEnc h hIsTerm (Just enc) = case textEncodingName enc of
|
||||
"UTF-8" -> pure Keep
|
||||
name
|
||||
-- XXX: The first branch is actually never used, because the encoding
|
||||
-- loses the @//TRANSLIT@ suffix after it is being created.
|
||||
-- TODO: Find a way to detect that the encoding is already trasliterating.
|
||||
| "//TRANSLIT" `isSuffixOf` name -> pure Keep
|
||||
| otherwise -> IO.hIsTerminalDevice h >>= \case
|
||||
| otherwise -> hIsTerm h >>= \case
|
||||
False -> pure $ ChangeFromTo enc (textEncodingName utf8)
|
||||
True -> pure $ ChangeFromTo enc (name ++ "//TRANSLIT")
|
||||
|
@ -8,14 +8,13 @@ module Main
|
||||
) where
|
||||
|
||||
import GHC.IO.Encoding (mkTextEncoding, setLocaleEncoding)
|
||||
import Main.Utf8 (withUtf8)
|
||||
import Test.Tasty (defaultMain)
|
||||
|
||||
import System.IO.Utf8 (withUtf8StdHandles)
|
||||
|
||||
import Tree (tests)
|
||||
|
||||
main :: IO ()
|
||||
main = withUtf8StdHandles $ do
|
||||
main = withUtf8 $ do
|
||||
-- The issue only shows up when current locale encoding is ASCII.
|
||||
-- Realistically, very often when running this test this will not be
|
||||
-- the case, so we unset locale encoding manually.
|
||||
|
@ -13,6 +13,7 @@ import qualified System.IO as IO
|
||||
import Test.HUnit (Assertion, assertFailure)
|
||||
import Test.Tasty.HUnit ((@=?))
|
||||
|
||||
import System.IO (hIsTerminalDevice)
|
||||
import System.IO.Utf8.Internal (EncodingAction (..), chooseBestEnc, chooseBestEncPure)
|
||||
|
||||
import qualified System.IO.Utf8 as Utf8
|
||||
@ -27,7 +28,7 @@ verifyOn h = do
|
||||
enc <- IO.hGetEncoding h
|
||||
|
||||
let pureResult = chooseBestEncPure isTerm (textEncodingName <$> enc)
|
||||
realResult <- chooseBestEnc h enc
|
||||
realResult <- chooseBestEnc h hIsTerminalDevice enc
|
||||
|
||||
case (pureResult, realResult) of
|
||||
(Nothing, Keep) -> pure ()
|
||||
@ -98,7 +99,7 @@ unit_term_idempotent :: Assertion
|
||||
unit_term_idempotent = withTerminalIn char8 $ \h -> do
|
||||
Just enc <- IO.hGetEncoding h
|
||||
"ISO-8859-1" @=? textEncodingName enc -- sanity check
|
||||
Utf8.hWithEncoding h $ do
|
||||
Utf8.withHandle h $ do
|
||||
-- XXX: Actually not true, there is no suffix in the name
|
||||
-- Just enc' <- IO.hGetEncoding h
|
||||
-- "ISO-8859-1//TRANSLIT" @=? textEncodingName enc' -- sanity check
|
||||
|
@ -15,6 +15,7 @@ import Control.Exception.Safe (MonadMask, try)
|
||||
import Control.Monad ((>=>))
|
||||
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||
import Data.Text (Text)
|
||||
import GHC.IO.Encoding (utf8)
|
||||
import System.IO (FilePath)
|
||||
import System.IO.Temp (withSystemTempFile)
|
||||
|
||||
@ -33,9 +34,7 @@ import qualified Hedgehog.Range as R
|
||||
-- | Helper that writes Text to a temp file.
|
||||
withTestFile :: (MonadIO m, MonadMask m) => Text -> (FilePath -> m r) -> m r
|
||||
withTestFile t act = withSystemTempFile "utf8.txt" $ \fp h -> do
|
||||
Utf8.hSetEncoding h
|
||||
liftIO $ T.hPutStr h t
|
||||
liftIO $ IO.hClose h
|
||||
liftIO $ IO.hSetEncoding h utf8 *> T.hPutStr h t *> IO.hClose h
|
||||
act fp
|
||||
|
||||
|
||||
|
@ -12,6 +12,7 @@ import qualified System.IO as IO
|
||||
import Test.HUnit (Assertion)
|
||||
import Test.Tasty.HUnit ((@=?))
|
||||
|
||||
import System.IO (hIsTerminalDevice)
|
||||
import System.IO.Utf8.Internal (EncodingAction (..), chooseBestEnc)
|
||||
|
||||
import qualified System.IO.Utf8 as Utf8
|
||||
@ -23,9 +24,9 @@ import Test.Util (withTmpFileIn)
|
||||
verifyOn :: IO.Handle -> Assertion
|
||||
verifyOn h = do
|
||||
menc <- IO.hGetEncoding h
|
||||
act <- chooseBestEnc h menc
|
||||
act <- chooseBestEnc h hIsTerminalDevice menc
|
||||
|
||||
Utf8.hWithEncoding h $ do
|
||||
Utf8.withHandle h $ do
|
||||
menc' <- IO.hGetEncoding h
|
||||
case act of
|
||||
Keep ->
|
||||
|
Loading…
Reference in New Issue
Block a user