Allow not changing file handles to UTF-8

The old interface only allowed one to perform terminal detection on a
handle and change it to UTF-8 if it was not a terminal. Turns our, it
makes sense to sometimes leave the original encoding on a handle even if
it does not point to a terminal.

* Change the interface to separate these two use-cases.
* Extract main-wrappers to a new `Main` module.
This commit is contained in:
Kirill Elagin 2020-02-17 17:48:05 -05:00
parent 8a76bb2577
commit 40069d4b2b
9 changed files with 214 additions and 65 deletions

View File

@ -12,9 +12,12 @@ Initial release.
### Added ### Added
- `withUtf8StdHandles` - `withUtf8`
- `hSetEncoding` - `withStdTerminalHandles`
- `hWithEncoding` - `setHandleEncoding`
- `withHandle`
- `setTerminalHandleEncoding`
- `withTerminalHandle`
- `openFile` - `openFile`
- `withFile` - `withFile`
- `readFile` - `readFile`

View File

@ -24,12 +24,12 @@ import qualified System.IO as IO
import qualified System.IO.Utf8 as Utf8 import qualified System.IO.Utf8 as Utf8
-- | Like @readFile@, but assumes the file is encoded in UTF-8, regardless -- | Like 'T.readFile', but assumes the file is encoded in UTF-8, regardless
-- of the current locale. -- of the current locale.
readFile :: MonadIO m => IO.FilePath -> m Text readFile :: MonadIO m => IO.FilePath -> m Text
readFile path = Utf8.openFile path IO.ReadMode >>= liftIO . T.hGetContents readFile path = Utf8.openFile path IO.ReadMode >>= liftIO . T.hGetContents
-- | Like @writeFile@, but encodes the data in UTF-8, regardless -- | Like 'T.writeFile', but encodes the data in UTF-8, regardless
-- of the current locale. -- of the current locale.
writeFile :: (MonadIO m, MonadMask m) => IO.FilePath -> Text -> m () writeFile :: (MonadIO m, MonadMask m) => IO.FilePath -> Text -> m ()
writeFile path = Utf8.withFile path IO.WriteMode . (liftIO .) . flip T.hPutStr writeFile path = Utf8.withFile path IO.WriteMode . (liftIO .) . flip T.hPutStr

82
lib/Main/Utf8.hs Normal file
View File

@ -0,0 +1,82 @@
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
-
- SPDX-License-Identifier: MPL-2.0
-}
-----------------------------------------------------------------------------
-- |
--
-- Functions in this module will help you make your /executable/ work
-- correctly with encodings of text files and standard handles.
--
-- /Note: if you are developing a library, see "System.IO.Utf8"./
--
-- = Quick start
--
-- Wrap a call to 'withUtf8' around your @main@:
--
-- @
-- import Main.Utf8 (withUtf8)
--
-- main :: IO ()
-- main = 'withUtf8' $ do
-- putStrLn "Hello, мир!"
-- @
--
-- Basically, this is all you have to do for a program that uses
-- @stdin@ and @stdout@ to interact with the user. However, some
-- programs read input from and write output to files and,
-- at the same time, allow the user to redirect @stdin@ and @stdout@
-- instead of providing explicit file names.
--
-- If this is the case for your executable, you should also wrap
-- @Utf8.@'System.IO.Utf8.withHandle' around the code that passes
-- the handle to a third-party library. It is not necessary to do
-- when passing it to your own library, assuming that it follows
-- the recommendations from the documentation of "System.IO.Utf8".
module Main.Utf8
( withUtf8
, withStdTerminalHandles
) where
import Control.Exception.Safe (MonadMask, bracket)
import Control.Monad.IO.Class (MonadIO, liftIO)
import GHC.IO.Encoding (getLocaleEncoding, setLocaleEncoding, utf8)
import System.IO (stderr, stdin, stdout)
import System.IO.Utf8 (withTerminalHandle)
-- | Make standard handles safe to write anything to them and change
-- program-global default file handle encoding to UTF-8.
--
-- This function will:
--
-- 1. Adjust the encoding of 'stdin', 'stdout', and 'stderr' to
-- enable transliteration, like 'withStdTerminalHandles' does.
-- 2. Call 'setLocaleEncoding' to change the program-global locale
-- encoding to UTF-8.
-- 3. Undo everything when the wrapped action finishes.
withUtf8 :: (MonadIO m, MonadMask m) => m r -> m r
withUtf8 act = withStdTerminalHandles $
bracket
(liftIO $ getLocaleEncoding <* setLocaleEncoding utf8)
(liftIO . setLocaleEncoding)
(\_ -> act)
-- | Make standard handles safe to write anything to them.
--
-- This function will for each of 'stdin', 'stdout', 'stderr' do:
--
-- 1. Tweak the existing encoding so that unrepresentable characters
-- will get approximated (aka transliterated) by visually similar
-- ones or question marks.
-- 2. Restore the original encoding when the wrapped action finishes.
--
-- Use this function only if you do not want to change the program-global
-- locale encoding. Otherwise prefer 'withUtf8'.
withStdTerminalHandles :: (MonadIO m, MonadMask m) => m r -> m r
withStdTerminalHandles
= withTerminalHandle stdin
. withTerminalHandle stdout
. withTerminalHandle stderr

View File

@ -5,12 +5,13 @@
{-# LANGUAGE LambdaCase #-} {-# LANGUAGE LambdaCase #-}
-- | System IO for the modern world. -----------------------------------------------------------------------------
-- |
-- --
-- Standard IO functions assume that the character encoding of the data -- Standard IO functions assume that the character encoding of the data
-- they read or write is the same as the one used by current locale. In many -- they read or write is the same as the one used by current locale. In many
-- situtations this assumption is wrong, as tools work with files, and -- situtations this assumption is wrong, as tools work with files, and
-- the files nowadays are mostly UTF-8 encoded, regardless of the locale. -- files nowadays are mostly UTF-8 encoded, regardless of the locale.
-- Therefore, it is almost always a good idea to switch the encoding of -- Therefore, it is almost always a good idea to switch the encoding of
-- file handles to UTF-8. -- file handles to UTF-8.
-- --
@ -18,12 +19,58 @@
-- there is an edge-case: if they are attached to a terminal, and the -- there is an edge-case: if they are attached to a terminal, and the
-- encoding is not UTF-8, using UTF-8 might actually be unsafe. -- encoding is not UTF-8, using UTF-8 might actually be unsafe.
-- --
-- Functions in this module help deal with all these issues. -- If you are developing an executable, in most cases, it is enough to
-- configure the environment accordingly on program start, see the
-- "Main.Utf8" for functions that help with this.
-- However, if you are a library author, you should avoid modifying the
-- global environment.
--
-- = Quick start
--
-- == Opening new files
--
-- If you need to open a text file, use @Utf8.@'withFile'
-- (or @Utf8.@'openFile'). These will not only open the file, but also
-- set the handles encoding to UTF-8, regardless of the users locale.
--
-- == Working with existing handles
--
-- Suppose you are creating a function which produces some text and writes
-- it to a file handle that is passed to it from the outside.
-- Ask yourself this question: do I want to encode this text in UTF-8
-- or using the encoding from the users locale?
--
-- In many cases this question is easy to answer. For example, if your
-- function produces Haskell code, then you always want it in UTF-8,
-- because that is what all other tools (including GHC) expect.
--
-- In some cases it is not that clear. What you can do then is consider
-- what the user is going to do with the data produced.
-- If it is, primarily, meant to be displayed on their screen and then
-- forgotten, you dont need UTF-8. On the other hand, if it is meant
-- to be saved somewhere and then used or edited by other tools, then
-- you need UTF-8.
--
-- If you decided that your function needs to try to switch the handle
-- to UTF-8, it is very easy to achieve:
--
-- @
-- import qualified System.IO.Utf8 as Utf8
--
-- writeData :: 'IO.Handle' -> InputDataType -> IO ()
-- writeData hOut inData = Utf8.'withHandle' hOut $ do
-- {- ... write the data ... -}
-- @
--
-- If you decided that you dont need to try to switch it to UTF-8,
-- replace @withHandle@ with 'withTerminalHandle' to only make the
-- handle safe to write to without runtime errors.
module System.IO.Utf8 module System.IO.Utf8
( withUtf8StdHandles ( withHandle
, withTerminalHandle
, hSetEncoding , setHandleEncoding
, hWithEncoding , setTerminalHandleEncoding
, openFile , openFile
, withFile , withFile
@ -32,8 +79,7 @@ module System.IO.Utf8
import Control.Exception.Safe (MonadMask, bracket) import Control.Exception.Safe (MonadMask, bracket)
import Control.Monad.IO.Class (MonadIO, liftIO) import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Functor (void) import Data.Functor (void)
import GHC.IO.Encoding (mkTextEncoding) import GHC.IO.Encoding (mkTextEncoding, utf8)
import System.IO (stderr, stdin, stdout)
import qualified System.IO as IO import qualified System.IO as IO
@ -49,60 +95,74 @@ type EncRestoreAction m = IO.Handle -> m ()
-- If the handle is not attached to a terminal, sets UTF-8. -- If the handle is not attached to a terminal, sets UTF-8.
-- Otherwise, keeps its current encoding, but augments it to transliterate -- Otherwise, keeps its current encoding, but augments it to transliterate
-- unsupported characters. -- unsupported characters.
hSetBestUtf8Enc :: MonadIO m => IO.Handle -> m (EncRestoreAction m) hSetBestUtf8Enc
hSetBestUtf8Enc h = liftIO $ do :: MonadIO m
IO.hGetEncoding h >>= chooseBestEnc h >>= \case => (IO.Handle -> IO Bool)
-> IO.Handle
-> m (EncRestoreAction m)
hSetBestUtf8Enc hIsTerm h = liftIO $ do
IO.hGetEncoding h >>= chooseBestEnc h hIsTerm >>= \case
Keep -> pure (\_ -> pure ()) Keep -> pure (\_ -> pure ())
ChangeFromTo enc newName -> do ChangeFromTo enc newName -> do
mkTextEncoding newName >>= IO.hSetEncoding h mkTextEncoding newName >>= IO.hSetEncoding h
pure $ liftIO . flip IO.hSetEncoding enc pure $ liftIO . flip IO.hSetEncoding enc
-- | Configures the encodings of the three standard handles (stdin, stdout, stderr)
-- to work with UTF-8 encoded data and runs the specified IO action.
-- After the action finishes, restores the original encodings.
withUtf8StdHandles :: IO a -> IO a
withUtf8StdHandles action =
hWithEncoding stdin $
hWithEncoding stdout $
hWithEncoding stderr $
action
-- | Set handle encoding to the best possible. -- | Set handle encoding to the best possible.
-- --
-- It is safe to call this function on any kind of handle whatsoever. -- See 'withHandle' for description and prefer it, if possible.
-- setHandleEncoding :: MonadIO m => IO.Handle -> m ()
-- * If the handle is in binary mode, it will do nothing. setHandleEncoding = liftIO . void . hSetBestUtf8Enc IO.hIsTerminalDevice
-- * If the handle is a terminal, it will use the same encoding, but switch
-- it to Unicode approximation mode so it won't throw errors on invalid
-- byte sequences, but instead try to approximate unecodable characters
-- with visually similar encodable ones.
-- * For regular files it will always choose UTF-8, of course.
--
-- You probably shouldn't be using this function. If you open the file
-- yourself, use 'openFile' (or, even better, 'withFile') instead.
-- If you get the handle from somewhere else, use 'hWithEncoding',
-- which will restore the previous encoding when you are done.
hSetEncoding :: MonadIO m => IO.Handle -> m ()
hSetEncoding = liftIO . void . hSetBestUtf8Enc
-- | Temporarily set handle encoding to the best possible. -- | Temporarily set handle encoding to the best possible.
-- --
-- This is like 'hSetEncoding', but it will restore the encoding -- “Best possible” means UTF-8, unless the handle points to a terminal
-- to the previous one when the action is done. -- device, in which case the encoding will be left the same, but tweaked
hWithEncoding :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r -- to approximate unencodable characters.
hWithEncoding h = bracket (hSetBestUtf8Enc h) ($ h) . const --
-- This function is safe to call on handles open in binary mode and it will
-- do nothing on them.
--
-- To sum up:
--
-- * If the handle is in binary mode, do nothing.
-- * If the handle points to a terminal device, act like 'withTerminalHandle'.
-- * For regular files always choose UTF-8, of course.
withHandle :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r
withHandle h = bracket (hSetBestUtf8Enc IO.hIsTerminalDevice h) ($ h) . const
-- | Make a handle safe to write any text to.
--
-- See 'withTerminalHandle' for description and prefer it, if possible.
setTerminalHandleEncoding :: MonadIO m => IO.Handle -> m ()
setTerminalHandleEncoding = liftIO . void . hSetBestUtf8Enc (const $ pure True)
-- | Temporarily make a handle safe to write any text to.
--
-- If the handle is not using UTF-8, adjust the encoding to remain the same
-- as before, but approximate unencodable characters. When the action is done,
-- restore it back to the previous one.
--
-- Use this function only if you are sure you want to treat this handle as
-- a terminal (that is, you will be using it to interact with the user
-- and to write user-visible messages, rather than something that can
-- be reasonable expected to go to a file).
--
-- This function is safe to call on handles open in binary mode and it will
-- do nothing on them.
withTerminalHandle :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r
withTerminalHandle h = bracket (hSetBestUtf8Enc (const $ pure True) h) ($ h) . const
-- | Like @openFile@, but sets the file encoding to UTF-8, regardless -- | Like 'System.IO.openFile', but sets the file encoding to UTF-8, regardless
-- of the current locale. -- of the current locale.
openFile :: MonadIO m => IO.FilePath -> IO.IOMode -> m IO.Handle openFile :: MonadIO m => IO.FilePath -> IO.IOMode -> m IO.Handle
openFile path mode = do openFile path mode = liftIO $ do
h <- liftIO $ IO.openFile path mode h <- IO.openFile path mode
hSetEncoding h IO.hSetEncoding h utf8
pure h pure h
-- | Like @withFile@, but sets the file encoding to UTF-8, regardless -- | Like 'System.IO.withFile', but sets the file encoding to UTF-8, regardless
-- of the current locale. -- of the current locale.
withFile withFile
:: (MonadIO m, MonadMask m) :: (MonadIO m, MonadMask m)

View File

@ -36,7 +36,7 @@ data EncodingAction
| ChangeFromTo TextEncoding String | ChangeFromTo TextEncoding String
-- ^ Change the first encoding to the second. -- ^ Change the first encoding to the second.
-- | Pure version of 'chooseBestUtf8Enc'. -- | Pure version of 'chooseBestEnc'.
-- --
-- This function is not actually used in the library. It exists only -- This function is not actually used in the library. It exists only
-- for documentation purposes to demonstrate the logic. -- for documentation purposes to demonstrate the logic.
@ -68,15 +68,19 @@ chooseBestEncPure True (Just name)
-- (e.g. to be able to restore it), so we avoid repeating the query. -- (e.g. to be able to restore it), so we avoid repeating the query.
-- 2. It first checks for the cases where it doesn't care whether the device -- 2. It first checks for the cases where it doesn't care whether the device
-- is a terminal or not, so the query will be made only if really necessary. -- is a terminal or not, so the query will be made only if really necessary.
chooseBestEnc :: IO.Handle -> Maybe TextEncoding -> IO EncodingAction chooseBestEnc
chooseBestEnc _ Nothing = pure Keep :: IO.Handle -- ^ Handle to choose encoding for
chooseBestEnc h (Just enc) = case textEncodingName enc of -> (IO.Handle -> IO Bool) -- ^ @hIsTerminalDevice@
-> Maybe TextEncoding -- ^ Current encoding.
-> IO EncodingAction
chooseBestEnc _ _ Nothing = pure Keep
chooseBestEnc h hIsTerm (Just enc) = case textEncodingName enc of
"UTF-8" -> pure Keep "UTF-8" -> pure Keep
name name
-- XXX: The first branch is actually never used, because the encoding -- XXX: The first branch is actually never used, because the encoding
-- loses the @//TRANSLIT@ suffix after it is being created. -- loses the @//TRANSLIT@ suffix after it is being created.
-- TODO: Find a way to detect that the encoding is already trasliterating. -- TODO: Find a way to detect that the encoding is already trasliterating.
| "//TRANSLIT" `isSuffixOf` name -> pure Keep | "//TRANSLIT" `isSuffixOf` name -> pure Keep
| otherwise -> IO.hIsTerminalDevice h >>= \case | otherwise -> hIsTerm h >>= \case
False -> pure $ ChangeFromTo enc (textEncodingName utf8) False -> pure $ ChangeFromTo enc (textEncodingName utf8)
True -> pure $ ChangeFromTo enc (name ++ "//TRANSLIT") True -> pure $ ChangeFromTo enc (name ++ "//TRANSLIT")

View File

@ -8,14 +8,13 @@ module Main
) where ) where
import GHC.IO.Encoding (mkTextEncoding, setLocaleEncoding) import GHC.IO.Encoding (mkTextEncoding, setLocaleEncoding)
import Main.Utf8 (withUtf8)
import Test.Tasty (defaultMain) import Test.Tasty (defaultMain)
import System.IO.Utf8 (withUtf8StdHandles)
import Tree (tests) import Tree (tests)
main :: IO () main :: IO ()
main = withUtf8StdHandles $ do main = withUtf8 $ do
-- The issue only shows up when current locale encoding is ASCII. -- The issue only shows up when current locale encoding is ASCII.
-- Realistically, very often when running this test this will not be -- Realistically, very often when running this test this will not be
-- the case, so we unset locale encoding manually. -- the case, so we unset locale encoding manually.

View File

@ -13,6 +13,7 @@ import qualified System.IO as IO
import Test.HUnit (Assertion, assertFailure) import Test.HUnit (Assertion, assertFailure)
import Test.Tasty.HUnit ((@=?)) import Test.Tasty.HUnit ((@=?))
import System.IO (hIsTerminalDevice)
import System.IO.Utf8.Internal (EncodingAction (..), chooseBestEnc, chooseBestEncPure) import System.IO.Utf8.Internal (EncodingAction (..), chooseBestEnc, chooseBestEncPure)
import qualified System.IO.Utf8 as Utf8 import qualified System.IO.Utf8 as Utf8
@ -27,7 +28,7 @@ verifyOn h = do
enc <- IO.hGetEncoding h enc <- IO.hGetEncoding h
let pureResult = chooseBestEncPure isTerm (textEncodingName <$> enc) let pureResult = chooseBestEncPure isTerm (textEncodingName <$> enc)
realResult <- chooseBestEnc h enc realResult <- chooseBestEnc h hIsTerminalDevice enc
case (pureResult, realResult) of case (pureResult, realResult) of
(Nothing, Keep) -> pure () (Nothing, Keep) -> pure ()
@ -98,7 +99,7 @@ unit_term_idempotent :: Assertion
unit_term_idempotent = withTerminalIn char8 $ \h -> do unit_term_idempotent = withTerminalIn char8 $ \h -> do
Just enc <- IO.hGetEncoding h Just enc <- IO.hGetEncoding h
"ISO-8859-1" @=? textEncodingName enc -- sanity check "ISO-8859-1" @=? textEncodingName enc -- sanity check
Utf8.hWithEncoding h $ do Utf8.withHandle h $ do
-- XXX: Actually not true, there is no suffix in the name -- XXX: Actually not true, there is no suffix in the name
-- Just enc' <- IO.hGetEncoding h -- Just enc' <- IO.hGetEncoding h
-- "ISO-8859-1//TRANSLIT" @=? textEncodingName enc' -- sanity check -- "ISO-8859-1//TRANSLIT" @=? textEncodingName enc' -- sanity check

View File

@ -15,6 +15,7 @@ import Control.Exception.Safe (MonadMask, try)
import Control.Monad ((>=>)) import Control.Monad ((>=>))
import Control.Monad.IO.Class (MonadIO, liftIO) import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Text (Text) import Data.Text (Text)
import GHC.IO.Encoding (utf8)
import System.IO (FilePath) import System.IO (FilePath)
import System.IO.Temp (withSystemTempFile) import System.IO.Temp (withSystemTempFile)
@ -33,9 +34,7 @@ import qualified Hedgehog.Range as R
-- | Helper that writes Text to a temp file. -- | Helper that writes Text to a temp file.
withTestFile :: (MonadIO m, MonadMask m) => Text -> (FilePath -> m r) -> m r withTestFile :: (MonadIO m, MonadMask m) => Text -> (FilePath -> m r) -> m r
withTestFile t act = withSystemTempFile "utf8.txt" $ \fp h -> do withTestFile t act = withSystemTempFile "utf8.txt" $ \fp h -> do
Utf8.hSetEncoding h liftIO $ IO.hSetEncoding h utf8 *> T.hPutStr h t *> IO.hClose h
liftIO $ T.hPutStr h t
liftIO $ IO.hClose h
act fp act fp

View File

@ -12,6 +12,7 @@ import qualified System.IO as IO
import Test.HUnit (Assertion) import Test.HUnit (Assertion)
import Test.Tasty.HUnit ((@=?)) import Test.Tasty.HUnit ((@=?))
import System.IO (hIsTerminalDevice)
import System.IO.Utf8.Internal (EncodingAction (..), chooseBestEnc) import System.IO.Utf8.Internal (EncodingAction (..), chooseBestEnc)
import qualified System.IO.Utf8 as Utf8 import qualified System.IO.Utf8 as Utf8
@ -23,9 +24,9 @@ import Test.Util (withTmpFileIn)
verifyOn :: IO.Handle -> Assertion verifyOn :: IO.Handle -> Assertion
verifyOn h = do verifyOn h = do
menc <- IO.hGetEncoding h menc <- IO.hGetEncoding h
act <- chooseBestEnc h menc act <- chooseBestEnc h hIsTerminalDevice menc
Utf8.hWithEncoding h $ do Utf8.withHandle h $ do
menc' <- IO.hGetEncoding h menc' <- IO.hGetEncoding h
case act of case act of
Keep -> Keep ->