Allow not changing file handles to UTF-8

The old interface only allowed one to perform terminal detection on a
handle and change it to UTF-8 if it was not a terminal. Turns our, it
makes sense to sometimes leave the original encoding on a handle even if
it does not point to a terminal.

* Change the interface to separate these two use-cases.
* Extract main-wrappers to a new `Main` module.
This commit is contained in:
Kirill Elagin 2020-02-17 17:48:05 -05:00
parent 8a76bb2577
commit 40069d4b2b
9 changed files with 214 additions and 65 deletions

View File

@ -12,9 +12,12 @@ Initial release.
### Added
- `withUtf8StdHandles`
- `hSetEncoding`
- `hWithEncoding`
- `withUtf8`
- `withStdTerminalHandles`
- `setHandleEncoding`
- `withHandle`
- `setTerminalHandleEncoding`
- `withTerminalHandle`
- `openFile`
- `withFile`
- `readFile`

View File

@ -24,12 +24,12 @@ import qualified System.IO as IO
import qualified System.IO.Utf8 as Utf8
-- | Like @readFile@, but assumes the file is encoded in UTF-8, regardless
-- | Like 'T.readFile', but assumes the file is encoded in UTF-8, regardless
-- of the current locale.
readFile :: MonadIO m => IO.FilePath -> m Text
readFile path = Utf8.openFile path IO.ReadMode >>= liftIO . T.hGetContents
-- | Like @writeFile@, but encodes the data in UTF-8, regardless
-- | Like 'T.writeFile', but encodes the data in UTF-8, regardless
-- of the current locale.
writeFile :: (MonadIO m, MonadMask m) => IO.FilePath -> Text -> m ()
writeFile path = Utf8.withFile path IO.WriteMode . (liftIO .) . flip T.hPutStr

82
lib/Main/Utf8.hs Normal file
View File

@ -0,0 +1,82 @@
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
-
- SPDX-License-Identifier: MPL-2.0
-}
-----------------------------------------------------------------------------
-- |
--
-- Functions in this module will help you make your /executable/ work
-- correctly with encodings of text files and standard handles.
--
-- /Note: if you are developing a library, see "System.IO.Utf8"./
--
-- = Quick start
--
-- Wrap a call to 'withUtf8' around your @main@:
--
-- @
-- import Main.Utf8 (withUtf8)
--
-- main :: IO ()
-- main = 'withUtf8' $ do
-- putStrLn "Hello, мир!"
-- @
--
-- Basically, this is all you have to do for a program that uses
-- @stdin@ and @stdout@ to interact with the user. However, some
-- programs read input from and write output to files and,
-- at the same time, allow the user to redirect @stdin@ and @stdout@
-- instead of providing explicit file names.
--
-- If this is the case for your executable, you should also wrap
-- @Utf8.@'System.IO.Utf8.withHandle' around the code that passes
-- the handle to a third-party library. It is not necessary to do
-- when passing it to your own library, assuming that it follows
-- the recommendations from the documentation of "System.IO.Utf8".
module Main.Utf8
( withUtf8
, withStdTerminalHandles
) where
import Control.Exception.Safe (MonadMask, bracket)
import Control.Monad.IO.Class (MonadIO, liftIO)
import GHC.IO.Encoding (getLocaleEncoding, setLocaleEncoding, utf8)
import System.IO (stderr, stdin, stdout)
import System.IO.Utf8 (withTerminalHandle)
-- | Make standard handles safe to write anything to them and change
-- program-global default file handle encoding to UTF-8.
--
-- This function will:
--
-- 1. Adjust the encoding of 'stdin', 'stdout', and 'stderr' to
-- enable transliteration, like 'withStdTerminalHandles' does.
-- 2. Call 'setLocaleEncoding' to change the program-global locale
-- encoding to UTF-8.
-- 3. Undo everything when the wrapped action finishes.
withUtf8 :: (MonadIO m, MonadMask m) => m r -> m r
withUtf8 act = withStdTerminalHandles $
bracket
(liftIO $ getLocaleEncoding <* setLocaleEncoding utf8)
(liftIO . setLocaleEncoding)
(\_ -> act)
-- | Make standard handles safe to write anything to them.
--
-- This function will for each of 'stdin', 'stdout', 'stderr' do:
--
-- 1. Tweak the existing encoding so that unrepresentable characters
-- will get approximated (aka transliterated) by visually similar
-- ones or question marks.
-- 2. Restore the original encoding when the wrapped action finishes.
--
-- Use this function only if you do not want to change the program-global
-- locale encoding. Otherwise prefer 'withUtf8'.
withStdTerminalHandles :: (MonadIO m, MonadMask m) => m r -> m r
withStdTerminalHandles
= withTerminalHandle stdin
. withTerminalHandle stdout
. withTerminalHandle stderr

View File

@ -5,12 +5,13 @@
{-# LANGUAGE LambdaCase #-}
-- | System IO for the modern world.
-----------------------------------------------------------------------------
-- |
--
-- Standard IO functions assume that the character encoding of the data
-- they read or write is the same as the one used by current locale. In many
-- situtations this assumption is wrong, as tools work with files, and
-- the files nowadays are mostly UTF-8 encoded, regardless of the locale.
-- files nowadays are mostly UTF-8 encoded, regardless of the locale.
-- Therefore, it is almost always a good idea to switch the encoding of
-- file handles to UTF-8.
--
@ -18,12 +19,58 @@
-- there is an edge-case: if they are attached to a terminal, and the
-- encoding is not UTF-8, using UTF-8 might actually be unsafe.
--
-- Functions in this module help deal with all these issues.
-- If you are developing an executable, in most cases, it is enough to
-- configure the environment accordingly on program start, see the
-- "Main.Utf8" for functions that help with this.
-- However, if you are a library author, you should avoid modifying the
-- global environment.
--
-- = Quick start
--
-- == Opening new files
--
-- If you need to open a text file, use @Utf8.@'withFile'
-- (or @Utf8.@'openFile'). These will not only open the file, but also
-- set the handles encoding to UTF-8, regardless of the users locale.
--
-- == Working with existing handles
--
-- Suppose you are creating a function which produces some text and writes
-- it to a file handle that is passed to it from the outside.
-- Ask yourself this question: do I want to encode this text in UTF-8
-- or using the encoding from the users locale?
--
-- In many cases this question is easy to answer. For example, if your
-- function produces Haskell code, then you always want it in UTF-8,
-- because that is what all other tools (including GHC) expect.
--
-- In some cases it is not that clear. What you can do then is consider
-- what the user is going to do with the data produced.
-- If it is, primarily, meant to be displayed on their screen and then
-- forgotten, you dont need UTF-8. On the other hand, if it is meant
-- to be saved somewhere and then used or edited by other tools, then
-- you need UTF-8.
--
-- If you decided that your function needs to try to switch the handle
-- to UTF-8, it is very easy to achieve:
--
-- @
-- import qualified System.IO.Utf8 as Utf8
--
-- writeData :: 'IO.Handle' -> InputDataType -> IO ()
-- writeData hOut inData = Utf8.'withHandle' hOut $ do
-- {- ... write the data ... -}
-- @
--
-- If you decided that you dont need to try to switch it to UTF-8,
-- replace @withHandle@ with 'withTerminalHandle' to only make the
-- handle safe to write to without runtime errors.
module System.IO.Utf8
( withUtf8StdHandles
( withHandle
, withTerminalHandle
, hSetEncoding
, hWithEncoding
, setHandleEncoding
, setTerminalHandleEncoding
, openFile
, withFile
@ -32,8 +79,7 @@ module System.IO.Utf8
import Control.Exception.Safe (MonadMask, bracket)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Functor (void)
import GHC.IO.Encoding (mkTextEncoding)
import System.IO (stderr, stdin, stdout)
import GHC.IO.Encoding (mkTextEncoding, utf8)
import qualified System.IO as IO
@ -49,60 +95,74 @@ type EncRestoreAction m = IO.Handle -> m ()
-- If the handle is not attached to a terminal, sets UTF-8.
-- Otherwise, keeps its current encoding, but augments it to transliterate
-- unsupported characters.
hSetBestUtf8Enc :: MonadIO m => IO.Handle -> m (EncRestoreAction m)
hSetBestUtf8Enc h = liftIO $ do
IO.hGetEncoding h >>= chooseBestEnc h >>= \case
hSetBestUtf8Enc
:: MonadIO m
=> (IO.Handle -> IO Bool)
-> IO.Handle
-> m (EncRestoreAction m)
hSetBestUtf8Enc hIsTerm h = liftIO $ do
IO.hGetEncoding h >>= chooseBestEnc h hIsTerm >>= \case
Keep -> pure (\_ -> pure ())
ChangeFromTo enc newName -> do
mkTextEncoding newName >>= IO.hSetEncoding h
pure $ liftIO . flip IO.hSetEncoding enc
-- | Configures the encodings of the three standard handles (stdin, stdout, stderr)
-- to work with UTF-8 encoded data and runs the specified IO action.
-- After the action finishes, restores the original encodings.
withUtf8StdHandles :: IO a -> IO a
withUtf8StdHandles action =
hWithEncoding stdin $
hWithEncoding stdout $
hWithEncoding stderr $
action
-- | Set handle encoding to the best possible.
--
-- It is safe to call this function on any kind of handle whatsoever.
--
-- * If the handle is in binary mode, it will do nothing.
-- * If the handle is a terminal, it will use the same encoding, but switch
-- it to Unicode approximation mode so it won't throw errors on invalid
-- byte sequences, but instead try to approximate unecodable characters
-- with visually similar encodable ones.
-- * For regular files it will always choose UTF-8, of course.
--
-- You probably shouldn't be using this function. If you open the file
-- yourself, use 'openFile' (or, even better, 'withFile') instead.
-- If you get the handle from somewhere else, use 'hWithEncoding',
-- which will restore the previous encoding when you are done.
hSetEncoding :: MonadIO m => IO.Handle -> m ()
hSetEncoding = liftIO . void . hSetBestUtf8Enc
-- See 'withHandle' for description and prefer it, if possible.
setHandleEncoding :: MonadIO m => IO.Handle -> m ()
setHandleEncoding = liftIO . void . hSetBestUtf8Enc IO.hIsTerminalDevice
-- | Temporarily set handle encoding to the best possible.
--
-- This is like 'hSetEncoding', but it will restore the encoding
-- to the previous one when the action is done.
hWithEncoding :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r
hWithEncoding h = bracket (hSetBestUtf8Enc h) ($ h) . const
-- “Best possible” means UTF-8, unless the handle points to a terminal
-- device, in which case the encoding will be left the same, but tweaked
-- to approximate unencodable characters.
--
-- This function is safe to call on handles open in binary mode and it will
-- do nothing on them.
--
-- To sum up:
--
-- * If the handle is in binary mode, do nothing.
-- * If the handle points to a terminal device, act like 'withTerminalHandle'.
-- * For regular files always choose UTF-8, of course.
withHandle :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r
withHandle h = bracket (hSetBestUtf8Enc IO.hIsTerminalDevice h) ($ h) . const
-- | Make a handle safe to write any text to.
--
-- See 'withTerminalHandle' for description and prefer it, if possible.
setTerminalHandleEncoding :: MonadIO m => IO.Handle -> m ()
setTerminalHandleEncoding = liftIO . void . hSetBestUtf8Enc (const $ pure True)
-- | Temporarily make a handle safe to write any text to.
--
-- If the handle is not using UTF-8, adjust the encoding to remain the same
-- as before, but approximate unencodable characters. When the action is done,
-- restore it back to the previous one.
--
-- Use this function only if you are sure you want to treat this handle as
-- a terminal (that is, you will be using it to interact with the user
-- and to write user-visible messages, rather than something that can
-- be reasonable expected to go to a file).
--
-- This function is safe to call on handles open in binary mode and it will
-- do nothing on them.
withTerminalHandle :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r
withTerminalHandle h = bracket (hSetBestUtf8Enc (const $ pure True) h) ($ h) . const
-- | Like @openFile@, but sets the file encoding to UTF-8, regardless
-- | Like 'System.IO.openFile', but sets the file encoding to UTF-8, regardless
-- of the current locale.
openFile :: MonadIO m => IO.FilePath -> IO.IOMode -> m IO.Handle
openFile path mode = do
h <- liftIO $ IO.openFile path mode
hSetEncoding h
openFile path mode = liftIO $ do
h <- IO.openFile path mode
IO.hSetEncoding h utf8
pure h
-- | Like @withFile@, but sets the file encoding to UTF-8, regardless
-- | Like 'System.IO.withFile', but sets the file encoding to UTF-8, regardless
-- of the current locale.
withFile
:: (MonadIO m, MonadMask m)

View File

@ -36,7 +36,7 @@ data EncodingAction
| ChangeFromTo TextEncoding String
-- ^ Change the first encoding to the second.
-- | Pure version of 'chooseBestUtf8Enc'.
-- | Pure version of 'chooseBestEnc'.
--
-- This function is not actually used in the library. It exists only
-- for documentation purposes to demonstrate the logic.
@ -68,15 +68,19 @@ chooseBestEncPure True (Just name)
-- (e.g. to be able to restore it), so we avoid repeating the query.
-- 2. It first checks for the cases where it doesn't care whether the device
-- is a terminal or not, so the query will be made only if really necessary.
chooseBestEnc :: IO.Handle -> Maybe TextEncoding -> IO EncodingAction
chooseBestEnc _ Nothing = pure Keep
chooseBestEnc h (Just enc) = case textEncodingName enc of
chooseBestEnc
:: IO.Handle -- ^ Handle to choose encoding for
-> (IO.Handle -> IO Bool) -- ^ @hIsTerminalDevice@
-> Maybe TextEncoding -- ^ Current encoding.
-> IO EncodingAction
chooseBestEnc _ _ Nothing = pure Keep
chooseBestEnc h hIsTerm (Just enc) = case textEncodingName enc of
"UTF-8" -> pure Keep
name
-- XXX: The first branch is actually never used, because the encoding
-- loses the @//TRANSLIT@ suffix after it is being created.
-- TODO: Find a way to detect that the encoding is already trasliterating.
| "//TRANSLIT" `isSuffixOf` name -> pure Keep
| otherwise -> IO.hIsTerminalDevice h >>= \case
| otherwise -> hIsTerm h >>= \case
False -> pure $ ChangeFromTo enc (textEncodingName utf8)
True -> pure $ ChangeFromTo enc (name ++ "//TRANSLIT")

View File

@ -8,14 +8,13 @@ module Main
) where
import GHC.IO.Encoding (mkTextEncoding, setLocaleEncoding)
import Main.Utf8 (withUtf8)
import Test.Tasty (defaultMain)
import System.IO.Utf8 (withUtf8StdHandles)
import Tree (tests)
main :: IO ()
main = withUtf8StdHandles $ do
main = withUtf8 $ do
-- The issue only shows up when current locale encoding is ASCII.
-- Realistically, very often when running this test this will not be
-- the case, so we unset locale encoding manually.

View File

@ -13,6 +13,7 @@ import qualified System.IO as IO
import Test.HUnit (Assertion, assertFailure)
import Test.Tasty.HUnit ((@=?))
import System.IO (hIsTerminalDevice)
import System.IO.Utf8.Internal (EncodingAction (..), chooseBestEnc, chooseBestEncPure)
import qualified System.IO.Utf8 as Utf8
@ -27,7 +28,7 @@ verifyOn h = do
enc <- IO.hGetEncoding h
let pureResult = chooseBestEncPure isTerm (textEncodingName <$> enc)
realResult <- chooseBestEnc h enc
realResult <- chooseBestEnc h hIsTerminalDevice enc
case (pureResult, realResult) of
(Nothing, Keep) -> pure ()
@ -98,7 +99,7 @@ unit_term_idempotent :: Assertion
unit_term_idempotent = withTerminalIn char8 $ \h -> do
Just enc <- IO.hGetEncoding h
"ISO-8859-1" @=? textEncodingName enc -- sanity check
Utf8.hWithEncoding h $ do
Utf8.withHandle h $ do
-- XXX: Actually not true, there is no suffix in the name
-- Just enc' <- IO.hGetEncoding h
-- "ISO-8859-1//TRANSLIT" @=? textEncodingName enc' -- sanity check

View File

@ -15,6 +15,7 @@ import Control.Exception.Safe (MonadMask, try)
import Control.Monad ((>=>))
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Text (Text)
import GHC.IO.Encoding (utf8)
import System.IO (FilePath)
import System.IO.Temp (withSystemTempFile)
@ -33,9 +34,7 @@ import qualified Hedgehog.Range as R
-- | Helper that writes Text to a temp file.
withTestFile :: (MonadIO m, MonadMask m) => Text -> (FilePath -> m r) -> m r
withTestFile t act = withSystemTempFile "utf8.txt" $ \fp h -> do
Utf8.hSetEncoding h
liftIO $ T.hPutStr h t
liftIO $ IO.hClose h
liftIO $ IO.hSetEncoding h utf8 *> T.hPutStr h t *> IO.hClose h
act fp

View File

@ -12,6 +12,7 @@ import qualified System.IO as IO
import Test.HUnit (Assertion)
import Test.Tasty.HUnit ((@=?))
import System.IO (hIsTerminalDevice)
import System.IO.Utf8.Internal (EncodingAction (..), chooseBestEnc)
import qualified System.IO.Utf8 as Utf8
@ -23,9 +24,9 @@ import Test.Util (withTmpFileIn)
verifyOn :: IO.Handle -> Assertion
verifyOn h = do
menc <- IO.hGetEncoding h
act <- chooseBestEnc h menc
act <- chooseBestEnc h hIsTerminalDevice menc
Utf8.hWithEncoding h $ do
Utf8.withHandle h $ do
menc' <- IO.hGetEncoding h
case act of
Keep ->