diff --git a/CHANGELOG.md b/CHANGELOG.md index dbe664e..704e253 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -12,9 +12,12 @@ Initial release. ### Added -- `withUtf8StdHandles` -- `hSetEncoding` -- `hWithEncoding` +- `withUtf8` +- `withStdTerminalHandles` +- `setHandleEncoding` +- `withHandle` +- `setTerminalHandleEncoding` +- `withTerminalHandle` - `openFile` - `withFile` - `readFile` diff --git a/lib/Data/Text/IO/Utf8.hs b/lib/Data/Text/IO/Utf8.hs index 4099bd4..626f8ce 100644 --- a/lib/Data/Text/IO/Utf8.hs +++ b/lib/Data/Text/IO/Utf8.hs @@ -24,12 +24,12 @@ import qualified System.IO as IO import qualified System.IO.Utf8 as Utf8 --- | Like @readFile@, but assumes the file is encoded in UTF-8, regardless +-- | Like 'T.readFile', but assumes the file is encoded in UTF-8, regardless -- of the current locale. readFile :: MonadIO m => IO.FilePath -> m Text readFile path = Utf8.openFile path IO.ReadMode >>= liftIO . T.hGetContents --- | Like @writeFile@, but encodes the data in UTF-8, regardless +-- | Like 'T.writeFile', but encodes the data in UTF-8, regardless -- of the current locale. writeFile :: (MonadIO m, MonadMask m) => IO.FilePath -> Text -> m () writeFile path = Utf8.withFile path IO.WriteMode . (liftIO .) . flip T.hPutStr diff --git a/lib/Main/Utf8.hs b/lib/Main/Utf8.hs new file mode 100644 index 0000000..2ceec02 --- /dev/null +++ b/lib/Main/Utf8.hs @@ -0,0 +1,82 @@ +{- SPDX-FileCopyrightText: 2020 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +----------------------------------------------------------------------------- +-- | +-- +-- Functions in this module will help you make your /executable/ work +-- correctly with encodings of text files and standard handles. +-- +-- /Note: if you are developing a library, see "System.IO.Utf8"./ +-- +-- = Quick start +-- +-- Wrap a call to 'withUtf8' around your @main@: +-- +-- @ +-- import Main.Utf8 (withUtf8) +-- +-- main :: IO () +-- main = 'withUtf8' $ do +-- putStrLn "Hello, мир!" +-- @ +-- +-- Basically, this is all you have to do for a program that uses +-- @stdin@ and @stdout@ to interact with the user. However, some +-- programs read input from and write output to files and, +-- at the same time, allow the user to redirect @stdin@ and @stdout@ +-- instead of providing explicit file names. +-- +-- If this is the case for your executable, you should also wrap +-- @Utf8.@'System.IO.Utf8.withHandle' around the code that passes +-- the handle to a third-party library. It is not necessary to do +-- when passing it to your own library, assuming that it follows +-- the recommendations from the documentation of "System.IO.Utf8". +module Main.Utf8 + ( withUtf8 + , withStdTerminalHandles + ) where + +import Control.Exception.Safe (MonadMask, bracket) +import Control.Monad.IO.Class (MonadIO, liftIO) +import GHC.IO.Encoding (getLocaleEncoding, setLocaleEncoding, utf8) +import System.IO (stderr, stdin, stdout) + +import System.IO.Utf8 (withTerminalHandle) + + +-- | Make standard handles safe to write anything to them and change +-- program-global default file handle encoding to UTF-8. +-- +-- This function will: +-- +-- 1. Adjust the encoding of 'stdin', 'stdout', and 'stderr' to +-- enable transliteration, like 'withStdTerminalHandles' does. +-- 2. Call 'setLocaleEncoding' to change the program-global locale +-- encoding to UTF-8. +-- 3. Undo everything when the wrapped action finishes. +withUtf8 :: (MonadIO m, MonadMask m) => m r -> m r +withUtf8 act = withStdTerminalHandles $ + bracket + (liftIO $ getLocaleEncoding <* setLocaleEncoding utf8) + (liftIO . setLocaleEncoding) + (\_ -> act) + +-- | Make standard handles safe to write anything to them. +-- +-- This function will for each of 'stdin', 'stdout', 'stderr' do: +-- +-- 1. Tweak the existing encoding so that unrepresentable characters +-- will get approximated (aka transliterated) by visually similar +-- ones or question marks. +-- 2. Restore the original encoding when the wrapped action finishes. +-- +-- Use this function only if you do not want to change the program-global +-- locale encoding. Otherwise prefer 'withUtf8'. +withStdTerminalHandles :: (MonadIO m, MonadMask m) => m r -> m r +withStdTerminalHandles + = withTerminalHandle stdin + . withTerminalHandle stdout + . withTerminalHandle stderr diff --git a/lib/System/IO/Utf8.hs b/lib/System/IO/Utf8.hs index cc0331f..ea6c1f5 100644 --- a/lib/System/IO/Utf8.hs +++ b/lib/System/IO/Utf8.hs @@ -5,12 +5,13 @@ {-# LANGUAGE LambdaCase #-} --- | System IO for the modern world. +----------------------------------------------------------------------------- +-- | -- -- Standard IO functions assume that the character encoding of the data -- they read or write is the same as the one used by current locale. In many -- situtations this assumption is wrong, as tools work with files, and --- the files nowadays are mostly UTF-8 encoded, regardless of the locale. +-- files nowadays are mostly UTF-8 encoded, regardless of the locale. -- Therefore, it is almost always a good idea to switch the encoding of -- file handles to UTF-8. -- @@ -18,12 +19,58 @@ -- there is an edge-case: if they are attached to a terminal, and the -- encoding is not UTF-8, using UTF-8 might actually be unsafe. -- --- Functions in this module help deal with all these issues. +-- If you are developing an executable, in most cases, it is enough to +-- configure the environment accordingly on program start, see the +-- "Main.Utf8" for functions that help with this. +-- However, if you are a library author, you should avoid modifying the +-- global environment. +-- +-- = Quick start +-- +-- == Opening new files +-- +-- If you need to open a text file, use @Utf8.@'withFile' +-- (or @Utf8.@'openFile'). These will not only open the file, but also +-- set the handle’s encoding to UTF-8, regardless of the user’s locale. +-- +-- == Working with existing handles +-- +-- Suppose you are creating a function which produces some text and writes +-- it to a file handle that is passed to it from the outside. +-- Ask yourself this question: do I want to encode this text in UTF-8 +-- or using the encoding from the user’s locale? +-- +-- In many cases this question is easy to answer. For example, if your +-- function produces Haskell code, then you always want it in UTF-8, +-- because that is what all other tools (including GHC) expect. +-- +-- In some cases it is not that clear. What you can do then is consider +-- what the user is going to do with the data produced. +-- If it is, primarily, meant to be displayed on their screen and then +-- forgotten, you don’t need UTF-8. On the other hand, if it is meant +-- to be saved somewhere and then used or edited by other tools, then +-- you need UTF-8. +-- +-- If you decided that your function needs to try to switch the handle +-- to UTF-8, it is very easy to achieve: +-- +-- @ +-- import qualified System.IO.Utf8 as Utf8 +-- +-- writeData :: 'IO.Handle' -> InputDataType -> IO () +-- writeData hOut inData = Utf8.'withHandle' hOut $ do +-- {- ... write the data ... -} +-- @ +-- +-- If you decided that you don’t need to try to switch it to UTF-8, +-- replace @withHandle@ with 'withTerminalHandle' to only make the +-- handle safe to write to without runtime errors. module System.IO.Utf8 - ( withUtf8StdHandles + ( withHandle + , withTerminalHandle - , hSetEncoding - , hWithEncoding + , setHandleEncoding + , setTerminalHandleEncoding , openFile , withFile @@ -32,8 +79,7 @@ module System.IO.Utf8 import Control.Exception.Safe (MonadMask, bracket) import Control.Monad.IO.Class (MonadIO, liftIO) import Data.Functor (void) -import GHC.IO.Encoding (mkTextEncoding) -import System.IO (stderr, stdin, stdout) +import GHC.IO.Encoding (mkTextEncoding, utf8) import qualified System.IO as IO @@ -49,60 +95,74 @@ type EncRestoreAction m = IO.Handle -> m () -- If the handle is not attached to a terminal, sets UTF-8. -- Otherwise, keeps its current encoding, but augments it to transliterate -- unsupported characters. -hSetBestUtf8Enc :: MonadIO m => IO.Handle -> m (EncRestoreAction m) -hSetBestUtf8Enc h = liftIO $ do - IO.hGetEncoding h >>= chooseBestEnc h >>= \case +hSetBestUtf8Enc + :: MonadIO m + => (IO.Handle -> IO Bool) + -> IO.Handle + -> m (EncRestoreAction m) +hSetBestUtf8Enc hIsTerm h = liftIO $ do + IO.hGetEncoding h >>= chooseBestEnc h hIsTerm >>= \case Keep -> pure (\_ -> pure ()) ChangeFromTo enc newName -> do mkTextEncoding newName >>= IO.hSetEncoding h pure $ liftIO . flip IO.hSetEncoding enc --- | Configures the encodings of the three standard handles (stdin, stdout, stderr) --- to work with UTF-8 encoded data and runs the specified IO action. --- After the action finishes, restores the original encodings. -withUtf8StdHandles :: IO a -> IO a -withUtf8StdHandles action = - hWithEncoding stdin $ - hWithEncoding stdout $ - hWithEncoding stderr $ - action - -- | Set handle encoding to the best possible. -- --- It is safe to call this function on any kind of handle whatsoever. --- --- * If the handle is in binary mode, it will do nothing. --- * If the handle is a terminal, it will use the same encoding, but switch --- it to Unicode approximation mode so it won't throw errors on invalid --- byte sequences, but instead try to approximate unecodable characters --- with visually similar encodable ones. --- * For regular files it will always choose UTF-8, of course. --- --- You probably shouldn't be using this function. If you open the file --- yourself, use 'openFile' (or, even better, 'withFile') instead. --- If you get the handle from somewhere else, use 'hWithEncoding', --- which will restore the previous encoding when you are done. -hSetEncoding :: MonadIO m => IO.Handle -> m () -hSetEncoding = liftIO . void . hSetBestUtf8Enc +-- See 'withHandle' for description and prefer it, if possible. +setHandleEncoding :: MonadIO m => IO.Handle -> m () +setHandleEncoding = liftIO . void . hSetBestUtf8Enc IO.hIsTerminalDevice -- | Temporarily set handle encoding to the best possible. -- --- This is like 'hSetEncoding', but it will restore the encoding --- to the previous one when the action is done. -hWithEncoding :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r -hWithEncoding h = bracket (hSetBestUtf8Enc h) ($ h) . const +-- “Best possible” means UTF-8, unless the handle points to a terminal +-- device, in which case the encoding will be left the same, but tweaked +-- to approximate unencodable characters. +-- +-- This function is safe to call on handles open in binary mode and it will +-- do nothing on them. +-- +-- To sum up: +-- +-- * If the handle is in binary mode, do nothing. +-- * If the handle points to a terminal device, act like 'withTerminalHandle'. +-- * For regular files always choose UTF-8, of course. +withHandle :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r +withHandle h = bracket (hSetBestUtf8Enc IO.hIsTerminalDevice h) ($ h) . const + +-- | Make a handle safe to write any text to. +-- +-- See 'withTerminalHandle' for description and prefer it, if possible. +setTerminalHandleEncoding :: MonadIO m => IO.Handle -> m () +setTerminalHandleEncoding = liftIO . void . hSetBestUtf8Enc (const $ pure True) + +-- | Temporarily make a handle safe to write any text to. +-- +-- If the handle is not using UTF-8, adjust the encoding to remain the same +-- as before, but approximate unencodable characters. When the action is done, +-- restore it back to the previous one. +-- +-- Use this function only if you are sure you want to treat this handle as +-- a terminal (that is, you will be using it to interact with the user +-- and to write user-visible messages, rather than something that can +-- be reasonable expected to go to a file). +-- +-- This function is safe to call on handles open in binary mode and it will +-- do nothing on them. +withTerminalHandle :: (MonadIO m, MonadMask m) => IO.Handle -> m r -> m r +withTerminalHandle h = bracket (hSetBestUtf8Enc (const $ pure True) h) ($ h) . const --- | Like @openFile@, but sets the file encoding to UTF-8, regardless +-- | Like 'System.IO.openFile', but sets the file encoding to UTF-8, regardless -- of the current locale. openFile :: MonadIO m => IO.FilePath -> IO.IOMode -> m IO.Handle -openFile path mode = do - h <- liftIO $ IO.openFile path mode - hSetEncoding h +openFile path mode = liftIO $ do + h <- IO.openFile path mode + IO.hSetEncoding h utf8 pure h --- | Like @withFile@, but sets the file encoding to UTF-8, regardless +-- | Like 'System.IO.withFile', but sets the file encoding to UTF-8, regardless -- of the current locale. withFile :: (MonadIO m, MonadMask m) diff --git a/lib/System/IO/Utf8/Internal.hs b/lib/System/IO/Utf8/Internal.hs index 9243e11..4329c0e 100644 --- a/lib/System/IO/Utf8/Internal.hs +++ b/lib/System/IO/Utf8/Internal.hs @@ -36,7 +36,7 @@ data EncodingAction | ChangeFromTo TextEncoding String -- ^ Change the first encoding to the second. --- | Pure version of 'chooseBestUtf8Enc'. +-- | Pure version of 'chooseBestEnc'. -- -- This function is not actually used in the library. It exists only -- for documentation purposes to demonstrate the logic. @@ -68,15 +68,19 @@ chooseBestEncPure True (Just name) -- (e.g. to be able to restore it), so we avoid repeating the query. -- 2. It first checks for the cases where it doesn't care whether the device -- is a terminal or not, so the query will be made only if really necessary. -chooseBestEnc :: IO.Handle -> Maybe TextEncoding -> IO EncodingAction -chooseBestEnc _ Nothing = pure Keep -chooseBestEnc h (Just enc) = case textEncodingName enc of +chooseBestEnc + :: IO.Handle -- ^ Handle to choose encoding for + -> (IO.Handle -> IO Bool) -- ^ @hIsTerminalDevice@ + -> Maybe TextEncoding -- ^ Current encoding. + -> IO EncodingAction +chooseBestEnc _ _ Nothing = pure Keep +chooseBestEnc h hIsTerm (Just enc) = case textEncodingName enc of "UTF-8" -> pure Keep name -- XXX: The first branch is actually never used, because the encoding -- loses the @//TRANSLIT@ suffix after it is being created. -- TODO: Find a way to detect that the encoding is already trasliterating. | "//TRANSLIT" `isSuffixOf` name -> pure Keep - | otherwise -> IO.hIsTerminalDevice h >>= \case + | otherwise -> hIsTerm h >>= \case False -> pure $ ChangeFromTo enc (textEncodingName utf8) True -> pure $ ChangeFromTo enc (name ++ "//TRANSLIT") diff --git a/test/Test.hs b/test/Test.hs index ca6664a..6176daf 100644 --- a/test/Test.hs +++ b/test/Test.hs @@ -8,14 +8,13 @@ module Main ) where import GHC.IO.Encoding (mkTextEncoding, setLocaleEncoding) +import Main.Utf8 (withUtf8) import Test.Tasty (defaultMain) -import System.IO.Utf8 (withUtf8StdHandles) - import Tree (tests) main :: IO () -main = withUtf8StdHandles $ do +main = withUtf8 $ do -- The issue only shows up when current locale encoding is ASCII. -- Realistically, very often when running this test this will not be -- the case, so we unset locale encoding manually. diff --git a/test/Test/Utf8/Choice.hs b/test/Test/Utf8/Choice.hs index 2136326..927e28f 100644 --- a/test/Test/Utf8/Choice.hs +++ b/test/Test/Utf8/Choice.hs @@ -13,6 +13,7 @@ import qualified System.IO as IO import Test.HUnit (Assertion, assertFailure) import Test.Tasty.HUnit ((@=?)) +import System.IO (hIsTerminalDevice) import System.IO.Utf8.Internal (EncodingAction (..), chooseBestEnc, chooseBestEncPure) import qualified System.IO.Utf8 as Utf8 @@ -27,7 +28,7 @@ verifyOn h = do enc <- IO.hGetEncoding h let pureResult = chooseBestEncPure isTerm (textEncodingName <$> enc) - realResult <- chooseBestEnc h enc + realResult <- chooseBestEnc h hIsTerminalDevice enc case (pureResult, realResult) of (Nothing, Keep) -> pure () @@ -98,7 +99,7 @@ unit_term_idempotent :: Assertion unit_term_idempotent = withTerminalIn char8 $ \h -> do Just enc <- IO.hGetEncoding h "ISO-8859-1" @=? textEncodingName enc -- sanity check - Utf8.hWithEncoding h $ do + Utf8.withHandle h $ do -- XXX: Actually not true, there is no suffix in the name -- Just enc' <- IO.hGetEncoding h -- "ISO-8859-1//TRANSLIT" @=? textEncodingName enc' -- sanity check diff --git a/test/Test/Utf8/ReadWrite.hs b/test/Test/Utf8/ReadWrite.hs index 70124d6..904217c 100644 --- a/test/Test/Utf8/ReadWrite.hs +++ b/test/Test/Utf8/ReadWrite.hs @@ -15,6 +15,7 @@ import Control.Exception.Safe (MonadMask, try) import Control.Monad ((>=>)) import Control.Monad.IO.Class (MonadIO, liftIO) import Data.Text (Text) +import GHC.IO.Encoding (utf8) import System.IO (FilePath) import System.IO.Temp (withSystemTempFile) @@ -33,9 +34,7 @@ import qualified Hedgehog.Range as R -- | Helper that writes Text to a temp file. withTestFile :: (MonadIO m, MonadMask m) => Text -> (FilePath -> m r) -> m r withTestFile t act = withSystemTempFile "utf8.txt" $ \fp h -> do - Utf8.hSetEncoding h - liftIO $ T.hPutStr h t - liftIO $ IO.hClose h + liftIO $ IO.hSetEncoding h utf8 *> T.hPutStr h t *> IO.hClose h act fp diff --git a/test/Test/Utf8/Set.hs b/test/Test/Utf8/Set.hs index 2352215..191d922 100644 --- a/test/Test/Utf8/Set.hs +++ b/test/Test/Utf8/Set.hs @@ -12,6 +12,7 @@ import qualified System.IO as IO import Test.HUnit (Assertion) import Test.Tasty.HUnit ((@=?)) +import System.IO (hIsTerminalDevice) import System.IO.Utf8.Internal (EncodingAction (..), chooseBestEnc) import qualified System.IO.Utf8 as Utf8 @@ -23,9 +24,9 @@ import Test.Util (withTmpFileIn) verifyOn :: IO.Handle -> Assertion verifyOn h = do menc <- IO.hGetEncoding h - act <- chooseBestEnc h menc + act <- chooseBestEnc h hIsTerminalDevice menc - Utf8.hWithEncoding h $ do + Utf8.withHandle h $ do menc' <- IO.hGetEncoding h case act of Keep ->