First version

This commit is contained in:
Kirill Elagin 2020-02-15 19:15:52 -05:00
parent b7b3ef4bc5
commit 08407bbe4d
10 changed files with 400 additions and 0 deletions

View File

@ -7,3 +7,13 @@ SPDX-License-Identifier: MPL-2.0
# Changelog
## Unreleased
Initial release.
### Added
- `withUtf8StdHandles`
- `openFile`
- `withFile`
- `readFile`
- `writeFile`

View File

@ -22,6 +22,66 @@ exists and an explanation of some of the opinionated decisions it is based on.
## Use
### Step 1: Get it
The library is on [Hackage][hackage:utf8], just add it to the dependencies of
your project.
[hackage:utf8]: https://hackage.haskell.org/package/utf8
### Step 2: Wrap your `main`
Import `withUtf8StdHandles` from `System.IO.Utf8` and wrap it around your `main`:
```haskell
main :: IO ()
main = withUtf8StdHandles $ {- ... your main function ... -}
```
This will make sure that if your program reads something from `stdin` or
outputs something to `stdout`/`stderr`, it will not fail with a runtime
error due to encoding issues.
### Step 3: Read files using UTF-8
If you are going to read a text file (to be precise, if you are going to open
a file in text mode), youll probably use `withFile`, `openFile`, or `readFile`.
Grab the first two from `System.IO.Utf8` or the latter from `Data.Text.IO.Utf8`.
_Note: it is best to import these functions qualified._
_Note: there is no `System.IO.Utf8.readFile` because its 2020 and
you should not read `String`s from files._
All these functions will make sure that the content will be treated as if it
was encoded in UTF-8 (it is 2020, what else can it be encoded in?).
If, for some reason, you really need to use `withFile`/`openFile` from `base`,
just call `hSetEncoding h`, where `h` is your handle and `hSetEncoding` comes
from `System.IO.Utf8` for your convenience:
```haskell
import qualified System.IO as IO
import qualified System.IO.Utf8 as Utf8
doSomethingWithAFile :: IO ()
doSomethingWithAFile = IO.withFile "file.txt" IO.ReadMode $ \h -> do
Utf8.hSetEncoding h
{- ... work with the file ... -}
```
### Step 4: Write files using UTF-8
When writing a file either open it using `withFile`/`openFile` from
`System.IO.Utf8` or write to it directly with `writeFile` from
`Data.Text.IO.Utf8`.
_Note: it is best to import these functions qualified._
_Note that there is no `System.IO.Utf8.writeFile`._
If, for some reason, you really need to use `withFile`/`openFile` from `base`,
do the same as in the previous step.
## Contributing

35
lib/Data/Text/IO/Utf8.hs Normal file
View File

@ -0,0 +1,35 @@
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
-
- SPDX-License-Identifier: MPL-2.0
-}
-- | Data Text IO for the modern world.
--
-- Wrappers around simple file reading/writing functions from the
-- @text@ package that reset the handle encoding to UTF-8.
module Data.Text.IO.Utf8
( readFile
, writeFile
) where
import Prelude hiding (readFile, writeFile)
import Control.Exception.Safe (MonadMask)
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Text (Text)
import qualified Data.Text.IO as T
import qualified System.IO as IO
import qualified System.IO.Utf8 as Utf8
-- | Like @readFile@, but assumes the file is encoded in UTF-8, regardless
-- of the current locale.
readFile :: MonadIO m => IO.FilePath -> m Text
readFile path = Utf8.openFile path IO.ReadMode >>= liftIO . T.hGetContents
-- | Like @writeFile@, but encodes the data in UTF-8, regardless
-- of the current locale.
writeFile :: (MonadIO m, MonadMask m) => IO.FilePath -> Text -> m ()
writeFile path = Utf8.withFile path IO.WriteMode . (liftIO .) . flip T.hPutStr

99
lib/System/IO/Utf8.hs Normal file
View File

@ -0,0 +1,99 @@
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
-
- SPDX-License-Identifier: MPL-2.0
-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE ViewPatterns #-}
-- | System IO for the modern world.
--
-- Standard IO functions assume that the character encoding of the data
-- they read or write is the same as the one used by current locale. In many
-- situtations this assumption is wrong, as tools work with files, and
-- the files nowadays are mostly UTF-8 encoded, regardless of the locale.
-- Therefore, it is almost always a good idea to switch the encoding of
-- file handles to UTF-8.
--
-- The same applies to standard input, output, and error handles. However,
-- there is an edge-case: if they are attached to a terminal, and the
-- encoding is not UTF-8, using UTF-8 might actually be unsafe.
--
-- Functions in this module help deal with all these issues.
module System.IO.Utf8
( withUtf8StdHandles
, hSetEncoding
, openFile
, withFile
) where
import Control.Exception.Safe (MonadMask, bracket)
import Control.Monad.IO.Class (MonadIO, liftIO)
import GHC.IO.Encoding (mkTextEncoding, textEncodingName, utf8)
import System.IO (stderr, stdin, stdout)
import qualified System.IO as IO
type EncRestoreAction = IO.Handle -> IO ()
-- | Sets the best available UTF-8-compatible encoding for the handle.
-- Returns the action that will restore the previous one.
--
-- If the handle is in binary mode, does nothing.
-- If the handle is not attached to a terminal, sets UTF-8.
-- Otherwise, keeps its current encoding, but augments it to transliterate
-- unsupported characters.
hSetBestUtf8Enc :: IO.Handle -> IO EncRestoreAction
hSetBestUtf8Enc h = IO.hGetEncoding h >>= \case
Nothing -> pure (\_ -> pure ())
Just enc -> do
isTerm <- IO.hIsTerminalDevice h
enc' <- chooseBestEnc isTerm enc
IO.hSetEncoding h enc'
pure $ flip IO.hSetEncoding enc
where
chooseBestEnc False _ = pure utf8
chooseBestEnc True enc@(textEncodingName -> name)
| '/' `notElem` name = mkTextEncoding (name ++ "//TRANSLIT")
| otherwise = pure enc
-- | Configures the encodings of the three standard handles (stdin, stdout, stderr)
-- to work with UTF-8 encoded data and runs the specified IO action.
-- After the action finishes, restores the original encodings.
withUtf8StdHandles :: IO a -> IO a
withUtf8StdHandles action =
withConfiguredHandle stdin $
withConfiguredHandle stdout $
withConfiguredHandle stderr $
action
where
withConfiguredHandle :: IO.Handle -> IO a -> IO a
withConfiguredHandle h = bracket (hSetBestUtf8Enc h) ($ h) . const
-- | A shortucut for setting handle encoding to UTF-8.
--
-- @
-- hSetEncoding h = System.IO.hSetEncoding h GHC.IO.Endoding.utf8
-- @
hSetEncoding :: MonadIO m => IO.Handle -> m ()
hSetEncoding = liftIO . flip IO.hSetEncoding utf8
-- | Like @openFile@, but sets the file encoding to UTF-8, regardless
-- of the current locale.
openFile :: MonadIO m => IO.FilePath -> IO.IOMode -> m IO.Handle
openFile path mode = do
h <- liftIO $ IO.openFile path mode
hSetEncoding h
pure h
-- | Like @withFile@, but sets the file encoding to UTF-8, regardless
-- of the current locale.
withFile
:: (MonadIO m, MonadMask m)
=> IO.FilePath -> IO.IOMode -> (IO.Handle -> m r) -> m r
withFile path mode = bracket (openFile path mode) (liftIO . IO.hClose)

View File

@ -28,3 +28,28 @@ library:
dependencies:
- base
- safe-exceptions # only really needed for polymorphic bracket
- text
tests:
utf8-test:
source-dirs: test
main: Test.hs
dependencies:
- utf8
- base
- deepseq
- safe-exceptions
- temporary
- text
- hedgehog
- HUnit
- tasty
- tasty-hedgehog
- tasty-hunit
build-tools:
- tasty-discover:tasty-discover

12
stack.yaml.lock Normal file
View File

@ -0,0 +1,12 @@
# This file was autogenerated by Stack.
# You should not edit this file by hand.
# For more information, please see the documentation at:
# https://docs.haskellstack.org/en/stable/lock_files
packages: []
snapshots:
- completed:
size: 524163
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/14/25.yaml
sha256: 97548b4e927a8a862d4a203687ca18a3f5d2ae75b4b72c946fb29c17df1a5082
original: lts-14.25

18
test/Test.hs Normal file
View File

@ -0,0 +1,18 @@
module Main
( main
) where
import GHC.IO.Encoding (mkTextEncoding, setLocaleEncoding)
import Test.Tasty (defaultMain)
import System.IO.Utf8 (withUtf8StdHandles)
import Tree (tests)
main :: IO ()
main = withUtf8StdHandles $ do
-- The issue only shows up when current locale encoding is ASCII.
-- Realistically, very often when running this test this will not be
-- the case, so we unset locale encoding manually.
mkTextEncoding "ASCII" >>= setLocaleEncoding
tests >>= defaultMain

View File

@ -0,0 +1,87 @@
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
-
- SPDX-License-Identifier: MPL-2.0
-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ScopedTypeVariables #-}
module Test.Utf8.ReadWrite where
import Control.DeepSeq (force)
import Control.Exception (IOException, evaluate)
import Control.Exception.Safe (MonadMask, try)
import Control.Monad ((>=>))
import Control.Monad.IO.Class (MonadIO, liftIO)
import Data.Text (Text)
import System.IO (FilePath)
import System.IO.Temp (withSystemTempFile)
import Hedgehog (Property, (===), forAll, property)
import Test.HUnit (Assertion, assertFailure)
import qualified Data.Text.IO as T
import qualified Data.Text.IO.Utf8 as Utf8
import qualified System.IO as IO
import qualified System.IO.Utf8 as Utf8
import qualified Hedgehog.Gen as G
import qualified Hedgehog.Range as R
-- Import MonadMask instance for Hedgehog Property
import Test.Util.Hedgehog ()
-- | Helper that writes Text to a temp file.
withTestFile :: (MonadIO m, MonadMask m) => Text -> (FilePath -> m r) -> m r
withTestFile t act = withSystemTempFile "utf8.txt" $ \fp h -> do
Utf8.hSetEncoding h
liftIO $ T.hPutStr h t
liftIO $ IO.hClose h
act fp
-- | Sanity check.
unit_standard_readFile_fails :: Assertion
unit_standard_readFile_fails = withTestFile str $ \fp ->
try (T.readFile fp) >>= \case
Right _ -> assertFailure "Standard `readFile` should fail."
Left (_ :: IOException) -> pure ()
where
-- We use an escape here because otherwise _both_ tasty-discover
-- and hedgehog fail when they are reading this file for their
-- discovery or nice error reporting purposes... I know, right?
str = "doma\285e"
hprop_readFile :: Property
hprop_readFile = property $ do
str <- forAll $ G.text (R.linear 0 1000) G.unicode
str' <- liftIO $ withTestFile str (Utf8.readFile >=> evaluate . force)
str === str'
hprop_writeFile :: Property
hprop_writeFile = property $ do
str <- forAll $ G.text (R.linear 0 1000) G.unicode
liftIO $ withTestFile str (flip Utf8.writeFile str)
hprop_openFile :: Property
hprop_openFile = property $ do
str <- forAll $ G.text (R.linear 0 1000) G.unicode
str' <- liftIO $ withTestFile str $ \fp -> do
h <- Utf8.openFile fp IO.ReadMode
res <- T.hGetContents h
res' <- evaluate . force $ res
IO.hClose h
pure res'
str === str'
hprop_withFile :: Property
hprop_withFile = property $ do
str <- forAll $ G.text (R.linear 0 1000) G.unicode
str' <- liftIO $ withTestFile str $ \fp ->
Utf8.withFile fp IO.ReadMode $ \h -> do
res <- T.hGetContents h
res' <- evaluate . force $ res
pure res'
str === str'

View File

@ -0,0 +1,53 @@
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
-
- SPDX-License-Identifier: MPL-2.0
-}
{-# OPTIONS_GHC -Wno-orphans #-}
{-# LANGUAGE DerivingStrategies #-}
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE StandaloneDeriving #-}
-- | The missing MonadMask instance for Hedgehog.
module Test.Util.Hedgehog
() where
{-
import Control.Exception.Safe (MonadMask (..))
import Hedgehog.Internal.Gen
import Hedgehog.Internal.Property
import Hedgehog.Internal.Tree
import qualified Hedgehog.Internal.Seed as Seed
deriving newtype instance MonadMask m => MonadMask (TestT m)
deriving newtype instance MonadMask m => MonadMask (PropertyT m)
instance MonadMask m => MonadMask (GenT m) where
mask a = GenT $ \size seed ->
mask $ \u -> runGenT size seed (a $ mapGenT u)
uninterruptibleMask a = GenT $ \size seed ->
uninterruptibleMask $ \u -> runGenT size seed (a $ mapGenT u)
generalBracket acquire release use = GenT $ \size seed ->
case Seed.split seed of
(seed1, seed1') -> case Seed.split seed1' of
(seed2, seed3) -> do
generalBracket
(runGenT size seed1 acquire)
(\treeRes ec -> runGenT size seed2 (release treeRes ec))
(\treeRes -> runGenT size seed3 (use treeRes))
instance MonadMask m => MonadMask (TreeT m) where
mask a
= TreeT $ mask $ \u -> runTreeT (a $ mapTreeT u)
uninterruptibleMask a
= TreeT $ uninterruptibleMask $ \u -> runTreeT (a $ mapTreeT u)
generalBracket acquire release use = undefined
-}

1
test/Tree.hs Normal file
View File

@ -0,0 +1 @@
{-# OPTIONS_GHC -F -pgmF tasty-discover -optF --tree-display -optF --generated-module -optF Tree #-}