mirror of
https://github.com/serokell/haskell-with-utf8.git
synced 2024-10-26 07:51:16 +03:00
First version
This commit is contained in:
parent
b7b3ef4bc5
commit
08407bbe4d
10
CHANGELOG.md
10
CHANGELOG.md
@ -7,3 +7,13 @@ SPDX-License-Identifier: MPL-2.0
|
|||||||
# Changelog
|
# Changelog
|
||||||
|
|
||||||
## Unreleased
|
## Unreleased
|
||||||
|
|
||||||
|
Initial release.
|
||||||
|
|
||||||
|
### Added
|
||||||
|
|
||||||
|
- `withUtf8StdHandles`
|
||||||
|
- `openFile`
|
||||||
|
- `withFile`
|
||||||
|
- `readFile`
|
||||||
|
- `writeFile`
|
||||||
|
60
README.md
60
README.md
@ -22,6 +22,66 @@ exists and an explanation of some of the opinionated decisions it is based on.
|
|||||||
|
|
||||||
## Use
|
## Use
|
||||||
|
|
||||||
|
### Step 1: Get it
|
||||||
|
|
||||||
|
The library is on [Hackage][hackage:utf8], just add it to the dependencies of
|
||||||
|
your project.
|
||||||
|
|
||||||
|
[hackage:utf8]: https://hackage.haskell.org/package/utf8
|
||||||
|
|
||||||
|
### Step 2: Wrap your `main`
|
||||||
|
|
||||||
|
Import `withUtf8StdHandles` from `System.IO.Utf8` and wrap it around your `main`:
|
||||||
|
|
||||||
|
```haskell
|
||||||
|
main :: IO ()
|
||||||
|
main = withUtf8StdHandles $ {- ... your main function ... -}
|
||||||
|
```
|
||||||
|
|
||||||
|
This will make sure that if your program reads something from `stdin` or
|
||||||
|
outputs something to `stdout`/`stderr`, it will not fail with a runtime
|
||||||
|
error due to encoding issues.
|
||||||
|
|
||||||
|
### Step 3: Read files using UTF-8
|
||||||
|
|
||||||
|
If you are going to read a text file (to be precise, if you are going to open
|
||||||
|
a file in text mode), you’ll probably use `withFile`, `openFile`, or `readFile`.
|
||||||
|
Grab the first two from `System.IO.Utf8` or the latter from `Data.Text.IO.Utf8`.
|
||||||
|
|
||||||
|
_Note: it is best to import these functions qualified._
|
||||||
|
|
||||||
|
_Note: there is no `System.IO.Utf8.readFile` because it’s 2020 and
|
||||||
|
you should not read `String`s from files._
|
||||||
|
|
||||||
|
All these functions will make sure that the content will be treated as if it
|
||||||
|
was encoded in UTF-8 (it is 2020, what else can it be encoded in?).
|
||||||
|
|
||||||
|
If, for some reason, you really need to use `withFile`/`openFile` from `base`,
|
||||||
|
just call `hSetEncoding h`, where `h` is your handle and `hSetEncoding` comes
|
||||||
|
from `System.IO.Utf8` for your convenience:
|
||||||
|
|
||||||
|
```haskell
|
||||||
|
import qualified System.IO as IO
|
||||||
|
import qualified System.IO.Utf8 as Utf8
|
||||||
|
|
||||||
|
doSomethingWithAFile :: IO ()
|
||||||
|
doSomethingWithAFile = IO.withFile "file.txt" IO.ReadMode $ \h -> do
|
||||||
|
Utf8.hSetEncoding h
|
||||||
|
{- ... work with the file ... -}
|
||||||
|
```
|
||||||
|
|
||||||
|
### Step 4: Write files using UTF-8
|
||||||
|
|
||||||
|
When writing a file either open it using `withFile`/`openFile` from
|
||||||
|
`System.IO.Utf8` or write to it directly with `writeFile` from
|
||||||
|
`Data.Text.IO.Utf8`.
|
||||||
|
|
||||||
|
_Note: it is best to import these functions qualified._
|
||||||
|
|
||||||
|
_Note that there is no `System.IO.Utf8.writeFile`._
|
||||||
|
|
||||||
|
If, for some reason, you really need to use `withFile`/`openFile` from `base`,
|
||||||
|
do the same as in the previous step.
|
||||||
|
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
|
35
lib/Data/Text/IO/Utf8.hs
Normal file
35
lib/Data/Text/IO/Utf8.hs
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
|
||||||
|
-
|
||||||
|
- SPDX-License-Identifier: MPL-2.0
|
||||||
|
-}
|
||||||
|
|
||||||
|
-- | Data Text IO for the modern world.
|
||||||
|
--
|
||||||
|
-- Wrappers around simple file reading/writing functions from the
|
||||||
|
-- @text@ package that reset the handle encoding to UTF-8.
|
||||||
|
module Data.Text.IO.Utf8
|
||||||
|
( readFile
|
||||||
|
, writeFile
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Prelude hiding (readFile, writeFile)
|
||||||
|
|
||||||
|
import Control.Exception.Safe (MonadMask)
|
||||||
|
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||||
|
import Data.Text (Text)
|
||||||
|
|
||||||
|
import qualified Data.Text.IO as T
|
||||||
|
import qualified System.IO as IO
|
||||||
|
|
||||||
|
import qualified System.IO.Utf8 as Utf8
|
||||||
|
|
||||||
|
|
||||||
|
-- | Like @readFile@, but assumes the file is encoded in UTF-8, regardless
|
||||||
|
-- of the current locale.
|
||||||
|
readFile :: MonadIO m => IO.FilePath -> m Text
|
||||||
|
readFile path = Utf8.openFile path IO.ReadMode >>= liftIO . T.hGetContents
|
||||||
|
|
||||||
|
-- | Like @writeFile@, but encodes the data in UTF-8, regardless
|
||||||
|
-- of the current locale.
|
||||||
|
writeFile :: (MonadIO m, MonadMask m) => IO.FilePath -> Text -> m ()
|
||||||
|
writeFile path = Utf8.withFile path IO.WriteMode . (liftIO .) . flip T.hPutStr
|
99
lib/System/IO/Utf8.hs
Normal file
99
lib/System/IO/Utf8.hs
Normal file
@ -0,0 +1,99 @@
|
|||||||
|
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
|
||||||
|
-
|
||||||
|
- SPDX-License-Identifier: MPL-2.0
|
||||||
|
-}
|
||||||
|
|
||||||
|
{-# LANGUAGE LambdaCase #-}
|
||||||
|
{-# LANGUAGE ViewPatterns #-}
|
||||||
|
|
||||||
|
-- | System IO for the modern world.
|
||||||
|
--
|
||||||
|
-- Standard IO functions assume that the character encoding of the data
|
||||||
|
-- they read or write is the same as the one used by current locale. In many
|
||||||
|
-- situtations this assumption is wrong, as tools work with files, and
|
||||||
|
-- the files nowadays are mostly UTF-8 encoded, regardless of the locale.
|
||||||
|
-- Therefore, it is almost always a good idea to switch the encoding of
|
||||||
|
-- file handles to UTF-8.
|
||||||
|
--
|
||||||
|
-- The same applies to standard input, output, and error handles. However,
|
||||||
|
-- there is an edge-case: if they are attached to a terminal, and the
|
||||||
|
-- encoding is not UTF-8, using UTF-8 might actually be unsafe.
|
||||||
|
--
|
||||||
|
-- Functions in this module help deal with all these issues.
|
||||||
|
module System.IO.Utf8
|
||||||
|
( withUtf8StdHandles
|
||||||
|
|
||||||
|
, hSetEncoding
|
||||||
|
|
||||||
|
, openFile
|
||||||
|
, withFile
|
||||||
|
) where
|
||||||
|
|
||||||
|
import Control.Exception.Safe (MonadMask, bracket)
|
||||||
|
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||||
|
import GHC.IO.Encoding (mkTextEncoding, textEncodingName, utf8)
|
||||||
|
import System.IO (stderr, stdin, stdout)
|
||||||
|
|
||||||
|
import qualified System.IO as IO
|
||||||
|
|
||||||
|
|
||||||
|
type EncRestoreAction = IO.Handle -> IO ()
|
||||||
|
|
||||||
|
-- | Sets the best available UTF-8-compatible encoding for the handle.
|
||||||
|
-- Returns the action that will restore the previous one.
|
||||||
|
--
|
||||||
|
-- If the handle is in binary mode, does nothing.
|
||||||
|
-- If the handle is not attached to a terminal, sets UTF-8.
|
||||||
|
-- Otherwise, keeps its current encoding, but augments it to transliterate
|
||||||
|
-- unsupported characters.
|
||||||
|
hSetBestUtf8Enc :: IO.Handle -> IO EncRestoreAction
|
||||||
|
hSetBestUtf8Enc h = IO.hGetEncoding h >>= \case
|
||||||
|
Nothing -> pure (\_ -> pure ())
|
||||||
|
Just enc -> do
|
||||||
|
isTerm <- IO.hIsTerminalDevice h
|
||||||
|
enc' <- chooseBestEnc isTerm enc
|
||||||
|
IO.hSetEncoding h enc'
|
||||||
|
pure $ flip IO.hSetEncoding enc
|
||||||
|
where
|
||||||
|
chooseBestEnc False _ = pure utf8
|
||||||
|
chooseBestEnc True enc@(textEncodingName -> name)
|
||||||
|
| '/' `notElem` name = mkTextEncoding (name ++ "//TRANSLIT")
|
||||||
|
| otherwise = pure enc
|
||||||
|
|
||||||
|
-- | Configures the encodings of the three standard handles (stdin, stdout, stderr)
|
||||||
|
-- to work with UTF-8 encoded data and runs the specified IO action.
|
||||||
|
-- After the action finishes, restores the original encodings.
|
||||||
|
withUtf8StdHandles :: IO a -> IO a
|
||||||
|
withUtf8StdHandles action =
|
||||||
|
withConfiguredHandle stdin $
|
||||||
|
withConfiguredHandle stdout $
|
||||||
|
withConfiguredHandle stderr $
|
||||||
|
action
|
||||||
|
where
|
||||||
|
withConfiguredHandle :: IO.Handle -> IO a -> IO a
|
||||||
|
withConfiguredHandle h = bracket (hSetBestUtf8Enc h) ($ h) . const
|
||||||
|
|
||||||
|
|
||||||
|
-- | A shortucut for setting handle encoding to UTF-8.
|
||||||
|
--
|
||||||
|
-- @
|
||||||
|
-- hSetEncoding h = System.IO.hSetEncoding h GHC.IO.Endoding.utf8
|
||||||
|
-- @
|
||||||
|
hSetEncoding :: MonadIO m => IO.Handle -> m ()
|
||||||
|
hSetEncoding = liftIO . flip IO.hSetEncoding utf8
|
||||||
|
|
||||||
|
|
||||||
|
-- | Like @openFile@, but sets the file encoding to UTF-8, regardless
|
||||||
|
-- of the current locale.
|
||||||
|
openFile :: MonadIO m => IO.FilePath -> IO.IOMode -> m IO.Handle
|
||||||
|
openFile path mode = do
|
||||||
|
h <- liftIO $ IO.openFile path mode
|
||||||
|
hSetEncoding h
|
||||||
|
pure h
|
||||||
|
|
||||||
|
-- | Like @withFile@, but sets the file encoding to UTF-8, regardless
|
||||||
|
-- of the current locale.
|
||||||
|
withFile
|
||||||
|
:: (MonadIO m, MonadMask m)
|
||||||
|
=> IO.FilePath -> IO.IOMode -> (IO.Handle -> m r) -> m r
|
||||||
|
withFile path mode = bracket (openFile path mode) (liftIO . IO.hClose)
|
25
package.yaml
25
package.yaml
@ -28,3 +28,28 @@ library:
|
|||||||
|
|
||||||
dependencies:
|
dependencies:
|
||||||
- base
|
- base
|
||||||
|
- safe-exceptions # only really needed for polymorphic bracket
|
||||||
|
- text
|
||||||
|
|
||||||
|
tests:
|
||||||
|
utf8-test:
|
||||||
|
source-dirs: test
|
||||||
|
main: Test.hs
|
||||||
|
|
||||||
|
dependencies:
|
||||||
|
- utf8
|
||||||
|
|
||||||
|
- base
|
||||||
|
- deepseq
|
||||||
|
- safe-exceptions
|
||||||
|
- temporary
|
||||||
|
- text
|
||||||
|
|
||||||
|
- hedgehog
|
||||||
|
- HUnit
|
||||||
|
- tasty
|
||||||
|
- tasty-hedgehog
|
||||||
|
- tasty-hunit
|
||||||
|
|
||||||
|
build-tools:
|
||||||
|
- tasty-discover:tasty-discover
|
||||||
|
12
stack.yaml.lock
Normal file
12
stack.yaml.lock
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
# This file was autogenerated by Stack.
|
||||||
|
# You should not edit this file by hand.
|
||||||
|
# For more information, please see the documentation at:
|
||||||
|
# https://docs.haskellstack.org/en/stable/lock_files
|
||||||
|
|
||||||
|
packages: []
|
||||||
|
snapshots:
|
||||||
|
- completed:
|
||||||
|
size: 524163
|
||||||
|
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/14/25.yaml
|
||||||
|
sha256: 97548b4e927a8a862d4a203687ca18a3f5d2ae75b4b72c946fb29c17df1a5082
|
||||||
|
original: lts-14.25
|
18
test/Test.hs
Normal file
18
test/Test.hs
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
module Main
|
||||||
|
( main
|
||||||
|
) where
|
||||||
|
|
||||||
|
import GHC.IO.Encoding (mkTextEncoding, setLocaleEncoding)
|
||||||
|
import Test.Tasty (defaultMain)
|
||||||
|
|
||||||
|
import System.IO.Utf8 (withUtf8StdHandles)
|
||||||
|
|
||||||
|
import Tree (tests)
|
||||||
|
|
||||||
|
main :: IO ()
|
||||||
|
main = withUtf8StdHandles $ do
|
||||||
|
-- The issue only shows up when current locale encoding is ASCII.
|
||||||
|
-- Realistically, very often when running this test this will not be
|
||||||
|
-- the case, so we unset locale encoding manually.
|
||||||
|
mkTextEncoding "ASCII" >>= setLocaleEncoding
|
||||||
|
tests >>= defaultMain
|
87
test/Test/Utf8/ReadWrite.hs
Normal file
87
test/Test/Utf8/ReadWrite.hs
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
|
||||||
|
-
|
||||||
|
- SPDX-License-Identifier: MPL-2.0
|
||||||
|
-}
|
||||||
|
|
||||||
|
{-# LANGUAGE LambdaCase #-}
|
||||||
|
{-# LANGUAGE OverloadedStrings #-}
|
||||||
|
{-# LANGUAGE ScopedTypeVariables #-}
|
||||||
|
|
||||||
|
module Test.Utf8.ReadWrite where
|
||||||
|
|
||||||
|
import Control.DeepSeq (force)
|
||||||
|
import Control.Exception (IOException, evaluate)
|
||||||
|
import Control.Exception.Safe (MonadMask, try)
|
||||||
|
import Control.Monad ((>=>))
|
||||||
|
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||||
|
import Data.Text (Text)
|
||||||
|
import System.IO (FilePath)
|
||||||
|
import System.IO.Temp (withSystemTempFile)
|
||||||
|
|
||||||
|
import Hedgehog (Property, (===), forAll, property)
|
||||||
|
import Test.HUnit (Assertion, assertFailure)
|
||||||
|
|
||||||
|
import qualified Data.Text.IO as T
|
||||||
|
import qualified Data.Text.IO.Utf8 as Utf8
|
||||||
|
import qualified System.IO as IO
|
||||||
|
import qualified System.IO.Utf8 as Utf8
|
||||||
|
|
||||||
|
import qualified Hedgehog.Gen as G
|
||||||
|
import qualified Hedgehog.Range as R
|
||||||
|
|
||||||
|
-- Import MonadMask instance for Hedgehog Property
|
||||||
|
import Test.Util.Hedgehog ()
|
||||||
|
|
||||||
|
-- | Helper that writes Text to a temp file.
|
||||||
|
withTestFile :: (MonadIO m, MonadMask m) => Text -> (FilePath -> m r) -> m r
|
||||||
|
withTestFile t act = withSystemTempFile "utf8.txt" $ \fp h -> do
|
||||||
|
Utf8.hSetEncoding h
|
||||||
|
liftIO $ T.hPutStr h t
|
||||||
|
liftIO $ IO.hClose h
|
||||||
|
act fp
|
||||||
|
|
||||||
|
|
||||||
|
-- | Sanity check.
|
||||||
|
unit_standard_readFile_fails :: Assertion
|
||||||
|
unit_standard_readFile_fails = withTestFile str $ \fp ->
|
||||||
|
try (T.readFile fp) >>= \case
|
||||||
|
Right _ -> assertFailure "Standard `readFile` should fail."
|
||||||
|
Left (_ :: IOException) -> pure ()
|
||||||
|
where
|
||||||
|
-- We use an escape here because otherwise _both_ tasty-discover
|
||||||
|
-- and hedgehog fail when they are reading this file for their
|
||||||
|
-- discovery or nice error reporting purposes... I know, right?
|
||||||
|
str = "doma\285e"
|
||||||
|
|
||||||
|
|
||||||
|
hprop_readFile :: Property
|
||||||
|
hprop_readFile = property $ do
|
||||||
|
str <- forAll $ G.text (R.linear 0 1000) G.unicode
|
||||||
|
str' <- liftIO $ withTestFile str (Utf8.readFile >=> evaluate . force)
|
||||||
|
str === str'
|
||||||
|
|
||||||
|
hprop_writeFile :: Property
|
||||||
|
hprop_writeFile = property $ do
|
||||||
|
str <- forAll $ G.text (R.linear 0 1000) G.unicode
|
||||||
|
liftIO $ withTestFile str (flip Utf8.writeFile str)
|
||||||
|
|
||||||
|
hprop_openFile :: Property
|
||||||
|
hprop_openFile = property $ do
|
||||||
|
str <- forAll $ G.text (R.linear 0 1000) G.unicode
|
||||||
|
str' <- liftIO $ withTestFile str $ \fp -> do
|
||||||
|
h <- Utf8.openFile fp IO.ReadMode
|
||||||
|
res <- T.hGetContents h
|
||||||
|
res' <- evaluate . force $ res
|
||||||
|
IO.hClose h
|
||||||
|
pure res'
|
||||||
|
str === str'
|
||||||
|
|
||||||
|
hprop_withFile :: Property
|
||||||
|
hprop_withFile = property $ do
|
||||||
|
str <- forAll $ G.text (R.linear 0 1000) G.unicode
|
||||||
|
str' <- liftIO $ withTestFile str $ \fp ->
|
||||||
|
Utf8.withFile fp IO.ReadMode $ \h -> do
|
||||||
|
res <- T.hGetContents h
|
||||||
|
res' <- evaluate . force $ res
|
||||||
|
pure res'
|
||||||
|
str === str'
|
53
test/Test/Util/Hedgehog.hs
Normal file
53
test/Test/Util/Hedgehog.hs
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
|
||||||
|
-
|
||||||
|
- SPDX-License-Identifier: MPL-2.0
|
||||||
|
-}
|
||||||
|
|
||||||
|
{-# OPTIONS_GHC -Wno-orphans #-}
|
||||||
|
|
||||||
|
{-# LANGUAGE DerivingStrategies #-}
|
||||||
|
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||||
|
{-# LANGUAGE StandaloneDeriving #-}
|
||||||
|
|
||||||
|
-- | The missing MonadMask instance for Hedgehog.
|
||||||
|
module Test.Util.Hedgehog
|
||||||
|
() where
|
||||||
|
|
||||||
|
{-
|
||||||
|
import Control.Exception.Safe (MonadMask (..))
|
||||||
|
|
||||||
|
import Hedgehog.Internal.Gen
|
||||||
|
import Hedgehog.Internal.Property
|
||||||
|
import Hedgehog.Internal.Tree
|
||||||
|
|
||||||
|
import qualified Hedgehog.Internal.Seed as Seed
|
||||||
|
|
||||||
|
|
||||||
|
deriving newtype instance MonadMask m => MonadMask (TestT m)
|
||||||
|
deriving newtype instance MonadMask m => MonadMask (PropertyT m)
|
||||||
|
|
||||||
|
instance MonadMask m => MonadMask (GenT m) where
|
||||||
|
mask a = GenT $ \size seed ->
|
||||||
|
mask $ \u -> runGenT size seed (a $ mapGenT u)
|
||||||
|
|
||||||
|
uninterruptibleMask a = GenT $ \size seed ->
|
||||||
|
uninterruptibleMask $ \u -> runGenT size seed (a $ mapGenT u)
|
||||||
|
|
||||||
|
generalBracket acquire release use = GenT $ \size seed ->
|
||||||
|
case Seed.split seed of
|
||||||
|
(seed1, seed1') -> case Seed.split seed1' of
|
||||||
|
(seed2, seed3) -> do
|
||||||
|
generalBracket
|
||||||
|
(runGenT size seed1 acquire)
|
||||||
|
(\treeRes ec -> runGenT size seed2 (release treeRes ec))
|
||||||
|
(\treeRes -> runGenT size seed3 (use treeRes))
|
||||||
|
|
||||||
|
instance MonadMask m => MonadMask (TreeT m) where
|
||||||
|
mask a
|
||||||
|
= TreeT $ mask $ \u -> runTreeT (a $ mapTreeT u)
|
||||||
|
|
||||||
|
uninterruptibleMask a
|
||||||
|
= TreeT $ uninterruptibleMask $ \u -> runTreeT (a $ mapTreeT u)
|
||||||
|
|
||||||
|
generalBracket acquire release use = undefined
|
||||||
|
-}
|
1
test/Tree.hs
Normal file
1
test/Tree.hs
Normal file
@ -0,0 +1 @@
|
|||||||
|
{-# OPTIONS_GHC -F -pgmF tasty-discover -optF --tree-display -optF --generated-module -optF Tree #-}
|
Loading…
Reference in New Issue
Block a user