mirror of
https://github.com/serokell/haskell-with-utf8.git
synced 2024-08-15 09:40:21 +03:00
First version
This commit is contained in:
parent
b7b3ef4bc5
commit
08407bbe4d
10
CHANGELOG.md
10
CHANGELOG.md
@ -7,3 +7,13 @@ SPDX-License-Identifier: MPL-2.0
|
||||
# Changelog
|
||||
|
||||
## Unreleased
|
||||
|
||||
Initial release.
|
||||
|
||||
### Added
|
||||
|
||||
- `withUtf8StdHandles`
|
||||
- `openFile`
|
||||
- `withFile`
|
||||
- `readFile`
|
||||
- `writeFile`
|
||||
|
60
README.md
60
README.md
@ -22,6 +22,66 @@ exists and an explanation of some of the opinionated decisions it is based on.
|
||||
|
||||
## Use
|
||||
|
||||
### Step 1: Get it
|
||||
|
||||
The library is on [Hackage][hackage:utf8], just add it to the dependencies of
|
||||
your project.
|
||||
|
||||
[hackage:utf8]: https://hackage.haskell.org/package/utf8
|
||||
|
||||
### Step 2: Wrap your `main`
|
||||
|
||||
Import `withUtf8StdHandles` from `System.IO.Utf8` and wrap it around your `main`:
|
||||
|
||||
```haskell
|
||||
main :: IO ()
|
||||
main = withUtf8StdHandles $ {- ... your main function ... -}
|
||||
```
|
||||
|
||||
This will make sure that if your program reads something from `stdin` or
|
||||
outputs something to `stdout`/`stderr`, it will not fail with a runtime
|
||||
error due to encoding issues.
|
||||
|
||||
### Step 3: Read files using UTF-8
|
||||
|
||||
If you are going to read a text file (to be precise, if you are going to open
|
||||
a file in text mode), you’ll probably use `withFile`, `openFile`, or `readFile`.
|
||||
Grab the first two from `System.IO.Utf8` or the latter from `Data.Text.IO.Utf8`.
|
||||
|
||||
_Note: it is best to import these functions qualified._
|
||||
|
||||
_Note: there is no `System.IO.Utf8.readFile` because it’s 2020 and
|
||||
you should not read `String`s from files._
|
||||
|
||||
All these functions will make sure that the content will be treated as if it
|
||||
was encoded in UTF-8 (it is 2020, what else can it be encoded in?).
|
||||
|
||||
If, for some reason, you really need to use `withFile`/`openFile` from `base`,
|
||||
just call `hSetEncoding h`, where `h` is your handle and `hSetEncoding` comes
|
||||
from `System.IO.Utf8` for your convenience:
|
||||
|
||||
```haskell
|
||||
import qualified System.IO as IO
|
||||
import qualified System.IO.Utf8 as Utf8
|
||||
|
||||
doSomethingWithAFile :: IO ()
|
||||
doSomethingWithAFile = IO.withFile "file.txt" IO.ReadMode $ \h -> do
|
||||
Utf8.hSetEncoding h
|
||||
{- ... work with the file ... -}
|
||||
```
|
||||
|
||||
### Step 4: Write files using UTF-8
|
||||
|
||||
When writing a file either open it using `withFile`/`openFile` from
|
||||
`System.IO.Utf8` or write to it directly with `writeFile` from
|
||||
`Data.Text.IO.Utf8`.
|
||||
|
||||
_Note: it is best to import these functions qualified._
|
||||
|
||||
_Note that there is no `System.IO.Utf8.writeFile`._
|
||||
|
||||
If, for some reason, you really need to use `withFile`/`openFile` from `base`,
|
||||
do the same as in the previous step.
|
||||
|
||||
|
||||
## Contributing
|
||||
|
35
lib/Data/Text/IO/Utf8.hs
Normal file
35
lib/Data/Text/IO/Utf8.hs
Normal file
@ -0,0 +1,35 @@
|
||||
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
|
||||
-
|
||||
- SPDX-License-Identifier: MPL-2.0
|
||||
-}
|
||||
|
||||
-- | Data Text IO for the modern world.
|
||||
--
|
||||
-- Wrappers around simple file reading/writing functions from the
|
||||
-- @text@ package that reset the handle encoding to UTF-8.
|
||||
module Data.Text.IO.Utf8
|
||||
( readFile
|
||||
, writeFile
|
||||
) where
|
||||
|
||||
import Prelude hiding (readFile, writeFile)
|
||||
|
||||
import Control.Exception.Safe (MonadMask)
|
||||
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||
import Data.Text (Text)
|
||||
|
||||
import qualified Data.Text.IO as T
|
||||
import qualified System.IO as IO
|
||||
|
||||
import qualified System.IO.Utf8 as Utf8
|
||||
|
||||
|
||||
-- | Like @readFile@, but assumes the file is encoded in UTF-8, regardless
|
||||
-- of the current locale.
|
||||
readFile :: MonadIO m => IO.FilePath -> m Text
|
||||
readFile path = Utf8.openFile path IO.ReadMode >>= liftIO . T.hGetContents
|
||||
|
||||
-- | Like @writeFile@, but encodes the data in UTF-8, regardless
|
||||
-- of the current locale.
|
||||
writeFile :: (MonadIO m, MonadMask m) => IO.FilePath -> Text -> m ()
|
||||
writeFile path = Utf8.withFile path IO.WriteMode . (liftIO .) . flip T.hPutStr
|
99
lib/System/IO/Utf8.hs
Normal file
99
lib/System/IO/Utf8.hs
Normal file
@ -0,0 +1,99 @@
|
||||
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
|
||||
-
|
||||
- SPDX-License-Identifier: MPL-2.0
|
||||
-}
|
||||
|
||||
{-# LANGUAGE LambdaCase #-}
|
||||
{-# LANGUAGE ViewPatterns #-}
|
||||
|
||||
-- | System IO for the modern world.
|
||||
--
|
||||
-- Standard IO functions assume that the character encoding of the data
|
||||
-- they read or write is the same as the one used by current locale. In many
|
||||
-- situtations this assumption is wrong, as tools work with files, and
|
||||
-- the files nowadays are mostly UTF-8 encoded, regardless of the locale.
|
||||
-- Therefore, it is almost always a good idea to switch the encoding of
|
||||
-- file handles to UTF-8.
|
||||
--
|
||||
-- The same applies to standard input, output, and error handles. However,
|
||||
-- there is an edge-case: if they are attached to a terminal, and the
|
||||
-- encoding is not UTF-8, using UTF-8 might actually be unsafe.
|
||||
--
|
||||
-- Functions in this module help deal with all these issues.
|
||||
module System.IO.Utf8
|
||||
( withUtf8StdHandles
|
||||
|
||||
, hSetEncoding
|
||||
|
||||
, openFile
|
||||
, withFile
|
||||
) where
|
||||
|
||||
import Control.Exception.Safe (MonadMask, bracket)
|
||||
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||
import GHC.IO.Encoding (mkTextEncoding, textEncodingName, utf8)
|
||||
import System.IO (stderr, stdin, stdout)
|
||||
|
||||
import qualified System.IO as IO
|
||||
|
||||
|
||||
type EncRestoreAction = IO.Handle -> IO ()
|
||||
|
||||
-- | Sets the best available UTF-8-compatible encoding for the handle.
|
||||
-- Returns the action that will restore the previous one.
|
||||
--
|
||||
-- If the handle is in binary mode, does nothing.
|
||||
-- If the handle is not attached to a terminal, sets UTF-8.
|
||||
-- Otherwise, keeps its current encoding, but augments it to transliterate
|
||||
-- unsupported characters.
|
||||
hSetBestUtf8Enc :: IO.Handle -> IO EncRestoreAction
|
||||
hSetBestUtf8Enc h = IO.hGetEncoding h >>= \case
|
||||
Nothing -> pure (\_ -> pure ())
|
||||
Just enc -> do
|
||||
isTerm <- IO.hIsTerminalDevice h
|
||||
enc' <- chooseBestEnc isTerm enc
|
||||
IO.hSetEncoding h enc'
|
||||
pure $ flip IO.hSetEncoding enc
|
||||
where
|
||||
chooseBestEnc False _ = pure utf8
|
||||
chooseBestEnc True enc@(textEncodingName -> name)
|
||||
| '/' `notElem` name = mkTextEncoding (name ++ "//TRANSLIT")
|
||||
| otherwise = pure enc
|
||||
|
||||
-- | Configures the encodings of the three standard handles (stdin, stdout, stderr)
|
||||
-- to work with UTF-8 encoded data and runs the specified IO action.
|
||||
-- After the action finishes, restores the original encodings.
|
||||
withUtf8StdHandles :: IO a -> IO a
|
||||
withUtf8StdHandles action =
|
||||
withConfiguredHandle stdin $
|
||||
withConfiguredHandle stdout $
|
||||
withConfiguredHandle stderr $
|
||||
action
|
||||
where
|
||||
withConfiguredHandle :: IO.Handle -> IO a -> IO a
|
||||
withConfiguredHandle h = bracket (hSetBestUtf8Enc h) ($ h) . const
|
||||
|
||||
|
||||
-- | A shortucut for setting handle encoding to UTF-8.
|
||||
--
|
||||
-- @
|
||||
-- hSetEncoding h = System.IO.hSetEncoding h GHC.IO.Endoding.utf8
|
||||
-- @
|
||||
hSetEncoding :: MonadIO m => IO.Handle -> m ()
|
||||
hSetEncoding = liftIO . flip IO.hSetEncoding utf8
|
||||
|
||||
|
||||
-- | Like @openFile@, but sets the file encoding to UTF-8, regardless
|
||||
-- of the current locale.
|
||||
openFile :: MonadIO m => IO.FilePath -> IO.IOMode -> m IO.Handle
|
||||
openFile path mode = do
|
||||
h <- liftIO $ IO.openFile path mode
|
||||
hSetEncoding h
|
||||
pure h
|
||||
|
||||
-- | Like @withFile@, but sets the file encoding to UTF-8, regardless
|
||||
-- of the current locale.
|
||||
withFile
|
||||
:: (MonadIO m, MonadMask m)
|
||||
=> IO.FilePath -> IO.IOMode -> (IO.Handle -> m r) -> m r
|
||||
withFile path mode = bracket (openFile path mode) (liftIO . IO.hClose)
|
25
package.yaml
25
package.yaml
@ -28,3 +28,28 @@ library:
|
||||
|
||||
dependencies:
|
||||
- base
|
||||
- safe-exceptions # only really needed for polymorphic bracket
|
||||
- text
|
||||
|
||||
tests:
|
||||
utf8-test:
|
||||
source-dirs: test
|
||||
main: Test.hs
|
||||
|
||||
dependencies:
|
||||
- utf8
|
||||
|
||||
- base
|
||||
- deepseq
|
||||
- safe-exceptions
|
||||
- temporary
|
||||
- text
|
||||
|
||||
- hedgehog
|
||||
- HUnit
|
||||
- tasty
|
||||
- tasty-hedgehog
|
||||
- tasty-hunit
|
||||
|
||||
build-tools:
|
||||
- tasty-discover:tasty-discover
|
||||
|
12
stack.yaml.lock
Normal file
12
stack.yaml.lock
Normal file
@ -0,0 +1,12 @@
|
||||
# This file was autogenerated by Stack.
|
||||
# You should not edit this file by hand.
|
||||
# For more information, please see the documentation at:
|
||||
# https://docs.haskellstack.org/en/stable/lock_files
|
||||
|
||||
packages: []
|
||||
snapshots:
|
||||
- completed:
|
||||
size: 524163
|
||||
url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/14/25.yaml
|
||||
sha256: 97548b4e927a8a862d4a203687ca18a3f5d2ae75b4b72c946fb29c17df1a5082
|
||||
original: lts-14.25
|
18
test/Test.hs
Normal file
18
test/Test.hs
Normal file
@ -0,0 +1,18 @@
|
||||
module Main
|
||||
( main
|
||||
) where
|
||||
|
||||
import GHC.IO.Encoding (mkTextEncoding, setLocaleEncoding)
|
||||
import Test.Tasty (defaultMain)
|
||||
|
||||
import System.IO.Utf8 (withUtf8StdHandles)
|
||||
|
||||
import Tree (tests)
|
||||
|
||||
main :: IO ()
|
||||
main = withUtf8StdHandles $ do
|
||||
-- The issue only shows up when current locale encoding is ASCII.
|
||||
-- Realistically, very often when running this test this will not be
|
||||
-- the case, so we unset locale encoding manually.
|
||||
mkTextEncoding "ASCII" >>= setLocaleEncoding
|
||||
tests >>= defaultMain
|
87
test/Test/Utf8/ReadWrite.hs
Normal file
87
test/Test/Utf8/ReadWrite.hs
Normal file
@ -0,0 +1,87 @@
|
||||
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
|
||||
-
|
||||
- SPDX-License-Identifier: MPL-2.0
|
||||
-}
|
||||
|
||||
{-# LANGUAGE LambdaCase #-}
|
||||
{-# LANGUAGE OverloadedStrings #-}
|
||||
{-# LANGUAGE ScopedTypeVariables #-}
|
||||
|
||||
module Test.Utf8.ReadWrite where
|
||||
|
||||
import Control.DeepSeq (force)
|
||||
import Control.Exception (IOException, evaluate)
|
||||
import Control.Exception.Safe (MonadMask, try)
|
||||
import Control.Monad ((>=>))
|
||||
import Control.Monad.IO.Class (MonadIO, liftIO)
|
||||
import Data.Text (Text)
|
||||
import System.IO (FilePath)
|
||||
import System.IO.Temp (withSystemTempFile)
|
||||
|
||||
import Hedgehog (Property, (===), forAll, property)
|
||||
import Test.HUnit (Assertion, assertFailure)
|
||||
|
||||
import qualified Data.Text.IO as T
|
||||
import qualified Data.Text.IO.Utf8 as Utf8
|
||||
import qualified System.IO as IO
|
||||
import qualified System.IO.Utf8 as Utf8
|
||||
|
||||
import qualified Hedgehog.Gen as G
|
||||
import qualified Hedgehog.Range as R
|
||||
|
||||
-- Import MonadMask instance for Hedgehog Property
|
||||
import Test.Util.Hedgehog ()
|
||||
|
||||
-- | Helper that writes Text to a temp file.
|
||||
withTestFile :: (MonadIO m, MonadMask m) => Text -> (FilePath -> m r) -> m r
|
||||
withTestFile t act = withSystemTempFile "utf8.txt" $ \fp h -> do
|
||||
Utf8.hSetEncoding h
|
||||
liftIO $ T.hPutStr h t
|
||||
liftIO $ IO.hClose h
|
||||
act fp
|
||||
|
||||
|
||||
-- | Sanity check.
|
||||
unit_standard_readFile_fails :: Assertion
|
||||
unit_standard_readFile_fails = withTestFile str $ \fp ->
|
||||
try (T.readFile fp) >>= \case
|
||||
Right _ -> assertFailure "Standard `readFile` should fail."
|
||||
Left (_ :: IOException) -> pure ()
|
||||
where
|
||||
-- We use an escape here because otherwise _both_ tasty-discover
|
||||
-- and hedgehog fail when they are reading this file for their
|
||||
-- discovery or nice error reporting purposes... I know, right?
|
||||
str = "doma\285e"
|
||||
|
||||
|
||||
hprop_readFile :: Property
|
||||
hprop_readFile = property $ do
|
||||
str <- forAll $ G.text (R.linear 0 1000) G.unicode
|
||||
str' <- liftIO $ withTestFile str (Utf8.readFile >=> evaluate . force)
|
||||
str === str'
|
||||
|
||||
hprop_writeFile :: Property
|
||||
hprop_writeFile = property $ do
|
||||
str <- forAll $ G.text (R.linear 0 1000) G.unicode
|
||||
liftIO $ withTestFile str (flip Utf8.writeFile str)
|
||||
|
||||
hprop_openFile :: Property
|
||||
hprop_openFile = property $ do
|
||||
str <- forAll $ G.text (R.linear 0 1000) G.unicode
|
||||
str' <- liftIO $ withTestFile str $ \fp -> do
|
||||
h <- Utf8.openFile fp IO.ReadMode
|
||||
res <- T.hGetContents h
|
||||
res' <- evaluate . force $ res
|
||||
IO.hClose h
|
||||
pure res'
|
||||
str === str'
|
||||
|
||||
hprop_withFile :: Property
|
||||
hprop_withFile = property $ do
|
||||
str <- forAll $ G.text (R.linear 0 1000) G.unicode
|
||||
str' <- liftIO $ withTestFile str $ \fp ->
|
||||
Utf8.withFile fp IO.ReadMode $ \h -> do
|
||||
res <- T.hGetContents h
|
||||
res' <- evaluate . force $ res
|
||||
pure res'
|
||||
str === str'
|
53
test/Test/Util/Hedgehog.hs
Normal file
53
test/Test/Util/Hedgehog.hs
Normal file
@ -0,0 +1,53 @@
|
||||
{- SPDX-FileCopyrightText: 2020 Serokell <https://serokell.io/>
|
||||
-
|
||||
- SPDX-License-Identifier: MPL-2.0
|
||||
-}
|
||||
|
||||
{-# OPTIONS_GHC -Wno-orphans #-}
|
||||
|
||||
{-# LANGUAGE DerivingStrategies #-}
|
||||
{-# LANGUAGE GeneralizedNewtypeDeriving #-}
|
||||
{-# LANGUAGE StandaloneDeriving #-}
|
||||
|
||||
-- | The missing MonadMask instance for Hedgehog.
|
||||
module Test.Util.Hedgehog
|
||||
() where
|
||||
|
||||
{-
|
||||
import Control.Exception.Safe (MonadMask (..))
|
||||
|
||||
import Hedgehog.Internal.Gen
|
||||
import Hedgehog.Internal.Property
|
||||
import Hedgehog.Internal.Tree
|
||||
|
||||
import qualified Hedgehog.Internal.Seed as Seed
|
||||
|
||||
|
||||
deriving newtype instance MonadMask m => MonadMask (TestT m)
|
||||
deriving newtype instance MonadMask m => MonadMask (PropertyT m)
|
||||
|
||||
instance MonadMask m => MonadMask (GenT m) where
|
||||
mask a = GenT $ \size seed ->
|
||||
mask $ \u -> runGenT size seed (a $ mapGenT u)
|
||||
|
||||
uninterruptibleMask a = GenT $ \size seed ->
|
||||
uninterruptibleMask $ \u -> runGenT size seed (a $ mapGenT u)
|
||||
|
||||
generalBracket acquire release use = GenT $ \size seed ->
|
||||
case Seed.split seed of
|
||||
(seed1, seed1') -> case Seed.split seed1' of
|
||||
(seed2, seed3) -> do
|
||||
generalBracket
|
||||
(runGenT size seed1 acquire)
|
||||
(\treeRes ec -> runGenT size seed2 (release treeRes ec))
|
||||
(\treeRes -> runGenT size seed3 (use treeRes))
|
||||
|
||||
instance MonadMask m => MonadMask (TreeT m) where
|
||||
mask a
|
||||
= TreeT $ mask $ \u -> runTreeT (a $ mapTreeT u)
|
||||
|
||||
uninterruptibleMask a
|
||||
= TreeT $ uninterruptibleMask $ \u -> runTreeT (a $ mapTreeT u)
|
||||
|
||||
generalBracket acquire release use = undefined
|
||||
-}
|
1
test/Tree.hs
Normal file
1
test/Tree.hs
Normal file
@ -0,0 +1 @@
|
||||
{-# OPTIONS_GHC -F -pgmF tasty-discover -optF --tree-display -optF --generated-module -optF Tree #-}
|
Loading…
Reference in New Issue
Block a user