From 08407bbe4d5745f9819d58a4b9d87525c245d4d3 Mon Sep 17 00:00:00 2001 From: Kirill Elagin Date: Sat, 15 Feb 2020 19:15:52 -0500 Subject: [PATCH] First version --- CHANGELOG.md | 10 ++++ README.md | 60 ++++++++++++++++++++++ lib/Data/Text/IO/Utf8.hs | 35 +++++++++++++ lib/System/IO/Utf8.hs | 99 +++++++++++++++++++++++++++++++++++++ package.yaml | 25 ++++++++++ stack.yaml.lock | 12 +++++ test/Test.hs | 18 +++++++ test/Test/Utf8/ReadWrite.hs | 87 ++++++++++++++++++++++++++++++++ test/Test/Util/Hedgehog.hs | 53 ++++++++++++++++++++ test/Tree.hs | 1 + 10 files changed, 400 insertions(+) create mode 100644 lib/Data/Text/IO/Utf8.hs create mode 100644 lib/System/IO/Utf8.hs create mode 100644 stack.yaml.lock create mode 100644 test/Test.hs create mode 100644 test/Test/Utf8/ReadWrite.hs create mode 100644 test/Test/Util/Hedgehog.hs create mode 100644 test/Tree.hs diff --git a/CHANGELOG.md b/CHANGELOG.md index 7a21d82..52eb570 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,3 +7,13 @@ SPDX-License-Identifier: MPL-2.0 # Changelog ## Unreleased + +Initial release. + +### Added + +- `withUtf8StdHandles` +- `openFile` +- `withFile` +- `readFile` +- `writeFile` diff --git a/README.md b/README.md index f3cfe58..e874d77 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,66 @@ exists and an explanation of some of the opinionated decisions it is based on. ## Use +### Step 1: Get it + +The library is on [Hackage][hackage:utf8], just add it to the dependencies of +your project. + +[hackage:utf8]: https://hackage.haskell.org/package/utf8 + +### Step 2: Wrap your `main` + +Import `withUtf8StdHandles` from `System.IO.Utf8` and wrap it around your `main`: + +```haskell +main :: IO () +main = withUtf8StdHandles $ {- ... your main function ... -} +``` + +This will make sure that if your program reads something from `stdin` or +outputs something to `stdout`/`stderr`, it will not fail with a runtime +error due to encoding issues. + +### Step 3: Read files using UTF-8 + +If you are going to read a text file (to be precise, if you are going to open +a file in text mode), you’ll probably use `withFile`, `openFile`, or `readFile`. +Grab the first two from `System.IO.Utf8` or the latter from `Data.Text.IO.Utf8`. + +_Note: it is best to import these functions qualified._ + +_Note: there is no `System.IO.Utf8.readFile` because it’s 2020 and +you should not read `String`s from files._ + +All these functions will make sure that the content will be treated as if it +was encoded in UTF-8 (it is 2020, what else can it be encoded in?). + +If, for some reason, you really need to use `withFile`/`openFile` from `base`, +just call `hSetEncoding h`, where `h` is your handle and `hSetEncoding` comes +from `System.IO.Utf8` for your convenience: + +```haskell +import qualified System.IO as IO +import qualified System.IO.Utf8 as Utf8 + +doSomethingWithAFile :: IO () +doSomethingWithAFile = IO.withFile "file.txt" IO.ReadMode $ \h -> do + Utf8.hSetEncoding h + {- ... work with the file ... -} +``` + +### Step 4: Write files using UTF-8 + +When writing a file either open it using `withFile`/`openFile` from +`System.IO.Utf8` or write to it directly with `writeFile` from +`Data.Text.IO.Utf8`. + +_Note: it is best to import these functions qualified._ + +_Note that there is no `System.IO.Utf8.writeFile`._ + +If, for some reason, you really need to use `withFile`/`openFile` from `base`, +do the same as in the previous step. ## Contributing diff --git a/lib/Data/Text/IO/Utf8.hs b/lib/Data/Text/IO/Utf8.hs new file mode 100644 index 0000000..3ba6c01 --- /dev/null +++ b/lib/Data/Text/IO/Utf8.hs @@ -0,0 +1,35 @@ +{- SPDX-FileCopyrightText: 2020 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +-- | Data Text IO for the modern world. +-- +-- Wrappers around simple file reading/writing functions from the +-- @text@ package that reset the handle encoding to UTF-8. +module Data.Text.IO.Utf8 + ( readFile + , writeFile + ) where + +import Prelude hiding (readFile, writeFile) + +import Control.Exception.Safe (MonadMask) +import Control.Monad.IO.Class (MonadIO, liftIO) +import Data.Text (Text) + +import qualified Data.Text.IO as T +import qualified System.IO as IO + +import qualified System.IO.Utf8 as Utf8 + + +-- | Like @readFile@, but assumes the file is encoded in UTF-8, regardless +-- of the current locale. +readFile :: MonadIO m => IO.FilePath -> m Text +readFile path = Utf8.openFile path IO.ReadMode >>= liftIO . T.hGetContents + +-- | Like @writeFile@, but encodes the data in UTF-8, regardless +-- of the current locale. +writeFile :: (MonadIO m, MonadMask m) => IO.FilePath -> Text -> m () +writeFile path = Utf8.withFile path IO.WriteMode . (liftIO .) . flip T.hPutStr diff --git a/lib/System/IO/Utf8.hs b/lib/System/IO/Utf8.hs new file mode 100644 index 0000000..6aecc4d --- /dev/null +++ b/lib/System/IO/Utf8.hs @@ -0,0 +1,99 @@ +{- SPDX-FileCopyrightText: 2020 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE ViewPatterns #-} + +-- | System IO for the modern world. +-- +-- Standard IO functions assume that the character encoding of the data +-- they read or write is the same as the one used by current locale. In many +-- situtations this assumption is wrong, as tools work with files, and +-- the files nowadays are mostly UTF-8 encoded, regardless of the locale. +-- Therefore, it is almost always a good idea to switch the encoding of +-- file handles to UTF-8. +-- +-- The same applies to standard input, output, and error handles. However, +-- there is an edge-case: if they are attached to a terminal, and the +-- encoding is not UTF-8, using UTF-8 might actually be unsafe. +-- +-- Functions in this module help deal with all these issues. +module System.IO.Utf8 + ( withUtf8StdHandles + + , hSetEncoding + + , openFile + , withFile + ) where + +import Control.Exception.Safe (MonadMask, bracket) +import Control.Monad.IO.Class (MonadIO, liftIO) +import GHC.IO.Encoding (mkTextEncoding, textEncodingName, utf8) +import System.IO (stderr, stdin, stdout) + +import qualified System.IO as IO + + +type EncRestoreAction = IO.Handle -> IO () + +-- | Sets the best available UTF-8-compatible encoding for the handle. +-- Returns the action that will restore the previous one. +-- +-- If the handle is in binary mode, does nothing. +-- If the handle is not attached to a terminal, sets UTF-8. +-- Otherwise, keeps its current encoding, but augments it to transliterate +-- unsupported characters. +hSetBestUtf8Enc :: IO.Handle -> IO EncRestoreAction +hSetBestUtf8Enc h = IO.hGetEncoding h >>= \case + Nothing -> pure (\_ -> pure ()) + Just enc -> do + isTerm <- IO.hIsTerminalDevice h + enc' <- chooseBestEnc isTerm enc + IO.hSetEncoding h enc' + pure $ flip IO.hSetEncoding enc + where + chooseBestEnc False _ = pure utf8 + chooseBestEnc True enc@(textEncodingName -> name) + | '/' `notElem` name = mkTextEncoding (name ++ "//TRANSLIT") + | otherwise = pure enc + +-- | Configures the encodings of the three standard handles (stdin, stdout, stderr) +-- to work with UTF-8 encoded data and runs the specified IO action. +-- After the action finishes, restores the original encodings. +withUtf8StdHandles :: IO a -> IO a +withUtf8StdHandles action = + withConfiguredHandle stdin $ + withConfiguredHandle stdout $ + withConfiguredHandle stderr $ + action + where + withConfiguredHandle :: IO.Handle -> IO a -> IO a + withConfiguredHandle h = bracket (hSetBestUtf8Enc h) ($ h) . const + + +-- | A shortucut for setting handle encoding to UTF-8. +-- +-- @ +-- hSetEncoding h = System.IO.hSetEncoding h GHC.IO.Endoding.utf8 +-- @ +hSetEncoding :: MonadIO m => IO.Handle -> m () +hSetEncoding = liftIO . flip IO.hSetEncoding utf8 + + +-- | Like @openFile@, but sets the file encoding to UTF-8, regardless +-- of the current locale. +openFile :: MonadIO m => IO.FilePath -> IO.IOMode -> m IO.Handle +openFile path mode = do + h <- liftIO $ IO.openFile path mode + hSetEncoding h + pure h + +-- | Like @withFile@, but sets the file encoding to UTF-8, regardless +-- of the current locale. +withFile + :: (MonadIO m, MonadMask m) + => IO.FilePath -> IO.IOMode -> (IO.Handle -> m r) -> m r +withFile path mode = bracket (openFile path mode) (liftIO . IO.hClose) diff --git a/package.yaml b/package.yaml index 1cfebd4..0ca693c 100644 --- a/package.yaml +++ b/package.yaml @@ -28,3 +28,28 @@ library: dependencies: - base + - safe-exceptions # only really needed for polymorphic bracket + - text + +tests: + utf8-test: + source-dirs: test + main: Test.hs + + dependencies: + - utf8 + + - base + - deepseq + - safe-exceptions + - temporary + - text + + - hedgehog + - HUnit + - tasty + - tasty-hedgehog + - tasty-hunit + + build-tools: + - tasty-discover:tasty-discover diff --git a/stack.yaml.lock b/stack.yaml.lock new file mode 100644 index 0000000..8d605d9 --- /dev/null +++ b/stack.yaml.lock @@ -0,0 +1,12 @@ +# This file was autogenerated by Stack. +# You should not edit this file by hand. +# For more information, please see the documentation at: +# https://docs.haskellstack.org/en/stable/lock_files + +packages: [] +snapshots: +- completed: + size: 524163 + url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/lts/14/25.yaml + sha256: 97548b4e927a8a862d4a203687ca18a3f5d2ae75b4b72c946fb29c17df1a5082 + original: lts-14.25 diff --git a/test/Test.hs b/test/Test.hs new file mode 100644 index 0000000..62ed937 --- /dev/null +++ b/test/Test.hs @@ -0,0 +1,18 @@ +module Main + ( main + ) where + +import GHC.IO.Encoding (mkTextEncoding, setLocaleEncoding) +import Test.Tasty (defaultMain) + +import System.IO.Utf8 (withUtf8StdHandles) + +import Tree (tests) + +main :: IO () +main = withUtf8StdHandles $ do + -- The issue only shows up when current locale encoding is ASCII. + -- Realistically, very often when running this test this will not be + -- the case, so we unset locale encoding manually. + mkTextEncoding "ASCII" >>= setLocaleEncoding + tests >>= defaultMain diff --git a/test/Test/Utf8/ReadWrite.hs b/test/Test/Utf8/ReadWrite.hs new file mode 100644 index 0000000..2ce5bd4 --- /dev/null +++ b/test/Test/Utf8/ReadWrite.hs @@ -0,0 +1,87 @@ +{- SPDX-FileCopyrightText: 2020 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +{-# LANGUAGE LambdaCase #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ScopedTypeVariables #-} + +module Test.Utf8.ReadWrite where + +import Control.DeepSeq (force) +import Control.Exception (IOException, evaluate) +import Control.Exception.Safe (MonadMask, try) +import Control.Monad ((>=>)) +import Control.Monad.IO.Class (MonadIO, liftIO) +import Data.Text (Text) +import System.IO (FilePath) +import System.IO.Temp (withSystemTempFile) + +import Hedgehog (Property, (===), forAll, property) +import Test.HUnit (Assertion, assertFailure) + +import qualified Data.Text.IO as T +import qualified Data.Text.IO.Utf8 as Utf8 +import qualified System.IO as IO +import qualified System.IO.Utf8 as Utf8 + +import qualified Hedgehog.Gen as G +import qualified Hedgehog.Range as R + +-- Import MonadMask instance for Hedgehog Property +import Test.Util.Hedgehog () + +-- | Helper that writes Text to a temp file. +withTestFile :: (MonadIO m, MonadMask m) => Text -> (FilePath -> m r) -> m r +withTestFile t act = withSystemTempFile "utf8.txt" $ \fp h -> do + Utf8.hSetEncoding h + liftIO $ T.hPutStr h t + liftIO $ IO.hClose h + act fp + + +-- | Sanity check. +unit_standard_readFile_fails :: Assertion +unit_standard_readFile_fails = withTestFile str $ \fp -> + try (T.readFile fp) >>= \case + Right _ -> assertFailure "Standard `readFile` should fail." + Left (_ :: IOException) -> pure () + where + -- We use an escape here because otherwise _both_ tasty-discover + -- and hedgehog fail when they are reading this file for their + -- discovery or nice error reporting purposes... I know, right? + str = "doma\285e" + + +hprop_readFile :: Property +hprop_readFile = property $ do + str <- forAll $ G.text (R.linear 0 1000) G.unicode + str' <- liftIO $ withTestFile str (Utf8.readFile >=> evaluate . force) + str === str' + +hprop_writeFile :: Property +hprop_writeFile = property $ do + str <- forAll $ G.text (R.linear 0 1000) G.unicode + liftIO $ withTestFile str (flip Utf8.writeFile str) + +hprop_openFile :: Property +hprop_openFile = property $ do + str <- forAll $ G.text (R.linear 0 1000) G.unicode + str' <- liftIO $ withTestFile str $ \fp -> do + h <- Utf8.openFile fp IO.ReadMode + res <- T.hGetContents h + res' <- evaluate . force $ res + IO.hClose h + pure res' + str === str' + +hprop_withFile :: Property +hprop_withFile = property $ do + str <- forAll $ G.text (R.linear 0 1000) G.unicode + str' <- liftIO $ withTestFile str $ \fp -> + Utf8.withFile fp IO.ReadMode $ \h -> do + res <- T.hGetContents h + res' <- evaluate . force $ res + pure res' + str === str' diff --git a/test/Test/Util/Hedgehog.hs b/test/Test/Util/Hedgehog.hs new file mode 100644 index 0000000..11e3d35 --- /dev/null +++ b/test/Test/Util/Hedgehog.hs @@ -0,0 +1,53 @@ +{- SPDX-FileCopyrightText: 2020 Serokell + - + - SPDX-License-Identifier: MPL-2.0 + -} + +{-# OPTIONS_GHC -Wno-orphans #-} + +{-# LANGUAGE DerivingStrategies #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{-# LANGUAGE StandaloneDeriving #-} + +-- | The missing MonadMask instance for Hedgehog. +module Test.Util.Hedgehog + () where + +{- +import Control.Exception.Safe (MonadMask (..)) + +import Hedgehog.Internal.Gen +import Hedgehog.Internal.Property +import Hedgehog.Internal.Tree + +import qualified Hedgehog.Internal.Seed as Seed + + +deriving newtype instance MonadMask m => MonadMask (TestT m) +deriving newtype instance MonadMask m => MonadMask (PropertyT m) + +instance MonadMask m => MonadMask (GenT m) where + mask a = GenT $ \size seed -> + mask $ \u -> runGenT size seed (a $ mapGenT u) + + uninterruptibleMask a = GenT $ \size seed -> + uninterruptibleMask $ \u -> runGenT size seed (a $ mapGenT u) + + generalBracket acquire release use = GenT $ \size seed -> + case Seed.split seed of + (seed1, seed1') -> case Seed.split seed1' of + (seed2, seed3) -> do + generalBracket + (runGenT size seed1 acquire) + (\treeRes ec -> runGenT size seed2 (release treeRes ec)) + (\treeRes -> runGenT size seed3 (use treeRes)) + +instance MonadMask m => MonadMask (TreeT m) where + mask a + = TreeT $ mask $ \u -> runTreeT (a $ mapTreeT u) + + uninterruptibleMask a + = TreeT $ uninterruptibleMask $ \u -> runTreeT (a $ mapTreeT u) + + generalBracket acquire release use = undefined +-} diff --git a/test/Tree.hs b/test/Tree.hs new file mode 100644 index 0000000..e70d993 --- /dev/null +++ b/test/Tree.hs @@ -0,0 +1 @@ +{-# OPTIONS_GHC -F -pgmF tasty-discover -optF --tree-display -optF --generated-module -optF Tree #-}