Check absolute path in append

Also add a few other utilities.
This commit is contained in:
Harendra Kumar 2024-01-13 19:41:58 +05:30
parent b1b66d040d
commit ec8ebdf601

View File

@ -26,6 +26,11 @@
-- The 'Path' type can be converted to and from 'OsPath' type at zero cost
-- since the underlying representation of both is the same.
-- We should be able to manipulate windows paths on posix and posix paths on
-- windows as well. Therefore, we have WindowsPath and PosixPath types which
-- are supported on both platforms. However, the Path module aliases Path to
-- WindowsPath on Windows and PosixPath on Posix.
--
-- Conventions: A trailing separator on a path indicates that it is a directory.
-- However, the absence of a trailing separator does not convey any
-- information, it could either be a directory or a file.
@ -33,10 +38,17 @@
-- You may also find the 'str' quasiquoter from "Streamly.Unicode.String" to be
-- useful in creating paths.
--
-- * https://en.wikipedia.org/wiki/Path_(computing)
-- * https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file
-- * https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-dtyp/62e862f4-2a51-452e-8eeb-dc4ff5ee33cc
--
module Streamly.Internal.FileSystem.Path
(
-- * OS
OS (..)
-- * Path Types
Path (..)
, Path (..)
, File
, Dir
, Abs
@ -88,21 +100,29 @@ module Streamly.Internal.FileSystem.Path
-- , primarySeparator
-- , isSeparator
, append
, unsafeAppend
, appendRel
, dropTrailingSeparators
, isRelativeRaw
, isAbsoluteRaw
)
where
#include "assert.hs"
import Control.Exception (Exception)
import Control.Monad.Catch (MonadThrow(..))
import Data.Char (chr)
import Data.Char (ord, isAlpha)
import Data.Functor.Identity (Identity(..))
import Data.Word (Word8)
#if defined(mingw32_HOST_OS) || defined(__MINGW32__)
import Data.Word (Word16)
#endif
import GHC.Base (unsafeChr)
import Language.Haskell.TH (Q, Exp)
import Language.Haskell.TH.Quote (QuasiQuoter)
import Streamly.Internal.Data.Array (Array)
import Streamly.Internal.Data.Array (Array(..))
import Streamly.Internal.Data.MutByteArray (Unbox)
import Streamly.Internal.Data.Stream (Stream)
import System.IO.Unsafe (unsafePerformIO)
@ -114,18 +134,19 @@ import qualified Streamly.Internal.Unicode.Stream as Unicode
import Prelude hiding (abs)
#if defined(mingw32_HOST_OS) || defined(__MINGW32__)
#define WORD_TYPE Word16
#define SEPARATOR 92
#else
#define WORD_TYPE Word8
#define SEPARATOR 47
#endif
------------------------------------------------------------------------------
-- Exceptions
------------------------------------------------------------------------------
data OS = Windows | Posix
currentOS :: OS
#if defined(mingw32_HOST_OS) || defined(__MINGW32__)
currentOS = Windows
#else
currentOS = Posix
#endif
-- | Exceptions thrown by path operations.
data PathException =
InvalidPath String
@ -137,6 +158,10 @@ data PathException =
instance Exception PathException
------------------------------------------------------------------------------
-- Types
------------------------------------------------------------------------------
-- XXX Path must not contain null char on Posix. System calls consider the path
-- as null terminated.
-- XXX Maintain the Array with null termination because Unix system calls
@ -144,18 +169,31 @@ instance Exception PathException
-- as paths. Implementation of path append will have to handle the null
-- termination. Or we can choose to always copy the array when using it in
-- system calls.
-- XXX The eq instance needs to make sure that the paths are equivalent. If we
-- normalize the paths we can do a byte comparison. However, on windows paths
-- are case insensitive but the case is preserved, therefore, we cannot
-- normalize and need to do case insensitive comparison.
------------------------------------------------------------------------------
-- Types
------------------------------------------------------------------------------
-- ".." is not processed, is kept as is.
-- On Windows several characters other than null are not allowed but we do not
-- validate that yet when parsing a path.
-- | A type representing file system paths for directories or files.
newtype Path = Path (Array WORD_TYPE) -- deriving Eq
--
-- A Path is validated before construction unless unsafe constructors are used
-- to create it. Rules and invariants maintained by the safe construction
-- methods are as follows:
--
-- * Does not contain a null character.
-- * Does not have a trailing separator except in the root path.
-- * Does not have a trailing @.@ component.
-- * Does not have consecutive separators except in UNC prefixes on Windows.
-- * Does not contain @/./@ path components (can be replaced with @/@) except
-- in a UNC prefix on windows.
--
-- Note that in some cases the file system may perform unicode normalization on
-- paths (e.g. Apple HFS), it may cause surprising results as the path used by
-- the user may not have the same bytes as later returned by the file system.
#if defined(mingw32_HOST_OS) || defined(__MINGW32__)
newtype Path = Path (Array Word16)
#else
newtype Path = Path (Array Word8)
#endif
-- Show instance prints raw bytes without any decoding for rountdtripping.
-- Should we print this as a string instead, may be useful for ascii chars but
@ -166,6 +204,11 @@ instance Show Path where
show (Path x) = show x
-}
-- XXX The Eq instance needs to make sure that the paths are equivalent. If we
-- normalize the paths we can do a byte comparison. However, on windows paths
-- are case insensitive but the case is preserved, therefore, we cannot
-- normalize and need to do case insensitive comparison.
-- XXX Do we need a type for file or dir Name as names cannot have the
-- separator char and there may be other restrictions on names? For example,
-- length restriction. A file name cannot be "." or "..". We can use the types
@ -191,10 +234,13 @@ newtype Rel a = Rel a
-- | A member of 'IsPath' knows how to convert to and from the 'Path' type.
class IsPath a where
-- | Like 'fromPath' but does not check the properties of 'Path'. Provides
-- performance and simplicity when we know that the properties of the path
-- are already verified, for example, when we get the path from the file
-- system or the OS APIs.
-- | Like 'fromPath' but does not check the properties of 'Path'. The user
-- is responsible to maintain the invariants mentioned in the definition of
-- 'Path' type otherwise surprising behavior may result.
--
-- Provides performance and simplicity when we know that the properties of
-- the path are already verified, for example, when we get the path from
-- the file system or the OS APIs.
unsafeFromPath :: Path -> a
-- | Convert a raw 'Path' to other forms of well-typed paths. It may fail
@ -270,11 +316,12 @@ adapt p = fromPath $ toPath p
-- been using it consistently in streamly. We use "bytes" for a stream of
-- bytes.
-- | /Unsafe/: On Posix, a path cannot contain null characters. On Windows, the
-- array passed must be a multiple of 2 bytes as the underlying representation
-- uses 'Word16'.
-- | /Unsafe/: The user is responsible to maintain the invariants mentioned in
-- the definition of the 'Path' type. On Windows, the array passed must be a
-- multiple of 2 bytes as the underlying representation uses 'Word16'.
{-# INLINE unsafeFromChunk #-}
unsafeFromChunk :: Array Word8 -> Path
-- XXX add asserts to check safety
unsafeFromChunk arr = Path (Array.castUnsafe arr)
-- | On Posix it may fail if the byte array contains null characters. On
@ -300,6 +347,9 @@ toChunk (Path arr) = Array.asBytes arr
-- | Encode a Unicode char stream to 'Path' using strict UTF-8 encoding on
-- Posix. On Posix it may fail if the stream contains null characters.
-- TBD: Use UTF16LE on Windows.
--
-- Unicode normalization is not done. If normalization is needed the user can
-- normalize it and use the fromChunk API.
fromChars :: MonadThrow m => Stream Identity Char -> m Path
fromChars s =
let n = runIdentity $ Stream.fold Fold.length s
@ -454,8 +504,29 @@ mkRelFile = undefined
-- Operations
------------------------------------------------------------------------------
separatorWord :: WORD_TYPE
separatorWord = SEPARATOR
posixSeparator :: Char
posixSeparator = '/'
windowsSeparator :: Char
windowsSeparator = '\\'
-- XXX We can use Enum type class to include the Char type as well so that the
-- functions can work on Array Word8/Word16/Char but that may be slow.
-- | Unsafe, may tructate to shorter word types, can only be used safely for
-- characters that fit in the given word size.
charToWord :: Integral a => Char -> a
charToWord c =
let n = ord c
in assert (n <= 255) (fromIntegral n)
-- | Unsafe, should be a valid character.
wordToChar :: Integral a => a -> Char
wordToChar = unsafeChr . fromIntegral
-- | Index a word in an array and convert it to Char.
unsafeIndexChar :: (Unbox a, Integral a) => Int -> Array a -> Char
unsafeIndexChar i a = wordToChar (Array.getIndexUnsafe i a)
-- Portable definition for exporting.
@ -463,60 +534,180 @@ separatorWord = SEPARATOR
-- Windows supports @/@ too as a separator. Please use 'isSeparator' for
-- testing if a char is a separator char.
_primarySeparator :: Char
_primarySeparator = chr (SEPARATOR)
_primarySeparator = posixSeparator
------------------------------------------------------------------------------
-- Path parsing utilities
------------------------------------------------------------------------------
-- | On Posix only @/@ is a path separator but in windows it could be either
-- @/@ or @\\@.
_isSeparator :: Char -> Bool
#if defined(mingw32_HOST_OS) || defined(__MINGW32__)
_isSeparator c = (c == '/') || (c == '\\')
#else
_isSeparator = (== '/')
#endif
{-# INLINE isSeparator #-}
isSeparator :: OS -> Char -> Bool
isSeparator Windows c = (c == windowsSeparator) || (c == posixSeparator)
isSeparator Posix c = (c == posixSeparator)
-- If we append an absolute path it may fail with an error if the 'Path'
-- implementation stores absolute path information (a leading separator char).
-- However, the implementation may choose to store the path as a list of
-- components in which case we cannot distinguish an absolute path from
-- relative.
{-# INLINE isSeparatorWord #-}
isSeparatorWord :: Integral a => OS -> a -> Bool
isSeparatorWord os = isSeparator os . wordToChar
countTrailingBy :: Unbox a => (a -> Bool) -> Array a -> Int
countTrailingBy p arr =
runIdentity
$ Stream.fold Fold.length
$ Stream.takeWhile p
$ Array.readRev arr
-- | If the path is @//@ the result is @/@. If it is @a//@ then the result is
-- @a@.
dropTrailingBy :: Unbox a => (a -> Bool) -> Array a -> Array a
dropTrailingBy p arr@(Array barr start end) =
if end - start > 0
then
let n = countTrailingBy p arr
in Array barr start (max 1 (end - n))
else arr
-- | If the path is @//@ the result is @/@. If it is @a//@ then the result is
-- @a@.
{-# INLINE dropTrailingSeparators #-}
dropTrailingSeparators :: OS -> Path -> Path
dropTrailingSeparators os (Path arr) =
Path (dropTrailingBy (isSeparator os . wordToChar) arr)
-- | @C:...@
hasDrive :: (Unbox a, Integral a) => Array a -> Bool
hasDrive a =
if Array.byteLength a < 2
then False
-- Check colon first for quicker return
else if (unsafeIndexChar 1 a /= ':')
then False
-- XXX If we found a colon anyway this cannot be a valid path unless it has
-- a drive prefix. colon is not a valid path character.
-- XXX check isAlpha perf
else if not (isAlpha (unsafeIndexChar 0 a))
then False -- XXX if we are here it is not a valid path
else True
-- | On windows, the path starts with a separator.
isAbsoluteInDrive :: (Unbox a, Integral a) => Array a -> Bool
isAbsoluteInDrive a =
-- Assuming the path is not empty.
isSeparator Windows (wordToChar (Array.getIndexUnsafe 0 a))
-- | @C:\...@
isAbsoluteDrive :: (Unbox a, Integral a) => Array a -> Bool
isAbsoluteDrive a =
if Array.byteLength a < 3
then False
-- Check colon first for quicker return
else if (unsafeIndexChar 1 a /= ':')
then False
else if not (isSeparator Windows (unsafeIndexChar 2 a))
then False
-- XXX If we found a colon anyway this cannot be a valid path unless it has
-- a drive prefix. colon is not a valid path character.
-- XXX check isAlpha perf
else if not (isAlpha (unsafeIndexChar 0 a))
then False -- XXX if we are here it is not a valid path
else True
-- | @\\\\...@
isAbsoluteUNC :: (Unbox a, Integral a) => Array a -> Bool
isAbsoluteUNC a =
if Array.byteLength a < 2
then False
else if (unsafeIndexChar 0 a /= '\\')
then False
else if (unsafeIndexChar 1 a /= '\\')
then False
else True
-- | On Posix, a path starting with a separator is an absolute path.
--
-- On Windows:
-- * @C:\\@ local absolute
-- * @C:@ local relative
-- * @\\@ local relative to current drive root
-- * @\\\\@ UNC network path
-- * @\\\\?\\C:\\@ Long UNC local path
-- * @\\\\?\\UNC\\@ Long UNC server path
-- * @\\\\.\\@ DOS local device namespace
-- * @\\\\??\\@ DOS global namespace
isAbsoluteRaw :: (Unbox a, Integral a) => OS -> Array a -> Bool
isAbsoluteRaw Posix a =
-- Assuming path is not empty.
isSeparator Posix (wordToChar (Array.getIndexUnsafe 0 a))
isAbsoluteRaw Windows a = isAbsoluteDrive a || isAbsoluteUNC a
isRelativeRaw :: (Unbox a, Integral a) => OS -> Array a -> Bool
isRelativeRaw os = not . isAbsoluteRaw os
------------------------------------------------------------------------------
-- Operations of Path
------------------------------------------------------------------------------
-- XXX This can be generalized to an Array intersperse operation
-- | Like 'appendRel' but for the less restrictive 'Path' type, it always
-- creates a syntactically valid 'Path' type but it may not be semantically
-- valid because we may append an absolute path or we may append a path to a
-- file path. The onus lies on the user to ensure that the first path is not a
-- file and the second path is not absolute.
{-# INLINE doAppend #-}
doAppend :: (Unbox a, Integral a) => OS -> Array a -> Array a -> Array a
doAppend os a b = unsafePerformIO $ do
let lenA = Array.byteLength a
lenB = Array.byteLength b
assertM (lenA /= 0 && lenB /= 0)
assertM (countTrailingBy (isSeparatorWord os) a == 0)
let len = lenA + 1 + lenB
arr <- MutArray.new len
arr1 <- MutArray.spliceUnsafe arr (Array.unsafeThaw a)
arr2 <- MutArray.snocUnsafe arr1 (charToWord posixSeparator)
arr3 <- MutArray.spliceUnsafe arr2 (Array.unsafeThaw b)
return (Array.unsafeFreeze arr3)
{-# INLINE withAppendCheck #-}
withAppendCheck :: OS -> Path -> a -> a
withAppendCheck Posix p2@(Path arr) f =
if isAbsoluteRaw Posix arr
then error $ "append: cannot append absolute path " ++ toString p2
else f
withAppendCheck Windows p2@(Path arr) f =
if isAbsoluteInDrive arr
then error $ "append: cannot append drive absolute path " ++ toString p2
else if hasDrive arr
then error $ "append: cannot append path with drive " ++ toString p2
else if isAbsoluteUNC arr
then error $ "append: cannot append absolute UNC path " ++ toString p2
else f
-- | Does not check if any of the path is empty or if the second path is
-- absolute.
{-# INLINE unsafeAppendOS #-}
unsafeAppendOS :: OS -> Path -> Path -> Path
unsafeAppendOS os (Path a) p2@(Path b) =
assert (withAppendCheck os p2 True) (Path $ doAppend os a b)
{-# INLINE unsafeAppend #-}
unsafeAppend :: Path -> Path -> Path
unsafeAppend = unsafeAppendOS currentOS
{-# INLINE appendOS #-}
appendOS :: OS -> Path -> Path -> Path
appendOS os (Path a) p2@(Path b) =
withAppendCheck os p2 (Path $ doAppend os a b)
-- | Append a 'Path' to another. Fails if the second path is absolute.
--
-- Also see 'appendRel'.
append :: Path -> Path -> Path
append (Path a) (Path b) =
let len = Array.byteLength a + 1 + Array.byteLength b
-- XXX Check the leading separator or drive identifier. However,
-- checking the drive letter may add an additional overhead (can it be
-- arbitrarily long?), if it is significant we may want to have a
-- separate combinePathChecked API for that.
--
-- Also, do not add a separator char if the first path has a trailing
-- separator.
newArr = unsafePerformIO $ do
arr <- MutArray.new len
arr1 <- MutArray.spliceUnsafe arr (Array.unsafeThaw a)
arr2 <- MutArray.snocUnsafe arr1 separatorWord
arr3 <- MutArray.spliceUnsafe arr2 (Array.unsafeThaw b)
return (Array.unsafeFreeze arr3)
in Path newArr
append = appendOS currentOS
-- The only safety we need for paths is: (1) The first path can only be a Dir
-- type path, and (2) second path can only be a Rel path.
-- Can this be coerced to create unsafe versions?
-- | Extend a directory path by appending a relative path to it. This is the
-- equivalent to the @</>@ operator from the @filepath@ package.
-- | Append a 'Rel' 'Path' to a 'Dir' 'Path'. Never fails.
--
-- Also see 'append'.
{-# INLINE appendRel #-}
appendRel :: (IsPath (a (Dir Path)), IsPath b, IsPath (a b)) =>
(a (Dir Path)) -> Rel b -> a b
appendRel a (Rel b) =
unsafeFromPath $ append (toPath a) (toPath b)
appendRel a (Rel b) = unsafeFromPath $ unsafeAppend (toPath a) (toPath b)