diff --git a/core/src/Streamly/Internal/FileSystem/Path.hs b/core/src/Streamly/Internal/FileSystem/Path.hs index 4d06c0a78..0445267e7 100644 --- a/core/src/Streamly/Internal/FileSystem/Path.hs +++ b/core/src/Streamly/Internal/FileSystem/Path.hs @@ -26,6 +26,11 @@ -- The 'Path' type can be converted to and from 'OsPath' type at zero cost -- since the underlying representation of both is the same. +-- We should be able to manipulate windows paths on posix and posix paths on +-- windows as well. Therefore, we have WindowsPath and PosixPath types which +-- are supported on both platforms. However, the Path module aliases Path to +-- WindowsPath on Windows and PosixPath on Posix. +-- -- Conventions: A trailing separator on a path indicates that it is a directory. -- However, the absence of a trailing separator does not convey any -- information, it could either be a directory or a file. @@ -33,10 +38,17 @@ -- You may also find the 'str' quasiquoter from "Streamly.Unicode.String" to be -- useful in creating paths. -- +-- * https://en.wikipedia.org/wiki/Path_(computing) +-- * https://learn.microsoft.com/en-us/windows/win32/fileio/naming-a-file +-- * https://learn.microsoft.com/en-us/openspecs/windows_protocols/ms-dtyp/62e862f4-2a51-452e-8eeb-dc4ff5ee33cc +-- module Streamly.Internal.FileSystem.Path ( + -- * OS + OS (..) + -- * Path Types - Path (..) + , Path (..) , File , Dir , Abs @@ -88,21 +100,29 @@ module Streamly.Internal.FileSystem.Path -- , primarySeparator -- , isSeparator , append + , unsafeAppend , appendRel + , dropTrailingSeparators + , isRelativeRaw + , isAbsoluteRaw ) where +#include "assert.hs" + import Control.Exception (Exception) import Control.Monad.Catch (MonadThrow(..)) -import Data.Char (chr) +import Data.Char (ord, isAlpha) import Data.Functor.Identity (Identity(..)) import Data.Word (Word8) #if defined(mingw32_HOST_OS) || defined(__MINGW32__) import Data.Word (Word16) #endif +import GHC.Base (unsafeChr) import Language.Haskell.TH (Q, Exp) import Language.Haskell.TH.Quote (QuasiQuoter) -import Streamly.Internal.Data.Array (Array) +import Streamly.Internal.Data.Array (Array(..)) +import Streamly.Internal.Data.MutByteArray (Unbox) import Streamly.Internal.Data.Stream (Stream) import System.IO.Unsafe (unsafePerformIO) @@ -114,18 +134,19 @@ import qualified Streamly.Internal.Unicode.Stream as Unicode import Prelude hiding (abs) -#if defined(mingw32_HOST_OS) || defined(__MINGW32__) -#define WORD_TYPE Word16 -#define SEPARATOR 92 -#else -#define WORD_TYPE Word8 -#define SEPARATOR 47 -#endif - ------------------------------------------------------------------------------ -- Exceptions ------------------------------------------------------------------------------ +data OS = Windows | Posix + +currentOS :: OS +#if defined(mingw32_HOST_OS) || defined(__MINGW32__) +currentOS = Windows +#else +currentOS = Posix +#endif + -- | Exceptions thrown by path operations. data PathException = InvalidPath String @@ -137,6 +158,10 @@ data PathException = instance Exception PathException +------------------------------------------------------------------------------ +-- Types +------------------------------------------------------------------------------ + -- XXX Path must not contain null char on Posix. System calls consider the path -- as null terminated. -- XXX Maintain the Array with null termination because Unix system calls @@ -144,18 +169,31 @@ instance Exception PathException -- as paths. Implementation of path append will have to handle the null -- termination. Or we can choose to always copy the array when using it in -- system calls. - --- XXX The eq instance needs to make sure that the paths are equivalent. If we --- normalize the paths we can do a byte comparison. However, on windows paths --- are case insensitive but the case is preserved, therefore, we cannot --- normalize and need to do case insensitive comparison. - ------------------------------------------------------------------------------- --- Types ------------------------------------------------------------------------------- +-- ".." is not processed, is kept as is. +-- On Windows several characters other than null are not allowed but we do not +-- validate that yet when parsing a path. -- | A type representing file system paths for directories or files. -newtype Path = Path (Array WORD_TYPE) -- deriving Eq +-- +-- A Path is validated before construction unless unsafe constructors are used +-- to create it. Rules and invariants maintained by the safe construction +-- methods are as follows: +-- +-- * Does not contain a null character. +-- * Does not have a trailing separator except in the root path. +-- * Does not have a trailing @.@ component. +-- * Does not have consecutive separators except in UNC prefixes on Windows. +-- * Does not contain @/./@ path components (can be replaced with @/@) except +-- in a UNC prefix on windows. +-- +-- Note that in some cases the file system may perform unicode normalization on +-- paths (e.g. Apple HFS), it may cause surprising results as the path used by +-- the user may not have the same bytes as later returned by the file system. +#if defined(mingw32_HOST_OS) || defined(__MINGW32__) +newtype Path = Path (Array Word16) +#else +newtype Path = Path (Array Word8) +#endif -- Show instance prints raw bytes without any decoding for rountdtripping. -- Should we print this as a string instead, may be useful for ascii chars but @@ -166,6 +204,11 @@ instance Show Path where show (Path x) = show x -} +-- XXX The Eq instance needs to make sure that the paths are equivalent. If we +-- normalize the paths we can do a byte comparison. However, on windows paths +-- are case insensitive but the case is preserved, therefore, we cannot +-- normalize and need to do case insensitive comparison. + -- XXX Do we need a type for file or dir Name as names cannot have the -- separator char and there may be other restrictions on names? For example, -- length restriction. A file name cannot be "." or "..". We can use the types @@ -191,10 +234,13 @@ newtype Rel a = Rel a -- | A member of 'IsPath' knows how to convert to and from the 'Path' type. class IsPath a where - -- | Like 'fromPath' but does not check the properties of 'Path'. Provides - -- performance and simplicity when we know that the properties of the path - -- are already verified, for example, when we get the path from the file - -- system or the OS APIs. + -- | Like 'fromPath' but does not check the properties of 'Path'. The user + -- is responsible to maintain the invariants mentioned in the definition of + -- 'Path' type otherwise surprising behavior may result. + -- + -- Provides performance and simplicity when we know that the properties of + -- the path are already verified, for example, when we get the path from + -- the file system or the OS APIs. unsafeFromPath :: Path -> a -- | Convert a raw 'Path' to other forms of well-typed paths. It may fail @@ -270,11 +316,12 @@ adapt p = fromPath $ toPath p -- been using it consistently in streamly. We use "bytes" for a stream of -- bytes. --- | /Unsafe/: On Posix, a path cannot contain null characters. On Windows, the --- array passed must be a multiple of 2 bytes as the underlying representation --- uses 'Word16'. +-- | /Unsafe/: The user is responsible to maintain the invariants mentioned in +-- the definition of the 'Path' type. On Windows, the array passed must be a +-- multiple of 2 bytes as the underlying representation uses 'Word16'. {-# INLINE unsafeFromChunk #-} unsafeFromChunk :: Array Word8 -> Path +-- XXX add asserts to check safety unsafeFromChunk arr = Path (Array.castUnsafe arr) -- | On Posix it may fail if the byte array contains null characters. On @@ -300,6 +347,9 @@ toChunk (Path arr) = Array.asBytes arr -- | Encode a Unicode char stream to 'Path' using strict UTF-8 encoding on -- Posix. On Posix it may fail if the stream contains null characters. -- TBD: Use UTF16LE on Windows. +-- +-- Unicode normalization is not done. If normalization is needed the user can +-- normalize it and use the fromChunk API. fromChars :: MonadThrow m => Stream Identity Char -> m Path fromChars s = let n = runIdentity $ Stream.fold Fold.length s @@ -454,8 +504,29 @@ mkRelFile = undefined -- Operations ------------------------------------------------------------------------------ -separatorWord :: WORD_TYPE -separatorWord = SEPARATOR +posixSeparator :: Char +posixSeparator = '/' + +windowsSeparator :: Char +windowsSeparator = '\\' + +-- XXX We can use Enum type class to include the Char type as well so that the +-- functions can work on Array Word8/Word16/Char but that may be slow. + +-- | Unsafe, may tructate to shorter word types, can only be used safely for +-- characters that fit in the given word size. +charToWord :: Integral a => Char -> a +charToWord c = + let n = ord c + in assert (n <= 255) (fromIntegral n) + +-- | Unsafe, should be a valid character. +wordToChar :: Integral a => a -> Char +wordToChar = unsafeChr . fromIntegral + +-- | Index a word in an array and convert it to Char. +unsafeIndexChar :: (Unbox a, Integral a) => Int -> Array a -> Char +unsafeIndexChar i a = wordToChar (Array.getIndexUnsafe i a) -- Portable definition for exporting. @@ -463,60 +534,180 @@ separatorWord = SEPARATOR -- Windows supports @/@ too as a separator. Please use 'isSeparator' for -- testing if a char is a separator char. _primarySeparator :: Char -_primarySeparator = chr (SEPARATOR) +_primarySeparator = posixSeparator + +------------------------------------------------------------------------------ +-- Path parsing utilities +------------------------------------------------------------------------------ -- | On Posix only @/@ is a path separator but in windows it could be either -- @/@ or @\\@. -_isSeparator :: Char -> Bool -#if defined(mingw32_HOST_OS) || defined(__MINGW32__) -_isSeparator c = (c == '/') || (c == '\\') -#else -_isSeparator = (== '/') -#endif +{-# INLINE isSeparator #-} +isSeparator :: OS -> Char -> Bool +isSeparator Windows c = (c == windowsSeparator) || (c == posixSeparator) +isSeparator Posix c = (c == posixSeparator) --- If we append an absolute path it may fail with an error if the 'Path' --- implementation stores absolute path information (a leading separator char). --- However, the implementation may choose to store the path as a list of --- components in which case we cannot distinguish an absolute path from --- relative. +{-# INLINE isSeparatorWord #-} +isSeparatorWord :: Integral a => OS -> a -> Bool +isSeparatorWord os = isSeparator os . wordToChar + +countTrailingBy :: Unbox a => (a -> Bool) -> Array a -> Int +countTrailingBy p arr = + runIdentity + $ Stream.fold Fold.length + $ Stream.takeWhile p + $ Array.readRev arr + +-- | If the path is @//@ the result is @/@. If it is @a//@ then the result is +-- @a@. +dropTrailingBy :: Unbox a => (a -> Bool) -> Array a -> Array a +dropTrailingBy p arr@(Array barr start end) = + if end - start > 0 + then + let n = countTrailingBy p arr + in Array barr start (max 1 (end - n)) + else arr + +-- | If the path is @//@ the result is @/@. If it is @a//@ then the result is +-- @a@. +{-# INLINE dropTrailingSeparators #-} +dropTrailingSeparators :: OS -> Path -> Path +dropTrailingSeparators os (Path arr) = + Path (dropTrailingBy (isSeparator os . wordToChar) arr) + +-- | @C:...@ +hasDrive :: (Unbox a, Integral a) => Array a -> Bool +hasDrive a = + if Array.byteLength a < 2 + then False + -- Check colon first for quicker return + else if (unsafeIndexChar 1 a /= ':') + then False + -- XXX If we found a colon anyway this cannot be a valid path unless it has + -- a drive prefix. colon is not a valid path character. + -- XXX check isAlpha perf + else if not (isAlpha (unsafeIndexChar 0 a)) + then False -- XXX if we are here it is not a valid path + else True + +-- | On windows, the path starts with a separator. +isAbsoluteInDrive :: (Unbox a, Integral a) => Array a -> Bool +isAbsoluteInDrive a = + -- Assuming the path is not empty. + isSeparator Windows (wordToChar (Array.getIndexUnsafe 0 a)) + +-- | @C:\...@ +isAbsoluteDrive :: (Unbox a, Integral a) => Array a -> Bool +isAbsoluteDrive a = + if Array.byteLength a < 3 + then False + -- Check colon first for quicker return + else if (unsafeIndexChar 1 a /= ':') + then False + else if not (isSeparator Windows (unsafeIndexChar 2 a)) + then False + -- XXX If we found a colon anyway this cannot be a valid path unless it has + -- a drive prefix. colon is not a valid path character. + -- XXX check isAlpha perf + else if not (isAlpha (unsafeIndexChar 0 a)) + then False -- XXX if we are here it is not a valid path + else True + +-- | @\\\\...@ +isAbsoluteUNC :: (Unbox a, Integral a) => Array a -> Bool +isAbsoluteUNC a = + if Array.byteLength a < 2 + then False + else if (unsafeIndexChar 0 a /= '\\') + then False + else if (unsafeIndexChar 1 a /= '\\') + then False + else True + +-- | On Posix, a path starting with a separator is an absolute path. +-- +-- On Windows: +-- * @C:\\@ local absolute +-- * @C:@ local relative +-- * @\\@ local relative to current drive root +-- * @\\\\@ UNC network path +-- * @\\\\?\\C:\\@ Long UNC local path +-- * @\\\\?\\UNC\\@ Long UNC server path +-- * @\\\\.\\@ DOS local device namespace +-- * @\\\\??\\@ DOS global namespace +isAbsoluteRaw :: (Unbox a, Integral a) => OS -> Array a -> Bool +isAbsoluteRaw Posix a = + -- Assuming path is not empty. + isSeparator Posix (wordToChar (Array.getIndexUnsafe 0 a)) +isAbsoluteRaw Windows a = isAbsoluteDrive a || isAbsoluteUNC a + +isRelativeRaw :: (Unbox a, Integral a) => OS -> Array a -> Bool +isRelativeRaw os = not . isAbsoluteRaw os + +------------------------------------------------------------------------------ +-- Operations of Path +------------------------------------------------------------------------------ -- XXX This can be generalized to an Array intersperse operation --- | Like 'appendRel' but for the less restrictive 'Path' type, it always --- creates a syntactically valid 'Path' type but it may not be semantically --- valid because we may append an absolute path or we may append a path to a --- file path. The onus lies on the user to ensure that the first path is not a --- file and the second path is not absolute. +{-# INLINE doAppend #-} +doAppend :: (Unbox a, Integral a) => OS -> Array a -> Array a -> Array a +doAppend os a b = unsafePerformIO $ do + let lenA = Array.byteLength a + lenB = Array.byteLength b + assertM (lenA /= 0 && lenB /= 0) + assertM (countTrailingBy (isSeparatorWord os) a == 0) + let len = lenA + 1 + lenB + arr <- MutArray.new len + arr1 <- MutArray.spliceUnsafe arr (Array.unsafeThaw a) + arr2 <- MutArray.snocUnsafe arr1 (charToWord posixSeparator) + arr3 <- MutArray.spliceUnsafe arr2 (Array.unsafeThaw b) + return (Array.unsafeFreeze arr3) + +{-# INLINE withAppendCheck #-} +withAppendCheck :: OS -> Path -> a -> a +withAppendCheck Posix p2@(Path arr) f = + if isAbsoluteRaw Posix arr + then error $ "append: cannot append absolute path " ++ toString p2 + else f +withAppendCheck Windows p2@(Path arr) f = + if isAbsoluteInDrive arr + then error $ "append: cannot append drive absolute path " ++ toString p2 + else if hasDrive arr + then error $ "append: cannot append path with drive " ++ toString p2 + else if isAbsoluteUNC arr + then error $ "append: cannot append absolute UNC path " ++ toString p2 + else f + +-- | Does not check if any of the path is empty or if the second path is +-- absolute. +{-# INLINE unsafeAppendOS #-} +unsafeAppendOS :: OS -> Path -> Path -> Path +unsafeAppendOS os (Path a) p2@(Path b) = + assert (withAppendCheck os p2 True) (Path $ doAppend os a b) + +{-# INLINE unsafeAppend #-} +unsafeAppend :: Path -> Path -> Path +unsafeAppend = unsafeAppendOS currentOS + +{-# INLINE appendOS #-} +appendOS :: OS -> Path -> Path -> Path +appendOS os (Path a) p2@(Path b) = + withAppendCheck os p2 (Path $ doAppend os a b) + +-- | Append a 'Path' to another. Fails if the second path is absolute. -- -- Also see 'appendRel'. append :: Path -> Path -> Path -append (Path a) (Path b) = - let len = Array.byteLength a + 1 + Array.byteLength b - -- XXX Check the leading separator or drive identifier. However, - -- checking the drive letter may add an additional overhead (can it be - -- arbitrarily long?), if it is significant we may want to have a - -- separate combinePathChecked API for that. - -- - -- Also, do not add a separator char if the first path has a trailing - -- separator. - newArr = unsafePerformIO $ do - arr <- MutArray.new len - arr1 <- MutArray.spliceUnsafe arr (Array.unsafeThaw a) - arr2 <- MutArray.snocUnsafe arr1 separatorWord - arr3 <- MutArray.spliceUnsafe arr2 (Array.unsafeThaw b) - return (Array.unsafeFreeze arr3) - in Path newArr +append = appendOS currentOS -- The only safety we need for paths is: (1) The first path can only be a Dir -- type path, and (2) second path can only be a Rel path. --- Can this be coerced to create unsafe versions? --- | Extend a directory path by appending a relative path to it. This is the --- equivalent to the @@ operator from the @filepath@ package. +-- | Append a 'Rel' 'Path' to a 'Dir' 'Path'. Never fails. -- -- Also see 'append'. {-# INLINE appendRel #-} appendRel :: (IsPath (a (Dir Path)), IsPath b, IsPath (a b)) => (a (Dir Path)) -> Rel b -> a b -appendRel a (Rel b) = - unsafeFromPath $ append (toPath a) (toPath b) +appendRel a (Rel b) = unsafeFromPath $ unsafeAppend (toPath a) (toPath b)