Tease apart file reading and transcoding

2024-12-24 23:42:31 +03:00 · 2017-04-19 09:12:19 -07:00 · 2017-04-19 09:12:19 -07:00 · eb9547fb25
commit eb9547fb25
parent 48c4edf362
7 changed files with 111 additions and 70 deletions
--- a/semantic-diff.cabal
+++ b/semantic-diff.cabal
@ -158,6 +158,7 @@ test-suite test
                     , RangeSpec
                     , SES.Myers.Spec
                     , SourceSpec
+                     , SpecHelpers
                     , TermSpec
                     , TOCSpec
                     , IntegrationSpec
@ -185,6 +186,7 @@ test-suite test
                     , regex-compat
                     , semantic-diff
                     , text >= 1.2.1.3
+                     , text-icu
                     , unordered-containers
                     , these
                     , vector
--- a/src/Command.hs
+++ b/src/Command.hs
@ -189,7 +189,7 @@ runReadFilesAtSHAs gitDir alternateObjectDirs paths shas = do
        toSourceKind (Git.SymlinkBlob mode) = Source.SymlinkBlob mode

 runParse :: Maybe Language -> SourceBlob -> IO (Term (Syntax Text) (Record DefaultFields))
-runParse = maybe lineByLineParser parserForLanguage
+runParse = parserForLanguage

 runDiff :: HasField fields Category => Both (Term (Syntax Text) (Record fields)) -> Diff (Syntax Text) (Record fields)
 runDiff terms = stripDiff (runBothWith diffTerms (fmap decorate terms))
--- a/src/Command/Parse.hs
+++ b/src/Command/Parse.hs
@ -1,5 +1,19 @@
 {-# LANGUAGE DataKinds, GADTs, GeneralizedNewtypeDeriving, ScopedTypeVariables, TypeFamilies, TypeOperators #-}
-module Command.Parse where
+module Command.Parse
+(
+-- Render parser trees
+  jsonParseTree
+, jsonIndexParseTree
+, sExpressionParseTree
+-- Parsers
+, parserForLanguage
+, parserForFilePath -- TODO: Remove. Langauge detection should not come from file extension.
+-- Files and transcoding
+, transcode
+, sourceBlobsFromPaths
+-- Git
+, sourceBlobsFromSha
+) where

 import Category
 import Data.Aeson (ToJSON, toJSON, encode, object, (.=))
@ -7,6 +21,7 @@ import Data.Aeson.Types (Pair)
 import Data.Functor.Foldable hiding (Nil)
 import Data.Record
 import Data.String
+import qualified Data.ByteString as B
 import qualified Data.Text as T
 import Git.Blob
 import Git.Libgit2
@ -30,6 +45,47 @@ import Text.Parser.TreeSitter.Go
 import Text.Parser.TreeSitter.JavaScript
 import Text.Parser.TreeSitter.Ruby
 import Text.Parser.TreeSitter.TypeScript
+import System.IO
+import Control.Exception (catch, IOException)
+import qualified Data.Text.ICU.Convert as Convert
+import qualified Data.Text.ICU.Detect as Detect
+
+-- TODO: This doesn't belong in here
+
+-- | Return a parser for a given langauge or the lineByLineParser parser.
+parserForLanguage :: Maybe Language -> Parser (Syntax Text) (Record DefaultFields)
+parserForLanguage Nothing = lineByLineParser
+parserForLanguage (Just language) = case language of
+  C -> treeSitterParser C tree_sitter_c
+  JavaScript -> treeSitterParser JavaScript tree_sitter_javascript
+  TypeScript -> treeSitterParser TypeScript tree_sitter_typescript
+  Markdown -> cmarkParser
+  Ruby -> treeSitterParser Ruby tree_sitter_ruby
+  Language.Go -> treeSitterParser Language.Go tree_sitter_go
+
+-- | Return a parser based on the file extension (including the ".").
+-- | TODO: Remove.
+parserForType :: String -> Parser (Syntax Text) (Record DefaultFields)
+parserForType = parserForLanguage . languageForType
+
+-- | Return the parser that should be used for a given path.
+-- | TODO: Remove.
+parserForFilePath :: FilePath -> Parser (Syntax Text) (Record DefaultFields)
+parserForFilePath = parserForType . toS . takeExtension
+
+-- | A fallback parser that treats a file simply as rows of strings.
+lineByLineParser :: Parser (Syntax Text) (Record DefaultFields)
+lineByLineParser SourceBlob{..} = pure . cofree . root $ case foldl' annotateLeaves ([], 0) lines of
+  (leaves, _) -> cofree <$> leaves
+  where
+    lines = actualLines source
+    root children = (sourceRange :. Program :. rangeToSourceSpan source sourceRange :. Nil) :< Indexed children
+    sourceRange = Source.totalRange source
+    leaf charIndex line = (Range charIndex (charIndex + T.length line) :. Program :. rangeToSourceSpan source (Range charIndex (charIndex + T.length line)) :. Nil) :< Leaf line
+    annotateLeaves (accum, charIndex) line =
+      (accum <> [ leaf charIndex (Source.toText line) ] , charIndex + Source.length line)
+
+

 data ParseTreeFile = ParseTreeFile { parseTreeFilePath :: FilePath, node :: Rose ParseNode } deriving (Show)

@ -67,9 +123,7 @@ parseNodeToJSONFields ParseNode{..} =
 sExpressionParseTree :: Bool -> [SourceBlob] -> IO ByteString
 sExpressionParseTree _ blobs =
  pure . printTerms TreeOnly =<< parse blobs
-  where parse = traverse (\sourceBlob@SourceBlob{..} -> parserForType (toS (takeExtension path)) sourceBlob)
-
-type RAlgebra t a = Base t (t, a) -> a
+  where parse = traverse (\sourceBlob@SourceBlob{..} -> parserForFilePath path sourceBlob)

 parseRoot :: Bool -> (FilePath -> f ParseNode -> root) -> (ParseNode -> [f ParseNode] -> f ParseNode) -> [SourceBlob] -> IO [root]
 parseRoot debug construct combine blobs = for blobs (\ sourceBlob@SourceBlob{..} -> do
@ -137,21 +191,7 @@ identifierFor = fmap toS . extractLeafValue . unwrap <=< maybeIdentifier

 -- | Return a parser incorporating the provided TermDecorator.
 parseWithDecorator :: TermDecorator (Syntax Text) DefaultFields field -> FilePath -> Parser (Syntax Text) (Record (field ': DefaultFields))
-parseWithDecorator decorator path blob = decorateTerm decorator <$> parserForType (toS (takeExtension path)) blob
-
-- | Return a parser based on the file extension (including the ".").
-parserForType :: String -> Parser (Syntax Text) (Record DefaultFields)
-parserForType mediaType = maybe lineByLineParser parserForLanguage (languageForType mediaType)
-
-- | Select a parser for a given Language.
-parserForLanguage :: Language -> Parser (Syntax Text) (Record DefaultFields)
-parserForLanguage language = case language of
-  C -> treeSitterParser C tree_sitter_c
-  JavaScript -> treeSitterParser JavaScript tree_sitter_javascript
-  TypeScript -> treeSitterParser TypeScript tree_sitter_typescript
-  Markdown -> cmarkParser
-  Ruby -> treeSitterParser Ruby tree_sitter_ruby
-  Language.Go -> treeSitterParser Language.Go tree_sitter_go
+parseWithDecorator decorator path blob = decorateTerm decorator <$> parserForFilePath path blob

 -- | Decorate a 'Term' using a function to compute the annotation values at every node.
 decorateTerm :: (Functor f) => TermDecorator f fields field -> Term f (Record fields) -> Term f (Record (field ': fields))
@ -167,24 +207,32 @@ termSourceTextDecorator source (ann :< _) = Just (SourceText (toText (Source.sli
 newtype Identifier = Identifier Text
  deriving (Eq, Show, ToJSON)

-identifierDecorator :: (HasField fields Category, StringConv leaf Text) => TermDecorator (Syntax leaf) fields (Maybe Identifier)
-identifierDecorator = fmap (Command.Parse.Identifier . toS) . extractLeafValue . unwrap <=< maybeIdentifier . tailF
+-- | Read the file and convert it to Unicode.
+readAndTranscodeFile :: FilePath -> IO Source
+readAndTranscodeFile path = do
+  size <- fileSize path
+  text <- case size of
+    0 -> pure B.empty
+    _ -> B.readFile path
+  transcode text

-- | A fallback parser that treats a file simply as rows of strings.
-lineByLineParser :: Parser (Syntax Text) (Record DefaultFields)
-lineByLineParser SourceBlob{..} = pure . cofree . root $ case foldl' annotateLeaves ([], 0) lines of
-  (leaves, _) -> cofree <$> leaves
-  where
-    lines = actualLines source
-    root children = (sourceRange :. Program :. rangeToSourceSpan source sourceRange :. Nil) :< Indexed children
-    sourceRange = Source.totalRange source
-    leaf charIndex line = (Range charIndex (charIndex + T.length line) :. Program :. rangeToSourceSpan source (Range charIndex (charIndex + T.length line)) :. Nil) :< Leaf line
-    annotateLeaves (accum, charIndex) line =
-      (accum <> [ leaf charIndex (Source.toText line) ] , charIndex + Source.length line)
+-- Based on https://github.com/haskell/bytestring/pull/79/files
+-- Neccessary to be able to handle /dev/null as a file.
+fileSize :: FilePath -> IO Integer
+fileSize f = withBinaryFile f ReadMode $ \h -> do
+  -- hFileSize fails if file is not regular file (like /dev/null). Catch
+  -- exception and return 0 in that case.
+  filesz <- catch (hFileSize h) useZeroIfNotRegularFile
+  pure $ fromIntegral filesz `max` 0
+  where useZeroIfNotRegularFile :: IOException -> IO Integer
+        useZeroIfNotRegularFile _ = pure 0

-- | Return the parser that should be used for a given path.
-parserForFilepath :: FilePath -> Parser (Syntax Text) (Record DefaultFields)
-parserForFilepath = parserForType . toS . takeExtension
+-- | Transcode a file to a unicode source.
+transcode :: B.ByteString -> IO Source
+transcode text = fromText <$> do
+  match <- Detect.detectCharset text
+  converter <- Convert.open match Nothing
+  pure $ Convert.toUnicode converter text


 data RoseF a b = RoseF a [b]
--- a/src/Source.hs
+++ b/src/Source.hs
@ -2,7 +2,7 @@
 {-# OPTIONS_GHC -funbox-strict-fields #-}
 module Source where

-import Prelude (FilePath, fromIntegral)
+import Prelude (FilePath)
 import Prologue
 import qualified Data.ByteString as B
 import qualified Data.Text as T
@ -10,10 +10,6 @@ import Numeric
 import Range
 import SourceSpan
 import Test.LeanCheck
-import System.IO
-import Control.Exception (catch, IOException)
-import qualified Data.Text.ICU.Convert as Convert
-import qualified Data.Text.ICU.Detect as Detect

 -- | The source, oid, path, and Maybe SourceKind of a blob in a Git repo.
 data SourceBlob = SourceBlob { source :: Source, oid :: T.Text, path :: FilePath, blobKind :: Maybe SourceKind }
@ -27,32 +23,6 @@ newtype Source = Source { sourceText :: B.ByteString }
 data SourceKind = PlainBlob Word32  | ExecutableBlob Word32 | SymlinkBlob Word32
  deriving (Show, Eq)

-- | Read the file and convert it to Unicode.
-readAndTranscodeFile :: FilePath -> IO Source
-readAndTranscodeFile path = do
-  size <- fileSize path
-  text <- case size of
-    0 -> pure B.empty
-    _ -> B.readFile path
-  transcode text
-
-- Based on https://github.com/haskell/bytestring/pull/79/files
-fileSize :: FilePath -> IO Integer
-fileSize f = withBinaryFile f ReadMode $ \h -> do
-  -- hFileSize fails if file is not regular file (like /dev/null). Catch
-  -- exception and return 0 in that case.
-  filesz <- catch (hFileSize h) useZeroIfNotRegularFile
-  pure $ fromIntegral filesz `max` 0
-  where useZeroIfNotRegularFile :: IOException -> IO Integer
-        useZeroIfNotRegularFile _ = pure 0
-
-- | Transcode a file to a unicode source.
-transcode :: B.ByteString -> IO Source
-transcode text = fromText <$> do
-  match <- Detect.detectCharset text
-  converter <- Convert.open match Nothing
-  pure $ Convert.toUnicode converter text
-
 modeToDigits :: SourceKind -> Text
 modeToDigits (PlainBlob mode) = toS $ showOct mode ""
 modeToDigits (ExecutableBlob mode) = toS $ showOct mode ""
--- a/test/IntegrationSpec.hs
+++ b/test/IntegrationSpec.hs
@ -18,6 +18,7 @@ import Source
 import Syntax
 import System.FilePath
 import System.FilePath.Glob
+import SpecHelpers
 import Test.Hspec (Spec, describe, it, SpecWith, runIO, parallel)
 import Test.Hspec.Expectations.Pretty

@ -109,9 +110,9 @@ normalizeName path = dropExtension $ dropExtension path

 testParse :: FilePath -> FilePath -> Expectation
 testParse path expectedOutput = do
-  source <- readAndTranscodeFile path
+  source <- readFileToUnicode path
  let blob = sourceBlob source path
-  term <- parserForType (toS (takeExtension path)) blob
+  term <- parserForFilePath path blob
  let actual = (Verbatim . stripWhitespace) $ printTerm term 0 TreeOnly
  expected <- (Verbatim . stripWhitespace) <$> B.readFile expectedOutput
  actual `shouldBe` expected
--- a/test/SpecHelpers.hs
+++ b/test/SpecHelpers.hs
@ -0,0 +1,19 @@
+module SpecHelpers where
+
+import Prologue
+import qualified Data.ByteString as B
+import qualified Data.Text.ICU.Convert as Convert
+import qualified Data.Text.ICU.Detect as Detect
+import Source
+
+
+-- | Read a file and convert it to Unicode.
+readFileToUnicode :: FilePath -> IO Source
+readFileToUnicode path = B.readFile path >>= transcode
+  where
+    -- | Transcode a file to a unicode source.
+    transcode :: B.ByteString -> IO Source
+    transcode text = fromText <$> do
+      match <- Detect.detectCharset text
+      converter <- Convert.open match Nothing
+      pure $ Convert.toUnicode converter text
--- a/test/TOCSpec.hs
+++ b/test/TOCSpec.hs
@ -19,6 +19,7 @@ import Renderer.TOC
 import Source
 import Syntax as S
 import Term
+import SpecHelpers
 import Test.Hspec (Spec, describe, it, parallel)
 import Test.Hspec.Expectations.Pretty
 import Test.Hspec.LeanCheck
@ -193,7 +194,7 @@ testDiff blobs = runCommand $ do

 blobsForPaths :: Both FilePath -> IO (Both SourceBlob)
 blobsForPaths paths = do
-  sources <- traverse (readAndTranscodeFile . ("test/fixtures/toc/" <>)) paths
+  sources <- traverse (readFileToUnicode . ("test/fixtures/toc/" <>)) paths
  pure $ SourceBlob <$> sources <*> pure mempty <*> paths <*> pure (Just Source.defaultPlainBlob)

 sourceSpanBetween :: (Int, Int) -> (Int, Int) -> SourceSpan