1
1
mirror of https://github.com/github/semantic.git synced 2024-12-24 23:42:31 +03:00

Tease apart file reading and transcoding

This commit is contained in:
Timothy Clem 2017-04-19 09:12:19 -07:00
parent 48c4edf362
commit eb9547fb25
7 changed files with 111 additions and 70 deletions

View File

@ -158,6 +158,7 @@ test-suite test
, RangeSpec
, SES.Myers.Spec
, SourceSpec
, SpecHelpers
, TermSpec
, TOCSpec
, IntegrationSpec
@ -185,6 +186,7 @@ test-suite test
, regex-compat
, semantic-diff
, text >= 1.2.1.3
, text-icu
, unordered-containers
, these
, vector

View File

@ -189,7 +189,7 @@ runReadFilesAtSHAs gitDir alternateObjectDirs paths shas = do
toSourceKind (Git.SymlinkBlob mode) = Source.SymlinkBlob mode
runParse :: Maybe Language -> SourceBlob -> IO (Term (Syntax Text) (Record DefaultFields))
runParse = maybe lineByLineParser parserForLanguage
runParse = parserForLanguage
runDiff :: HasField fields Category => Both (Term (Syntax Text) (Record fields)) -> Diff (Syntax Text) (Record fields)
runDiff terms = stripDiff (runBothWith diffTerms (fmap decorate terms))

View File

@ -1,5 +1,19 @@
{-# LANGUAGE DataKinds, GADTs, GeneralizedNewtypeDeriving, ScopedTypeVariables, TypeFamilies, TypeOperators #-}
module Command.Parse where
module Command.Parse
(
-- Render parser trees
jsonParseTree
, jsonIndexParseTree
, sExpressionParseTree
-- Parsers
, parserForLanguage
, parserForFilePath -- TODO: Remove. Langauge detection should not come from file extension.
-- Files and transcoding
, transcode
, sourceBlobsFromPaths
-- Git
, sourceBlobsFromSha
) where
import Category
import Data.Aeson (ToJSON, toJSON, encode, object, (.=))
@ -7,6 +21,7 @@ import Data.Aeson.Types (Pair)
import Data.Functor.Foldable hiding (Nil)
import Data.Record
import Data.String
import qualified Data.ByteString as B
import qualified Data.Text as T
import Git.Blob
import Git.Libgit2
@ -30,6 +45,47 @@ import Text.Parser.TreeSitter.Go
import Text.Parser.TreeSitter.JavaScript
import Text.Parser.TreeSitter.Ruby
import Text.Parser.TreeSitter.TypeScript
import System.IO
import Control.Exception (catch, IOException)
import qualified Data.Text.ICU.Convert as Convert
import qualified Data.Text.ICU.Detect as Detect
-- TODO: This doesn't belong in here
-- | Return a parser for a given langauge or the lineByLineParser parser.
parserForLanguage :: Maybe Language -> Parser (Syntax Text) (Record DefaultFields)
parserForLanguage Nothing = lineByLineParser
parserForLanguage (Just language) = case language of
C -> treeSitterParser C tree_sitter_c
JavaScript -> treeSitterParser JavaScript tree_sitter_javascript
TypeScript -> treeSitterParser TypeScript tree_sitter_typescript
Markdown -> cmarkParser
Ruby -> treeSitterParser Ruby tree_sitter_ruby
Language.Go -> treeSitterParser Language.Go tree_sitter_go
-- | Return a parser based on the file extension (including the ".").
-- | TODO: Remove.
parserForType :: String -> Parser (Syntax Text) (Record DefaultFields)
parserForType = parserForLanguage . languageForType
-- | Return the parser that should be used for a given path.
-- | TODO: Remove.
parserForFilePath :: FilePath -> Parser (Syntax Text) (Record DefaultFields)
parserForFilePath = parserForType . toS . takeExtension
-- | A fallback parser that treats a file simply as rows of strings.
lineByLineParser :: Parser (Syntax Text) (Record DefaultFields)
lineByLineParser SourceBlob{..} = pure . cofree . root $ case foldl' annotateLeaves ([], 0) lines of
(leaves, _) -> cofree <$> leaves
where
lines = actualLines source
root children = (sourceRange :. Program :. rangeToSourceSpan source sourceRange :. Nil) :< Indexed children
sourceRange = Source.totalRange source
leaf charIndex line = (Range charIndex (charIndex + T.length line) :. Program :. rangeToSourceSpan source (Range charIndex (charIndex + T.length line)) :. Nil) :< Leaf line
annotateLeaves (accum, charIndex) line =
(accum <> [ leaf charIndex (Source.toText line) ] , charIndex + Source.length line)
data ParseTreeFile = ParseTreeFile { parseTreeFilePath :: FilePath, node :: Rose ParseNode } deriving (Show)
@ -67,9 +123,7 @@ parseNodeToJSONFields ParseNode{..} =
sExpressionParseTree :: Bool -> [SourceBlob] -> IO ByteString
sExpressionParseTree _ blobs =
pure . printTerms TreeOnly =<< parse blobs
where parse = traverse (\sourceBlob@SourceBlob{..} -> parserForType (toS (takeExtension path)) sourceBlob)
type RAlgebra t a = Base t (t, a) -> a
where parse = traverse (\sourceBlob@SourceBlob{..} -> parserForFilePath path sourceBlob)
parseRoot :: Bool -> (FilePath -> f ParseNode -> root) -> (ParseNode -> [f ParseNode] -> f ParseNode) -> [SourceBlob] -> IO [root]
parseRoot debug construct combine blobs = for blobs (\ sourceBlob@SourceBlob{..} -> do
@ -137,21 +191,7 @@ identifierFor = fmap toS . extractLeafValue . unwrap <=< maybeIdentifier
-- | Return a parser incorporating the provided TermDecorator.
parseWithDecorator :: TermDecorator (Syntax Text) DefaultFields field -> FilePath -> Parser (Syntax Text) (Record (field ': DefaultFields))
parseWithDecorator decorator path blob = decorateTerm decorator <$> parserForType (toS (takeExtension path)) blob
-- | Return a parser based on the file extension (including the ".").
parserForType :: String -> Parser (Syntax Text) (Record DefaultFields)
parserForType mediaType = maybe lineByLineParser parserForLanguage (languageForType mediaType)
-- | Select a parser for a given Language.
parserForLanguage :: Language -> Parser (Syntax Text) (Record DefaultFields)
parserForLanguage language = case language of
C -> treeSitterParser C tree_sitter_c
JavaScript -> treeSitterParser JavaScript tree_sitter_javascript
TypeScript -> treeSitterParser TypeScript tree_sitter_typescript
Markdown -> cmarkParser
Ruby -> treeSitterParser Ruby tree_sitter_ruby
Language.Go -> treeSitterParser Language.Go tree_sitter_go
parseWithDecorator decorator path blob = decorateTerm decorator <$> parserForFilePath path blob
-- | Decorate a 'Term' using a function to compute the annotation values at every node.
decorateTerm :: (Functor f) => TermDecorator f fields field -> Term f (Record fields) -> Term f (Record (field ': fields))
@ -167,24 +207,32 @@ termSourceTextDecorator source (ann :< _) = Just (SourceText (toText (Source.sli
newtype Identifier = Identifier Text
deriving (Eq, Show, ToJSON)
identifierDecorator :: (HasField fields Category, StringConv leaf Text) => TermDecorator (Syntax leaf) fields (Maybe Identifier)
identifierDecorator = fmap (Command.Parse.Identifier . toS) . extractLeafValue . unwrap <=< maybeIdentifier . tailF
-- | Read the file and convert it to Unicode.
readAndTranscodeFile :: FilePath -> IO Source
readAndTranscodeFile path = do
size <- fileSize path
text <- case size of
0 -> pure B.empty
_ -> B.readFile path
transcode text
-- | A fallback parser that treats a file simply as rows of strings.
lineByLineParser :: Parser (Syntax Text) (Record DefaultFields)
lineByLineParser SourceBlob{..} = pure . cofree . root $ case foldl' annotateLeaves ([], 0) lines of
(leaves, _) -> cofree <$> leaves
where
lines = actualLines source
root children = (sourceRange :. Program :. rangeToSourceSpan source sourceRange :. Nil) :< Indexed children
sourceRange = Source.totalRange source
leaf charIndex line = (Range charIndex (charIndex + T.length line) :. Program :. rangeToSourceSpan source (Range charIndex (charIndex + T.length line)) :. Nil) :< Leaf line
annotateLeaves (accum, charIndex) line =
(accum <> [ leaf charIndex (Source.toText line) ] , charIndex + Source.length line)
-- Based on https://github.com/haskell/bytestring/pull/79/files
-- Neccessary to be able to handle /dev/null as a file.
fileSize :: FilePath -> IO Integer
fileSize f = withBinaryFile f ReadMode $ \h -> do
-- hFileSize fails if file is not regular file (like /dev/null). Catch
-- exception and return 0 in that case.
filesz <- catch (hFileSize h) useZeroIfNotRegularFile
pure $ fromIntegral filesz `max` 0
where useZeroIfNotRegularFile :: IOException -> IO Integer
useZeroIfNotRegularFile _ = pure 0
-- | Return the parser that should be used for a given path.
parserForFilepath :: FilePath -> Parser (Syntax Text) (Record DefaultFields)
parserForFilepath = parserForType . toS . takeExtension
-- | Transcode a file to a unicode source.
transcode :: B.ByteString -> IO Source
transcode text = fromText <$> do
match <- Detect.detectCharset text
converter <- Convert.open match Nothing
pure $ Convert.toUnicode converter text
data RoseF a b = RoseF a [b]

View File

@ -2,7 +2,7 @@
{-# OPTIONS_GHC -funbox-strict-fields #-}
module Source where
import Prelude (FilePath, fromIntegral)
import Prelude (FilePath)
import Prologue
import qualified Data.ByteString as B
import qualified Data.Text as T
@ -10,10 +10,6 @@ import Numeric
import Range
import SourceSpan
import Test.LeanCheck
import System.IO
import Control.Exception (catch, IOException)
import qualified Data.Text.ICU.Convert as Convert
import qualified Data.Text.ICU.Detect as Detect
-- | The source, oid, path, and Maybe SourceKind of a blob in a Git repo.
data SourceBlob = SourceBlob { source :: Source, oid :: T.Text, path :: FilePath, blobKind :: Maybe SourceKind }
@ -27,32 +23,6 @@ newtype Source = Source { sourceText :: B.ByteString }
data SourceKind = PlainBlob Word32 | ExecutableBlob Word32 | SymlinkBlob Word32
deriving (Show, Eq)
-- | Read the file and convert it to Unicode.
readAndTranscodeFile :: FilePath -> IO Source
readAndTranscodeFile path = do
size <- fileSize path
text <- case size of
0 -> pure B.empty
_ -> B.readFile path
transcode text
-- Based on https://github.com/haskell/bytestring/pull/79/files
fileSize :: FilePath -> IO Integer
fileSize f = withBinaryFile f ReadMode $ \h -> do
-- hFileSize fails if file is not regular file (like /dev/null). Catch
-- exception and return 0 in that case.
filesz <- catch (hFileSize h) useZeroIfNotRegularFile
pure $ fromIntegral filesz `max` 0
where useZeroIfNotRegularFile :: IOException -> IO Integer
useZeroIfNotRegularFile _ = pure 0
-- | Transcode a file to a unicode source.
transcode :: B.ByteString -> IO Source
transcode text = fromText <$> do
match <- Detect.detectCharset text
converter <- Convert.open match Nothing
pure $ Convert.toUnicode converter text
modeToDigits :: SourceKind -> Text
modeToDigits (PlainBlob mode) = toS $ showOct mode ""
modeToDigits (ExecutableBlob mode) = toS $ showOct mode ""

View File

@ -18,6 +18,7 @@ import Source
import Syntax
import System.FilePath
import System.FilePath.Glob
import SpecHelpers
import Test.Hspec (Spec, describe, it, SpecWith, runIO, parallel)
import Test.Hspec.Expectations.Pretty
@ -109,9 +110,9 @@ normalizeName path = dropExtension $ dropExtension path
testParse :: FilePath -> FilePath -> Expectation
testParse path expectedOutput = do
source <- readAndTranscodeFile path
source <- readFileToUnicode path
let blob = sourceBlob source path
term <- parserForType (toS (takeExtension path)) blob
term <- parserForFilePath path blob
let actual = (Verbatim . stripWhitespace) $ printTerm term 0 TreeOnly
expected <- (Verbatim . stripWhitespace) <$> B.readFile expectedOutput
actual `shouldBe` expected

19
test/SpecHelpers.hs Normal file
View File

@ -0,0 +1,19 @@
module SpecHelpers where
import Prologue
import qualified Data.ByteString as B
import qualified Data.Text.ICU.Convert as Convert
import qualified Data.Text.ICU.Detect as Detect
import Source
-- | Read a file and convert it to Unicode.
readFileToUnicode :: FilePath -> IO Source
readFileToUnicode path = B.readFile path >>= transcode
where
-- | Transcode a file to a unicode source.
transcode :: B.ByteString -> IO Source
transcode text = fromText <$> do
match <- Detect.detectCharset text
converter <- Convert.open match Nothing
pure $ Convert.toUnicode converter text

View File

@ -19,6 +19,7 @@ import Renderer.TOC
import Source
import Syntax as S
import Term
import SpecHelpers
import Test.Hspec (Spec, describe, it, parallel)
import Test.Hspec.Expectations.Pretty
import Test.Hspec.LeanCheck
@ -193,7 +194,7 @@ testDiff blobs = runCommand $ do
blobsForPaths :: Both FilePath -> IO (Both SourceBlob)
blobsForPaths paths = do
sources <- traverse (readAndTranscodeFile . ("test/fixtures/toc/" <>)) paths
sources <- traverse (readFileToUnicode . ("test/fixtures/toc/" <>)) paths
pure $ SourceBlob <$> sources <*> pure mempty <*> paths <*> pure (Just Source.defaultPlainBlob)
sourceSpanBetween :: (Int, Int) -> (Int, Int) -> SourceSpan