Initial layout

This commit is contained in:
Alexander Vershilov 2020-03-10 01:07:58 +03:00
commit 630c1b6861
10 changed files with 356 additions and 0 deletions

5
.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
dist-newstyle/
tmp/
cabal.project.local
stack.yaml
package.yaml

5
CHANGELOG.md Normal file
View File

@ -0,0 +1,5 @@
# Revision history for ghc-timings
## 0.0.0.1 -- 2020-03-09
* Initial layout

101
Main.hs Normal file
View File

@ -0,0 +1,101 @@
{-# LANGUAGE TransformListComp #-}
module Main where
import Control.Monad
import Control.Monad.Trans.Resource
import qualified Data.ByteString.Lazy as BSL
import qualified Data.Binary.Builder as Builder
import Data.Conduit
import Data.Conduit.Combinators as CL
import Data.Conduit.List
import Data.Set as Set
import Data.Csv as Csv
import Data.Csv.Builder as Csv
import Data.Foldable
import Data.Functor
import Data.Function
import Data.Traversable
import qualified Data.Map as Map
import qualified Data.Text.Encoding as T
import qualified Data.Text.IO as T
import Data.Aeson
import Data.Either
import Data.List
import Data.Maybe as M
import GhcBuildPhase
import GhcFile
import GHC.Exts
import qualified Data.Vector as V
import System.Environment
import System.FilePath
import Prelude hiding (mapM_, print)
import qualified Prelude
main :: IO ()
main = do
[dir] <- getArgs
files <- findDumpTimings dir
let ( files_failed,
files_parsed)
= partitionEithers $ files <&> \file ->
case stripPrefix dir file of
Nothing -> Left file
Just x -> case splitDirectories x of
("/": "build": hostOs: ghcVersion: packageName: "build": modulePath) -> Right GhcFile{..}
_ -> Left file
unless (Prelude.null files_failed) $ do
Prelude.putStrLn "Warning, some files are failed to be parsed"
Prelude.print files_failed
-- Output all files in json form for later analysis.
results <- for files_parsed $ \f -> do
steps <- fmap parsePhases $ T.readFile (rebuildFilePath dir f)
encodeFile (output </> rebuildPlainPath f <.> "json") steps
let bs = encodeDefaultOrderedByName steps
BSL.writeFile (output </> rebuildPlainPath f <.> "csv") bs
pure (f, steps)
let stats_by_package = Map.fromListWith (<>)
[ (packageName, Map.singleton (joinPath modulePath) steps)
| (GhcFile{..}, steps) <- results
]
encodeFile (output </> "stats_by_package" <.> "json") stats_by_package
for_ (Map.toList stats_by_package) $ \(package, stat) -> do
let headers = Set.toList $ Set.fromList
[ T.encodeUtf8 phaseName
| (_, steps) <- Map.toList stat
, Phase{..} <- steps
]
let bs = Csv.encodeHeader (V.fromList ("module": "total": headers))
<> mconcat
[ Csv.encodeRecord
$ module_name
: show total
: Prelude.map (\n -> maybe "" show $ Map.lookup n by_phase) headers
| (module_name, steps) <- Map.toList stat
, let total = Prelude.sum [phaseTime | Phase{..} <- steps]
, let by_phase = Map.fromListWith (+)
[(T.encodeUtf8 phaseName, phaseTime)
| Phase{..} <- steps
]
, then sortWith by (Down total)
]
BSL.writeFile (output </> package <.> "csv")
$ Builder.toLazyByteString bs
-- Prelude.print byPackage
where
output = "./tmp"
-- | Find all files that are related to the dump timings.
--
-- XXX: this method is not effective enough as it eagerly builds a list of FilePath
findDumpTimings :: String -> IO [FilePath]
findDumpTimings input = do
runResourceT $ runConduit $ sourceDirectoryDeep False input
.| CL.filter (\x -> x `endsWith` ".dump-timings")
.| consume
where
endsWith x y = (reverse y) `isPrefixOf` (reverse x)

114
README.markdown Normal file
View File

@ -0,0 +1,114 @@
The idea of this tool is to get a report about program compilation so
you will know where GHC spends time, and what does it do. This way to
can verify your ideas about how to make compilation faster without
touching GHC, and leads to better understanding why it where does it
spend time and if it reasonable or not. If improved this tool can be
used a a guidance to the improving codebase and possibly GHC to get
better timings without sacrificing functionality. At least I hope so.
For the author this tool even in it's simplest form allowed to find
a way to speedup compilation on 20% by just reorganizing code structure
in the project.
NOTE: this tool is in the very early stage, and I work on that in my
free time, basically during official holydays only. As a result I target
only my usecase, so not all configurations are supported, but I gladly
apply any merge requests that will make live of other users easier.
## How to use.
1. Download the tool:
```haskell
hub clone https://github.com/qnikst/ghc-timing-report
```
2. Build it:
```bash
cabal v2-build
```
At this point I don't suggest you to install the tool because
at such an early stage it will likely require manual configuration
a lot.
3. Configure your project in order to generate timing files:
```bash
cabal v2-configure --ghc-options=-ddump-timings --ghc-options=-ddump-to-file
```
`-ddump-timings` tells GHC to generate timings report, `-ddump-to-file` tells GHC
to store those reports to files.
4. Running:
```bash
cabal v2-run ghc-timings /Users/qnikst/workspace/another-project/dist-newstyle
```
In the `tmp` folder you'll get all the reports.
# Report files.
(Note this section will likely always be outdated despite all efforts to keep it up to date,
sorry for that)
Report that are generated keeps a ton of -garbage- useful data for each module in all the
projects you'll see files:
```
<host-os>--<ghc-version>--<package-version>--<module>.dump-timings.json
<host-os>--<ghc-version>--<package-version>--<module>.dump-timings.csv
```
That contains a table for each module with th name of the phase, module, number of allocations, and time
spend on that phase.
And report files:
```
<package-version>.csv
```
That summarizes information for the package. File keeps list of modules, and total compilation time
for module and total time for each phase. If you import `<package-version>.csv` file to some
package that works with tables like numbers, you'll see something like this:
![screen1](https://github.com/qnikst/ghc-timing-report/screenshot1.png)
# Project ideology.
Here I want to share a bits of how I work on this project and it's not usual one for me.
I don't know where this project would lead, it has proved to be useful at least
for me from the very early development steps. But I don't know how to get good analysis
and I don't know that if data will be useful or not. So I use following guidance:
a. Write in a cheap and dirty way so you get your results as fast as possible, it allows
me to actually see and verify if I can make any use of the data. It does not worth to
spend much time on the design if you'll throw it away because it's not useful at all.
But once code is proven to work and useful it worth stabilizing it by proper refactoring.
Also this means that I sometime do not use most efficient and nice tricks like foldl
package, unless I have a good understanding of it's use, so I don't spend much time
learnings that. But I'd like to be pushed into the directions that can improve the
package pipeline in a cheap way.
b. Despite of writing dirty way it worth to keep types that describe the problem on
each step, to make refactoring simpler.
c. There are many tools that can be used for data visualization and analyses so it worth
to store all intermediate data in a machine readable format. This way it's possble to
not be tighed to Haskell. (Though I hope)
# Notes.
1. It's a pity that we don't have memory residency statistics per module, because allocations
tells nothing and it's basically just another "time" report. So for now I avoid working with
that in statistics reports.
2. I've tried to build a table where I output all the phases, but such a table bacame too
big so "Numbers" that I was using for working with data refused to work with it. So I've
abadonned that idea unless I'll find a way to represent data in a compact way.

2
Setup.hs Normal file
View File

@ -0,0 +1,2 @@
import Distribution.Simple
main = defaultMain

3
cabal.project Normal file
View File

@ -0,0 +1,3 @@
packages: .
optional-packages:
vendor/**/*.cabal

43
ghc-timings.cabal Normal file
View File

@ -0,0 +1,43 @@
cabal-version: >=1.10
name: ghc-timings-report
version: 0.1.0.0
synopsis: Get statistical report about how long files were compiled.
description: Simple package that can gather information about compilation
time for later analysis.
bug-reports: https://github.com/qnikst/ghc-timings-report
license: MIT
author: Alexander Vershilov
maintainer: alexander.vershilov@gmail.com
-- copyright:
-- category:
build-type: Simple
extra-source-files: CHANGELOG.md
executable ghc-timings
main-is: Main.hs
other-modules: GhcFile
GhcBuildPhase
hs-source-dirs: src
.
default-extensions: DerivingStrategies
DeriveGeneric
DeriveAnyClass
DerivingVia
OverloadedStrings
RecordWildCards
ViewPatterns
-- other-extensions:
build-depends: base >=4.13 && <4.14,
aeson,
binary,
bytestring,
cassava,
conduit >= 1.3,
containers,
resourcet,
filepath,
text,
text-show,
vector
ghc-options: -Wall -Werror
default-language: Haskell2010

BIN
screenshot1.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 293 KiB

47
src/GhcBuildPhase.hs Normal file
View File

@ -0,0 +1,47 @@
{-# LANGUAGE StrictData #-}
-- | Definition of the types used in the analysis.
module GhcBuildPhase
( Phase(..)
, parsePhases
) where
import Data.Aeson
import Data.Csv
import Data.Functor
import Data.Maybe
import qualified Data.Text as T
import GHC.Generics
import TextShow
import TextShow.Generic
-- | Build phase that is reported in the ghc timings output.
data Phase = Phase
{ phaseName :: T.Text
, phaseModule :: T.Text
, phaseAlloc :: Int
, phaseTime :: Double
}
deriving stock (Generic)
deriving anyclass (ToJSON, FromJSON)
deriving anyclass (ToNamedRecord, DefaultOrdered)
deriving TextShow via (FromGeneric Phase)
-- | Parse .ghc-timings file timings file and get list of phases.
--
-- This is ad-hoc parsing procedure that doesn't do anything clever like parsers-combinators
-- regular expressions and stuff.
--
-- Assumes struture:
-- @name [Module]: alloc=INT time=DOUBLE@
--
-- Doesn't report errors.
parsePhases :: T.Text -> [Phase]
parsePhases input = T.lines input <&> parseStep where
parseStep x = case T.span (/='[') x of
(phaseName, T.drop 1 -> rest1) -> case T.span (/=']') rest1 of
(phaseModule, T.drop 2 -> rest2) -> case T.words rest2 of
[allocs, time] ->
let phaseAlloc = read $ T.unpack $ fromJust $ T.stripPrefix "alloc=" allocs -- !!!
phaseTime = read $ T.unpack $ fromJust $ T.stripPrefix "time=" time -- !!!
in Phase{..}
_ -> error $ "illegal line: " <> T.unpack rest2

36
src/GhcFile.hs Normal file
View File

@ -0,0 +1,36 @@
{-# LANGUAGE StrictData #-}
module GhcFile
( GhcFile(..)
, rebuildFilePath
, rebuildPlainPath
) where
import Data.Aeson
import Data.List
import GHC.Generics (Generic)
import System.FilePath
-- | Representation of the file in the filesystem structure.
--
-- This file follows pattern used in cabal build and may differ for
-- other build systems. I don't care about those, but patches are welcome.
data GhcFile = GhcFile
{ hostOs :: String -- ^ Host
, ghcVersion :: String
, packageName :: String
, modulePath :: [String]
}
deriving (Show, Generic)
deriving anyclass (ToJSON, FromJSON)
-- | Build path to the file in the file system based on prefix and 'GhcFile'
--
-- It looks terrible, seems a wrong abstraction is here.
rebuildFilePath :: FilePath -> GhcFile -> FilePath
rebuildFilePath base GhcFile{..} =
base </> "build" </> hostOs </> ghcVersion </> packageName </> "build" </> joinPath modulePath
-- | Convert 'GhcFile' into plain filename that we use in our report storage.
rebuildPlainPath :: GhcFile -> FilePath
rebuildPlainPath GhcFile{..} =
intercalate "--" $ [hostOs, ghcVersion, packageName] ++ modulePath