├── .ghci ├── .gitattributes ├── .github └── workflows │ └── haskell-ci.yml ├── .gitignore ├── .hlint.yaml ├── .projectile ├── ChangeLog.md ├── LICENSE ├── app ├── bench.hs └── explore.hs ├── cabal.project ├── other ├── ExampleSum-1000-1000-MeasureTime-StatAverage.perf ├── RunExample-ExampleNoOp-1000-1000-MeasureTime-StatAverage.perf ├── RunExampleIO-MeasureTime-StatAverage.perf ├── RunExamples-1000-1000-MeasureTime-StatAverage.perf ├── perf.svg ├── perf20000.svg └── perffuse.svg ├── perf.cabal ├── perf.org ├── readme.md └── src ├── Perf.hs └── Perf ├── Algos.hs ├── BigO.hs ├── Chart.hs ├── Count.hs ├── Measure.hs ├── Report.hs ├── Space.hs ├── Stats.hs ├── Time.hs └── Types.hs /.ghci: -------------------------------------------------------------------------------- 1 | :set -XOverloadedStrings 2 | :set -Wno-type-defaults 3 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | other/* linguist-documentation 2 | index.html linguist-documentation 3 | -------------------------------------------------------------------------------- /.github/workflows/haskell-ci.yml: -------------------------------------------------------------------------------- 1 | name: build 2 | on: [push] 3 | 4 | # INFO: The following configuration block ensures that only one build runs per branch, 5 | # which may be desirable for projects with a costly build process. 6 | # Remove this block from the CI workflow to let each CI job run to completion. 7 | concurrency: 8 | group: build-${{ github.ref }} 9 | cancel-in-progress: true 10 | 11 | jobs: 12 | hlint: 13 | runs-on: ubuntu-latest 14 | steps: 15 | - uses: actions/checkout@v4 16 | - uses: haskell-actions/hlint-setup@v2 17 | - uses: haskell-actions/hlint-run@v2 18 | with: 19 | path: . 20 | fail-on: warning 21 | ormolu: 22 | runs-on: ubuntu-latest 23 | steps: 24 | - uses: actions/checkout@v4 25 | - uses: haskell-actions/run-ormolu@v16 26 | build: 27 | name: GHC ${{ matrix.ghc-version }} on ${{ matrix.os }} 28 | runs-on: ${{ matrix.os }} 29 | strategy: 30 | fail-fast: false 31 | matrix: 32 | os: [ubuntu-latest] 33 | ghc-version: ['9.10', '9.8', '9.6'] 34 | 35 | include: 36 | - os: windows-latest 37 | ghc-version: '9.8' 38 | - os: macos-latest 39 | ghc-version: '9.8' 40 | 41 | steps: 42 | - uses: actions/checkout@v4 43 | 44 | - name: Set up GHC ${{ matrix.ghc-version }} 45 | uses: haskell-actions/setup@v2 46 | id: setup 47 | with: 48 | ghc-version: ${{ matrix.ghc-version }} 49 | 50 | - name: Configure the build 51 | run: | 52 | cabal configure --enable-tests --enable-benchmarks --disable-documentation 53 | cabal build --dry-run 54 | # The last step generates dist-newstyle/cache/plan.json for the cache key. 55 | 56 | - name: Restore cached dependencies 57 | uses: actions/cache/restore@v4 58 | id: cache 59 | env: 60 | key: ${{ runner.os }}-ghc-${{ steps.setup.outputs.ghc-version }}-cabal-${{ steps.setup.outputs.cabal-version }} 61 | with: 62 | path: ${{ steps.setup.outputs.cabal-store }} 63 | key: ${{ env.key }}-plan-${{ hashFiles('**/plan.json') }} 64 | restore-keys: ${{ env.key }}- 65 | 66 | - name: Install dependencies 67 | # If we had an exact cache hit, the dependencies will be up to date. 68 | if: steps.cache.outputs.cache-hit != 'true' 69 | run: cabal build all --only-dependencies 70 | 71 | # Cache dependencies already here, so that we do not have to rebuild them should the subsequent steps fail. 72 | - name: Save cached dependencies 73 | uses: actions/cache/save@v4 74 | # If we had an exact cache hit, trying to save the cache would error because of key clash. 75 | if: steps.cache.outputs.cache-hit != 'true' 76 | with: 77 | path: ${{ steps.setup.outputs.cabal-store }} 78 | key: ${{ steps.cache.outputs.cache-primary-key }} 79 | 80 | - name: Build 81 | run: cabal build all 82 | 83 | - name: Check cabal file 84 | run: cabal check 85 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /.stack-work/ 2 | .DS_Store 3 | /dist-newstyle/ 4 | /stack.yaml.lock 5 | /.hie/ 6 | /cabal.project.local* 7 | /.ghc.environment.* 8 | /.hkgr/ 9 | /checklist.org 10 | /other/*.perf 11 | -------------------------------------------------------------------------------- /.hlint.yaml: -------------------------------------------------------------------------------- 1 | - ignore: {name: Reduce duplication, within: Perf.Measure} 2 | - ignore: {name: Redundant Lambda} 3 | - ignore: {name: Eta reduce} 4 | - ignore: {name: Use if} 5 | -------------------------------------------------------------------------------- /.projectile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tonyday567/perf/5f1cd67117f43511dd27064a933f2c10b69a96ce/.projectile -------------------------------------------------------------------------------- /ChangeLog.md: -------------------------------------------------------------------------------- 1 | 0.14 2 | === 3 | * Added charting 4 | * Added Perf.Chart 5 | * refactored reportMain 6 | * expanded ReportOptions 7 | * added more algorithms to Example 8 | * refactored Perf.BigO 9 | * added PerfDumpOptions 10 | * reportOrg2D ==> report2D 11 | * added reportBigO 12 | * added reportTasty' 13 | * added reportToConsole & parseClock 14 | * refactored SpaceStats 15 | * added tickIOWith, timesN & timesNWith 16 | * removed measureM from Measure 17 | * added multiN 18 | 19 | 20 | 0.13 21 | === 22 | * replaced rdtsc with clocks library. 23 | 24 | 0.12.0.0 25 | === 26 | * added reportMain and support for executable development. 27 | * refactored app/perf-explore 28 | * created app/perf-bench and aded as a `cabal bench` thing. 29 | 30 | 0.11.0.0 31 | === 32 | * added Perf.Count 33 | * GHC 9.6.2 support 34 | 35 | 0.10.0 36 | === 37 | * GHC 9.2.1 support 38 | * inclusion of BigO 39 | 40 | 0.9.0 41 | === 42 | * Removed deepseq dependency 43 | 44 | 0.8.0 45 | === 46 | * GHC 9.0.1 support 47 | * internal fixes to remove numhask dependency 48 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2018, Tony Day 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of Tony Day nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /app/bench.hs: -------------------------------------------------------------------------------- 1 | -- | Sum example, measured using default settings. 2 | module Main where 3 | 4 | import Data.List qualified as List 5 | import Perf 6 | import Prelude 7 | 8 | main :: IO () 9 | main = do 10 | let l = 1000 11 | let a = ExampleSum 12 | reportMain a defaultReportOptions (List.intercalate "-" [show a, show @Int l]) (testExample . examplePattern a) 13 | -------------------------------------------------------------------------------- /app/explore.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedLabels #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | -- | basic measurement and callibration 5 | module Main where 6 | 7 | import Control.DeepSeq 8 | import Control.Monad 9 | import Control.Monad.State.Lazy 10 | import Data.List (intercalate) 11 | import Data.Map.Strict qualified as Map 12 | import Data.Text (Text) 13 | import Data.Text qualified as Text 14 | import Data.Text.IO qualified as Text 15 | import GHC.Exts 16 | import GHC.Generics 17 | import Optics.Core 18 | import Options.Applicative 19 | import Options.Applicative.Help.Pretty 20 | import Perf 21 | import Prelude 22 | 23 | data Run = RunExample | RunExampleIO | RunSums | RunLengths | RunNoOps | RunTicks deriving (Eq, Show) 24 | 25 | data AppConfig = AppConfig 26 | { appRun :: Run, 27 | appExample :: Example, 28 | appReportOptions :: ReportOptions 29 | } 30 | deriving (Eq, Show, Generic) 31 | 32 | defaultAppConfig :: AppConfig 33 | defaultAppConfig = AppConfig RunExample ExampleSum defaultReportOptions 34 | 35 | parseRun :: Parser Run 36 | parseRun = 37 | flag' RunSums (long "sums" <> help "run on sum algorithms") 38 | <|> flag' RunLengths (long "lengths" <> help "run on length algorithms") 39 | <|> flag' RunExample (long "example" <> help "run on the example algorithm" <> style (annotate bold)) 40 | <|> flag' RunExampleIO (long "exampleIO" <> help "exampleIO test") 41 | <|> flag' RunNoOps (long "noops" <> help "noops test") 42 | <|> flag' RunTicks (long "ticks" <> help "tick test") 43 | <|> pure RunExample 44 | 45 | appParser :: AppConfig -> Parser AppConfig 46 | appParser def = 47 | AppConfig 48 | <$> parseRun 49 | <*> parseExample 50 | <*> parseReportOptions (view #appReportOptions def) 51 | 52 | appConfig :: AppConfig -> ParserInfo AppConfig 53 | appConfig def = 54 | info 55 | (appParser def <**> helper) 56 | (fullDesc <> header "Examples of perf usage (defaults in bold)") 57 | 58 | -- | * exampleIO 59 | exampleIO :: (Semigroup t) => PerfT IO t () 60 | exampleIO = do 61 | txt <- fam "file-read" (Text.readFile "src/Perf.hs") 62 | n <- ffap "length" Text.length txt 63 | fam "print-result" (Text.putStrLn $ "length of file is: " <> Text.pack (show n)) 64 | 65 | -- | * sums 66 | -- | measure the various versions of a tick. 67 | statTicks :: (NFData t, NFData b) => Text -> (t -> b) -> t -> Int -> StatDType -> StateT (Map.Map [Text] Double) IO () 68 | statTicks l f a n s = do 69 | addStat [l, "tick"] . statD s . fmap fromIntegral =<< lift (fst <$> multi tick n f a) 70 | addStat [l, "tickWHNF"] . statD s . fmap fromIntegral =<< lift (fst <$> multi tickWHNF n f a) 71 | addStat [l, "tickLazy"] . statD s . fmap fromIntegral =<< lift (fst <$> multi tickLazy n f a) 72 | addStat [l, "tickForce"] . statD s . fmap fromIntegral =<< lift (fst <$> multi tickForce n f a) 73 | addStat [l, "tickForceArgs"] . statD s . fmap fromIntegral =<< lift (fst <$> multi tickForceArgs n f a) 74 | addStat [l, "stepTime"] . statD s . fmap fromIntegral =<< lift (mconcat . fmap snd . take 1 . Map.toList <$> execPerfT (toMeasureN n stepTime) (f |$| a)) 75 | addStat [l, "times"] . statD s . fmap fromIntegral =<< lift (mconcat . fmap snd . take 1 . Map.toList <$> execPerfT (times n) (f |$| a)) 76 | addStat [l, "timesn"] . statD s . fmap fromIntegral =<< lift (mconcat . fmap snd . Map.toList <$> execPerfT (pure <$> timesN n) (f |$| a)) 77 | 78 | statTicksSum :: (NFData b, Enum b, Num b) => SumPattern b -> Int -> StatDType -> StateT (Map.Map [Text] Double) IO () 79 | statTicksSum (SumFuse label f a) n s = statTicks label f a n s 80 | statTicksSum (SumFusePoly label f a) n s = statTicks label f a n s 81 | statTicksSum (SumPoly label f a) n s = statTicks label f a n s 82 | statTicksSum (SumMono label f a) n s = statTicks label f a n s 83 | 84 | statTicksSums :: Int -> Int -> StatDType -> IO (Map.Map [Text] Double) 85 | statTicksSums n l s = flip execStateT Map.empty $ mapM_ (\x -> statTicksSum x n s) (allSums l) 86 | 87 | -- * no-op testing 88 | 89 | perfNoOps :: (Semigroup a) => Measure IO a -> IO (Map.Map Text a) 90 | perfNoOps meas = 91 | execPerfT meas $ do 92 | liftIO $ warmup 1000 93 | fap "const" (const ()) () 94 | fam "pure" (pure ()) 95 | 96 | main :: IO () 97 | main = do 98 | o <- execParser (appConfig defaultAppConfig) 99 | let repOptions = appReportOptions o 100 | let n = reportN repOptions 101 | let s = reportStatDType repOptions 102 | let mt = reportMeasureType repOptions 103 | let c = reportClock repOptions 104 | let !l = reportLength repOptions 105 | let a = appExample o 106 | let r = appRun o 107 | 108 | case r of 109 | RunExample -> do 110 | reportMain 111 | a 112 | repOptions 113 | (intercalate "-" [show r, show a, show l]) 114 | (testExample . examplePattern a) 115 | RunExampleIO -> do 116 | m1 <- execPerfT (measureDs mt c 1) exampleIO 117 | (_, (m', m2)) <- outer "outer-total" (measureDs mt c 1) (measureDs mt c 1) exampleIO 118 | let ms = mconcat [Map.mapKeys (\x -> ["normal", x]) m1, Map.mapKeys (\x -> ["outer", x]) (m2 <> m')] 119 | putStrLn "" 120 | let o' = replaceDefaultFilePath (intercalate "-" [show r, show mt, show s]) repOptions 121 | report o' (fmap (statD s) <$> ms) 122 | RunSums -> do 123 | m <- statSums n l (measureDs mt c) 124 | let o' = replaceDefaultFilePath (intercalate "-" [show r, show mt, show n, show l, show s]) repOptions 125 | report o' (statify s m) 126 | RunLengths -> do 127 | m <- statLengths n l (measureDs mt c) 128 | let o' = replaceDefaultFilePath (intercalate "-" [show r, show mt, show n, show l, show s]) repOptions 129 | report o' (statify s m) 130 | RunNoOps -> do 131 | m <- perfNoOps (measureDs mt c n) 132 | let o' = replaceDefaultFilePath (intercalate "-" [show r, show mt, show n]) repOptions 133 | report o' (allStats 4 (Map.mapKeys (: []) m)) 134 | RunTicks -> do 135 | m <- statTicksSums n l s 136 | report2D m 137 | -------------------------------------------------------------------------------- /cabal.project: -------------------------------------------------------------------------------- 1 | packages: 2 | perf.cabal 3 | -------------------------------------------------------------------------------- /other/ExampleSum-1000-1000-MeasureTime-StatAverage.perf: -------------------------------------------------------------------------------- 1 | fromList [(["sum","time"],13782.642)] -------------------------------------------------------------------------------- /other/RunExample-ExampleNoOp-1000-1000-MeasureTime-StatAverage.perf: -------------------------------------------------------------------------------- 1 | fromList [(["noop","time"],169.004)] -------------------------------------------------------------------------------- /other/RunExampleIO-MeasureTime-StatAverage.perf: -------------------------------------------------------------------------------- 1 | fromList [(["normal","file-read","time"],230708.0),(["normal","length","time"],2708.0),(["normal","print-result","time"],37459.0),(["outer","file-read","time"],60500.0),(["outer","length","time"],959.0),(["outer","outer-total","time"],73875.0),(["outer","print-result","time"],9792.0)] -------------------------------------------------------------------------------- /other/RunExamples-1000-1000-MeasureTime-StatAverage.perf: -------------------------------------------------------------------------------- 1 | fromList [(["constFuse","time"],736.132),(["lengthF","time"],5132.127),(["mapInc","time"],125.111),(["noop","time"],110.868),(["sum","time"],10436.082),(["sumFuse","time"],1812.501)] -------------------------------------------------------------------------------- /other/perf.svg: -------------------------------------------------------------------------------- 1 | 0 200 400 600 8001,000 5,00010,00015,00020,00025,00030,000sum: timeaverage: 9,170median: 8,680best: 8,630 -------------------------------------------------------------------------------- /perf.cabal: -------------------------------------------------------------------------------- 1 | cabal-version: 3.0 2 | name: perf 3 | version: 0.14.0.2 4 | license: BSD-3-Clause 5 | license-file: LICENSE 6 | copyright: Tony Day (c) 2018 7 | category: performance 8 | author: Tony Day, Marco Zocca 9 | maintainer: tonyday567@gmail.com 10 | homepage: https://github.com/tonyday567/perf#readme 11 | bug-reports: https://github.com/tonyday567/perf/issues 12 | synopsis: Performance methods and monad. 13 | description: 14 | A set of tools to measure performance of Haskell programs. 15 | See the Perf module and readme for an example and full API documentation. 16 | 17 | build-type: Simple 18 | tested-with: 19 | , GHC == 9.10.1 20 | , GHC == 9.6.5 21 | , GHC == 9.8.2 22 | extra-doc-files: 23 | ChangeLog.md 24 | readme.md 25 | 26 | source-repository head 27 | type: git 28 | location: https://github.com/tonyday567/perf 29 | 30 | common ghc-options-exe-stanza 31 | ghc-options: 32 | -fforce-recomp 33 | -funbox-strict-fields 34 | -rtsopts 35 | -threaded 36 | -with-rtsopts=-N 37 | 38 | common ghc-options-stanza 39 | ghc-options: 40 | -Wall 41 | -Wcompat 42 | -Widentities 43 | -Wincomplete-record-updates 44 | -Wincomplete-uni-patterns 45 | -Wpartial-fields 46 | -Wredundant-constraints 47 | -fproc-alignment=64 48 | 49 | common ghc2021-stanza 50 | default-language: GHC2021 51 | 52 | library 53 | import: ghc-options-stanza 54 | import: ghc2021-stanza 55 | hs-source-dirs: src 56 | build-depends: 57 | , base >=4.14 && <5 58 | , boxes >=0.1.5 && <0.2 59 | , chart-svg >=0.7 && <0.9 60 | , clock >=0.8 && <0.9 61 | , containers >=0.6 && <0.8 62 | , deepseq >=1.4.4 && <1.6 63 | , formatn >=0.2.1 && <0.4 64 | , mtl >=2.2.2 && <2.4 65 | , numhask-space >=0.10 && <0.14 66 | , optics-core >=0.4.1 && <0.5 67 | , optparse-applicative >=0.17 && <0.20 68 | , prettychart >=0.3 && <0.4 69 | , prettyprinter >=1.7.1 && <1.8 70 | , recursion-schemes >=5.2.2 && <5.3 71 | , tasty >=1.5.2 && <1.6 72 | , tasty-bench >=0.4 && <0.5 73 | , text >=1.2 && <2.2 74 | , vector >=0.12.3 && <0.14 75 | exposed-modules: 76 | Perf 77 | Perf.Algos 78 | Perf.BigO 79 | Perf.Chart 80 | Perf.Count 81 | Perf.Measure 82 | Perf.Report 83 | Perf.Space 84 | Perf.Stats 85 | Perf.Time 86 | Perf.Types 87 | ghc-options: -O2 88 | 89 | executable perf-explore 90 | import: ghc-options-exe-stanza 91 | import: ghc-options-stanza 92 | import: ghc2021-stanza 93 | main-is: explore.hs 94 | hs-source-dirs: app 95 | build-depends: 96 | , base >=4.14 && <5 97 | , containers >=0.6 && <0.8 98 | , deepseq >=1.4.4 && <1.6 99 | , mtl >=2.2.2 && <2.4 100 | , optics-core >=0.4.1 && <0.5 101 | , optparse-applicative >=0.17 && <0.20 102 | , perf 103 | , text >=1.2 && <2.2 104 | ghc-options: -O2 105 | 106 | benchmark perf-bench 107 | import: ghc-options-exe-stanza 108 | import: ghc-options-stanza 109 | import: ghc2021-stanza 110 | main-is: bench.hs 111 | hs-source-dirs: app 112 | build-depends: 113 | , base >=4.14 && <5 114 | , perf 115 | ghc-options: -O2 116 | type: exitcode-stdio-1.0 117 | -------------------------------------------------------------------------------- /perf.org: -------------------------------------------------------------------------------- 1 | #+TITLE: perf 2 | #+PROPERTY: header-args :exports both 3 | #+PROPERTY: header-args :eval no-export 4 | 5 | [[https://hackage.haskell.org/package/perf][file:https://img.shields.io/hackage/v/perf.svg]] [[https://github.com/tonyday567/perf/actions][file:https://github.com/tonyday567/perf/workflows/haskell-ci.yml/badge.svg]] 6 | 7 | * Features 8 | 9 | ~perf~ is an experimental library with a focus on the low-level empirics of Haskell code performance. If you are looking for a quick and reliable performance benchmark, criterion and tasty-bench are both good choices. If your results are confounding, however, you may need to dig deeper, and this is the problem space of ~perf~. 10 | 11 | The library: 12 | 13 | - provides a monad transformer, ~PerfT~. The criterion API tends towards an atomistic approach - bust code up into snippets, copy-paste into a bench.hs and measure their isolated performance. In contrast, with ~PerfT~ performance can be measured within a code snippet's original context. Differing code points can be labelled and measured as part of a single run, encouraging a much faster observation - experimentation - refactor cycle. 14 | 15 | - is polymorphic to what, exactly, is being measured, so that concepts such as counters, debug checks, time and space performance can share treatment. 16 | 17 | - attempts to measure big O for algorithms that can be defined in terms of input size growth. 18 | 19 | - includes live charting of raw performance results via chart-svg and prettychart 20 | 21 | * Usage 22 | 23 | Probably the best introduction to ~perf~ is via the perf-explore executable: 24 | 25 | #+begin_src sh :results output 26 | perf-explore 27 | #+end_src 28 | 29 | #+RESULTS: 30 | : label1 label2 old result new result change 31 | : 32 | : sum time 9.93e3 7.57e3 improvement 33 | 34 | Summing [1..1000] took 9,930 nanoseconds, an improvement versus the on file performance previously measured. 35 | 36 | Live charts of raw performance measurement can be obtained via the prettychart library with: 37 | 38 | #+begin_src sh :results output 39 | prettychart-watch --watch --filepath other --port 3566 40 | #+end_src 41 | 42 | ... and pointer your browser at localhost:3566 43 | 44 | #+begin_src sh :file other/perf.svg :results output graphics file :exports both 45 | perf-explore -n 1000 --nocheck --chart 46 | #+end_src 47 | 48 | #+RESULTS: 49 | [[file:other/perf.svg]] 50 | 51 | In this particular measure, there was an improvement, dropping from about 10,000 nanos to 8,600 nanos. Increasing the number of measurements: 52 | 53 | #+begin_src sh :file other/perf20000.svg :results output graphics file :exports both 54 | perf-explore -n 20000 --nocheck --chart --chartpath other/perf20000.svg 55 | #+end_src 56 | 57 | #+RESULTS: 58 | [[file:other/perf20000.svg]] 59 | 60 | Improvements seem to continue as n increases before stabilising (after a GC perhaps) at 3,500 nanos 61 | 62 | 63 | 64 | #+begin_src sh :results output 65 | perf-explore -n 20000 --order --nocheck --tasty 66 | #+end_src 67 | 68 | #+RESULTS: 69 | : label1 label2 results 70 | : 71 | : sum time 3.51e3 72 | : 73 | : sum:time 3.5 * O(N1) 74 | : tasty:time: 3510 75 | 76 | The order of the computation (=\l -> fap sum [1 .. l]=) is O(N1) and the results are very close to the tasty-bench result. 77 | 78 | In comparsion, (\l -> fap (\x -> sum [1 .. x]) l): 79 | 80 | #+begin_src sh :file other/perffuse.svg :results output graphics file :exports both 81 | perf-explore --nocheck --sumFuse -n 100000 --chart --chartpath other/perffuse.svg --order 82 | #+end_src 83 | 84 | #+RESULTS: 85 | [[file:other/perffuse.svg]] 86 | 87 | #+begin_src sh :results output :exports both 88 | perf-explore --nocheck --sumFuse -n 100000 --order 89 | #+end_src 90 | 91 | #+RESULTS: 92 | : label1 label2 results 93 | : 94 | : sumFuse time 6.78e2 95 | : 96 | : sumFuse:time 0.66 * O(N1) 97 | 98 | ... is much faster. Hooray for list fusion! 99 | 100 | * Issues 101 | 102 | ** fragility 103 | 104 | Results, especially for simple computations, are fragile and can show large variance in performance characteristics in identical runs, and across differing compilations. Whether this is due to library flaws or is just the nature of ghc is an open question. 105 | 106 | ** Statistics 107 | 108 | #+begin_quote 109 | Obligatory disclaimer: statistics is a tricky matter, there is no one-size-fits-all approach. In the absence of a good theory simplistic approaches are as (un)sound as obscure ones. Those who seek statistical soundness should rather collect raw data and process it themselves using a proper statistical toolbox. Data reported by tasty-bench is only of indicative and comparative significance. ~ [[https://hackage.haskell.org/package/tasty-bench-0.4/docs/Test-Tasty-Bench.html#t:Benchmarkable][tasty-bench]] 110 | #+end_quote 111 | 112 | #+begin_quote 113 | variance introduced by outliers: 88% (severely inflated) ~ [[https://hackage.haskell.org/package/criterion][criterion]] 114 | #+end_quote 115 | 116 | The library default is to report the 10th percentile as a summary statistic, and this is a matter of taste, determined mostly by the purpose of the measurement. 117 | 118 | ** ffap and fap 119 | 120 | #+begin_src haskell-ng :results output 121 | :t ffap 122 | #+end_src 123 | 124 | #+RESULTS: 125 | : ffap 126 | : :: (Control.DeepSeq.NFData a, Control.DeepSeq.NFData b, MonadIO m, 127 | : Semigroup t) => 128 | : Text.Text -> (a -> b) -> a -> PerfT m t b 129 | 130 | ffap and fap are broadly similar to criterion's nf and whnf respectively, but passes throught the results of the computation into the monad transformer, enabling in-context measurement. 131 | 132 | A fine-grained and detailed examination of the effect of measurement on laziness and on core details would be beneficial to the library. 133 | 134 | ** tasty 135 | 136 | The library was originally developed before tasty-bench, which does a great job of integrating into the tasty api, and a future refactor may integrate with this, rather than supply idiosyncratic methods. 137 | 138 | ** order 139 | 140 | BigOrder calculations tend to be fragile and sometimes differ from theory. 141 | 142 | * Development 143 | 144 | This org file has been used to develop and document library innovation and testing, and may be of use to users in understanding the library. Note that running ~perf~ via ghci is very slow compared with an external process which accesses the compiled version of the library. 145 | 146 | #+begin_src haskell-ng :results output 147 | :r 148 | :set -Wno-type-defaults 149 | :set -Wno-unused-do-bind 150 | :set -Wno-name-shadowing 151 | :set -XOverloadedStrings 152 | :set -XOverloadedLabels 153 | import Perf 154 | import Perf.Report 155 | import Data.FormatN 156 | import qualified Data.Text as Text 157 | import qualified Data.Text.IO as Text 158 | import qualified Data.Map.Strict as Map 159 | import Control.Monad 160 | import Data.Bifunctor 161 | import System.Clock 162 | import Data.List qualified as List 163 | import Control.Category ((>>>)) 164 | import Optics.Core 165 | import Data.Foldable 166 | import NumHask.Space 167 | putStrLn "ok" 168 | import Chart hiding (tick) 169 | import Prettychart 170 | import Chart.Examples 171 | import Perf.Chart 172 | (disp,q) <- startChartServer Nothing 173 | disp lineExample 174 | import Prettyprinter 175 | import Control.Monad.State.Lazy 176 | import Text.PrettyPrint.Boxes 177 | #+end_src 178 | 179 | #+RESULTS: 180 | : Ok, 11 modules loaded. 181 | : ok 182 | : Setting phasegrhsc it>o stun... (poTrrtu e9 183 | : 160) (cgthrcli->c to quitg)h 184 | 185 | #+begin_src haskell-ng :results output 186 | l = 1000 187 | n = 1000 188 | 189 | :{ 190 | p = do 191 | ffap "sum" sum [1 .. l] 192 | ffap "sumfuse" (\x -> sum [1 .. x]) l 193 | :} 194 | :t p 195 | run = runPerfT (times n) p 196 | :t run 197 | (res, m) <- run 198 | :t m 199 | median . fmap fromIntegral <$> m 200 | #+end_src 201 | 202 | #+RESULTS: 203 | : ghci| ghci| ghci| ghci| ghci> p :: (MonadIO m, Semigroup t, Control.DeepSeq.NFData b, Num b, 204 | : Enum b) => 205 | : PerfT m t b 206 | : run 207 | : :: (Control.DeepSeq.NFData a, Num a, Enum a) => 208 | : IO (a, Map.Map Text.Text [Nanos]) 209 | : m :: Map.Map Text.Text [Nanos] 210 | : fromList [("sum",21978.1),("sumfuse",26710.18)] 211 | 212 | * Details 213 | 214 | ** System.Clock 215 | 216 | The default clock is MonoticRaw for linux & macOS, and ThreadCPUTime for Windows. 217 | 218 | *** resolution 219 | 220 | #+begin_src haskell-ng :results output 221 | getRes Monotonic 222 | getRes Realtime 223 | getRes ProcessCPUTime 224 | getRes ThreadCPUTime 225 | getRes MonotonicRaw 226 | #+end_src 227 | 228 | #+RESULTS: 229 | : TimeSpec {sec = 0, nsec = 1000} 230 | : TimeSpec {sec = 0, nsec = 1000} 231 | : TimeSpec {sec = 0, nsec = 1000} 232 | : TimeSpec {sec = 0, nsec = 42} 233 | : TimeSpec {sec = 0, nsec = 42} 234 | 235 | ** ticks 236 | 237 | The various versions of tick and a variety of algorithms are artifacts of ongoing exploration. 238 | 239 | #+begin_src sh :results output 240 | perf-explore -n 20000 --best --ticks 241 | #+end_src 242 | 243 | #+RESULTS: 244 | #+begin_example 245 | algo stepTime tick tickForce tickForceArgs tickLazy tickWHNF times timesn 246 | sumAux 3.11e3 3.11e3 3.11e3 3.11e3 5.13e0 3.11e3 3.11e3 3.10e3 247 | sumCata 3.11e3 3.11e3 3.11e3 3.11e3 5.11e0 3.11e3 3.11e3 3.14e3 248 | sumCo 3.11e3 3.11e3 3.11e3 3.11e3 5.06e0 3.11e3 3.11e3 3.08e3 249 | sumCoCase 3.11e3 3.11e3 3.11e3 3.11e3 5.11e0 3.11e3 3.11e3 3.08e3 250 | sumCoGo 3.11e3 3.11e3 3.11e3 3.11e3 5.06e0 3.11e3 3.11e3 3.12e3 251 | sumF 3.48e3 3.49e3 3.46e3 3.46e3 5.06e0 3.48e3 3.48e3 3.48e3 252 | sumFlip 3.48e3 3.48e3 3.45e3 3.45e3 5.03e0 3.48e3 3.48e3 3.48e3 253 | sumFlipLazy 3.48e3 3.48e3 3.45e3 3.45e3 4.96e0 3.48e3 3.48e3 3.45e3 254 | sumFoldr 3.11e3 3.11e3 3.11e3 3.11e3 5.13e0 3.11e3 3.11e3 3.11e3 255 | sumFuse 6.54e2 6.54e2 6.54e2 6.54e2 5.17e0 6.54e2 6.54e2 6.39e2 256 | sumFuseFoldl' 6.54e2 6.54e2 6.54e2 6.54e2 5.00e0 6.54e2 6.54e2 6.44e2 257 | sumFuseFoldr 9.93e2 9.92e2 9.92e2 9.92e2 5.13e0 9.92e2 9.93e2 9.63e2 258 | sumFusePoly 6.56e2 6.56e2 6.56e2 6.56e2 5.12e0 6.56e2 6.57e2 6.47e2 259 | sumLambda 3.48e3 3.49e3 3.48e3 3.48e3 5.12e0 3.48e3 3.48e3 3.55e3 260 | sumMono 3.48e3 3.48e3 3.46e3 3.46e3 5.00e0 3.48e3 3.48e3 3.50e3 261 | sumPoly 3.62e3 3.49e3 3.54e3 3.56e3 5.04e0 3.71e3 3.62e3 3.70e3 262 | sumSum 3.48e3 3.49e3 3.48e3 3.48e3 4.98e0 3.48e3 3.48e3 3.49e3 263 | sumTail 3.48e3 3.49e3 3.45e3 3.45e3 5.00e0 3.48e3 3.48e3 3.51e3 264 | sumTailLazy 3.48e3 3.48e3 3.45e3 3.45e3 5.16e0 3.48e3 3.48e3 3.49e3 265 | 266 | #+end_example 267 | 268 | ** Time 269 | *** What is a tick? 270 | 271 | A fundamental operation of Perf.Time is tick, which sandwiches a (strict) function application between two readings of a clock, and returns time in nanoseconds, and the computation result. In this way, the `Perf` monad can be inserted into the midst of a computation in an attempt to measure performance in-situ as opposed to sitting off in a separate and decontextualized process. 272 | 273 | #+begin_src haskell-ng :results output :exports both 274 | :t tick 275 | #+end_src 276 | 277 | #+RESULTS: 278 | : tick :: (a -> b) -> a -> IO (Nanos, b) 279 | 280 | =tick= returns in the IO monad, because reading a cycle counter is an IO effect. A trivial but fundamental point is that performance measurement effects the computation being measured. 281 | 282 | 283 | *** tick_ 284 | 285 | tick_ measures the nanoseconds between two immediate clock reads. 286 | 287 | #+begin_src haskell-ng :results output :exports both 288 | :t tick_ 289 | #+end_src 290 | 291 | #+RESULTS: 292 | : tick_ :: IO Nanos 293 | 294 | 295 | #+begin_src haskell-ng :results output :exports both 296 | replicateM 10 tick_ 297 | #+end_src 298 | 299 | #+RESULTS: 300 | : [1833,500,416,416,416,375,375,416,416,416] 301 | 302 | *** multiple ticks 303 | 304 | #+begin_src haskell-ng :results output :exports both 305 | fmap (fmap (fst)) . replicateM 10 $ tick (const ()) () 306 | #+end_src 307 | 308 | #+RESULTS: 309 | : [7000,2333,2000,2208,1958,1959,1959,2000,2000,1959] 310 | 311 | Here, ~const () ()~ was evaluated and took 7 micro-seconds for the first effect, reducing down to 2 msecs after 10 effects. 312 | 313 | *** tickIO 314 | 315 | ~tickIO~ measures the evaluation of an IO value. 316 | 317 | #+begin_src haskell-ng :results output :exports both 318 | :t tickIO 319 | #+end_src 320 | 321 | #+RESULTS: 322 | : tickIO :: IO a -> IO (Cycles, a) 323 | 324 | #+begin_src haskell-ng :results output :exports both 325 | fmap (fmap fst) . replicateM 10 $ tickIO (pure ()) 326 | #+end_src 327 | 328 | #+RESULTS: 329 | : [5541,1625,1458,1833,1375,1416,1375,1375,1375,1375] 330 | 331 | *** sum example 332 | 333 | #+begin_src haskell-ng :exports both 334 | fmap (expt (Just 2) . fromIntegral) . fst <$> ticks 10 sum ([1..10000] :: [Double]) 335 | #+end_src 336 | 337 | #+RESULTS: 338 | : ["5.0e5","2.4e5","2.4e5","2.4e5","2.4e5","2.4e5","2.4e5","2.4e5","2.5e5","2.4e5"] 339 | 340 | 341 | #+begin_src haskell-ng :results output :exports both 342 | ts <- ticks 10000 sum ([1..1000] :: [Double]) 343 | print $ average (fmap fromIntegral $ fst ts) 344 | #+end_src 345 | 346 | #+RESULTS: 347 | : 10747.1975 348 | 349 | ** PerfT 350 | 351 | ~PerfT~ allows for multiple measurement points and is polymorphic in what is being measured. It returns a Map of results held in State. 352 | 353 | Compare a lower-level usage of ticks, measuring the average of summing to one thousand over one thousand trials: 354 | 355 | #+begin_src haskell-ng :results output :exports both 356 | first (average . fmap fromIntegral) <$> ticks 1000 sum [1..1000] 357 | #+end_src 358 | 359 | #+RESULTS: 360 | : (25947.635,500500) 361 | 362 | ... with PerfT usage 363 | 364 | #+begin_src haskell-ng :results output :exports both 365 | second (fmap (average . fmap fromIntegral)) <$> runPerfT (times 1000) (sum |$| [1..1000]) 366 | #+end_src 367 | 368 | #+RESULTS: 369 | : (500500,fromList [("",26217.098)]) 370 | 371 | An IO example 372 | 373 | #+begin_src haskell-ng 374 | exampleIO' :: IO () 375 | exampleIO' = do 376 | txt <- Text.readFile "src/Perf.hs" 377 | let n = Text.length txt 378 | Text.putStrLn $ "length of file is: " <> Text.pack (show n) 379 | #+end_src 380 | 381 | #+begin_src haskell-ng :results output :exports both 382 | exampleIO = execPerfT time (do 383 | txt <- fam "file_read" (Text.readFile "src/Perf.hs") 384 | n <- fap "length" Text.length txt 385 | fam "print_result" (Text.putStrLn $ "length of file is: " <> Text.pack (show n))) 386 | #+end_src 387 | 388 | #+begin_src sh :results output :exports both 389 | perf-explore --exampleIO 390 | #+end_src 391 | 392 | #+RESULTS: 393 | #+begin_example 394 | length of file is: 1794 395 | length of file is: 1794 396 | 397 | label1 label2 label3 old result new result change 398 | 399 | normal file-read time 2.31e5 1.28e5 improvement 400 | normal length time 2.71e3 2.00e3 improvement 401 | normal print-result time 3.75e4 1.32e4 improvement 402 | outer file-read time 6.05e4 3.64e4 improvement 403 | outer length time 9.59e2 6.25e2 improvement 404 | outer outer-total time 7.39e4 4.02e4 improvement 405 | outer print-result time 9.79e3 1.71e3 improvement 406 | #+end_example 407 | 408 | ** Perf.BigO 409 | 410 | Perf.BigO represents functionality to determine the complexity order for a computation. 411 | 412 | We could do a regression and minimise the error term, but we know that the largest run contains the most information; we would need to weight the simulations according to some heuristic. 413 | 414 | Instead, we: 415 | 416 | - estimate the order factor for each possible Order, from N3 to N0, setting the highest n run constant factor to zero, 417 | - pick the order based on lowest absolute error result summed across all the runs, 418 | 419 | #+begin_src haskell-ng :results output :exports both 420 | import qualified Prelude as P 421 | import Data.List (nub) 422 | estOrder (\x -> sum $ nub [1..x]) 100 [10,100,1000,1000] 423 | #+end_src 424 | 425 | #+RESULTS: 426 | : BigOrder {bigOrder = N2, bigFactor = 3.187417} 427 | 428 | #+begin_src haskell-ng :results output :exports both 429 | import qualified Prelude as P 430 | import Data.List (nub) 431 | estOrder (\x -> sum $ [1..x]) 10 [1,10,100,1000] 432 | #+end_src 433 | 434 | #+RESULTS: 435 | : BigOrder {bigOrder = N12, bigFactor = 695.0370069284081, bigConstant = 0.0} 436 | 437 | ** References 438 | 439 | https://wiki.haskell.org/Performance/GHC 440 | 441 | [[https://github.com/haskell-perf/checklist][The Haskell performance checklist]] 442 | 443 | [[https://github.com/ndmitchell/spaceleak][ndmitchell/spaceleak: Notes on space leaks]] 444 | 445 | *** Core 446 | 447 | [[https://ghc.gitlab.haskell.org/ghc/doc/users_guide/debugging.html#options-debugging][5.13. Debugging the compiler]] 448 | 449 | #+begin_src sh 450 | ghc app/speed.hs -ddump-simpl -ddump-to-file -fforce-recomp -dlint -O 451 | #+end_src 452 | 453 | [[https://wiki.haskell.org/Performance/GHC#Looking_at_the_Core][haskell wiki: Looking at the Core]] 454 | 455 | [[https://godbolt.org/][godbolt]] 456 | 457 | [[https://gitlab.haskell.org/ghc/ghc/-/issues/15185][ghc issue 15185: Enum instance for IntX / WordX are inefficient]] 458 | 459 | [[https://fixpt.de/blog/2017-12-04-strictness-analysis-part-1.html][fixpt - All About Strictness Analysis (part 1)]] 460 | 461 | *** Profiling 462 | **** setup 463 | [[https://ghc.gitlab.haskell.org/ghc/doc/users_guide/profiling.html#prof-heap][8. Profiling]] 464 | 465 | A typical configuration step for profiling: 466 | 467 | #+begin_src sh :results output 468 | cabal configure --enable-library-profiling --enable-executable-profiling -fprof-auto -fprof -write-ghc-environment-files=always 469 | #+end_src 470 | 471 | A cabal.project.local with profiling enabled: 472 | 473 | #+begin_quote 474 | write-ghc-environment-files: always 475 | ignore-project: False 476 | flags: +prof +prof-auto 477 | library-profiling: True 478 | executable-profiling: True 479 | #+end_quote 480 | 481 | Examples from markup-parse R&D: 482 | 483 | Executable compilation: 484 | 485 | #+begin_src sh :results output 486 | ghc -prof -fprof-auto -rtsopts app/speed0.hs -threaded -fforce-recomp 487 | #+end_src 488 | 489 | Executable run: 490 | 491 | #+begin_src sh :results output 492 | app/speed0 +RTS -s -p -hc -l -RTS 493 | #+end_src 494 | 495 | **** Space usage output (-s) 496 | 497 | #+begin_example 498 | 885,263,472 bytes allocated in the heap 499 | 8,507,448 bytes copied during GC 500 | 163,200 bytes maximum residency (4 sample(s)) 501 | 27,752 bytes maximum slop 502 | 6 MiB total memory in use (0 MiB lost due to fragmentation) 503 | 504 | Tot time (elapsed) Avg pause Max pause 505 | Gen 0 207 colls, 0 par 0.009s 0.010s 0.0001s 0.0002s 506 | Gen 1 4 colls, 0 par 0.001s 0.001s 0.0004s 0.0005s 507 | 508 | TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1) 509 | 510 | SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled) 511 | 512 | INIT time 0.006s ( 0.006s elapsed) 513 | MUT time 0.367s ( 0.360s elapsed) 514 | GC time 0.010s ( 0.011s elapsed) 515 | RP time 0.000s ( 0.000s elapsed) 516 | PROF time 0.000s ( 0.000s elapsed) 517 | EXIT time 0.001s ( 0.001s elapsed) 518 | Total time 0.384s ( 0.380s elapsed) 519 | #+end_example 520 | 521 | **** Cost center profile (-p) 522 | 523 | Dumped to speed0.prof 524 | 525 | #+begin_example 526 | COST CENTRE MODULE SRC %time %alloc 527 | 528 | token MarkupParse src/MarkupParse.hs:(259,1)-(260,20) 50.2 50.4 529 | wrappedQ' MarkupParse.FlatParse src/MarkupParse/FlatParse.hs:(215,1)-(217,78) 20.8 23.1 530 | ws_ MarkupParse.FlatParse src/MarkupParse/FlatParse.hs:(135,1)-(146,4) 14.3 5.5 531 | eq MarkupParse.FlatParse src/MarkupParse/FlatParse.hs:243:1-30 10.6 11.1 532 | gather MarkupParse src/MarkupParse.hs:(420,1)-(428,100) 2.4 3.7 533 | runParser FlatParse.Basic src/FlatParse/Basic.hs:(217,1)-(225,24) 1.0 6.0 534 | #+end_example 535 | 536 | **** heap analysis (-hc -l) 537 | 538 | #+begin_src sh :results output 539 | eventlog2html speed0.eventlog 540 | #+end_src 541 | 542 | Produces speed0.eventlog.html which contains heap charts. 543 | 544 | *** Cache speed 545 | 546 | The average cycles per + operation can get down to about 0.7 cycles, and there are about 4 cache registers per cycle, so a sum pipeline uses 2.8 register instructions per +. 547 | 548 | | Cache | nsecs | Cycles | 549 | |-------------------+-------+--------------| 550 | | register | 0.1 | 4 per cycle | 551 | | L1 Cache access | 1 | 3-4 cycles | 552 | | L2 Cache access | 4 | 11-12 cycles | 553 | | L3 unified access | 14 | 30 - 40 | 554 | | DRAM hit | 80 | 195 cycles | 555 | | L1 miss | 16 | 40 cycles | 556 | | L2 miss | >250 | >600 cycles | 557 | 558 | 559 | -------------------------------------------------------------------------------- /readme.md: -------------------------------------------------------------------------------- 1 | [![img](https://img.shields.io/hackage/v/perf.svg)](https://hackage.haskell.org/package/perf) [![img](https://github.com/tonyday567/perf/workflows/haskell-ci.yml/badge.svg)](https://github.com/tonyday567/perf/actions) 2 | 3 | 4 | # Features 5 | 6 | `perf` is an experimental library with a focus on the low-level empirics of Haskell code performance. If you are looking for a quick and reliable performance benchmark, criterion and tasty-bench are both good choices. If your results are confounding, however, you may need to dig deeper, and this is the problem space of `perf`. 7 | 8 | The library: 9 | 10 | - provides a monad transformer, `PerfT`. The criterion API tends towards an atomistic approach - bust code up into snippets, copy-paste into a bench.hs and measure their isolated performance. In contrast, with `PerfT` performance can be measured within a code snippet’s original context. Differing code points can be labelled and measured as part of a single run, encouraging a much faster observation - experimentation - refactor cycle. 11 | 12 | - is polymorphic to what, exactly, is being measured, so that concepts such as counters, debug checks, time and space performance can share treatment. 13 | 14 | - attempts to measure big O for algorithms that can be defined in terms of input size growth. 15 | 16 | - includes live charting of raw performance results via chart-svg and prettychart 17 | 18 | 19 | # Usage 20 | 21 | Probably the best introduction to `perf` is via the perf-explore executable: 22 | 23 | perf-explore 24 | 25 | label1 label2 old result new result change 26 | 27 | sum time 9.93e3 7.57e3 improvement 28 | 29 | Summing [1..1000] took 9,930 nanoseconds, an improvement versus the on file performance previously measured. 30 | 31 | Live charts of raw performance measurement can be obtained via the prettychart library with: 32 | 33 | prettychart-watch --watch --filepath other --port 3566 34 | 35 | … and pointer your browser at localhost:3566 36 | 37 | perf-explore -n 1000 --nocheck --chart 38 | 39 | ![img](other/perf.svg) 40 | 41 | In this particular measure, there was an improvement, dropping from about 10,000 nanos to 8,600 nanos. Increasing the number of measurements: 42 | 43 | perf-explore -n 20000 --nocheck --chart --chartpath other/perf20000.svg 44 | 45 | ![img](other/perf20000.svg) 46 | 47 | Improvements seem to continue as n increases before stabilising (after a GC perhaps) at 3,500 nanos 48 | 49 | perf-explore -n 20000 --order --nocheck --tasty 50 | 51 | label1 label2 results 52 | 53 | sum time 3.51e3 54 | 55 | sum:time 3.5 * O(N1) 56 | tasty:time: 3510 57 | 58 | The order of the computation (`\l -> fap sum [1 .. l]`) is O(N1) and the results are very close to the tasty-bench result. 59 | 60 | In comparsion, (\l -> fap (\x -> sum [1 .. x]) l): 61 | 62 | perf-explore --nocheck --sumFuse -n 100000 --chart --chartpath other/perffuse.svg --order 63 | 64 | ![img](other/perffuse.svg) 65 | 66 | perf-explore --nocheck --sumFuse -n 100000 --order 67 | 68 | label1 label2 results 69 | 70 | sumFuse time 6.78e2 71 | 72 | sumFuse:time 0.66 * O(N1) 73 | 74 | … is much faster. Hooray for list fusion! 75 | 76 | 77 | # Issues 78 | 79 | 80 | ## fragility 81 | 82 | Results, especially for simple computations, are fragile and can show large variance in performance characteristics in identical runs, and across differing compilations. Whether this is due to library flaws or is just the nature of ghc is an open question. 83 | 84 | 85 | ## Statistics 86 | 87 | > Obligatory disclaimer: statistics is a tricky matter, there is no one-size-fits-all approach. In the absence of a good theory simplistic approaches are as (un)sound as obscure ones. Those who seek statistical soundness should rather collect raw data and process it themselves using a proper statistical toolbox. Data reported by tasty-bench is only of indicative and comparative significance. ~ [tasty-bench](https://hackage.haskell.org/package/tasty-bench-0.4/docs/Test-Tasty-Bench.html#t:Benchmarkable) 88 | 89 | > variance introduced by outliers: 88% (severely inflated) ~ [criterion](https://hackage.haskell.org/package/criterion) 90 | 91 | The library default is to report the 10th percentile as a summary statistic, and this is a matter of taste, determined mostly by the purpose of the measurement. 92 | 93 | 94 | ## ffap and fap 95 | 96 | :t ffap 97 | 98 | ffap 99 | :: (Control.DeepSeq.NFData a, Control.DeepSeq.NFData b, MonadIO m, 100 | Semigroup t) => 101 | Text.Text -> (a -> b) -> a -> PerfT m t b 102 | 103 | ffap and fap are broadly similar to criterion’s nf and whnf respectively, but passes throught the results of the computation into the monad transformer, enabling in-context measurement. 104 | 105 | A fine-grained and detailed examination of the effect of measurement on laziness and on core details would be beneficial to the library. 106 | 107 | 108 | ## tasty 109 | 110 | The library was originally developed before tasty-bench, which does a great job of integrating into the tasty api, and a future refactor may integrate with this, rather than supply idiosyncratic methods. 111 | 112 | 113 | ## order 114 | 115 | BigOrder calculations tend to be fragile and sometimes differ from theory. 116 | 117 | 118 | # Development 119 | 120 | This org file has been used to develop and document library innovation and testing, and may be of use to users in understanding the library. Note that running `perf` via ghci is very slow compared with an external process which accesses the compiled version of the library. 121 | 122 | :r 123 | :set -Wno-type-defaults 124 | :set -Wno-unused-do-bind 125 | :set -Wno-name-shadowing 126 | :set -XOverloadedStrings 127 | :set -XOverloadedLabels 128 | import Perf 129 | import Perf.Report 130 | import Data.FormatN 131 | import qualified Data.Text as Text 132 | import qualified Data.Text.IO as Text 133 | import qualified Data.Map.Strict as Map 134 | import Control.Monad 135 | import Data.Bifunctor 136 | import System.Clock 137 | import Data.List qualified as List 138 | import Control.Category ((>>>)) 139 | import Optics.Core 140 | import Data.Foldable 141 | import NumHask.Space 142 | putStrLn "ok" 143 | import Chart hiding (tick) 144 | import Prettychart 145 | import Chart.Examples 146 | import Perf.Chart 147 | (disp,q) <- startChartServer Nothing 148 | disp lineExample 149 | import Prettyprinter 150 | import Control.Monad.State.Lazy 151 | import Text.PrettyPrint.Boxes 152 | 153 | Ok, 11 modules loaded. 154 | ok 155 | Setting phasegrhsc it>o stun... (poTrrtu e9 156 | 160) (cgthrcli->c to quitg)h 157 | 158 | l = 1000 159 | n = 1000 160 | 161 | :{ 162 | p = do 163 | ffap "sum" sum [1 .. l] 164 | ffap "sumfuse" (\x -> sum [1 .. x]) l 165 | :} 166 | :t p 167 | run = runPerfT (times n) p 168 | :t run 169 | (res, m) <- run 170 | :t m 171 | median . fmap fromIntegral <$> m 172 | 173 | ghci| ghci| ghci| ghci| ghci> p :: (MonadIO m, Semigroup t, Control.DeepSeq.NFData b, Num b, 174 | Enum b) => 175 | PerfT m t b 176 | run 177 | :: (Control.DeepSeq.NFData a, Num a, Enum a) => 178 | IO (a, Map.Map Text.Text [Nanos]) 179 | m :: Map.Map Text.Text [Nanos] 180 | fromList [("sum",21978.1),("sumfuse",26710.18)] 181 | 182 | 183 | # Details 184 | 185 | 186 | ## System.Clock 187 | 188 | The default clock is MonoticRaw for linux & macOS, and ThreadCPUTime for Windows. 189 | 190 | 191 | ### resolution 192 | 193 | getRes Monotonic 194 | getRes Realtime 195 | getRes ProcessCPUTime 196 | getRes ThreadCPUTime 197 | getRes MonotonicRaw 198 | 199 | TimeSpec {sec = 0, nsec = 1000} 200 | TimeSpec {sec = 0, nsec = 1000} 201 | TimeSpec {sec = 0, nsec = 1000} 202 | TimeSpec {sec = 0, nsec = 42} 203 | TimeSpec {sec = 0, nsec = 42} 204 | 205 | 206 | ## ticks 207 | 208 | The various versions of tick and a variety of algorithms are artifacts of ongoing exploration. 209 | 210 | perf-explore -n 20000 --best --ticks 211 | 212 | algo stepTime tick tickForce tickForceArgs tickLazy tickWHNF times timesn 213 | sumAux 3.11e3 3.11e3 3.11e3 3.11e3 5.13e0 3.11e3 3.11e3 3.10e3 214 | sumCata 3.11e3 3.11e3 3.11e3 3.11e3 5.11e0 3.11e3 3.11e3 3.14e3 215 | sumCo 3.11e3 3.11e3 3.11e3 3.11e3 5.06e0 3.11e3 3.11e3 3.08e3 216 | sumCoCase 3.11e3 3.11e3 3.11e3 3.11e3 5.11e0 3.11e3 3.11e3 3.08e3 217 | sumCoGo 3.11e3 3.11e3 3.11e3 3.11e3 5.06e0 3.11e3 3.11e3 3.12e3 218 | sumF 3.48e3 3.49e3 3.46e3 3.46e3 5.06e0 3.48e3 3.48e3 3.48e3 219 | sumFlip 3.48e3 3.48e3 3.45e3 3.45e3 5.03e0 3.48e3 3.48e3 3.48e3 220 | sumFlipLazy 3.48e3 3.48e3 3.45e3 3.45e3 4.96e0 3.48e3 3.48e3 3.45e3 221 | sumFoldr 3.11e3 3.11e3 3.11e3 3.11e3 5.13e0 3.11e3 3.11e3 3.11e3 222 | sumFuse 6.54e2 6.54e2 6.54e2 6.54e2 5.17e0 6.54e2 6.54e2 6.39e2 223 | sumFuseFoldl' 6.54e2 6.54e2 6.54e2 6.54e2 5.00e0 6.54e2 6.54e2 6.44e2 224 | sumFuseFoldr 9.93e2 9.92e2 9.92e2 9.92e2 5.13e0 9.92e2 9.93e2 9.63e2 225 | sumFusePoly 6.56e2 6.56e2 6.56e2 6.56e2 5.12e0 6.56e2 6.57e2 6.47e2 226 | sumLambda 3.48e3 3.49e3 3.48e3 3.48e3 5.12e0 3.48e3 3.48e3 3.55e3 227 | sumMono 3.48e3 3.48e3 3.46e3 3.46e3 5.00e0 3.48e3 3.48e3 3.50e3 228 | sumPoly 3.62e3 3.49e3 3.54e3 3.56e3 5.04e0 3.71e3 3.62e3 3.70e3 229 | sumSum 3.48e3 3.49e3 3.48e3 3.48e3 4.98e0 3.48e3 3.48e3 3.49e3 230 | sumTail 3.48e3 3.49e3 3.45e3 3.45e3 5.00e0 3.48e3 3.48e3 3.51e3 231 | sumTailLazy 3.48e3 3.48e3 3.45e3 3.45e3 5.16e0 3.48e3 3.48e3 3.49e3 232 | 233 | 234 | ## Time 235 | 236 | 237 | ### What is a tick? 238 | 239 | A fundamental operation of Perf.Time is tick, which sandwiches a (strict) function application between two readings of a clock, and returns time in nanoseconds, and the computation result. In this way, the \`Perf\` monad can be inserted into the midst of a computation in an attempt to measure performance in-situ as opposed to sitting off in a separate and decontextualized process. 240 | 241 | :t tick 242 | 243 | tick :: (a -> b) -> a -> IO (Nanos, b) 244 | 245 | `tick` returns in the IO monad, because reading a cycle counter is an IO effect. A trivial but fundamental point is that performance measurement effects the computation being measured. 246 | 247 | 248 | ### tick\_ 249 | 250 | tick\_ measures the nanoseconds between two immediate clock reads. 251 | 252 | :t tick_ 253 | 254 | tick_ :: IO Nanos 255 | 256 | replicateM 10 tick_ 257 | 258 | [1833,500,416,416,416,375,375,416,416,416] 259 | 260 | 261 | ### multiple ticks 262 | 263 | fmap (fmap (fst)) . replicateM 10 $ tick (const ()) () 264 | 265 | [7000,2333,2000,2208,1958,1959,1959,2000,2000,1959] 266 | 267 | Here, `const () ()` was evaluated and took 7 micro-seconds for the first effect, reducing down to 2 msecs after 10 effects. 268 | 269 | 270 | ### tickIO 271 | 272 | `tickIO` measures the evaluation of an IO value. 273 | 274 | :t tickIO 275 | 276 | tickIO :: IO a -> IO (Cycles, a) 277 | 278 | fmap (fmap fst) . replicateM 10 $ tickIO (pure ()) 279 | 280 | [5541,1625,1458,1833,1375,1416,1375,1375,1375,1375] 281 | 282 | 283 | ### sum example 284 | 285 | fmap (expt (Just 2) . fromIntegral) . fst <$> ticks 10 sum ([1..10000] :: [Double]) 286 | 287 | ["5.0e5","2.4e5","2.4e5","2.4e5","2.4e5","2.4e5","2.4e5","2.4e5","2.5e5","2.4e5"] 288 | 289 | ts <- ticks 10000 sum ([1..1000] :: [Double]) 290 | print $ average (fmap fromIntegral $ fst ts) 291 | 292 | 10747.1975 293 | 294 | 295 | ## PerfT 296 | 297 | `PerfT` allows for multiple measurement points and is polymorphic in what is being measured. It returns a Map of results held in State. 298 | 299 | Compare a lower-level usage of ticks, measuring the average of summing to one thousand over one thousand trials: 300 | 301 | first (average . fmap fromIntegral) <$> ticks 1000 sum [1..1000] 302 | 303 | (25947.635,500500) 304 | 305 | … with PerfT usage 306 | 307 | second (fmap (average . fmap fromIntegral)) <$> runPerfT (times 1000) (sum |$| [1..1000]) 308 | 309 | (500500,fromList [("",26217.098)]) 310 | 311 | An IO example 312 | 313 | exampleIO' :: IO () 314 | exampleIO' = do 315 | txt <- Text.readFile "src/Perf.hs" 316 | let n = Text.length txt 317 | Text.putStrLn $ "length of file is: " <> Text.pack (show n) 318 | 319 | exampleIO = execPerfT time (do 320 | txt <- fam "file_read" (Text.readFile "src/Perf.hs") 321 | n <- fap "length" Text.length txt 322 | fam "print_result" (Text.putStrLn $ "length of file is: " <> Text.pack (show n))) 323 | 324 | perf-explore --exampleIO 325 | 326 | length of file is: 1794 327 | length of file is: 1794 328 | 329 | label1 label2 label3 old result new result change 330 | 331 | normal file-read time 2.31e5 1.28e5 improvement 332 | normal length time 2.71e3 2.00e3 improvement 333 | normal print-result time 3.75e4 1.32e4 improvement 334 | outer file-read time 6.05e4 3.64e4 improvement 335 | outer length time 9.59e2 6.25e2 improvement 336 | outer outer-total time 7.39e4 4.02e4 improvement 337 | outer print-result time 9.79e3 1.71e3 improvement 338 | 339 | 340 | ## Perf.BigO 341 | 342 | Perf.BigO represents functionality to determine the complexity order for a computation. 343 | 344 | We could do a regression and minimise the error term, but we know that the largest run contains the most information; we would need to weight the simulations according to some heuristic. 345 | 346 | Instead, we: 347 | 348 | - estimate the order factor for each possible Order, from N3 to N0, setting the highest n run constant factor to zero, 349 | - pick the order based on lowest absolute error result summed across all the runs, 350 | 351 | import qualified Prelude as P 352 | import Data.List (nub) 353 | estOrder (\x -> sum $ nub [1..x]) 100 [10,100,1000,1000] 354 | 355 | BigOrder {bigOrder = N2, bigFactor = 3.187417} 356 | 357 | import qualified Prelude as P 358 | import Data.List (nub) 359 | estOrder (\x -> sum $ [1..x]) 10 [1,10,100,1000] 360 | 361 | BigOrder {bigOrder = N12, bigFactor = 695.0370069284081, bigConstant = 0.0} 362 | 363 | 364 | ## References 365 | 366 | 367 | 368 | [The Haskell performance checklist](https://github.com/haskell-perf/checklist) 369 | 370 | [ndmitchell/spaceleak: Notes on space leaks](https://github.com/ndmitchell/spaceleak) 371 | 372 | 373 | ### Core 374 | 375 | [5.13. Debugging the compiler](https://ghc.gitlab.haskell.org/ghc/doc/users_guide/debugging.html#options-debugging) 376 | 377 | ghc app/speed.hs -ddump-simpl -ddump-to-file -fforce-recomp -dlint -O 378 | 379 | [haskell wiki: Looking at the Core](https://wiki.haskell.org/Performance/GHC#Looking_at_the_Core) 380 | 381 | [godbolt](https://godbolt.org/) 382 | 383 | [ghc issue 15185: Enum instance for IntX / WordX are inefficient](https://gitlab.haskell.org/ghc/ghc/-/issues/15185) 384 | 385 | [fixpt - All About Strictness Analysis (part 1)](https://fixpt.de/blog/2017-12-04-strictness-analysis-part-1.html) 386 | 387 | 388 | ### Profiling 389 | 390 | 1. setup 391 | 392 | [8. Profiling](https://ghc.gitlab.haskell.org/ghc/doc/users_guide/profiling.html#prof-heap) 393 | 394 | A typical configuration step for profiling: 395 | 396 | cabal configure --enable-library-profiling --enable-executable-profiling -fprof-auto -fprof -write-ghc-environment-files=always 397 | 398 | A cabal.project.local with profiling enabled: 399 | 400 | > write-ghc-environment-files: always 401 | > ignore-project: False 402 | > flags: +prof +prof-auto 403 | > library-profiling: True 404 | > executable-profiling: True 405 | 406 | Examples from markup-parse R&D: 407 | 408 | Executable compilation: 409 | 410 | ghc -prof -fprof-auto -rtsopts app/speed0.hs -threaded -fforce-recomp 411 | 412 | Executable run: 413 | 414 | app/speed0 +RTS -s -p -hc -l -RTS 415 | 416 | 2. Space usage output (-s) 417 | 418 | 885,263,472 bytes allocated in the heap 419 | 8,507,448 bytes copied during GC 420 | 163,200 bytes maximum residency (4 sample(s)) 421 | 27,752 bytes maximum slop 422 | 6 MiB total memory in use (0 MiB lost due to fragmentation) 423 | 424 | Tot time (elapsed) Avg pause Max pause 425 | Gen 0 207 colls, 0 par 0.009s 0.010s 0.0001s 0.0002s 426 | Gen 1 4 colls, 0 par 0.001s 0.001s 0.0004s 0.0005s 427 | 428 | TASKS: 4 (1 bound, 3 peak workers (3 total), using -N1) 429 | 430 | SPARKS: 0 (0 converted, 0 overflowed, 0 dud, 0 GC'd, 0 fizzled) 431 | 432 | INIT time 0.006s ( 0.006s elapsed) 433 | MUT time 0.367s ( 0.360s elapsed) 434 | GC time 0.010s ( 0.011s elapsed) 435 | RP time 0.000s ( 0.000s elapsed) 436 | PROF time 0.000s ( 0.000s elapsed) 437 | EXIT time 0.001s ( 0.001s elapsed) 438 | Total time 0.384s ( 0.380s elapsed) 439 | 440 | 3. Cost center profile (-p) 441 | 442 | Dumped to speed0.prof 443 | 444 | COST CENTRE MODULE SRC %time %alloc 445 | 446 | token MarkupParse src/MarkupParse.hs:(259,1)-(260,20) 50.2 50.4 447 | wrappedQ' MarkupParse.FlatParse src/MarkupParse/FlatParse.hs:(215,1)-(217,78) 20.8 23.1 448 | ws_ MarkupParse.FlatParse src/MarkupParse/FlatParse.hs:(135,1)-(146,4) 14.3 5.5 449 | eq MarkupParse.FlatParse src/MarkupParse/FlatParse.hs:243:1-30 10.6 11.1 450 | gather MarkupParse src/MarkupParse.hs:(420,1)-(428,100) 2.4 3.7 451 | runParser FlatParse.Basic src/FlatParse/Basic.hs:(217,1)-(225,24) 1.0 6.0 452 | 453 | 4. heap analysis (-hc -l) 454 | 455 | eventlog2html speed0.eventlog 456 | 457 | Produces speed0.eventlog.html which contains heap charts. 458 | 459 | 460 | ### Cache speed 461 | 462 | The average cycles per + operation can get down to about 0.7 cycles, and there are about 4 cache registers per cycle, so a sum pipeline uses 2.8 register instructions per +. 463 | 464 | 465 | 466 | 467 | 468 | 469 | 470 | 471 | 472 | 473 | 474 | 475 | 476 | 477 | 478 | 479 | 480 | 481 | 482 | 483 | 484 | 485 | 486 | 487 | 488 | 489 | 490 | 491 | 492 | 493 | 494 | 495 | 496 | 497 | 498 | 499 | 500 | 501 | 502 | 503 | 504 | 505 | 506 | 507 | 508 | 509 | 510 | 511 | 512 | 513 | 514 | 515 | 516 | 517 | 518 | 519 | 520 | 521 | 522 | 523 | 524 |
CachensecsCycles
register0.14 per cycle
L1 Cache access13-4 cycles
L2 Cache access411-12 cycles
L3 unified access1430 - 40
DRAM hit80195 cycles
L1 miss1640 cycles
L2 miss>250>600 cycles
525 | 526 | -------------------------------------------------------------------------------- /src/Perf.hs: -------------------------------------------------------------------------------- 1 | -- | == Introduction 2 | -- 3 | -- @perf@ provides tools for measuring the runtime performance of Haskell functions. It includes: 4 | -- 5 | -- - time measurement via the [clock](https://hackage.haskell.org/package/clock) library. 6 | -- 7 | -- - a polymorphic approach to what a 'Measure' is so that a wide variety of measurements such as counting, space and time measurement can share the same API. 8 | -- 9 | -- - 'PerfT' which is a monad transformer designed to add the collection of performance information to existing code. Running the code produces a tuple of the original computation results, and a Map of performance measurements that were specified. 10 | -- 11 | -- - functionality to determine performance order, in 'Perf.BigO' 12 | -- 13 | -- - reporting functionality encapsulated in 'Perf.Report'. @perf@ can be run via 'cabal bench' and will, for example, error on performance degradation; see the project's cabal file for an example. 14 | module Perf 15 | ( -- * re-exports 16 | module Perf.Types, 17 | -- | Representation of what a Performance 'Measure' is. 18 | module Perf.Measure, 19 | -- | Low-level time performance 'Measure' counting 'Nanos' 20 | module Perf.Time, 21 | -- | Low-level space performance 'Measure's based on GHC's allocation statistics. 22 | module Perf.Space, 23 | -- | Simple loop counter 24 | module Perf.Count, 25 | -- | Various (fast loop) algorithms that have been used for testing perf functionality. 26 | module Perf.Algos, 27 | -- | Order of complexity computations 28 | module Perf.BigO, 29 | -- | Reporting 30 | module Perf.Report, 31 | -- | Statistical support 32 | module Perf.Stats, 33 | ) 34 | where 35 | 36 | import Perf.Algos 37 | import Perf.BigO 38 | import Perf.Count 39 | import Perf.Measure 40 | import Perf.Report 41 | import Perf.Space 42 | import Perf.Stats 43 | import Perf.Time 44 | import Perf.Types 45 | import Prelude hiding (cycle) 46 | -------------------------------------------------------------------------------- /src/Perf/Algos.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE LambdaCase #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# OPTIONS_GHC -Wno-name-shadowing #-} 4 | {-# OPTIONS_GHC -Wno-unrecognised-pragmas #-} 5 | 6 | {-# HLINT ignore "Redundant lambda" #-} 7 | {-# HLINT ignore "Avoid lambda" #-} 8 | {-# HLINT ignore "Use foldr" #-} 9 | {-# HLINT ignore "Use sum" #-} 10 | 11 | -- | Algorithms and functions for testing purposes 12 | module Perf.Algos 13 | ( -- * command-line options 14 | Example (..), 15 | parseExample, 16 | ExamplePattern (..), 17 | examplePattern, 18 | exampleLabel, 19 | testExample, 20 | tastyExample, 21 | 22 | -- * sum algorithms 23 | SumPattern (..), 24 | allSums, 25 | testSum, 26 | statSums, 27 | sumTail, 28 | sumTailLazy, 29 | sumFlip, 30 | sumFlipLazy, 31 | sumCo, 32 | sumCoGo, 33 | sumCoCase, 34 | sumAux, 35 | sumFoldr, 36 | sumCata, 37 | sumSum, 38 | sumMono, 39 | sumPoly, 40 | sumLambda, 41 | sumF, 42 | sumFuse, 43 | sumFusePoly, 44 | sumFuseFoldl', 45 | sumFuseFoldr, 46 | 47 | -- * length algorithms 48 | LengthPattern (..), 49 | allLengths, 50 | testLength, 51 | statLengths, 52 | 53 | -- * length 54 | lengthTail, 55 | lengthTailLazy, 56 | lengthFlip, 57 | lengthFlipLazy, 58 | lengthCo, 59 | lengthCoCase, 60 | lengthAux, 61 | lengthFoldr, 62 | lengthFoldrConst, 63 | lengthF, 64 | lengthFMono, 65 | 66 | -- * recursion patterns 67 | recurseTail, 68 | recurseTailLazy, 69 | recurseFlip, 70 | recurseFlipLazy, 71 | recurseCo, 72 | recurseCoLazy, 73 | recurseCata, 74 | 75 | -- * miscellaneous 76 | mapInc, 77 | constFuse, 78 | splitHalf, 79 | ) 80 | where 81 | 82 | import Control.Monad (void) 83 | import Control.Monad.IO.Class (MonadIO (..)) 84 | import Data.Bifunctor 85 | import Data.Foldable 86 | import Data.Functor.Foldable 87 | import Data.List qualified as List 88 | import Data.Map.Strict qualified as Map 89 | import Data.Text (Text) 90 | import Options.Applicative 91 | import Options.Applicative.Help.Pretty 92 | import Perf.Types 93 | import Test.Tasty.Bench 94 | 95 | -- | Algorithm examples for testing 96 | data Example = ExampleSumFuse | ExampleSum | ExampleLengthF | ExampleConstFuse | ExampleMapInc | ExampleNoOp | ExampleNub | ExampleFib deriving (Eq, Show) 97 | 98 | -- | Parse command-line options for algorithm examples. 99 | parseExample :: Parser Example 100 | parseExample = 101 | flag' ExampleSumFuse (long "sumFuse" <> help "fused sum pipeline") 102 | <|> flag' ExampleSum (long "sum" <> style (annotate bold) <> help "sum") 103 | <|> flag' ExampleLengthF (long "lengthF" <> help "foldr id length") 104 | <|> flag' ExampleConstFuse (long "constFuse" <> help "fused const pipeline") 105 | <|> flag' ExampleMapInc (long "mapInc" <> help "fmap (+1)") 106 | <|> flag' ExampleNoOp (long "noOp" <> help "const ()") 107 | <|> flag' ExampleFib (long "fib" <> help "fibonacci") 108 | <|> flag' ExampleNub (long "nub" <> help "List.nub") 109 | <|> pure ExampleSum 110 | 111 | -- | Unification of example function applications 112 | data ExamplePattern a 113 | = PatternSumFuse Text ((Num a) => (a -> a)) a 114 | | PatternSum Text ((Num a) => [a] -> a) [a] 115 | | PatternLengthF Text ([a] -> Int) [a] 116 | | PatternConstFuse Text (Int -> ()) Int 117 | | PatternMapInc Text ([Int] -> [Int]) [Int] 118 | | PatternNoOp Text (() -> ()) () 119 | | PatternNub Text ([Int] -> [Int]) [Int] 120 | | PatternFib Text (Int -> Integer) Int 121 | 122 | -- | Labels 123 | exampleLabel :: ExamplePattern a -> Text 124 | exampleLabel (PatternSumFuse l _ _) = l 125 | exampleLabel (PatternSum l _ _) = l 126 | exampleLabel (PatternLengthF l _ _) = l 127 | exampleLabel (PatternConstFuse l _ _) = l 128 | exampleLabel (PatternMapInc l _ _) = l 129 | exampleLabel (PatternNoOp l _ _) = l 130 | exampleLabel (PatternNub l _ _) = l 131 | exampleLabel (PatternFib l _ _) = l 132 | 133 | -- | Convert an 'Example' to an 'ExamplePattern'. 134 | examplePattern :: Example -> Int -> ExamplePattern Int 135 | examplePattern ExampleSumFuse l = PatternSumFuse "sumFuse" sumFuse l 136 | examplePattern ExampleSum l = PatternSum "sum" sum [1 .. l] 137 | examplePattern ExampleLengthF l = PatternLengthF "lengthF" lengthF [1 .. l] 138 | examplePattern ExampleConstFuse l = PatternConstFuse "constFuse" constFuse l 139 | examplePattern ExampleMapInc l = PatternMapInc "mapInc" mapInc [1 .. l] 140 | examplePattern ExampleNoOp _ = PatternNoOp "noop" (const ()) () 141 | examplePattern ExampleNub l = PatternNub "nub" List.nub [1 .. l] 142 | examplePattern ExampleFib l = PatternFib "fib" fib l 143 | 144 | -- | Convert an 'ExamplePattern' to a 'PerfT'. 145 | testExample :: (Semigroup a, MonadIO m) => ExamplePattern Int -> PerfT m a () 146 | testExample (PatternSumFuse label f a) = void $ ffap label f a 147 | testExample (PatternSum label f a) = void $ ffap label f a 148 | testExample (PatternLengthF label f a) = void $ ffap label f a 149 | testExample (PatternConstFuse label f a) = void $ ffap label f a 150 | testExample (PatternMapInc label f a) = void $ ffap label f a 151 | testExample (PatternNoOp label f a) = void $ ffap label f a 152 | testExample (PatternNub label f a) = void $ ffap label f a 153 | testExample (PatternFib label f a) = void $ ffap label f a 154 | 155 | -- | Convert an 'ExamplePattern' to a tasty-bench run. 156 | tastyExample :: ExamplePattern Int -> Benchmarkable 157 | tastyExample (PatternSumFuse _ f a) = nf f a 158 | tastyExample (PatternSum _ f a) = nf f a 159 | tastyExample (PatternLengthF _ f a) = nf f a 160 | tastyExample (PatternConstFuse _ f a) = nf f a 161 | tastyExample (PatternMapInc _ f a) = nf f a 162 | tastyExample (PatternNoOp _ f a) = nf f a 163 | tastyExample (PatternNub _ f a) = nf f a 164 | tastyExample (PatternFib _ f a) = nf f a 165 | 166 | -- | Unification of sum function applications 167 | data SumPattern a 168 | = SumFuse Text (Int -> Int) Int 169 | | SumFusePoly Text ((Enum a, Num a) => a -> a) a 170 | | SumPoly Text ((Num a) => [a] -> a) [a] 171 | | SumMono Text ([Int] -> Int) [Int] 172 | 173 | -- | All the sum algorithms. 174 | allSums :: Int -> [SumPattern Int] 175 | allSums l = 176 | [ SumPoly "sumTail" sumTail [1 .. l], 177 | SumPoly "sumTailLazy" sumTailLazy [1 .. l], 178 | SumPoly "sumFlip" sumFlip [1 .. l], 179 | SumPoly "sumFlipLazy" sumFlipLazy [1 .. l], 180 | SumPoly "sumCo" sumCo [1 .. l], 181 | SumPoly "sumCoGo" sumCoGo [1 .. l], 182 | SumPoly "sumCoCase" sumCoCase [1 .. l], 183 | SumPoly "sumAux" sumAux [1 .. l], 184 | SumPoly "sumFoldr" sumFoldr [1 .. l], 185 | SumPoly "sumCata" sumCata [1 .. l], 186 | SumPoly "sumSum" sumSum [1 .. l], 187 | SumMono "sumMono" sumMono [1 .. l], 188 | SumPoly "sumPoly" sumPoly [1 .. l], 189 | SumPoly "sumLambda" sumLambda [1 .. l], 190 | SumPoly "sumF" sumF [1 .. l], 191 | SumFuse "sumFuse" sumFuse l, 192 | SumFusePoly "sumFusePoly" sumFusePoly l, 193 | SumFuse "sumFuseFoldl'" sumFuseFoldl' l, 194 | SumFuse "sumFuseFoldr" sumFuseFoldr l 195 | ] 196 | 197 | -- | Convert an 'SumPattern' to a 'PerfT'. 198 | testSum :: (Semigroup a, MonadIO m) => SumPattern Int -> PerfT m a Int 199 | testSum (SumFuse label f a) = fap label f a 200 | testSum (SumFusePoly label f a) = fap label f a 201 | testSum (SumMono label f a) = fap label f a 202 | testSum (SumPoly label f a) = fap label f a 203 | 204 | -- | Run a sum algorithm measurement. 205 | statSums :: (MonadIO m) => Int -> Int -> (Int -> Measure m [a]) -> m (Map.Map Text [a]) 206 | statSums n l m = execPerfT (m n) $ mapM_ testSum (allSums l) 207 | 208 | -- | tail resursive 209 | sumTail :: (Num a) => [a] -> a 210 | sumTail = go 0 211 | where 212 | go acc [] = acc 213 | go acc (x : xs) = go (x + acc) $! xs 214 | 215 | -- | lazy recursion. 216 | sumTailLazy :: (Num a) => [a] -> a 217 | sumTailLazy = go 0 218 | where 219 | go acc [] = acc 220 | go acc (x : xs) = go (x + acc) $! xs 221 | 222 | -- | With argument order flipped 223 | sumFlip :: (Num a) => [a] -> a 224 | sumFlip xs0 = go xs0 0 225 | where 226 | go [] s = s 227 | go (x : xs) s = go xs $! x + s 228 | 229 | -- | Lazy with argument order flipped. 230 | sumFlipLazy :: (Num a) => [a] -> a 231 | sumFlipLazy xs0 = go xs0 0 232 | where 233 | go [] s = s 234 | go (x : xs) s = go xs $ x + s 235 | 236 | -- | Co-routine style 237 | sumCo :: (Num a) => [a] -> a 238 | sumCo [] = 0 239 | sumCo (x : xs) = x + sumCo xs 240 | 241 | -- | Co-routine, go style 242 | sumCoGo :: (Num a) => [a] -> a 243 | sumCoGo = go 244 | where 245 | go [] = 0 246 | go (x : xs) = x + go xs 247 | 248 | -- | Co-routine, case-style 249 | sumCoCase :: (Num a) => [a] -> a 250 | sumCoCase = \case 251 | [] -> 0 252 | (x : xs) -> x + sumCoCase xs 253 | 254 | -- | Auxillary style. 255 | sumAux :: (Num a) => [a] -> a 256 | sumAux = \case 257 | [] -> b 258 | (x : xs) -> f x (sumAux xs) 259 | where 260 | b = 0 261 | f x xs = x + xs 262 | 263 | -- | foldr style 264 | sumFoldr :: (Num a) => [a] -> a 265 | sumFoldr xs = foldr (+) 0 xs 266 | 267 | -- | cata style 268 | sumCata :: (Num a) => [a] -> a 269 | sumCata = cata $ \case 270 | Nil -> 0 271 | Cons x acc -> x + acc 272 | 273 | -- | sum 274 | sumSum :: (Num a) => [a] -> a 275 | sumSum xs = sum xs 276 | 277 | -- | Monomorphic sum 278 | sumMono :: [Int] -> Int 279 | sumMono xs = foldl' (+) 0 xs 280 | 281 | -- | Polymorphic sum 282 | sumPoly :: (Num a) => [a] -> a 283 | sumPoly xs = foldl' (+) 0 xs 284 | 285 | -- | Lambda-style sum 286 | sumLambda :: (Num a) => [a] -> a 287 | sumLambda = \xs -> foldl' (+) 0 xs 288 | 289 | sumF' :: (Num a) => a -> (a -> a) -> a -> a 290 | sumF' x r = \ !a -> r (x + a) 291 | 292 | -- | GHC-style foldr method. 293 | sumF :: (Num a) => [a] -> a 294 | sumF xs = foldr sumF' id xs 0 295 | 296 | -- | Fusion check 297 | sumFuse :: Int -> Int 298 | sumFuse x = sum [1 .. x] 299 | 300 | -- | Fusion under polymorph 301 | sumFusePoly :: (Enum a, Num a) => a -> a 302 | sumFusePoly x = sum [1 .. x] 303 | 304 | -- | foldl' fusion 305 | sumFuseFoldl' :: Int -> Int 306 | sumFuseFoldl' x = foldl' (+) 0 [1 .. x] 307 | 308 | -- | foldr fusion 309 | sumFuseFoldr :: Int -> Int 310 | sumFuseFoldr x = foldr (+) 0 [1 .. x] 311 | 312 | -- | Unification of length function applications 313 | data LengthPattern a 314 | = LengthPoly Text ([a] -> Int) [a] 315 | | LengthMono Text ([Int] -> Int) [Int] 316 | 317 | -- | All the length algorithms. 318 | allLengths :: Int -> [LengthPattern Int] 319 | allLengths l = 320 | [ LengthPoly "lengthTail" lengthTail [1 .. l], 321 | LengthPoly "lengthTailLazy" lengthTailLazy [1 .. l], 322 | LengthPoly "lengthFlip" lengthFlip [1 .. l], 323 | LengthPoly "lengthFlipLazy" lengthFlipLazy [1 .. l], 324 | LengthPoly "lengthCo" lengthCo [1 .. l], 325 | LengthPoly "lengthCoCase" lengthCoCase [1 .. l], 326 | LengthPoly "lengthAux" lengthAux [1 .. l], 327 | LengthPoly "lengthFoldr" lengthFoldr [1 .. l], 328 | LengthPoly "lengthFoldrConst" lengthFoldrConst [1 .. l], 329 | LengthPoly "lengthF" lengthF [1 .. l], 330 | LengthMono "lengthFMono" lengthFMono [1 .. l] 331 | ] 332 | 333 | -- | Convert an 'LengthPattern' to a 'PerfT'. 334 | testLength :: (Semigroup a, MonadIO m) => LengthPattern Int -> PerfT m a Int 335 | testLength (LengthMono label f a) = fap label f a 336 | testLength (LengthPoly label f a) = fap label f a 337 | 338 | -- | Run a lengths algorithm 339 | statLengths :: (MonadIO m) => Int -> Int -> (Int -> Measure m [a]) -> m (Map.Map Text [a]) 340 | statLengths n l m = execPerfT (m n) $ mapM_ testLength (allLengths l) 341 | 342 | -- | tail resursive 343 | lengthTail :: [a] -> Int 344 | lengthTail xs0 = go 0 xs0 345 | where 346 | go s [] = s 347 | go s (_ : xs) = go (s + 1) $! xs 348 | 349 | -- | lazy recursion. 350 | lengthTailLazy :: [a] -> Int 351 | lengthTailLazy xs0 = go 0 xs0 352 | where 353 | go s [] = s 354 | go s (_ : xs) = go (s + 1) xs 355 | 356 | -- | With argument order flipped 357 | lengthFlip :: [a] -> Int 358 | lengthFlip xs0 = go xs0 0 359 | where 360 | go [] s = s 361 | go (_ : xs) s = go xs $! s + 1 362 | 363 | -- | Lazy with argument order flipped. 364 | lengthFlipLazy :: [a] -> Int 365 | lengthFlipLazy xs0 = go xs0 0 366 | where 367 | go [] s = s 368 | go (_ : xs) s = go xs $ s + 1 369 | 370 | -- | Co-routine style 371 | lengthCo :: [a] -> Int 372 | lengthCo [] = 0 373 | lengthCo (_ : xs) = 1 + lengthCo xs 374 | 375 | -- | Co-routine style as a Case statement. 376 | lengthCoCase :: [a] -> Int 377 | lengthCoCase = \case 378 | [] -> 0 379 | (_ : xs) -> 1 + lengthCoCase xs 380 | 381 | -- | Auxillary version. 382 | lengthAux :: [a] -> Int 383 | lengthAux = \case 384 | [] -> b 385 | (x : xs) -> f x (lengthAux xs) 386 | where 387 | b = 0 388 | f _ xs = 1 + xs 389 | 390 | -- | foldr style 391 | lengthFoldr :: [a] -> Int 392 | lengthFoldr = foldr f b 393 | where 394 | b = 0 395 | f _ xs = 1 + xs 396 | 397 | -- | foldr style with explicit const usage. 398 | lengthFoldrConst :: [a] -> Int 399 | lengthFoldrConst = foldr (const (1 +)) 0 400 | 401 | {- 402 | -- from base: 403 | -- https://hackage.haskell.org/package/base-4.16.0.0/docs/src/GHC.List.html#length 404 | -- The lambda form turns out to be necessary to make this inline 405 | -- when we need it to and give good performance. 406 | {-# INLINE [0] lengthFB #-} 407 | lengthFB :: x -> (Int -> Int) -> Int -> Int 408 | lengthFB _ r !a = r (a + 1) 409 | 410 | -} 411 | lengthF' :: (Num a) => x -> (a -> a) -> a -> a 412 | lengthF' _ r = \ !a -> r (a + 1) 413 | 414 | -- | GHC style 415 | lengthF :: [a] -> Int 416 | lengthF xs0 = foldr lengthF' id xs0 0 417 | 418 | -- | Monomorphic, GHC style 419 | lengthFMono :: [Int] -> Int 420 | lengthFMono xs0 = foldr lengthF' id xs0 0 421 | 422 | -- * recursion patterns 423 | 424 | -- | Tail recursion 425 | recurseTail :: (a -> b -> b) -> b -> [a] -> b 426 | recurseTail f = go 427 | where 428 | go s [] = s 429 | go s (x : xs) = go (f x s) $! xs 430 | 431 | -- | Lazy tail recursion 432 | recurseTailLazy :: (a -> b -> b) -> b -> [a] -> b 433 | recurseTailLazy f = go 434 | where 435 | go s [] = s 436 | go s (x : xs) = go (f x s) xs 437 | 438 | -- | Tail resursion with flipped argument order. 439 | recurseFlip :: (a -> b -> b) -> b -> [a] -> b 440 | recurseFlip f s0 xs0 = go xs0 s0 441 | where 442 | go [] s = s 443 | go (x : xs) s = go xs $! f x s 444 | 445 | -- | Lazy tail resursion with flipped argument order. 446 | recurseFlipLazy :: (a -> b -> b) -> b -> [a] -> b 447 | recurseFlipLazy f s0 xs0 = go xs0 s0 448 | where 449 | go [] s = s 450 | go (x : xs) s = go xs $ f x s 451 | 452 | -- | Coroutine 453 | recurseCo :: (a -> b -> b) -> b -> [a] -> b 454 | recurseCo f s0 = go 455 | where 456 | go [] = s0 457 | go (x : xs) = f x $! go xs 458 | 459 | -- | Lazy, coroutine 460 | recurseCoLazy :: (a -> b -> b) -> b -> [a] -> b 461 | recurseCoLazy f s0 = go 462 | where 463 | go [] = s0 464 | go (x : xs) = f x $ go xs 465 | 466 | -- | Cata style 467 | recurseCata :: (a -> b -> b) -> b -> [a] -> b 468 | recurseCata f s0 = cata $ \case 469 | Nil -> s0 470 | Cons x acc -> f x acc 471 | 472 | -- * miscellaneous 473 | 474 | -- | Test of const fusion 475 | constFuse :: Int -> () 476 | constFuse x = foldl' const () [1 .. x] 477 | 478 | -- | Increment a list. 479 | mapInc :: [Int] -> [Int] 480 | mapInc xs = fmap (+ 1) xs 481 | 482 | -- | Split a list. 483 | splitHalf :: [a] -> ([a], [a]) 484 | splitHalf xs = go xs xs 485 | where 486 | go (y : ys) (_ : _ : zs) = first (y :) (go ys zs) 487 | go ys _ = ([], ys) 488 | 489 | -- | Fibonnacci 490 | fib :: Int -> Integer 491 | fib n = if n < 2 then toInteger n else fib (n - 1) + fib (n - 2) 492 | -------------------------------------------------------------------------------- /src/Perf/BigO.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | -- | [Order of complexity](https://en.wikibooks.org/wiki/Optimizing_Code_for_Speed/Order_of_Complexity_Optimizations#:~:text=of%2DComplexity%20Reduction-,What%20is%20order%20of%20complexity%3F,*log(N)) calculations. 4 | module Perf.BigO 5 | ( O (..), 6 | olist, 7 | promote, 8 | promote1, 9 | promote_, 10 | demote, 11 | demote1, 12 | spectrum, 13 | Order (..), 14 | bigO, 15 | runtime, 16 | BigOrder (..), 17 | fromOrder, 18 | toOrder, 19 | order, 20 | diffs, 21 | bestO, 22 | estO, 23 | estOs, 24 | makeNs, 25 | OrderOptions (..), 26 | defaultOrderOptions, 27 | parseOrderOptions, 28 | ) 29 | where 30 | 31 | import Data.Bool 32 | import Data.FormatN 33 | import Data.List qualified as List 34 | import Data.Maybe 35 | import Data.Monoid 36 | import Data.Vector qualified as V 37 | import GHC.Generics 38 | import Options.Applicative 39 | import Prettyprinter 40 | import Prelude 41 | 42 | -- $setup 43 | -- >>> import qualified Data.List as List 44 | -- >>> o = Order [0.0,1.0,100.0,0.0,0.0,0.0,0.0,0.0] 45 | -- >>> ms = [2805.0,3476.0,9989.0,92590.0,1029074.6947660954] 46 | -- >>> ns = [1,10,100,1000,10000] 47 | 48 | -- | order type 49 | data O 50 | = -- | cubic 51 | N3 52 | | -- | quadratic 53 | N2 54 | | -- | ^3/2 55 | N32 56 | | -- | N * log N 57 | NLogN 58 | | -- | linear 59 | N1 60 | | -- | sqrt N 61 | N12 62 | | -- | log N 63 | LogN 64 | | -- | constant 65 | N0 66 | deriving (Eq, Ord, Show, Generic, Enum) 67 | 68 | -- | enumeration of O types 69 | -- 70 | -- >>> olist 71 | -- [N3,N2,N32,NLogN,N1,N12,LogN,N0] 72 | olist :: [O] 73 | olist = [N3 .. N0] 74 | 75 | -- | functions to compute performance measure 76 | -- 77 | -- >>> fmap ($ 0) promote_ 78 | -- [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0] 79 | -- 80 | -- >>> fmap ($ 1) promote_ 81 | -- [1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0] 82 | -- 83 | -- Ordering makes sense around N=10 84 | -- 85 | -- >>> fmap ($ 10) promote_ 86 | -- [1000.0,100.0,31.622776601683793,23.02585092994046,10.0,3.1622776601683795,2.302585092994046,1.0] 87 | -- 88 | -- Having NP may cause big num problems 89 | -- 90 | -- >>> fmap ($ 1000) promote_ 91 | -- [1.0e9,1000000.0,31622.776601683792,6907.755278982137,1000.0,31.622776601683793,6.907755278982137,1.0] 92 | promote_ :: [Double -> Double] 93 | promote_ = 94 | [ -- \n -> min maxBound (bool (2**n) zero (n<=zero)), 95 | (^ (3 :: Integer)), 96 | (^ (2 :: Integer)), 97 | (** 1.5), 98 | \n -> bool (bool (n * log n) 1 (n <= 1)) 0 (n <= 0), 99 | id, 100 | (** 0.5), 101 | \n -> bool (bool (log n) 1 (n <= 1)) 0 (n <= 0), 102 | \n -> bool 1 0 (n <= 0) 103 | ] 104 | 105 | -- | a set of factors for each order, which represents a full Order specification. 106 | newtype Order a = Order {factors :: [a]} deriving (Eq, Ord, Show, Generic, Functor) 107 | 108 | -- | create an Order 109 | -- 110 | -- >>> order N2 1 111 | -- Order {factors = [0,1,0,0,0,0,0,0]} 112 | order :: (Num a) => O -> a -> Order a 113 | order o a = Order $ replicate n 0 <> [a] <> replicate (7 - n) 0 114 | where 115 | n = fromEnum o 116 | 117 | -- | Calculate the expected performance measure 118 | -- 119 | -- >>> promote (order N2 1) 10 120 | -- 100.0 121 | promote :: Order Double -> Double -> Double 122 | promote (Order fs) n = sum (zipWith (*) fs (($ n) <$> promote_)) 123 | 124 | -- | Calculate the expected performance measure per n 125 | -- 126 | -- >>> promote (order N2 1) 10 127 | -- 100.0 128 | promote1 :: Order Double -> Double 129 | promote1 o = promote o 1 130 | 131 | -- | Calculate an Order from a given O, an n, and a total performance measurement 132 | -- 133 | -- A measurement of 1e6 for n=1000 with an order of N2 is: 134 | -- 135 | -- >>> demote N2 1000 1000000 136 | -- Order {factors = [0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0]} 137 | -- 138 | -- > promote (demote N2 n m) n m == m 139 | demote :: O -> Double -> Double -> Order Double 140 | demote o n m = order o (m / (promote_ List.!! fromEnum o) n) 141 | 142 | -- | Calculate an Order from a measure, and an O 143 | -- 144 | -- >>> demote1 N2 1000 145 | -- Order {factors = [0.0,1000.0,0.0,0.0,0.0,0.0,0.0,0.0]} 146 | -- 147 | -- > demote1 N2 m == demote o 1 m 148 | demote1 :: O -> Double -> Order Double 149 | demote1 o m = demote o 1 m 150 | 151 | -- | find the dominant order, and it's factor 152 | -- 153 | -- >>> bigO o 154 | -- (N2,1.0) 155 | bigO :: (Ord a, Num a) => Order a -> (O, a) 156 | bigO (Order os) = (toEnum b, os List.!! b) 157 | where 158 | b = fromMaybe 7 $ List.findIndex (> 0) os 159 | 160 | -- | compute the runtime component of an Order, defined as the 161 | -- difference between the dominant order and the total for a single run. 162 | -- 163 | -- >>> runtime o 164 | -- 100.0 165 | runtime :: Order Double -> Double 166 | runtime (Order os) = promote (Order r) 1 167 | where 168 | b = fromMaybe 7 $ List.findIndex (> 0) os 169 | r = take b os <> [0] <> drop (b + 1) os 170 | 171 | instance (Num a) => Num (Order a) where 172 | -- 0 = Order $ replicate 9 0 173 | (+) (Order o) (Order o') = 174 | Order (zipWith (+) o o') 175 | negate (Order o) = Order $ negate <$> o 176 | (*) (Order o) (Order o') = 177 | Order (zipWith (*) o o') 178 | abs = undefined 179 | signum = undefined 180 | fromInteger x = Order $ replicate 9 (fromInteger x) 181 | 182 | -- | A set of factors consisting of the dominant order, the dominant order factor and a constant factor 183 | data BigOrder a = BigOrder {bigOrder :: O, bigFactor :: a} deriving (Eq, Ord, Show, Generic, Functor) 184 | 185 | instance Pretty (BigOrder Double) where 186 | pretty (BigOrder o f) = pretty (decimal (Just 2) f) <> " * O(" <> viaShow o <> ")" 187 | 188 | -- | compute the BigOrder 189 | -- 190 | -- >>> fromOrder o 191 | -- BigOrder {bigOrder = N2, bigFactor = 1.0} 192 | fromOrder :: Order Double -> BigOrder Double 193 | fromOrder o' = BigOrder o f 194 | where 195 | (o, f) = bigO o' 196 | 197 | -- | convert a BigOrder to an Order. 198 | -- 199 | -- toOrder . fromOrder is not a round trip iso. 200 | -- 201 | -- >>> toOrder (fromOrder o) 202 | -- Order {factors = [0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0]} 203 | toOrder :: BigOrder Double -> Order Double 204 | toOrder (BigOrder o f) = order o f 205 | 206 | -- | The factor for each O given an n, and a measurement. 207 | -- 208 | -- >>> spectrum 100 10000 209 | -- Order {factors = [1.0e-2,1.0,10.0,21.71472409516259,100.0,1000.0,2171.4724095162587,10000.0]} 210 | spectrum :: Double -> Double -> Order Double 211 | spectrum n m = Order ((m /) . ($ n) <$> promote_) 212 | 213 | -- | The errors for a list of n's and measurements, based on the spectrum of the last measurement. 214 | diffs :: [Double] -> [Double] -> [[Double]] 215 | diffs ns ms = List.transpose $ zipWith (\n m -> zipWith (\o' f -> m - promote (order o' f) n) olist fs) ns ms 216 | where 217 | fs = factors (spectrum (List.last ns) (List.last ms)) 218 | 219 | -- | minimum error order for a list of measurements 220 | -- 221 | -- >>> bestO ns ms 222 | -- N1 223 | bestO :: [Double] -> [Double] -> O 224 | bestO ns ms = 225 | toEnum $ 226 | V.minIndex $ 227 | V.fromList 228 | (sum <$> fmap (fmap abs) (diffs ns ms)) 229 | 230 | -- | fit the best order for the last measurement and return it, and the error terms for the measurements 231 | -- 232 | -- >>> estO ns ms 233 | -- (Order {factors = [0.0,0.0,0.0,0.0,102.90746947660953,0.0,0.0,0.0]},[2702.0925305233905,2446.9253052339045,-301.7469476609531,-10317.469476609534,0.0]) 234 | estO :: [Double] -> [Double] -> (Order Double, [Double]) 235 | estO [] _ = (0, []) 236 | estO ns ms = (lasto, diff) 237 | where 238 | diff = diffs ns ms List.!! fromEnum o 239 | o = bestO ns ms 240 | lasto = demote o (List.last ns) (List.last ms) 241 | 242 | -- | fit orders from the last measurement to the first, using the residuals at each step. 243 | -- 244 | -- >>> estOs ns ms 245 | -- [Order {factors = [0.0,0.0,0.0,0.0,102.90746947660953,0.0,0.0,0.0]},Order {factors = [0.0,0.0,-0.32626703235351473,0.0,0.0,0.0,0.0,0.0]},Order {factors = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.520084692561625]},Order {factors = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,2432.722690017952]},Order {factors = [0.0,0.0,0.0,0.0,0.0,0.0,0.0,245.1760228452299]}] 246 | estOs :: [Double] -> [Double] -> [Order Double] 247 | estOs ns ms = go [] ns ms 248 | where 249 | go os _ [] = os 250 | go os _ [m] = os <> [order N0 m] 251 | go os ns' ms' = let (o', res) = estO ns' ms' in go (os <> [o']) (List.init ns') (List.init res) 252 | 253 | makeNs :: Int -> Double -> Int -> [Int] 254 | makeNs n0 d low = reverse $ go (next n0) [n0] 255 | where 256 | next n = floor (fromIntegral n / d) 257 | go :: Int -> [Int] -> [Int] 258 | go n acc = bool (go (next n) (acc <> [n])) acc (low >= n) 259 | 260 | data OrderOptions = OrderOptions 261 | { doOrder :: Bool, 262 | orderLow :: Int, 263 | orderDivisor :: Double 264 | } 265 | deriving (Eq, Show, Generic) 266 | 267 | defaultOrderOptions :: OrderOptions 268 | defaultOrderOptions = OrderOptions False 10 9 269 | 270 | parseOrderOptions :: OrderOptions -> Parser OrderOptions 271 | parseOrderOptions def = 272 | OrderOptions 273 | <$> switch (long "order" <> short 'o' <> help "calculate order") 274 | <*> option auto (value (orderLow def) <> long "orderlowest" <> showDefaultWith show <> metavar "DOUBLE" <> help "smallest order test") 275 | <*> option auto (value (orderDivisor def) <> long "orderdivisor" <> showDefaultWith show <> metavar "DOUBLE" <> help "divisor for order computation") 276 | -------------------------------------------------------------------------------- /src/Perf/Chart.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedLabels #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | module Perf.Chart where 5 | 6 | import Chart 7 | import Control.Category ((>>>)) 8 | import Data.Bifunctor 9 | import Data.Bool 10 | import Data.List qualified as List 11 | import Data.Map.Strict qualified as Map 12 | import Data.Maybe 13 | import Data.Text (Text) 14 | import Data.Text qualified as Text 15 | import GHC.Generics 16 | import Optics.Core 17 | import Options.Applicative 18 | import Perf.Stats as Perf 19 | import Prettychart 20 | 21 | -- m <- fromDump defaultPerfDumpOptions 22 | data PerfChartOptions 23 | = PerfChartOptions 24 | { doChart :: Bool, 25 | chartFilepath :: FilePath, 26 | truncateAt :: Double, 27 | doSmallChart :: Bool, 28 | doBigChart :: Bool, 29 | doHistChart :: Bool, 30 | doAveragesLegend :: Bool, 31 | averagesStyle :: Style, 32 | averagesPaletteStart :: Int, 33 | averagesLegend :: LegendOptions, 34 | smallStyle :: Style, 35 | smallHud :: HudOptions, 36 | bigStyle :: Style, 37 | bigHud :: HudOptions, 38 | titleSize :: Double, 39 | histGrain :: Int, 40 | bigWidth :: Double, 41 | excludeZeros :: Bool 42 | } 43 | deriving (Eq, Show, Generic) 44 | 45 | defaultPerfChartOptions :: PerfChartOptions 46 | defaultPerfChartOptions = PerfChartOptions False "other/perf.svg" 10 True True True True (defaultGlyphStyle & set #size 0.05) 2 (defaultLegendOptions & set #place PlaceBottom & set #numStacks 3 & set #scaleChartsBy 0.2 & set #legendSize 0.3 & set #alignCharts AlignLeft & set #hgap (-0.2) & set #vgap (-0.1)) (defaultGlyphStyle & set #size 0.01 & over #color (rgb (palette 0)) & set (#color % opac') 0.3 & set (#borderColor % opac') 0.3 & set #glyphShape (gpalette 0)) defaultHudOptions (defaultGlyphStyle & set #size 0.06 & over #color (rgb (palette 0)) & set #glyphShape (gpalette 0) & set (#color % opac') 0.3 & set (#borderColor % opac') 1) (defaultHudOptions & set (#axes % each % #item % #ticks % #textTick %? #style % #size) 0.07 & over #axes (drop 1) & set (#axes % each % #item % #ticks % #tick % numTicks') (Just 2)) 0.08 100 0.2 True 47 | 48 | -- | Parse charting options. 49 | parsePerfChartOptions :: PerfChartOptions -> Parser PerfChartOptions 50 | parsePerfChartOptions def = 51 | (\c fp trunAt small big hist avs -> PerfChartOptions c fp trunAt small big hist avs (view #averagesStyle def) (view #averagesPaletteStart def) (view #averagesLegend def) (view #smallStyle def) (view #smallHud def) (view #bigStyle def) (view #bigHud def) (view #titleSize def) (view #histGrain def) (view #bigWidth def) (view #excludeZeros def)) 52 | <$> switch (long "chart" <> short 'c' <> help "chart the result") 53 | <*> option str (value (view #chartFilepath def) <> showDefault <> long "chartpath" <> metavar "FILE" <> help "chart file name") 54 | <*> option auto (value (view #truncateAt def) <> showDefaultWith show <> long "truncateat" <> help "truncate chart data (multiple of median)") 55 | <*> switch (long "small") 56 | <*> switch (long "big") 57 | <*> switch (long "histogram") 58 | <*> switch (long "averages") 59 | 60 | perfCharts :: PerfChartOptions -> Maybe [Text] -> Map.Map Text [[Double]] -> ChartOptions 61 | perfCharts cfg labels m = bool (stackCO stackn AlignLeft NoAlign 0.1 cs) (head cs) (length cs == 1) 62 | where 63 | stackn = length cs & fromIntegral & sqrt @Double & ceiling 64 | cs = uncurry (perfChart cfg) <$> ps' 65 | ps = mconcat $ fmap (uncurry zip . bimap (\t -> fmap ((t <> ": ") <>) (fromMaybe (Text.pack . show @Int <$> [0 ..]) labels)) List.transpose) (Map.toList m) 66 | ps' = filter ((> 0) . sum . snd) ps 67 | 68 | perfChart :: PerfChartOptions -> Text -> [Double] -> ChartOptions 69 | perfChart cfg t xs = finalChart 70 | where 71 | xsSmall = xs & xify & filter (_y >>> (< upperCutoff)) & filter (\x -> view #excludeZeros cfg && (_y x > 0)) 72 | xsBig = xs & xify & filter (_y >>> (>= upperCutoff)) 73 | med = median xs 74 | best = tenth xs 75 | av = Perf.average xs 76 | upperCutoff = view #truncateAt cfg * med 77 | 78 | labels = 79 | [ "average: " <> comma (Just 3) av, 80 | "median: " <> comma (Just 3) med, 81 | "best: " <> comma (Just 3) best 82 | ] 83 | (Rect _ _ y' w') = fromMaybe one $ space1 xsSmall 84 | (Range x' z') = Range zero (fromIntegral $ length xs) 85 | rectx = BlankChart defaultStyle [Rect x' z' y' w'] 86 | averagesCT = named "averages" $ zipWith (\x i -> GlyphChart (view #averagesStyle cfg & set #color (palette i) & set #borderColor (palette i) & set #glyphShape (gpalette i)) [Point zero x]) [av, med, best] [(view #averagesPaletteStart cfg) ..] 87 | 88 | (smallDot, smallHist) = dotHistChart (view #histGrain cfg) (view #smallStyle cfg) (mempty @ChartOptions & set #chartTree (averagesCT <> named "xrange" [rectx]) & set #hudOptions (view #smallHud cfg)) xsSmall 89 | 90 | minb = minimum (_y <$> xsBig) 91 | bigrange = Rect x' z' (bool minb zero (length xsBig == 1)) minb 92 | (bigDot, bigHist) = dotHistChart (view #histGrain cfg) (view #bigStyle cfg) (mempty @ChartOptions & set #hudOptions (view #bigHud cfg) & set #chartTree (named "xrange" [BlankChart defaultStyle [bigrange]])) xsBig 93 | 94 | (Rect bdX bdW _ _) = fromMaybe one $ view styleBox' (asChartTree bigDot) 95 | bdr = Just $ Rect bdX bdW (-(view #bigWidth cfg)) (view #bigWidth cfg) 96 | (Rect bdhX bdhW _ _) = fromMaybe one $ view styleBox' (asChartTree bigHist) 97 | bhr = Just $ Rect bdhX bdhW (-(view #bigWidth cfg)) (view #bigWidth cfg) 98 | 99 | finalChart = 100 | mempty @ChartOptions 101 | & set 102 | #chartTree 103 | ( stack 104 | 2 105 | NoAlign 106 | NoAlign 107 | 0 108 | ( bool (asChartTree bigDot & set styleBox' bdr & pure) mempty (null xsBig) 109 | <> bool (asChartTree bigHist & set styleBox' bhr & pure) mempty (null xsBig) 110 | <> [ asChartTree smallDot, 111 | asChartTree smallHist 112 | ] 113 | ) 114 | ) 115 | & set 116 | (#hudOptions % #legends) 117 | [Priority 10 (view #averagesLegend cfg & set #legendCharts (zipWith (\t' c -> (t', [c])) labels (toListOf chart' averagesCT)))] 118 | & set 119 | (#hudOptions % #titles) 120 | [Priority 5 (defaultTitleOptions t & set (#style % #size) (view #titleSize cfg))] 121 | 122 | dotHistChart :: Int -> Style -> ChartOptions -> [Point Double] -> (ChartOptions, ChartOptions) 123 | dotHistChart grain gstyle co xs = (dotCO, histCO) 124 | where 125 | dotCT = named "dot" [GlyphChart gstyle xs] 126 | ys = fmap _y xs 127 | (Range l u) = fromMaybe one (space1 ys) 128 | r' = bool (Range l u) (Range 0 l) (l == u) 129 | r = computeRangeTick r' (fromMaybe defaultTick (co & preview (#hudOptions % #axes % ix 1 % #item % #ticks % #tick))) 130 | (y, w) = let (Range y' w') = r in bool (y', w') (y' - 0.5, y' + 0.5) (y' == w') 131 | 132 | histCO = hhistChart r grain ys & set (#markupOptions % #chartAspect) (CanvasAspect 0.3) & over #chartTree (<> unnamed [BlankChart defaultStyle [Rect 0 0 y w]]) 133 | dotCO = co & over #chartTree (dotCT <>) 134 | 135 | compareCharts :: [(PerfChartOptions, Text, [Double])] -> ChartOptions 136 | compareCharts xs = finalChart 137 | where 138 | xs' = xs & fmap (\(_, _, x) -> x) 139 | cfg' = xs & fmap (\(x, _, _) -> x) 140 | t' = xs & fmap (\(_, x, _) -> x) 141 | cfg = head cfg' 142 | xsSmall = xs' & fmap (xify >>> filter (_y >>> (< upperCutoff)) >>> filter (\x -> view #excludeZeros cfg && (_y x > 0))) 143 | xsBig = xs' & fmap (xify >>> filter (_y >>> (>= upperCutoff))) 144 | med = median <$> xs' 145 | upperCutoff = view #truncateAt cfg * maximum med 146 | 147 | (Rect _ _ y' w') = fromMaybe one $ space1 $ mconcat xsSmall 148 | (Range x' z') = Range zero (fromIntegral $ maximum (length <$> xs')) 149 | rectx = BlankChart defaultStyle [Rect x' z' y' w'] 150 | 151 | (smallDot, smallHist) = dotHistCharts (view #histGrain cfg) (mempty @ChartOptions & set #hudOptions (view #smallHud cfg) & set #chartTree (unnamed [rectx])) (zip (view #smallStyle <$> cfg') xsSmall) 152 | 153 | minb = minimum (_y <$> mconcat xsBig) 154 | bigrange = Rect x' z' (bool minb zero (length (mconcat xsBig) == 1)) minb 155 | (bigDot, bigHist) = dotHistCharts (view #histGrain cfg) (mempty @ChartOptions & set #hudOptions (view #bigHud cfg) & set #chartTree (named "xrange" [BlankChart defaultStyle [bigrange]])) (zip (view #bigStyle <$> cfg') xsBig) 156 | 157 | (Rect bdX bdW _ _) = fromMaybe one $ view styleBox' (asChartTree bigDot) 158 | bdr = Just $ Rect bdX bdW (-(view #bigWidth cfg)) (view #bigWidth cfg) 159 | (Rect bdhX bdhW _ _) = fromMaybe one $ view styleBox' (asChartTree bigHist) 160 | bhr = Just $ Rect bdhX bdhW (-(view #bigWidth cfg)) (view #bigWidth cfg) 161 | 162 | finalChart = 163 | mempty @ChartOptions 164 | & set 165 | #chartTree 166 | ( stack 167 | 2 168 | NoAlign 169 | NoAlign 170 | 0 171 | ( bool (asChartTree bigDot & set styleBox' bdr & pure) mempty (null xsBig) 172 | <> bool (asChartTree bigHist & set styleBox' bhr & pure) mempty (null xsBig) 173 | <> [ asChartTree smallDot, 174 | asChartTree smallHist 175 | ] 176 | ) 177 | ) 178 | & set 179 | (#hudOptions % #legends) 180 | [Priority 10 (view #averagesLegend cfg & set #legendCharts (zipWith (\t'' c -> (t'', [c])) t' (toListOf (#chartTree % chart') smallDot)))] 181 | 182 | dotHistCharts :: Int -> ChartOptions -> [(Style, [Point Double])] -> (ChartOptions, ChartOptions) 183 | dotHistCharts grain co xs = (dotCO, histCO) 184 | where 185 | dotCTs = named "dot" (uncurry GlyphChart <$> xs) 186 | ys = fmap _y . snd <$> xs 187 | (Range l u) = fromMaybe one (space1 (mconcat ys)) 188 | r' = bool (Range l u) (Range 0 l) (l == u) 189 | r = computeRangeTick r' (fromMaybe defaultTick (co & preview (#hudOptions % #axes % ix 1 % #item % #ticks % #tick))) 190 | (y, w) = let (Range y' w') = r in bool (y', w') (y' - 0.5, y' + 0.5) (y' == w') 191 | 192 | histCO = hhistCharts r grain (zip (fst <$> xs) ys) & set (#markupOptions % #chartAspect) (CanvasAspect 0.3) & over #chartTree (<> unnamed [BlankChart defaultStyle [Rect 0 0 y w]]) 193 | dotCO = co & over #chartTree (dotCTs <>) 194 | -------------------------------------------------------------------------------- /src/Perf/Count.hs: -------------------------------------------------------------------------------- 1 | -- | Simple counter. 2 | module Perf.Count 3 | ( count, 4 | countN, 5 | ) 6 | where 7 | 8 | import Perf.Types 9 | import Prelude 10 | 11 | -- | Register 1 as a performance measure 12 | count :: (Applicative m) => StepMeasure m Int 13 | count = StepMeasure start stop 14 | where 15 | start = pure () 16 | stop _ = pure 1 17 | {-# INLINEABLE count #-} 18 | 19 | -- | Count the number of times measured. 20 | countN :: Int -> Measure IO Int 21 | countN n = sum <$> toMeasureN n count 22 | {-# INLINEABLE countN #-} 23 | -------------------------------------------------------------------------------- /src/Perf/Measure.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | -- | Unification of the various different performance measure types, mostly to unify reporting and data management. 4 | module Perf.Measure 5 | ( MeasureType (..), 6 | parseMeasure, 7 | measureDs, 8 | measureLabels, 9 | measureFinalStat, 10 | ) 11 | where 12 | 13 | import Data.Text (Text) 14 | import Options.Applicative 15 | import Options.Applicative.Help.Pretty qualified as OA 16 | import Perf.Count 17 | import Perf.Space 18 | import Perf.Stats 19 | import Perf.Time 20 | import Perf.Types 21 | import System.Clock 22 | import Prelude hiding (cycle) 23 | 24 | -- | Command-line measurement options. 25 | data MeasureType = MeasureTime | MeasureNTime | MeasureSpace | MeasureSpaceTime | MeasureAllocation | MeasureCount deriving (Eq, Show) 26 | 27 | -- | Parse command-line 'MeasureType' options. 28 | parseMeasure :: Parser MeasureType 29 | parseMeasure = 30 | flag' MeasureTime (long "time" <> style (OA.annotate OA.bold) <> help "measure time performance") 31 | <|> flag' MeasureNTime (long "ntime" <> help "measure n*time performance") 32 | <|> flag' MeasureSpace (long "space" <> help "measure space performance") 33 | <|> flag' MeasureSpaceTime (long "spacetime" <> help "measure both space and time performance") 34 | <|> flag' MeasureAllocation (long "allocation" <> help "measure bytes allocated") 35 | <|> flag' MeasureCount (long "count" <> help "measure count") 36 | <|> pure MeasureTime 37 | 38 | -- | unification of the different measurements to being a list of doubles. 39 | measureDs :: MeasureType -> Clock -> Int -> Measure IO [[Double]] 40 | measureDs mt c n = 41 | case mt of 42 | MeasureTime -> fmap ((: []) . fromIntegral) <$> timesWith c n 43 | MeasureNTime -> pure . pure . fromIntegral <$> timesNWith c n 44 | MeasureSpace -> toMeasureN n (ssToList <$> space False) 45 | MeasureSpaceTime -> toMeasureN n ((\x y -> ssToList x <> [fromIntegral y]) <$> space False <*> stepTime) 46 | MeasureAllocation -> fmap ((: []) . fromIntegral) <$> toMeasureN n (allocation False) 47 | MeasureCount -> (: []) . fmap fromIntegral <$> toMeasureN n count 48 | 49 | -- | unification of measurement labels 50 | measureLabels :: MeasureType -> [Text] 51 | measureLabels mt = 52 | case mt of 53 | MeasureTime -> ["time"] 54 | MeasureNTime -> ["ntime"] 55 | MeasureSpace -> spaceLabels 56 | MeasureSpaceTime -> spaceLabels <> ["time"] 57 | MeasureAllocation -> ["allocation"] 58 | MeasureCount -> ["count"] 59 | 60 | -- | How to fold the list of performance measures. 61 | measureFinalStat :: MeasureType -> Int -> [Double] -> Double 62 | measureFinalStat mt n = 63 | case mt of 64 | MeasureTime -> average 65 | MeasureNTime -> (/ fromIntegral n) . sum 66 | MeasureSpace -> average 67 | MeasureSpaceTime -> average 68 | MeasureAllocation -> average 69 | MeasureCount -> sum 70 | -------------------------------------------------------------------------------- /src/Perf/Report.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | {-# LANGUAGE OverloadedLabels #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | 5 | -- | Reporting on performance, potentially checking versus a canned results. 6 | module Perf.Report 7 | ( Name, 8 | Header (..), 9 | parseHeader, 10 | CompareLevels (..), 11 | defaultCompareLevels, 12 | parseCompareLevels, 13 | ReportOptions (..), 14 | defaultReportOptions, 15 | parseReportOptions, 16 | PerfDumpOptions (..), 17 | defaultPerfDumpOptions, 18 | parsePerfDumpOptions, 19 | fromDump, 20 | report, 21 | reportMain, 22 | writeResult, 23 | readResult, 24 | CompareResult (..), 25 | compareNote, 26 | report2D, 27 | Golden (..), 28 | defaultGolden, 29 | parseGolden, 30 | replaceDefaultFilePath, 31 | parseClock, 32 | reportToConsole, 33 | ) 34 | where 35 | 36 | import Chart 37 | import Control.Exception 38 | import Control.Monad 39 | import Data.Bool 40 | import Data.Foldable 41 | import Data.List (intercalate) 42 | import Data.List qualified as List 43 | import Data.Map.Merge.Strict 44 | import Data.Map.Strict qualified as Map 45 | import Data.Text (Text) 46 | import Data.Text qualified as Text 47 | import Data.Text.IO qualified as Text 48 | import GHC.Generics 49 | import Optics.Core 50 | import Options.Applicative as OA 51 | import Options.Applicative.Help.Pretty 52 | import Perf.Algos 53 | import Perf.BigO 54 | import Perf.Chart 55 | import Perf.Measure 56 | import Perf.Stats 57 | import Perf.Time (defaultClock) 58 | import Perf.Types 59 | import Prettyprinter.Render.Text qualified as PP 60 | import System.Clock 61 | import System.Exit 62 | import System.Mem 63 | import Test.Tasty 64 | import Test.Tasty.Bench 65 | import Text.PrettyPrint.Boxes qualified as B 66 | import Text.Printf hiding (parseFormat) 67 | import Text.Read 68 | 69 | -- | Benchmark name 70 | type Name = String 71 | 72 | -- | Whether to include header information. 73 | data Header = Header | NoHeader deriving (Eq, Show, Generic) 74 | 75 | -- | Command-line parser for 'Header' 76 | parseHeader :: Parser Header 77 | parseHeader = 78 | flag' Header (long "header" <> help "include headers in reporting") 79 | <|> flag' NoHeader (long "noheader" <> help "dont include headers in reporting") 80 | <|> pure Header 81 | 82 | -- | Options for production of a performance report. 83 | data ReportOptions = ReportOptions 84 | { -- | Number of times to run a benchmark. 85 | reportN :: Int, 86 | reportLength :: Int, 87 | reportClock :: Clock, 88 | reportStatDType :: StatDType, 89 | reportMeasureType :: MeasureType, 90 | reportGolden :: Golden, 91 | reportHeader :: Header, 92 | reportCompare :: CompareLevels, 93 | reportChart :: PerfChartOptions, 94 | reportDump :: PerfDumpOptions, 95 | reportGC :: Bool, 96 | reportOrder :: OrderOptions, 97 | reportTasty :: Bool 98 | } 99 | deriving (Eq, Show, Generic) 100 | 101 | -- | Default reporting options 102 | defaultReportOptions :: ReportOptions 103 | defaultReportOptions = 104 | ReportOptions 105 | 1000 106 | 1000 107 | defaultClock 108 | StatAverage 109 | MeasureTime 110 | defaultGolden 111 | Header 112 | defaultCompareLevels 113 | defaultPerfChartOptions 114 | defaultPerfDumpOptions 115 | False 116 | defaultOrderOptions 117 | False 118 | 119 | -- | Command-line parser for 'ReportOptions' 120 | parseReportOptions :: ReportOptions -> Parser ReportOptions 121 | parseReportOptions def = 122 | ReportOptions 123 | <$> option auto (value (view #reportN def) <> showDefaultWith show <> long "runs" <> short 'n' <> metavar "INT" <> help "number of runs to perform") 124 | <*> option auto (value (view #reportLength def) <> long "length" <> showDefaultWith show <> short 'l' <> metavar "INT" <> help "length-like variable eg, used to alter list length and compute order") 125 | <*> parseClock 126 | <*> parseStatD 127 | <*> parseMeasure 128 | <*> parseGolden 129 | <*> parseHeader 130 | <*> parseCompareLevels defaultCompareLevels 131 | <*> parsePerfChartOptions defaultPerfChartOptions 132 | <*> parsePerfDumpOptions defaultPerfDumpOptions 133 | <*> switch (long "gc" <> help "run the GC prior to measurement") 134 | <*> parseOrderOptions defaultOrderOptions 135 | <*> switch (long "tasty" <> help "run tasty-bench") 136 | 137 | -- | Parse command-line 'Clock' options. 138 | parseClock :: Parser Clock 139 | parseClock = 140 | flag' Monotonic (long "Monotonic" <> OA.style (annotate bold) <> help "use Monotonic clock") 141 | <|> flag' Realtime (long "Realtime" <> help "use Realtime clock") 142 | <|> flag' ProcessCPUTime (long "ProcessCPUTime" <> help "use ProcessCPUTime clock") 143 | <|> flag' ThreadCPUTime (long "ThreadCPUTime" <> help "use ThreadCPUTime clock") 144 | #ifdef mingw32_HOST_OS 145 | <|> pure ThreadCPUTime 146 | #else 147 | <|> flag' MonotonicRaw (long "MonotonicRaw" <> help "use MonotonicRaw clock") 148 | <|> pure MonotonicRaw 149 | #endif 150 | 151 | data PerfDumpOptions = PerfDumpOptions {dumpFilepath :: FilePath, doDump :: Bool} deriving (Eq, Show, Generic) 152 | 153 | defaultPerfDumpOptions :: PerfDumpOptions 154 | defaultPerfDumpOptions = PerfDumpOptions "other/perf.map" False 155 | 156 | -- | Parse charting options. 157 | parsePerfDumpOptions :: PerfDumpOptions -> Parser PerfDumpOptions 158 | parsePerfDumpOptions def = 159 | PerfDumpOptions 160 | <$> option str (value (view #dumpFilepath def) <> showDefaultWith show <> long "dumppath" <> metavar "FILE" <> help "dump file name") 161 | <*> switch (long "dump" <> help "dump raw performance data as a Map Text [[Double]]") 162 | 163 | fromDump :: PerfDumpOptions -> IO (Map.Map Text [[Double]]) 164 | fromDump cfg = read <$> readFile (view #dumpFilepath cfg) 165 | 166 | -- | Run and report a benchmark with the specified reporting options. 167 | reportMain :: Example -> ReportOptions -> Name -> (Int -> PerfT IO [[Double]] a) -> IO a 168 | reportMain ex o name t = do 169 | let !n = reportN o 170 | let l = reportLength o 171 | let s = reportStatDType o 172 | let c = reportClock o 173 | let mt = reportMeasureType o 174 | let o' = replaceDefaultFilePath (intercalate "-" [name, show n, show mt, show s]) o 175 | when (reportGC o) performGC 176 | (a, m) <- runPerfT (measureDs mt c n) (t l) 177 | report o' (statify s m) 178 | (\cfg -> when (view #doChart cfg) (writeChartOptions (view #chartFilepath cfg) (perfCharts cfg (Just (measureLabels mt)) m))) (reportChart o) 179 | (\cfg -> when (view #doDump cfg) (writeFile (view #dumpFilepath cfg) (show m))) (reportDump o) 180 | when (view (#reportOrder % #doOrder) o) (reportBigO o t) 181 | when (view #reportTasty o) (reportTasty' ex o) 182 | pure a 183 | 184 | -- | Levels of geometric difference in compared performance that triggers reporting. 185 | data CompareLevels = CompareLevels {errorLevel :: Double, warningLevel :: Double, improvedLevel :: Double} deriving (Eq, Show) 186 | 187 | -- | 188 | -- >>> defaultCompareLevels 189 | -- CompareLevels {errorLevel = 0.2, warningLevel = 5.0e-2, improvedLevel = 5.0e-2} 190 | defaultCompareLevels :: CompareLevels 191 | defaultCompareLevels = CompareLevels 0.2 0.05 0.05 192 | 193 | -- | Command-line parser for 'CompareLevels' 194 | parseCompareLevels :: CompareLevels -> Parser CompareLevels 195 | parseCompareLevels c = 196 | CompareLevels 197 | <$> option auto (value (errorLevel c) <> showDefaultWith show <> long "error" <> metavar "DOUBLE" <> help "report an error if performance degrades by more than this") 198 | <*> option auto (value (warningLevel c) <> showDefaultWith show <> long "warning" <> metavar "DOUBLE" <> help "report a warning if performance degrades by more than this") 199 | <*> option auto (value (improvedLevel c) <> showDefaultWith show <> long "improved" <> metavar "DOUBLE" <> help "report if performance improves by more than this") 200 | 201 | -- | Write results to file 202 | writeResult :: FilePath -> Map.Map [Text] Double -> IO () 203 | writeResult f m = writeFile f (show m) 204 | 205 | -- | Read results from a file. 206 | readResult :: FilePath -> IO (Either String (Map.Map [Text] Double)) 207 | readResult f = do 208 | a :: Either SomeException String <- try (readFile f) 209 | pure $ either (Left . show) readEither a 210 | 211 | -- | Comparison data between two results. 212 | data CompareResult = CompareResult {oldResult :: Maybe Double, newResult :: Maybe Double, noteResult :: Text} deriving (Show, Eq) 213 | 214 | hasDegraded :: Map.Map a CompareResult -> Bool 215 | hasDegraded m = any (((== "degraded") . noteResult) . snd) (Map.toList m) 216 | 217 | -- | Compare two results and produce some notes given level triggers. 218 | compareNote :: (Ord a) => CompareLevels -> Map.Map a Double -> Map.Map a Double -> Map.Map a CompareResult 219 | compareNote cfg x y = 220 | merge 221 | (mapMissing (\_ x' -> CompareResult Nothing (Just x') "new result")) 222 | (mapMissing (\_ x' -> CompareResult (Just x') Nothing "old result not found")) 223 | ( zipWithMatched 224 | ( \_ x' y' -> 225 | CompareResult (Just x') (Just y') (note' x' y') 226 | ) 227 | ) 228 | x 229 | y 230 | where 231 | note' x' y' 232 | | y' / x' > 1 + errorLevel cfg = "degraded" 233 | | y' / x' > 1 + warningLevel cfg = "slightly-degraded" 234 | | y' / x' < (1 - improvedLevel cfg) = "improvement" 235 | | otherwise = "" 236 | 237 | -- | Console-style header information. 238 | formatHeader :: Map.Map [Text] a -> [Text] -> [Text] 239 | formatHeader m ts = 240 | [mconcat $ Text.pack . printf "%-16s" <$> ((("label" <>) . Text.pack . show <$> [1 .. labelCols]) <> ts), mempty] 241 | where 242 | labelCols = maximum $ length <$> Map.keys m 243 | 244 | -- | Format a comparison. 245 | formatCompare :: Header -> Map.Map [Text] CompareResult -> [Text] 246 | formatCompare h m = 247 | bool [] (formatHeader m ["old result", "new result", "change"]) (h == Header) 248 | <> Map.elems (Map.mapWithKey (\k a -> Text.pack . mconcat $ printf "%-16s" <$> (k <> compareReport a)) m) 249 | where 250 | compareReport (CompareResult x y n) = 251 | [ maybe mempty (expt (Just 3)) x, 252 | maybe mempty (expt (Just 3)) y, 253 | n 254 | ] 255 | 256 | -- | Format a result as lines of text. 257 | formatText :: Header -> Map.Map [Text] Text -> [Text] 258 | formatText h m = 259 | bool [] (formatHeader m ["results"]) (h == Header) 260 | <> Map.elems (Map.mapWithKey (\k a -> Text.pack . mconcat $ printf "%-16s" <$> (k <> [a])) m) 261 | 262 | -- | Format a result as a table. 263 | report2D :: Map.Map [Text] Double -> IO () 264 | report2D m = putStrLn $ B.render $ B.hsep 1 B.left $ cs' : rs' 265 | where 266 | rs = List.nub ((List.!! 1) . fst <$> Map.toList m) 267 | cs = List.nub ((List.!! 0) . fst <$> Map.toList m) 268 | bx = B.text . Text.unpack 269 | xs = (\c -> (\r -> m Map.! [c, r]) <$> rs) <$> cs 270 | xs' = fmap (fmap (bx . expt (Just 3))) xs 271 | cs' = B.vcat B.left (bx <$> ("algo" : cs)) 272 | rs' = B.vcat B.right <$> zipWith (:) (bx <$> rs) (List.transpose xs') 273 | 274 | reportToConsole :: [Text] -> IO () 275 | reportToConsole xs = traverse_ Text.putStrLn xs 276 | 277 | -- | Golden file options. 278 | data Golden = Golden {golden :: FilePath, check :: CheckGolden, record :: RecordGolden} deriving (Generic, Eq, Show) 279 | 280 | -- | Whether to check against a golden file 281 | data CheckGolden = CheckGolden | NoCheckGolden deriving (Eq, Show, Generic) 282 | 283 | -- | Whether to overwrite a golden file 284 | data RecordGolden = RecordGolden | NoRecordGolden deriving (Eq, Show, Generic) 285 | 286 | -- | Default is Golden "other/bench.perf" CheckGolden NoRecordGolden 287 | defaultGolden :: Golden 288 | defaultGolden = Golden "other/bench.perf" CheckGolden NoRecordGolden 289 | 290 | -- | Replace the golden file name stem if it's the default. 291 | replaceGoldenDefault :: FilePath -> Golden -> Golden 292 | replaceGoldenDefault s g = bool g g {golden = s} (golden g == golden defaultGolden) 293 | 294 | defaultGoldenPath :: FilePath -> FilePath 295 | defaultGoldenPath fp = "other/" <> fp <> ".perf" 296 | 297 | -- | Replace the Golden file path with the suggested stem, but only if the user did not specify a specific file path at the command line. 298 | replaceDefaultFilePath :: FilePath -> ReportOptions -> ReportOptions 299 | replaceDefaultFilePath fp o = 300 | o {reportGolden = replaceGoldenDefault (defaultGoldenPath fp) (reportGolden o)} 301 | 302 | -- | Parse command-line golden file options. 303 | parseGolden :: Parser Golden 304 | parseGolden = 305 | Golden 306 | <$> option str (value (golden defaultGolden) <> showDefaultWith show <> long "golden" <> short 'g' <> metavar "FILE" <> help "golden file name") 307 | -- True is the default for 'check'. 308 | <*> (bool NoCheckGolden CheckGolden <$> flag True False (long "nocheck" <> help "do not check versus the golden file")) 309 | <*> (bool NoRecordGolden RecordGolden <$> switch (long "record" <> short 'r' <> help "record the result to the golden file")) 310 | 311 | reportConsoleNoCompare :: Header -> Map.Map [Text] Double -> IO () 312 | reportConsoleNoCompare h m = 313 | reportToConsole (formatText h (expt (Just 3) <$> m)) 314 | 315 | reportConsoleCompare :: Header -> Map.Map [Text] CompareResult -> IO () 316 | reportConsoleCompare h m = 317 | reportToConsole (formatCompare h m) 318 | 319 | -- | Report results 320 | -- 321 | -- If a goldenFile is checked, and performance has degraded, the function will exit with 'ExitFailure' so that 'cabal bench' and other types of processes can signal performance issues. 322 | report :: ReportOptions -> Map.Map [Text] [Double] -> IO () 323 | report o m = do 324 | when 325 | ((== RecordGolden) $ record (reportGolden o)) 326 | (writeResult (golden (reportGolden o)) m') 327 | case check (reportGolden o) of 328 | NoCheckGolden -> reportConsoleNoCompare (reportHeader o) m' 329 | CheckGolden -> do 330 | mOrig <- readResult (golden (reportGolden o)) 331 | case mOrig of 332 | Left _ -> do 333 | reportConsoleNoCompare (reportHeader o) m' 334 | unless 335 | ((RecordGolden ==) $ record (reportGolden o)) 336 | (putStrLn "No golden file found. To create one, run with '-r'") 337 | Right orig -> do 338 | let n = compareNote (reportCompare o) orig m' 339 | _ <- reportConsoleCompare (reportHeader o) n 340 | when (hasDegraded n) (exitWith $ ExitFailure 1) 341 | where 342 | m' = Map.fromList $ mconcat $ (\(ks, xss) -> zipWith (\x l -> (ks <> [l], x)) xss (measureLabels (reportMeasureType o))) <$> Map.toList m 343 | 344 | reportBigO :: ReportOptions -> (Int -> PerfT IO [[Double]] a) -> IO () 345 | reportBigO o p = do 346 | m <- mapM (execPerfT (measureDs (view #reportMeasureType o) (view #reportClock o) (view #reportN o)) . p) ns 347 | putStrLn mempty 348 | reportToConsole $ PP.renderStrict . layoutPretty defaultLayoutOptions <$> os'' m 349 | pure () 350 | where 351 | l = view #reportLength o 352 | ns = makeNs l (view (#reportOrder % #orderDivisor) o) (view (#reportOrder % #orderLow) o) 353 | ms m' = fmap (fmap (statD (view #reportStatDType o)) . List.transpose) <$> m' 354 | os m' = fmap (fmap (pretty . fromOrder . fst . estO (fromIntegral <$> ns)) . List.transpose) (Map.unionsWith (<>) (fmap (fmap (: [])) (ms m'))) 355 | os' m' = mconcat $ (\(ks, xss) -> zipWith (\x l' -> ([ks] <> [l'], x)) xss (measureLabels (reportMeasureType o))) <$> Map.toList (os m') 356 | os'' m' = (\(k, v) -> (pretty . Text.intercalate ":") k <> " " <> v) <$> os' m' 357 | 358 | reportTasty' :: Example -> ReportOptions -> IO () 359 | reportTasty' ex o = do 360 | t <- measureCpuTime (mkTimeout 1000000) (RelStDev 0.05) (tastyExample (examplePattern ex (view #reportLength o))) 361 | Text.putStrLn $ "tasty:time: " <> decimal (Just 3) (t * 1e9) 362 | -------------------------------------------------------------------------------- /src/Perf/Space.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | -- | Space performance measurement. 4 | module Perf.Space 5 | ( SpaceStats (..), 6 | ssToList, 7 | spaceLabels, 8 | space, 9 | allocation, 10 | Bytes (..), 11 | ) 12 | where 13 | 14 | import Control.Monad 15 | import Data.Text (Text) 16 | import Data.Word 17 | import GHC.Stats 18 | import Perf.Types 19 | import System.Mem 20 | import Prelude hiding (cycle) 21 | 22 | -- | GHC allocation statistics. 23 | data SpaceStats = SpaceStats {allocated :: Word64, copied :: Word64, maxmem :: Word64, minorgcs :: Word32, majorgcs :: Word32} deriving (Read, Show, Eq) 24 | 25 | -- | Convert 'SpaceStats' to a list of numbers. 26 | ssToList :: (Num a) => SpaceStats -> [a] 27 | ssToList (SpaceStats x1 x2 x3 x4 x5) = [fromIntegral x1, fromIntegral x2, fromIntegral x3, fromIntegral x4, fromIntegral x5] 28 | 29 | instance Semigroup SpaceStats where 30 | (<>) = addSpace 31 | 32 | instance Monoid SpaceStats where 33 | mempty = SpaceStats 0 0 0 0 0 34 | 35 | instance Num SpaceStats where 36 | (+) = addSpace 37 | (-) = diffSpace 38 | (*) = error "SpaceStats times" 39 | abs = error "SpaceStats abs" 40 | signum = error "SpaceStats signum" 41 | fromInteger n = SpaceStats (fromIntegral n) (fromIntegral n) (fromIntegral n) (fromIntegral n) (fromIntegral n) 42 | 43 | diffSpace :: SpaceStats -> SpaceStats -> SpaceStats 44 | diffSpace (SpaceStats x1 x2 x3 x4 x5) (SpaceStats x1' x2' x3' x4' x5') = SpaceStats (x1' - x1) (x2' - x2) (x3' - x3) (x4' - x4) (x5' - x5) 45 | 46 | addSpace :: SpaceStats -> SpaceStats -> SpaceStats 47 | addSpace (SpaceStats x1 x2 x3 x4 x5) (SpaceStats x1' x2' x3' x4' x5') = SpaceStats (x1' + x1) (x2' + x2) (x3' + x3) (x4' + x4) (x5' + x5) 48 | 49 | getSpace :: RTSStats -> SpaceStats 50 | getSpace s = SpaceStats (allocated_bytes s) (copied_bytes s) (max_mem_in_use_bytes s) (gcs s) (major_gcs s) 51 | 52 | -- | Labels for 'SpaceStats'. 53 | spaceLabels :: [Text] 54 | spaceLabels = ["allocated", "copied", "maxmem", "minorgcs", "majorgcs"] 55 | 56 | -- | A allocation 'StepMeasure' with a flag to determine if 'performGC' should run prior to the measurement. 57 | space :: Bool -> StepMeasure IO SpaceStats 58 | space p = StepMeasure (start p) stop 59 | where 60 | start p' = do 61 | when p' performGC 62 | getSpace <$> getRTSStats 63 | stop s = do 64 | s' <- getSpace <$> getRTSStats 65 | pure $ diffSpace s s' 66 | {-# INLINEABLE space #-} 67 | 68 | -- | Number of bytes 69 | newtype Bytes = Bytes {unbytes :: Word64} 70 | deriving (Show, Read, Eq, Ord, Num, Real, Enum, Integral) 71 | 72 | instance Semigroup Bytes where 73 | (<>) = (+) 74 | 75 | instance Monoid Bytes where 76 | mempty = 0 77 | 78 | -- | Measure memory allocation, with a flag to run 'performGC' prior to the measurement. 79 | allocation :: Bool -> StepMeasure IO Bytes 80 | allocation p = StepMeasure (start p) stop 81 | where 82 | start p' = do 83 | when p' performGC 84 | Bytes . allocated_bytes <$> getRTSStats 85 | stop s = do 86 | s' <- Bytes . allocated_bytes <$> getRTSStats 87 | pure $ s' - s 88 | {-# INLINEABLE allocation #-} 89 | -------------------------------------------------------------------------------- /src/Perf/Stats.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | -- | Statistical choices for multiple performance measurements. 4 | module Perf.Stats 5 | ( average, 6 | median, 7 | tenth, 8 | averageI, 9 | StatDType (..), 10 | statD, 11 | statDs, 12 | parseStatD, 13 | -- stat reporting 14 | addStat, 15 | ordy, 16 | allStats, 17 | statify, 18 | ) 19 | where 20 | 21 | import Control.Monad.State.Lazy 22 | import Data.List qualified as List 23 | import Data.Map.Strict qualified as Map 24 | import Data.Text (Text, pack) 25 | import NumHask.Space (quantile) 26 | import Options.Applicative 27 | import Options.Applicative.Help.Pretty 28 | 29 | -- | Compute the median 30 | median :: [Double] -> Double 31 | median = quantile 0.5 32 | 33 | -- | Compute the average 34 | average :: [Double] -> Double 35 | average xs = sum xs / (fromIntegral . length $ xs) 36 | 37 | -- | Compute the tenth percentile 38 | tenth :: [Double] -> Double 39 | tenth = quantile 0.1 40 | 41 | -- | Compute the average of an Integral 42 | averageI :: (Integral a) => [a] -> Double 43 | averageI xs = sum (fromIntegral <$> xs) / (fromIntegral . length $ xs) 44 | 45 | -- | Command-line options for type of statistic. 46 | data StatDType = StatAverage | StatMedian | StatBest deriving (Eq, Show) 47 | 48 | -- | Compute a statistic. 49 | statD :: StatDType -> [Double] -> Double 50 | statD StatBest = tenth 51 | statD StatMedian = median 52 | statD StatAverage = average 53 | 54 | -- | Compute a list of statistics. 55 | statDs :: StatDType -> [[Double]] -> [Double] 56 | statDs StatBest = fmap tenth . List.transpose 57 | statDs StatMedian = fmap median . List.transpose 58 | statDs StatAverage = fmap average . List.transpose 59 | 60 | -- | Parse command-line 'StatDType' options. 61 | parseStatD :: Parser StatDType 62 | parseStatD = 63 | flag' StatBest (long "best" <> style (annotate bold) <> help "report upper decile") 64 | <|> flag' StatMedian (long "median" <> help "report median") 65 | <|> flag' StatAverage (long "average" <> help "report average") 66 | <|> pure StatBest 67 | 68 | -- | Add a statistic to a State Map 69 | addStat :: (Ord k, Monad m) => k -> s -> StateT (Map.Map k s) m () 70 | addStat label s = do 71 | modify (Map.insert label s) 72 | 73 | -- | Linguistic conversion of an ordinal 74 | ordy :: Int -> [Text] 75 | ordy f = zipWith (\x s -> (pack . show) x <> s) [1 .. f] (["st", "nd", "rd"] <> repeat "th") 76 | 77 | -- | Compute all stats. 78 | allStats :: Int -> Map.Map [Text] [[Double]] -> Map.Map [Text] [Double] 79 | allStats f m = 80 | Map.fromList $ 81 | mconcat 82 | [ mconcat ((\(ks, xss) -> zipWith (\l xs -> (ks <> [l], xs)) (ordy f) xss) <$> mlist), 83 | (\(ks, xss) -> (ks <> ["best"], quantile 0.1 <$> List.transpose xss)) <$> mlist, 84 | (\(ks, xss) -> (ks <> ["median"], quantile 0.5 <$> List.transpose xss)) <$> mlist, 85 | (\(ks, xss) -> (ks <> ["average"], av <$> List.transpose xss)) <$> mlist 86 | ] 87 | where 88 | mlist = Map.toList m 89 | av xs = sum xs / (fromIntegral . length $ xs) 90 | 91 | -- | Convert a Map of performance result to a statistic. 92 | statify :: (Ord a) => StatDType -> Map.Map a [[Double]] -> Map.Map [a] [Double] 93 | statify s m = fmap (statD s) . List.transpose <$> Map.mapKeys (: []) m 94 | -------------------------------------------------------------------------------- /src/Perf/Time.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | {-# LANGUAGE ViewPatterns #-} 3 | 4 | -- | Use of 'System.Clock' from the [clock](https://hackage.haskell.org/package/clock) library to measure time performance of a computation. 5 | module Perf.Time 6 | ( Nanos, 7 | defaultClock, 8 | toSecs, 9 | nanosWith, 10 | nanos, 11 | tick_, 12 | warmup, 13 | tickWith, 14 | tick, 15 | tickWHNF, 16 | tickLazy, 17 | tickForce, 18 | tickForceArgs, 19 | tickIO, 20 | tickIOWith, 21 | ticks, 22 | ticksIO, 23 | time, 24 | times, 25 | timesWith, 26 | timesN, 27 | timesNWith, 28 | stepTime, 29 | ) 30 | where 31 | 32 | import Control.DeepSeq 33 | import Control.Monad (replicateM_) 34 | import Perf.Types 35 | import System.Clock 36 | import Prelude 37 | 38 | -- | A performance measure of number of nanoseconds. 39 | type Nanos = Integer 40 | 41 | -- | Convert 'Nanos' to seconds. 42 | toSecs :: Nanos -> Double 43 | toSecs ns = fromIntegral ns / 1e9 44 | 45 | -- | 'MonotonicRaw' is the default for macOS & linux, at around 42 nano time resolution, and a 'tick_' measurement of around 170 nanos. For Windows, 'ThreadCPUTime' has a similar time resolution at 42 nanos and a 'tick_' of around 500 nanos. 46 | defaultClock :: Clock 47 | 48 | #ifdef mingw32_HOST_OS 49 | defaultClock = ThreadCPUTime 50 | #else 51 | defaultClock = MonotonicRaw 52 | #endif 53 | 54 | -- | A single 'defaultClock' reading (note that the absolute value is not meaningful). 55 | nanos :: IO Nanos 56 | nanos = nanosWith defaultClock 57 | 58 | -- | A single reading of a specific 'Clock'. 59 | nanosWith :: Clock -> IO Nanos 60 | nanosWith c = toNanoSecs <$> getTime c 61 | 62 | -- | tick_ measures the number of nanos it takes to read the clock. 63 | tick_ :: IO Nanos 64 | tick_ = do 65 | t <- nanos 66 | t' <- nanos 67 | pure (t' - t) 68 | 69 | -- | Warm up the clock, to avoid a high first measurement. Without a warmup, one or more larger values can occur at the start of a measurement spree, and often are in the zone of an L2 miss. 70 | warmup :: Int -> IO () 71 | warmup n = replicateM_ n tick_ 72 | 73 | -- | tick from a specific 'Clock' 74 | tickWith :: Clock -> (a -> b) -> a -> IO (Nanos, b) 75 | tickWith c !f !a = do 76 | !t <- nanosWith c 77 | !a' <- pure $! f a 78 | !t' <- nanosWith c 79 | pure (t' - t, a') 80 | {-# INLINEABLE tickWith #-} 81 | 82 | -- | /tick f a/ 83 | -- 84 | -- - strictly evaluates f and a to WHNF 85 | -- - reads the clock 86 | -- - strictly evaluates f a to WHNF 87 | -- - reads the clock 88 | -- - returns (nanos, f a) 89 | tick :: (a -> b) -> a -> IO (Nanos, b) 90 | tick !f !a = do 91 | !t <- nanos 92 | !a' <- pure $! f a 93 | !t' <- nanos 94 | pure (t' - t, a') 95 | {-# INLINEABLE tick #-} 96 | 97 | -- | /tickWHNF f a/ 98 | -- 99 | -- - reads the clock 100 | -- - strictly evaluates f a to WHNF (this may also kick off thunk evaluation in f or a which will also be captured in the cycle count) 101 | -- - reads the clock 102 | -- - returns (nanos, f a) 103 | tickWHNF :: (a -> b) -> a -> IO (Nanos, b) 104 | tickWHNF f a = do 105 | !t <- nanos 106 | !a' <- pure $! f a 107 | !t' <- nanos 108 | pure (t' - t, a') 109 | {-# INLINEABLE tickWHNF #-} 110 | 111 | -- | /tickLazy f a/ 112 | -- 113 | -- - reads the clock 114 | -- - lazily evaluates f a 115 | -- - reads the clock 116 | -- - returns (nanos, f a) 117 | tickLazy :: (a -> b) -> a -> IO (Nanos, b) 118 | tickLazy f a = do 119 | t <- nanos 120 | let a' = f a 121 | t' <- nanos 122 | pure (t' - t, a') 123 | {-# INLINEABLE tickLazy #-} 124 | 125 | -- | /tickForce f a/ 126 | -- 127 | -- - deeply evaluates f and a, 128 | -- - reads the clock 129 | -- - deeply evaluates f a 130 | -- - reads the clock 131 | -- - returns (nanos, f a) 132 | tickForce :: (NFData a, NFData b) => (a -> b) -> a -> IO (Nanos, b) 133 | tickForce (force -> !f) (force -> !a) = do 134 | !t <- nanos 135 | !a' <- pure (force (f a)) 136 | !t' <- nanos 137 | pure (t' - t, a') 138 | {-# INLINEABLE tickForce #-} 139 | 140 | -- | /tickForceArgs f a/ 141 | -- 142 | -- - deeply evaluates f and a, 143 | -- - reads the clock 144 | -- - strictly evaluates f a to WHNF 145 | -- - reads the clock 146 | -- - returns (nanos, f a) 147 | tickForceArgs :: (NFData a) => (a -> b) -> a -> IO (Nanos, b) 148 | tickForceArgs (force -> !f) (force -> !a) = do 149 | !t <- nanos 150 | !a' <- pure $! f a 151 | !t' <- nanos 152 | pure (t' - t, a') 153 | {-# INLINEABLE tickForceArgs #-} 154 | 155 | -- | measures an /IO a/ 156 | tickIO :: IO a -> IO (Nanos, a) 157 | tickIO a = do 158 | !t <- nanos 159 | !a' <- a 160 | !t' <- nanos 161 | pure (t' - t, a') 162 | {-# INLINEABLE tickIO #-} 163 | 164 | -- | measures an /IO a/ 165 | tickIOWith :: Clock -> IO a -> IO (Nanos, a) 166 | tickIOWith c a = do 167 | !t <- nanosWith c 168 | !a' <- a 169 | !t' <- nanosWith c 170 | pure (t' - t, a') 171 | {-# INLINEABLE tickIOWith #-} 172 | 173 | -- | n measurements of a tick 174 | -- 175 | -- returns a list of Nanos and the last evaluated f a 176 | ticks :: Int -> (a -> b) -> a -> IO ([Nanos], b) 177 | ticks = multi tick 178 | {-# INLINEABLE ticks #-} 179 | 180 | -- | n measurements of a tickIO 181 | -- 182 | -- returns an IO tuple; list of Nanos and the last evaluated f a 183 | ticksIO :: Int -> IO a -> IO ([Nanos], a) 184 | ticksIO = multiM tickIO 185 | {-# INLINEABLE ticksIO #-} 186 | 187 | -- | tick as a 'StepMeasure' 188 | stepTime :: StepMeasure IO Nanos 189 | stepTime = StepMeasure start stop 190 | where 191 | start = nanos 192 | stop r = fmap (\x -> x - r) nanos 193 | {-# INLINEABLE stepTime #-} 194 | 195 | -- | tick as a 'Measure' 196 | time :: Measure IO Nanos 197 | time = Measure tick 198 | {-# INLINEABLE time #-} 199 | 200 | -- | tick as a multi-Measure 201 | times :: Int -> Measure IO [Nanos] 202 | times n = Measure (ticks n) 203 | {-# INLINEABLE times #-} 204 | 205 | -- | tickWith as a multi-Measure 206 | timesWith :: Clock -> Int -> Measure IO [Nanos] 207 | timesWith c n = repeated n (Measure (tickWith c)) 208 | {-# INLINEABLE timesWith #-} 209 | 210 | -- | tickWith for n repeated applications 211 | timesN :: Int -> Measure IO Nanos 212 | timesN n = Measure (tickNWith defaultClock n) 213 | {-# INLINEABLE timesN #-} 214 | 215 | -- | tickWith for n repeated applications 216 | timesNWith :: Clock -> Int -> Measure IO Nanos 217 | timesNWith c n = Measure (tickNWith c n) 218 | {-# INLINEABLE timesNWith #-} 219 | 220 | tickNWith :: Clock -> Int -> (a -> b) -> a -> IO (Nanos, b) 221 | tickNWith c n !f !a = do 222 | !t <- nanosWith c 223 | !a' <- multiN id f a n 224 | !t' <- nanosWith c 225 | pure (floor @Double (fromIntegral (t' - t) / fromIntegral n), a') 226 | {-# INLINEABLE tickNWith #-} 227 | -------------------------------------------------------------------------------- /src/Perf/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# OPTIONS_GHC -Wno-x-partial #-} 3 | 4 | -- | Abstract types of performance measurement. 5 | module Perf.Types 6 | ( -- * Measure 7 | Measure (..), 8 | repeated, 9 | StepMeasure (..), 10 | toMeasure, 11 | toMeasureN, 12 | step, 13 | stepM, 14 | multi, 15 | multiM, 16 | multiN, 17 | 18 | -- * function application 19 | fap, 20 | afap, 21 | ffap, 22 | fan, 23 | fam, 24 | (|$|), 25 | ($|), 26 | (|+|), 27 | 28 | -- * PerfT monad 29 | PerfT (..), 30 | Perf, 31 | runPerfT, 32 | evalPerfT, 33 | execPerfT, 34 | outer, 35 | slop, 36 | slops, 37 | ) 38 | where 39 | 40 | import Control.DeepSeq 41 | import Control.Monad 42 | import Control.Monad.State.Lazy 43 | import Data.Bifunctor 44 | import Data.Functor.Identity 45 | import Data.Map.Strict qualified as Map 46 | import Data.Text (Text) 47 | import GHC.Exts 48 | import GHC.IO hiding (liftIO) 49 | import Prelude 50 | 51 | -- | Abstraction of a performance measurement within a monadic context. 52 | -- 53 | -- - measure applies a function to a value, returning a tuple of the performance measure, and the computation result. 54 | -- - measureM evaluates a monadic value and returns a performance-result tuple. 55 | newtype Measure m t = Measure 56 | { measure :: forall a b. (a -> b) -> a -> m (t, b) 57 | } 58 | 59 | instance (Functor m) => Functor (Measure m) where 60 | fmap f (Measure m) = 61 | Measure 62 | (\f' a' -> fmap (first f) (m f' a')) 63 | 64 | -- | An inefficient application that runs the inner action twice. 65 | instance (Applicative m) => Applicative (Measure m) where 66 | pure t = Measure (\f a -> pure (t, f a)) 67 | (Measure mf) <*> (Measure mt) = 68 | Measure 69 | (\f a -> (\(nf', fa') (t', _) -> (nf' t', fa')) <$> mf f a <*> mt f a) 70 | 71 | -- | Convert a Measure into a multi measure. 72 | repeated :: (Applicative m) => Int -> Measure m t -> Measure m [t] 73 | repeated n (Measure p) = 74 | Measure 75 | (\f a -> fmap (\xs -> (fmap fst xs, snd (head xs))) (replicateM n (p f a))) 76 | {-# INLINEABLE repeated #-} 77 | 78 | -- | Abstraction of a performance measurement with a pre and a post step wrapping the computation. 79 | data StepMeasure m t = forall i. StepMeasure {pre :: m i, post :: i -> m t} 80 | 81 | instance (Functor m) => Functor (StepMeasure m) where 82 | fmap f (StepMeasure start stop) = StepMeasure start (fmap f . stop) 83 | 84 | instance (Applicative m) => Applicative (StepMeasure m) where 85 | pure t = StepMeasure (pure ()) (const (pure t)) 86 | (<*>) (StepMeasure fstart fstop) (StepMeasure start stop) = 87 | StepMeasure ((,) <$> fstart <*> start) (\(fi, i) -> fstop fi <*> stop i) 88 | 89 | -- | Convert a StepMeasure into a Measure 90 | toMeasure :: (Monad m) => StepMeasure m t -> Measure m t 91 | toMeasure (StepMeasure pre' post') = Measure (step pre' post') 92 | {-# INLINEABLE toMeasure #-} 93 | 94 | -- | Convert a StepMeasure into a Measure running the computation multiple times. 95 | toMeasureN :: (Monad m) => Int -> StepMeasure m t -> Measure m [t] 96 | toMeasureN n (StepMeasure pre' post') = Measure (multi (step pre' post') n) 97 | {-# INLINEABLE toMeasureN #-} 98 | 99 | -- | A single step measurement. 100 | step :: (Monad m) => m i -> (i -> m t) -> (a -> b) -> a -> m (t, b) 101 | step pre' post' !f !a = do 102 | !p <- pre' 103 | !b <- pure $! f a 104 | !t <- post' p 105 | pure (t, b) 106 | {-# INLINEABLE step #-} 107 | 108 | -- | A single step measurement. 109 | stepM :: (Monad m) => m i -> (i -> m t) -> m a -> m (t, a) 110 | stepM pre' post' a = do 111 | !p <- pre' 112 | !ma <- a 113 | !t <- post' p 114 | pure (t, ma) 115 | {-# INLINEABLE stepM #-} 116 | 117 | multi1 :: (Monad m) => ((a -> b) -> a -> m (t, b)) -> Int -> (a -> b) -> a -> m [(t, b)] 118 | multi1 action n !f !a = sequence $ replicate n $! action f a 119 | {-# INLINEABLE multi1 #-} 120 | 121 | -- | Return one result but multiple measurements. 122 | multi :: (Monad m) => ((a -> b) -> a -> m (t, b)) -> Int -> (a -> b) -> a -> m ([t], b) 123 | multi action n !f !a = do 124 | xs <- multi1 action n f a 125 | pure (fmap fst xs, snd (head xs)) 126 | {-# INLINEABLE multi #-} 127 | 128 | -- | Multiple measurements 129 | multiM :: (Monad m) => (m a -> m (t, a)) -> Int -> m a -> m ([t], a) 130 | multiM action n a = 131 | fmap (\xs -> (fmap fst xs, head $! fmap snd xs)) (replicateM n (action a)) 132 | {-# INLINEABLE multiM #-} 133 | 134 | multiN :: (b -> t) -> (a -> b) -> a -> Int -> IO t 135 | multiN frc = multiNLoop SPEC 136 | where 137 | multiNLoop !_ f x n 138 | | n == 1 = evaluate (frc (f x)) 139 | | otherwise = do 140 | _ <- evaluate (frc (f x)) 141 | multiNLoop SPEC f x (n - 1) 142 | {-# INLINE multiN #-} 143 | 144 | -- | Performance measurement transformer storing a 'Measure' and a map of named results. 145 | newtype PerfT m t a = PerfT 146 | { measurePerf :: StateT (Measure m t, Map.Map Text t) m a 147 | } 148 | deriving (Functor, Applicative, Monad) 149 | 150 | -- | The transformer over Identity 151 | type Perf t a = PerfT Identity t a 152 | 153 | instance (MonadIO m) => MonadIO (PerfT m t) where 154 | liftIO = PerfT . liftIO 155 | 156 | -- | Lift an application to a PerfT m, providing a label and a 'Measure'. 157 | -- 158 | -- Measurements with the same label will be mappended 159 | fap :: (MonadIO m, Semigroup t) => Text -> (a -> b) -> a -> PerfT m t b 160 | fap label f a = 161 | PerfT $ do 162 | m <- fst <$> get 163 | (t, fa) <- lift $ measure m f a 164 | modify $ second (Map.insertWith (<>) label t) 165 | return fa 166 | {-# INLINEABLE fap #-} 167 | 168 | -- | Lift an application to a PerfT m, forcing the argument. 169 | afap :: (NFData a, MonadIO m, Semigroup t) => Text -> (a -> b) -> a -> PerfT m t b 170 | afap label f a = fap label f (force a) 171 | {-# INLINEABLE afap #-} 172 | 173 | -- | Lift an application to a PerfT m, forcing argument and result. 174 | ffap :: (NFData a, NFData b, MonadIO m, Semigroup t) => Text -> (a -> b) -> a -> PerfT m t b 175 | ffap label f a = fap label (force . f) (force a) 176 | {-# INLINEABLE ffap #-} 177 | 178 | -- | Lift a number to a PerfT m, providing a label, function, and input. 179 | -- 180 | -- Measurements with the same label will be added 181 | fan :: (MonadIO m, Num t) => Text -> (a -> b) -> a -> PerfT m t b 182 | fan label f a = 183 | PerfT $ do 184 | m <- fst <$> get 185 | (t, fa) <- lift $ measure m f a 186 | modify $ second (Map.insertWith (+) label t) 187 | return fa 188 | {-# INLINEABLE fan #-} 189 | 190 | -- | Lift a monadic value to a PerfT m, providing a label and a 'Measure'. 191 | -- 192 | -- Measurements with the same label will be added 193 | fam :: (MonadIO m, Semigroup t) => Text -> m a -> PerfT m t a 194 | fam label a = 195 | PerfT $ do 196 | m <- fst <$> get 197 | (t, !ma) <- lift $ measure m (const a) () 198 | modify $ second (Map.insertWith (<>) label t) 199 | lift ma 200 | {-# INLINEABLE fam #-} 201 | 202 | -- | lift a pure, unnamed function application to PerfT 203 | (|$|) :: (Semigroup t) => (a -> b) -> a -> PerfT IO t b 204 | (|$|) f a = fap "" f a 205 | {-# INLINEABLE (|$|) #-} 206 | 207 | -- | lift a monadic, unnamed function application to PerfT 208 | ($|) :: (Semigroup t) => IO a -> PerfT IO t a 209 | ($|) a = fam "" a 210 | {-# INLINEABLE ($|) #-} 211 | 212 | -- | lift an unnamed numeric measure to PerfT 213 | (|+|) :: (Num t) => (a -> b) -> a -> PerfT IO t b 214 | (|+|) f a = fan "" f a 215 | {-# INLINEABLE (|+|) #-} 216 | 217 | -- | Run the performance measure, returning (computational result, measurement). 218 | runPerfT :: (Functor m) => Measure m t -> PerfT m t a -> m (a, Map.Map Text t) 219 | runPerfT m p = fmap (second snd) <$> flip runStateT (m, Map.empty) $ measurePerf p 220 | {-# INLINEABLE runPerfT #-} 221 | 222 | -- | Consume the PerfT layer and return the original monadic result. 223 | -- Fingers crossed, PerfT structure should be completely compiled away. 224 | evalPerfT :: (Monad m) => Measure m t -> PerfT m t a -> m a 225 | evalPerfT m p = fmap fst <$> flip runStateT (m, Map.empty) $ measurePerf p 226 | {-# INLINEABLE evalPerfT #-} 227 | 228 | -- | Consume a PerfT layer and return the measurement. 229 | execPerfT :: (Monad m) => Measure m t -> PerfT m t a -> m (Map.Map Text t) 230 | execPerfT m p = fmap snd <$> flip execStateT (m, Map.empty) $ measurePerf p 231 | {-# INLINEABLE execPerfT #-} 232 | 233 | -- | run a PerfT and also calculate performance over the entire computation 234 | outer :: (MonadIO m, Semigroup s) => Text -> Measure m s -> Measure m t -> PerfT m t a -> m (a, (Map.Map Text s, Map.Map Text t)) 235 | outer label outerm meas p = 236 | (\((a, m), m') -> (a, (m', m))) 237 | <$> runPerfT 238 | outerm 239 | (fam label (runPerfT meas p)) 240 | 241 | -- | run a PerfT and calculate excess performance over the entire computation 242 | slop :: (MonadIO m, Num t, Semigroup t) => Text -> Measure m t -> PerfT m t a -> m (a, Map.Map Text t) 243 | slop l meas p = 244 | (\((a, m), m') -> (a, m <> Map.insert "slop" (m' Map.! l - Map.foldl' (+) 0 m) m')) 245 | <$> runPerfT 246 | meas 247 | (fam l (runPerfT meas p)) 248 | 249 | -- | run a multi PerfT and calculate excess performance over the entire computation 250 | slops :: (MonadIO m, Num t, Semigroup t) => Int -> Measure m t -> PerfT m [t] a -> m (a, (Map.Map Text t, Map.Map Text [t])) 251 | slops n meas p = 252 | (\((a, ms), m') -> (a, (Map.insert "slop" (m' Map.! "outer" - Map.foldl' (+) 0 (fmap sum ms)) m', ms))) 253 | <$> runPerfT 254 | meas 255 | (fam "outer" (runPerfT (repeated n meas) p)) 256 | --------------------------------------------------------------------------------