├── .github
    └── workflows
    │   └── haskell-ci.yml
├── .gitignore
├── .travis.yml.bak
├── CHANGELOG.md
├── LICENSE
├── README.md
├── Setup.hs
├── bench
    └── Main.hs
├── foldl-statistics.cabal
├── src
    └── Control
    │   └── Foldl
    │       └── Statistics.hs
├── stack-6.yaml
├── stack-7.yaml
├── stack.yaml
└── test
    └── Spec.hs


/.github/workflows/haskell-ci.yml:
--------------------------------------------------------------------------------
  1 | # This GitHub workflow config has been generated by a script via
  2 | #
  3 | #   haskell-ci 'github' 'cabal.project'
  4 | #
  5 | # To regenerate the script (for example after adjusting tested-with) run
  6 | #
  7 | #   haskell-ci regenerate
  8 | #
  9 | # For more information, see https://github.com/haskell-CI/haskell-ci
 10 | #
 11 | # version: 0.18.1
 12 | #
 13 | # REGENDATA ("0.18.1",["github","cabal.project"])
 14 | #
 15 | name: Haskell-CI
 16 | on:
 17 |   - push
 18 |   - pull_request
 19 | jobs:
 20 |   linux:
 21 |     name: Haskell-CI - Linux - ${{ matrix.compiler }}
 22 |     runs-on: ubuntu-20.04
 23 |     timeout-minutes:
 24 |       60
 25 |     container:
 26 |       image: buildpack-deps:bionic
 27 |     continue-on-error: ${{ matrix.allow-failure }}
 28 |     strategy:
 29 |       matrix:
 30 |         include:
 31 |           - compiler: ghc-9.8.2
 32 |             compilerKind: ghc
 33 |             compilerVersion: 9.8.2
 34 |             setup-method: ghcup
 35 |             allow-failure: false
 36 |           - compiler: ghc-9.6.4
 37 |             compilerKind: ghc
 38 |             compilerVersion: 9.6.4
 39 |             setup-method: ghcup
 40 |             allow-failure: false
 41 |           - compiler: ghc-9.4.8
 42 |             compilerKind: ghc
 43 |             compilerVersion: 9.4.8
 44 |             setup-method: ghcup
 45 |             allow-failure: false
 46 |           - compiler: ghc-9.2.8
 47 |             compilerKind: ghc
 48 |             compilerVersion: 9.2.8
 49 |             setup-method: ghcup
 50 |             allow-failure: false
 51 |           - compiler: ghc-9.0.2
 52 |             compilerKind: ghc
 53 |             compilerVersion: 9.0.2
 54 |             setup-method: ghcup
 55 |             allow-failure: false
 56 |           - compiler: ghc-8.6.1
 57 |             compilerKind: ghc
 58 |             compilerVersion: 8.6.1
 59 |             setup-method: hvr-ppa
 60 |             allow-failure: false
 61 |           - compiler: ghc-8.4.3
 62 |             compilerKind: ghc
 63 |             compilerVersion: 8.4.3
 64 |             setup-method: hvr-ppa
 65 |             allow-failure: false
 66 |           - compiler: ghc-8.2.2
 67 |             compilerKind: ghc
 68 |             compilerVersion: 8.2.2
 69 |             setup-method: hvr-ppa
 70 |             allow-failure: false
 71 |           - compiler: ghc-8.0.2
 72 |             compilerKind: ghc
 73 |             compilerVersion: 8.0.2
 74 |             setup-method: hvr-ppa
 75 |             allow-failure: false
 76 |           - compiler: ghc-7.10.3
 77 |             compilerKind: ghc
 78 |             compilerVersion: 7.10.3
 79 |             setup-method: hvr-ppa
 80 |             allow-failure: false
 81 |           - compiler: ghc-7.8.4
 82 |             compilerKind: ghc
 83 |             compilerVersion: 7.8.4
 84 |             setup-method: hvr-ppa
 85 |             allow-failure: false
 86 |       fail-fast: false
 87 |     steps:
 88 |       - name: apt
 89 |         run: |
 90 |           apt-get update
 91 |           apt-get install -y --no-install-recommends gnupg ca-certificates dirmngr curl git software-properties-common libtinfo5
 92 |           if [ "${{ matrix.setup-method }}" = ghcup ]; then
 93 |             mkdir -p "$HOME/.ghcup/bin"
 94 |             curl -sL https://downloads.haskell.org/ghcup/0.1.20.0/x86_64-linux-ghcup-0.1.20.0 > "$HOME/.ghcup/bin/ghcup"
 95 |             chmod a+x "$HOME/.ghcup/bin/ghcup"
 96 |             "$HOME/.ghcup/bin/ghcup" install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false)
 97 |             "$HOME/.ghcup/bin/ghcup" install cabal 3.10.2.0 || (cat "$HOME"/.ghcup/logs/*.* && false)
 98 |           else
 99 |             apt-add-repository -y 'ppa:hvr/ghc'
100 |             apt-get update
101 |             apt-get install -y "$HCNAME"
102 |             mkdir -p "$HOME/.ghcup/bin"
103 |             curl -sL https://downloads.haskell.org/ghcup/0.1.20.0/x86_64-linux-ghcup-0.1.20.0 > "$HOME/.ghcup/bin/ghcup"
104 |             chmod a+x "$HOME/.ghcup/bin/ghcup"
105 |             "$HOME/.ghcup/bin/ghcup" install cabal 3.10.2.0 || (cat "$HOME"/.ghcup/logs/*.* && false)
106 |           fi
107 |         env:
108 |           HCKIND: ${{ matrix.compilerKind }}
109 |           HCNAME: ${{ matrix.compiler }}
110 |           HCVER: ${{ matrix.compilerVersion }}
111 |       - name: Set PATH and environment variables
112 |         run: |
113 |           echo "$HOME/.cabal/bin" >> $GITHUB_PATH
114 |           echo "LANG=C.UTF-8" >> "$GITHUB_ENV"
115 |           echo "CABAL_DIR=$HOME/.cabal" >> "$GITHUB_ENV"
116 |           echo "CABAL_CONFIG=$HOME/.cabal/config" >> "$GITHUB_ENV"
117 |           HCDIR=/opt/$HCKIND/$HCVER
118 |           if [ "${{ matrix.setup-method }}" = ghcup ]; then
119 |             HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER")
120 |             HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#')
121 |             HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#')
122 |             echo "HC=$HC" >> "$GITHUB_ENV"
123 |             echo "HCPKG=$HCPKG" >> "$GITHUB_ENV"
124 |             echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV"
125 |             echo "CABAL=$HOME/.ghcup/bin/cabal-3.10.2.0 -vnormal+nowrap" >> "$GITHUB_ENV"
126 |           else
127 |             HC=$HCDIR/bin/$HCKIND
128 |             echo "HC=$HC" >> "$GITHUB_ENV"
129 |             echo "HCPKG=$HCDIR/bin/$HCKIND-pkg" >> "$GITHUB_ENV"
130 |             echo "HADDOCK=$HCDIR/bin/haddock" >> "$GITHUB_ENV"
131 |             echo "CABAL=$HOME/.ghcup/bin/cabal-3.10.2.0 -vnormal+nowrap" >> "$GITHUB_ENV"
132 |           fi
133 | 
134 |           HCNUMVER=$(${HC} --numeric-version|perl -ne '/^(\d+)\.(\d+)\.(\d+)(\.(\d+))?$/; print(10000 * $1 + 100 * $2 + ($3 == 0 ? $5 != 1 : $3))')
135 |           echo "HCNUMVER=$HCNUMVER" >> "$GITHUB_ENV"
136 |           echo "ARG_TESTS=--enable-tests" >> "$GITHUB_ENV"
137 |           echo "ARG_BENCH=--enable-benchmarks" >> "$GITHUB_ENV"
138 |           echo "HEADHACKAGE=false" >> "$GITHUB_ENV"
139 |           echo "ARG_COMPILER=--$HCKIND --with-compiler=$HC" >> "$GITHUB_ENV"
140 |           echo "GHCJSARITH=0" >> "$GITHUB_ENV"
141 |         env:
142 |           HCKIND: ${{ matrix.compilerKind }}
143 |           HCNAME: ${{ matrix.compiler }}
144 |           HCVER: ${{ matrix.compilerVersion }}
145 |       - name: env
146 |         run: |
147 |           env
148 |       - name: write cabal config
149 |         run: |
150 |           mkdir -p $CABAL_DIR
151 |           cat >> $CABAL_CONFIG <<EOF
152 |           remote-build-reporting: anonymous
153 |           write-ghc-environment-files: never
154 |           remote-repo-cache: $CABAL_DIR/packages
155 |           logs-dir:          $CABAL_DIR/logs
156 |           world-file:        $CABAL_DIR/world
157 |           extra-prog-path:   $CABAL_DIR/bin
158 |           symlink-bindir:    $CABAL_DIR/bin
159 |           installdir:        $CABAL_DIR/bin
160 |           build-summary:     $CABAL_DIR/logs/build.log
161 |           store-dir:         $CABAL_DIR/store
162 |           install-dirs user
163 |             prefix: $CABAL_DIR
164 |           repository hackage.haskell.org
165 |             url: http://hackage.haskell.org/
166 |           EOF
167 |           cat >> $CABAL_CONFIG <<EOF
168 |           program-default-options
169 |             ghc-options: $GHCJOBS +RTS -M3G -RTS
170 |           EOF
171 |           cat $CABAL_CONFIG
172 |       - name: versions
173 |         run: |
174 |           $HC --version || true
175 |           $HC --print-project-git-commit-id || true
176 |           $CABAL --version || true
177 |       - name: update cabal index
178 |         run: |
179 |           $CABAL v2-update -v
180 |       - name: install cabal-plan
181 |         run: |
182 |           mkdir -p $HOME/.cabal/bin
183 |           curl -sL https://github.com/haskell-hvr/cabal-plan/releases/download/v0.7.3.0/cabal-plan-0.7.3.0-x86_64-linux.xz > cabal-plan.xz
184 |           echo 'f62ccb2971567a5f638f2005ad3173dba14693a45154c1508645c52289714cb2  cabal-plan.xz' | sha256sum -c -
185 |           xz -d < cabal-plan.xz > $HOME/.cabal/bin/cabal-plan
186 |           rm -f cabal-plan.xz
187 |           chmod a+x $HOME/.cabal/bin/cabal-plan
188 |           cabal-plan --version
189 |       - name: checkout
190 |         uses: actions/checkout@v3
191 |         with:
192 |           path: source
193 |       - name: initial cabal.project for sdist
194 |         run: |
195 |           touch cabal.project
196 |           echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project
197 |           cat cabal.project
198 |       - name: sdist
199 |         run: |
200 |           mkdir -p sdist
201 |           $CABAL sdist all --output-dir $GITHUB_WORKSPACE/sdist
202 |       - name: unpack
203 |         run: |
204 |           mkdir -p unpacked
205 |           find sdist -maxdepth 1 -type f -name '*.tar.gz' -exec tar -C $GITHUB_WORKSPACE/unpacked -xzvf {} \;
206 |       - name: generate cabal.project
207 |         run: |
208 |           PKGDIR_foldl_statistics="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/foldl-statistics-[0-9.]*')"
209 |           echo "PKGDIR_foldl_statistics=${PKGDIR_foldl_statistics}" >> "$GITHUB_ENV"
210 |           rm -f cabal.project cabal.project.local
211 |           touch cabal.project
212 |           touch cabal.project.local
213 |           echo "packages: ${PKGDIR_foldl_statistics}" >> cabal.project
214 |           if [ $((HCNUMVER >= 80200)) -ne 0 ] ; then echo "package foldl-statistics" >> cabal.project ; fi
215 |           if [ $((HCNUMVER >= 80200)) -ne 0 ] ; then echo "    ghc-options: -Werror=missing-methods" >> cabal.project ; fi
216 |           cat >> cabal.project <<EOF
217 |           EOF
218 |           $HCPKG list --simple-output --names-only | perl -ne 'for (split /\s+/) { print "constraints: $_ installed\n" unless /^(foldl-statistics)$/; }' >> cabal.project.local
219 |           cat cabal.project
220 |           cat cabal.project.local
221 |       - name: dump install plan
222 |         run: |
223 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dry-run all
224 |           cabal-plan
225 |       - name: restore cache
226 |         uses: actions/cache/restore@v3
227 |         with:
228 |           key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }}
229 |           path: ~/.cabal/store
230 |           restore-keys: ${{ runner.os }}-${{ matrix.compiler }}-
231 |       - name: install dependencies
232 |         run: |
233 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks --dependencies-only -j2 all
234 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dependencies-only -j2 all
235 |       - name: build w/o tests
236 |         run: |
237 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all
238 |       - name: build
239 |         run: |
240 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --write-ghc-environment-files=always
241 |       - name: tests
242 |         run: |
243 |           $CABAL v2-test $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --test-show-details=direct
244 |       - name: cabal check
245 |         run: |
246 |           cd ${PKGDIR_foldl_statistics} || false
247 |           ${CABAL} -vnormal check
248 |       - name: haddock
249 |         run: |
250 |           $CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all
251 |       - name: unconstrained build
252 |         run: |
253 |           rm -f cabal.project.local
254 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all
255 |       - name: save cache
256 |         uses: actions/cache/save@v3
257 |         if: always()
258 |         with:
259 |           key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }}
260 |           path: ~/.cabal/store
261 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | dist
 2 | cabal-dev
 3 | *.o
 4 | *.hi
 5 | *.chi
 6 | *.chs.h
 7 | *.dyn_o
 8 | *.dyn_hi
 9 | .hpc
10 | .hsenv
11 | .cabal-sandbox/
12 | cabal.sandbox.config
13 | *.prof
14 | *.aux
15 | *.hp
16 | .stack-work/
17 | 


--------------------------------------------------------------------------------
/.travis.yml.bak:
--------------------------------------------------------------------------------
  1 | # This Travis job script has been generated by a script via
  2 | #
  3 | #   runghc make_travis_yml_2.hs 'foldl-statistics.cabal'
  4 | #
  5 | # For more information, see https://github.com/haskell-CI/haskell-ci
  6 | #
  7 | language: c
  8 | sudo: false
  9 | 
 10 | git:
 11 |   submodules: false  # whether to recursively clone submodules
 12 | 
 13 | cache:
 14 |   directories:
 15 |     - $HOME/.cabal/packages
 16 |     - $HOME/.cabal/store
 17 | 
 18 | before_cache:
 19 |   - rm -fv $HOME/.cabal/packages/hackage.haskell.org/build-reports.log
 20 |   # remove files that are regenerated by 'cabal update'
 21 |   - rm -fv $HOME/.cabal/packages/hackage.haskell.org/00-index.*
 22 |   - rm -fv $HOME/.cabal/packages/hackage.haskell.org/*.json
 23 |   - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.cache
 24 |   - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.tar
 25 |   - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.tar.idx
 26 | 
 27 |   - rm -rfv $HOME/.cabal/packages/head.hackage
 28 | 
 29 | matrix:
 30 |   include:
 31 |     - compiler: "ghc-8.6.1"
 32 |     # env: TEST=--disable-tests BENCH=--disable-benchmarks
 33 |       addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-8.6.1], sources: [hvr-ghc]}}
 34 |     - compiler: "ghc-8.4.3"
 35 |     # env: TEST=--disable-tests BENCH=--disable-benchmarks
 36 |       addons: {apt: {packages: [ghc-ppa-tools,cabal-install-2.2,ghc-8.4.3], sources: [hvr-ghc]}}
 37 |     - compiler: "ghc-8.2.2"
 38 |     # env: TEST=--disable-tests BENCH=--disable-benchmarks
 39 |       addons: {apt: {packages: [ghc-ppa-tools,cabal-install-2.2,ghc-8.2.2], sources: [hvr-ghc]}}
 40 |     - compiler: "ghc-8.0.2"
 41 |     # env: TEST=--disable-tests BENCH=--disable-benchmarks
 42 |       addons: {apt: {packages: [ghc-ppa-tools,cabal-install-2.2,ghc-8.0.2], sources: [hvr-ghc]}}
 43 |     - compiler: "ghc-7.10.2"
 44 |     # env: TEST=--disable-tests BENCH=--disable-benchmarks
 45 |       addons: {apt: {packages: [ghc-ppa-tools,cabal-install-2.2,ghc-7.10.2], sources: [hvr-ghc]}}
 46 |     - compiler: "ghc-7.8.4"
 47 |     # env: TEST=--disable-tests BENCH=--disable-benchmarks
 48 |       addons: {apt: {packages: [ghc-ppa-tools,cabal-install-2.2,ghc-7.8.4], sources: [hvr-ghc]}}
 49 | 
 50 | before_install:
 51 |   - HC=${CC}
 52 |   - HCPKG=${HC/ghc/ghc-pkg}
 53 |   - unset CC
 54 |   - ROOTDIR=$(pwd)
 55 |   - mkdir -p $HOME/.local/bin
 56 |   - "PATH=/opt/ghc/bin:/opt/ghc-ppa-tools/bin:$HOME/local/bin:$PATH"
 57 |   - HCNUMVER=$(( $(${HC} --numeric-version|sed -E 's/([0-9]+)\.([0-9]+)\.([0-9]+).*/\1 * 10000 + \2 * 100 + \3/') ))
 58 |   - echo $HCNUMVER
 59 | 
 60 | install:
 61 |   - cabal --version
 62 |   - echo "$(${HC} --version) [$(${HC} --print-project-git-commit-id 2> /dev/null || echo '?')]"
 63 |   - BENCH=${BENCH---enable-benchmarks}
 64 |   - TEST=${TEST---enable-tests}
 65 |   - HADDOCK=${HADDOCK-true}
 66 |   - UNCONSTRAINED=${UNCONSTRAINED-true}
 67 |   - NOINSTALLEDCONSTRAINTS=${NOINSTALLEDCONSTRAINTS-false}
 68 |   - GHCHEAD=${GHCHEAD-false}
 69 |   - travis_retry cabal update -v
 70 |   - "sed -i.bak 's/^jobs:/-- jobs:/' ${HOME}/.cabal/config"
 71 |   - rm -fv cabal.project cabal.project.local
 72 |   - grep -Ev -- '^\s*--' ${HOME}/.cabal/config | grep -Ev '^\s*$'
 73 |   - "printf 'packages: \".\"\\n' > cabal.project"
 74 |   - touch cabal.project.local
 75 |   - "if ! $NOINSTALLEDCONSTRAINTS; then for pkg in $($HCPKG list --simple-output); do echo $pkg  | grep -vw -- foldl-statistics | sed 's/^/constraints: /' | sed 's/-[^-]*$/ installed/' >> cabal.project.local; done; fi"
 76 |   - cat cabal.project || true
 77 |   - cat cabal.project.local || true
 78 |   - if [ -f "./configure.ac" ]; then
 79 |       (cd "." && autoreconf -i);
 80 |     fi
 81 |   - rm -f cabal.project.freeze
 82 |   - cabal new-build -w ${HC} ${TEST} ${BENCH} --project-file="cabal.project" --dep -j2 all
 83 |   - cabal new-build -w ${HC} --disable-tests --disable-benchmarks --project-file="cabal.project" --dep -j2 all
 84 |   - rm -rf .ghc.environment.* "."/dist
 85 |   - DISTDIR=$(mktemp -d /tmp/dist-test.XXXX)
 86 | 
 87 | # Here starts the actual work to be performed for the package under test;
 88 | # any command which exits with a non-zero exit code causes the build to fail.
 89 | script:
 90 |   # test that source-distributions can be generated
 91 |   - (cd "." && cabal sdist)
 92 |   - mv "."/dist/foldl-statistics-*.tar.gz ${DISTDIR}/
 93 |   - cd ${DISTDIR} || false
 94 |   - find . -maxdepth 1 -name '*.tar.gz' -exec tar -xvf '{}' \;
 95 |   - "printf 'packages: foldl-statistics-*/*.cabal\\n' > cabal.project"
 96 |   - touch cabal.project.local
 97 |   - "if ! $NOINSTALLEDCONSTRAINTS; then for pkg in $($HCPKG list --simple-output); do echo $pkg  | grep -vw -- foldl-statistics | sed 's/^/constraints: /' | sed 's/-[^-]*$/ installed/' >> cabal.project.local; done; fi"
 98 |   - cat cabal.project || true
 99 |   - cat cabal.project.local || true
100 |   # this builds all libraries and executables (without tests/benchmarks)
101 |   - cabal new-build -w ${HC} --disable-tests --disable-benchmarks all
102 | 
103 |   # build & run tests, build benchmarks
104 |   - cabal new-build -w ${HC} ${TEST} ${BENCH} all
105 |   - if [ "x$TEST" = "x--enable-tests" ]; then cabal new-test -w ${HC} ${TEST} ${BENCH} all; fi
106 | 
107 |   # cabal check
108 |   - (cd foldl-statistics-* && cabal check)
109 | 
110 |   # haddock
111 |   - rm -rf ./dist-newstyle
112 |   - if $HADDOCK; then cabal new-haddock -w ${HC} ${TEST} ${BENCH} all; else echo "Skipping haddock generation";fi
113 | 
114 |   # Build without installed constraints for packages in global-db
115 |   - if $UNCONSTRAINED; then rm -f cabal.project.local; echo cabal new-build -w ${HC} --disable-tests --disable-benchmarks all; else echo "Not building without installed constraints"; fi
116 | 
117 | # REGENDATA ["foldl-statistics.cabal"]
118 | # EOF
119 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # 0.1.5.1
 2 | 
 3 | - GHC 8.6 compatibility (Thanks @gwils)
 4 | 
 5 | # 0.1.5.0
 6 | 
 7 | - Added `histogram`, `histogram'` and `ordersOfMagnitude`.
 8 | 
 9 | # 0.1.4.6
10 | 
11 | - Relax bounds on tasty-quickcheck
12 | 
13 | # 0.1.4.5
14 | 
15 | - Fix dependencies for GHC < 8.0
16 | 
17 | # 0.1.4.4
18 | 
19 | - Update to foldl < 1.4
20 | 
21 | # 0.1.4.3
22 | 
23 | - Update to statistics 0.14
24 | 
25 | # 0.1.4.1
26 | 
27 | - foldl >= 1.2.2 exports `mean` and ` variance`, so hide them.
28 | - export `lrrCount`
29 | 
30 | # 0.1.4.0
31 | 
32 | - Added monoidal interface to linear regression
33 | 
34 | # 0.1.3.0
35 | 
36 | - Added unbiased versions of LMVSK functions
37 | 
38 | # 0.1.2.0
39 | 
40 | - Exposed monoidal LMVSKState
41 | - Improved testing, including for fastLMVSK
42 | 
43 | # 0.1.1.0
44 | 
45 | - Add fastLMVSK (length, mean, variance, skewness and kurtosis)
46 | - Add fastLinearReg (count, slope, (Y) intercept and correlation of `(x,y)`)
47 | 
48 | 
49 | # 0.1.0.0
50 | 
51 | - Initial release
52 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Copyright (c) 2009, 2010 Bryan O'Sullivan
  2 | Copyright (c) 2016, Commonwealth Scientific and Industrial Research Organisation
  3 | (CSIRO) ABN 41 687 119 230.
  4 | 
  5 | All rights reserved. CSIRO is willing to grant you a license to this
  6 | aemo-webservice on the following terms, except where otherwise indicated for
  7 | third party material.
  8 | 
  9 | Redistribution and use of this software in source and binary forms, with or
 10 | without modification, are permitted provided that the following conditions are
 11 | met:
 12 | 
 13 | * Redistributions of source code must retain the above copyright notice, this
 14 |   list of conditions and the following disclaimer.
 15 | 
 16 | * Redistributions in binary form must reproduce the above copyright notice, this
 17 |   list of conditions and the following disclaimer in the documentation and/or
 18 |   other materials provided with the distribution.
 19 | 
 20 | * Neither the name of CSIRO nor the names of its contributors may be used to
 21 |   endorse or promote products derived from this software without specific prior
 22 |   written permission of CSIRO.
 23 | 
 24 | EXCEPT AS EXPRESSLY STATED IN THIS AGREEMENT AND TO THE FULL EXTENT PERMITTED BY
 25 | APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO MAKES NO
 26 | REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED,
 27 | INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS
 28 | REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE,
 29 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, THE ABSENCE
 30 | OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT
 31 | DISCOVERABLE.
 32 | 
 33 | TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO BE
 34 | LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION FOR
 35 | BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR
 36 | OTHER LIABILITY HOWSOEVER INCURRED.  WITHOUT LIMITING THE SCOPE OF THE PREVIOUS
 37 | SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR
 38 | OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF
 39 | ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC
 40 | LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY
 41 | DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS AGREEMENT, ACCESS OF THE
 42 | SOFTWARE OR ANY OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO HAS BEEN ADVISED
 43 | OF THE POSSIBILITY OF SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY.
 44 | 
 45 | APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY APPLY
 46 | REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS OR LIABILITY
 47 | ON CSIRO THAT CANNOT BE EXCLUDED, RESTRICTED OR MODIFIED TO THE FULL EXTENT SET
 48 | OUT IN THE EXPRESS TERMS OF THIS CLAUSE ABOVE "CONSUMER GUARANTEES".  TO THE
 49 | EXTENT THAT SUCH CONSUMER GUARANTEES CONTINUE TO APPLY, THEN TO THE FULL EXTENT
 50 | PERMITTED BY THE APPLICABLE LEGISLATION, THE LIABILITY OF CSIRO UNDER THE
 51 | RELEVANT CONSUMER GUARANTEE IS LIMITED (WHERE PERMITTED AT CSIRO'S OPTION) TO
 52 | ONE OF FOLLOWING REMEDIES OR SUBSTANTIALLY EQUIVALENT REMEDIES:
 53 | 
 54 | (a)               THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT
 55 |                   SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN;
 56 | (b)               THE REPAIR OF THE SOFTWARE;
 57 | (c)               THE PAYMENT OF THE COST OF REPLACING THE
 58 |                   SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE
 59 |                   RELEVANT SERVICES SUPPLIED AGAIN, OR HAVING THE SOFTWARE
 60 |                   REPAIRED.
 61 | 
 62 | IN THIS CLAUSE, CSIRO INCLUDES ANY THIRD PARTY AUTHOR OR OWNER OF ANY PART OF
 63 | THE SOFTWARE OR MATERIAL DISTRIBUTED WITH IT.  CSIRO MAY ENFORCE ANY RIGHTS ON
 64 | BEHALF OF THE RELEVANT THIRD PARTY.
 65 | 
 66 | Third Party Components
 67 | 
 68 | The following third party components are distributed with the Software.  You
 69 | agree to comply with the license terms for these components as part of accessing
 70 | the Software.  Other third party software may also be identified in separate
 71 | files distributed with the Software.
 72 | 
 73 | ___________________________________________________________________
 74 | ___________________________________________________________________
 75 | The following Haskell library dependencies may be obtained from
 76 | https://hackage.haskell.org/packages/
 77 | 
 78 | StateVar 1.1.0.4
 79 | array 0.5.1.0
 80 | base 4.8.2.0
 81 | base-orphans 0.5.4
 82 | bifunctors 5.2
 83 | binary 0.7.5.0
 84 | bytestring 0.10.6.0
 85 | comonad 4.2.7.2
 86 | containers 0.5.6.2
 87 | contravariant 1.4
 88 | deepseq 1.4.1.1
 89 | distributive 0.5.0.2
 90 | erf 2.0.0.0
 91 | foldl 1.2.1
 92 | foldl-statistics 0.1.0.0
 93 | ghc-prim 0.4.0.0
 94 | hashable 1.2.4.0
 95 | integer-gmp 1.0.0.0
 96 | math-functions 0.1.7.0
 97 | mwc-random 0.13.4.0
 98 | primitive 0.6.1.0
 99 | profunctors 5.2
100 | semigroups 0.18.1
101 | stm 2.4.4.1
102 | tagged 0.8.4
103 | template-haskell 2.10.0.0
104 | text 1.2.2.1
105 | time 1.5.0.1
106 | transformers 0.4.2.0
107 | transformers-compat 0.4.0.4
108 | unordered-containers 0.2.7.1
109 | vector 0.11.0.0
110 | vector-th-unbox 0.2.1.6
111 | void 0.7.1
112 | ___________________________________________________________________
113 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # foldl-statistics [![Build Status](https://travis-ci.org/data61/foldl-statistics.svg?branch=master)](https://travis-ci.org/data61/foldl-statistics)
 2 | A reimplementation of the [Statistics.Sample](https://hackage.haskell.org/package/statistics/docs/Statistics-Sample.html)
 3 | module using the [foldl](https://www.stackage.org/lts-5.1/package/foldl) package.
 4 | The intention of this package is to allow these algorithms to be used on a much broader set of data input types,
 5 | including lists and streaming libraries such as `conduit` and `pipes`, and any other type which is `Foldable`.
 6 | 
 7 | All statistics in this package can be computed with no more than two passes over the data - once to compute the mean and once to compute 
 8 | any statistics which require the mean. this is achieved because foldl `Fold`s are `Applicative`, which means that to compute, for example, the first 4 central moments as well as the count, the following could be used:
 9 | 
10 | ```haskell
11 | import Control.Foldl as F
12 | 
13 | ...
14 | 
15 | dataseries :: [Double]
16 | dataseries = ...
17 | 
18 |     ...
19 |     let m = F.fold mean dataseries
20 |        (c2,c3,c4,c5,n) = flip F.fold dataseries $ 
21 |                         (\(c2,c3) (c4,c5) n -> (c2,c3,c4,c5,n)) 
22 |                         <$> centralMoment 2 3 m
23 |                         <*> centralMoment 4 5 m
24 |                         <*> F.length
25 | ```
26 | 
27 | which traverses the data twice, once to compute the mean `m`, and once to compute all the central moments and the count concurrently. This brings along with it for free the ability to compute streaming statistics, such as the mean of all data seen so far, using the `foldl`'s `scan` function.
28 | 
29 | Where possible, care has been taken to ensure the numerical stability of the computation of statistics.
30 | 
31 | Several algorithms require the mean of the data to be known before computing the statistic, such as `skewness`, `kurtosis` and other `centralMoment`s.
32 | There are 'fast' implementations for calculating the variance, unbiased variance and standard deviation, which can be computed without knowing the mean
33 | *a priori*, but which may produce less accurate results.
34 | 
35 | ## Performance & Correctness
36 | Benchmarks are included comparing performance to the [statistics](https://hackage.haskell.org/package/statistics) package. In nearly all cases, the implementations in this package perform better than those in `statistics` on the same inputs, and in several cases, performing two passes (to compute the mean and another statistic) is faster than the equivalent `statistics` implementation.
37 | 
38 | This speed has not come at the cost of correctness; all `Fold`s are tested against their `statistics` counterparts to ensure the results are identical.
39 | 
40 | These results can be confirmed by running
41 | 
42 |     stack build --test --bench --benchmark-arguments "--output bench.html"
43 | 
44 | which will print out the results of the tests against `statistics` and then run the benchmark (this may take several minutes and is best run on a "quiet" machine which is doing very little other than running the benchmark). The results of the benchmarking are then available in the file `bench.html`.
45 | 
46 | 


--------------------------------------------------------------------------------
/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 | 


--------------------------------------------------------------------------------
/bench/Main.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP #-}
  2 | 
  3 | module Main where
  4 | 
  5 | import           Control.Monad.ST         (runST)
  6 | import           Criterion.Main
  7 | import qualified Data.Vector.Unboxed      as U
  8 | import qualified Statistics.Sample        as S
  9 | -- import           Statistics.Transform
 10 | import           System.Random.MWC
 11 | #if MIN_VERSION_foldl(1,2,2)
 12 | import           Control.Foldl            as F hiding (mean, variance)
 13 | #else
 14 | import           Control.Foldl            as F
 15 | #endif
 16 | 
 17 | import           Control.Foldl.Statistics
 18 | 
 19 | 
 20 | -- Test sample
 21 | {-# NOINLINE sample #-}
 22 | sample :: U.Vector Double
 23 | #if MIN_VERSION_mwc_random(0,15,0)
 24 | sample = runST $ do
 25 |   g <-  create
 26 |   U.replicateM 10000 (uniformRM (-10.0,10.0) g)
 27 | #else
 28 | sample = runST $ flip uniformVector 10000 =<< create
 29 | #endif
 30 | 
 31 | {-# NOINLINE sample2 #-}
 32 | sample2 :: U.Vector (Double,Double)
 33 | #if MIN_VERSION_mwc_random(0,15,0)
 34 | sample2 = runST $  do
 35 |   g <-  create
 36 |   U.replicateM 10000 ((,) <$> (uniformRM (-10.0,10.0) g) <*> (uniformRM (0.2,5.0) g))
 37 | #else
 38 | sample2 = runST $ flip uniformVector 10000 =<< create
 39 | #endif
 40 | 
 41 | {-# NOINLINE absSample #-}
 42 | absSample :: U.Vector Double
 43 | absSample = U.map abs sample
 44 | 
 45 | -- Weighted test sample
 46 | {-# NOINLINE sampleW #-}
 47 | sampleW :: U.Vector (Double,Double)
 48 | sampleW = U.zip sample (U.reverse sample)
 49 | 
 50 | m, mw :: Double
 51 | m = F.fold mean (U.toList sample)
 52 | 
 53 | mw = F.fold meanWeighted (U.toList sampleW)
 54 | 
 55 | 
 56 | 
 57 | 
 58 | main :: IO ()
 59 | main = defaultMain
 60 |     [ bgroup "Statistics of location"
 61 |       [ bgroup "mean"
 62 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold mean (U.toList vec)) sample
 63 |         , bench "Statistics.Sample"  $ nf S.mean sample
 64 |         ]
 65 |       , bgroup "meanWeighted"
 66 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold meanWeighted (U.toList vec)) sampleW
 67 |         , bench "Statistics.Sample"  $ nf S.meanWeighted sampleW
 68 |         ]
 69 |       , bgroup "welfordMean"
 70 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold welfordMean (U.toList vec)) sample
 71 |         , bench "Statistics.Sample"  $ nf S.welfordMean sample
 72 |         ]
 73 |       , bgroup "harmonicMean"
 74 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold harmonicMean (U.toList vec)) sample
 75 |         , bench "Statistics.Sample"  $ nf S.harmonicMean sample
 76 |         ]
 77 |       , bgroup "geometricMean"
 78 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold geometricMean (U.toList vec)) absSample
 79 |         , bench "Statistics.Sample"  $ nf S.geometricMean absSample
 80 |         ]
 81 |       ]
 82 |     , bgroup "Single-pass"
 83 |       [ bgroup "fastVariance"
 84 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold fastVariance (U.toList vec)) sample
 85 |         , bench "Statistics.Sample"  $ nf S.fastVariance sample
 86 |         ]
 87 |       , bgroup "fastVarianceUnbiased"
 88 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold fastVarianceUnbiased (U.toList vec)) sample
 89 |         , bench "Statistics.Sample"  $ nf S.fastVarianceUnbiased sample
 90 |         ]
 91 |       , bgroup "fastStdDev"
 92 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold fastStdDev (U.toList vec)) sample
 93 |         , bench "Statistics.Sample"  $ nf S.fastStdDev sample
 94 |         ]
 95 |       , bgroup "fastLMVSK"
 96 |                        -- T4 is strict in all arguments, so WHNF ok here
 97 |           [bench "C.F.Statistics"      $ whnf (\vec -> F.fold fastLMVSK (U.toList vec)) sample
 98 |           ]
 99 |       , bgroup "fastLinearReg"
100 |           [bench "fastLinearReg"       $ whnf (\vec -> F.fold fastLinearReg (U.toList vec)) sample2
101 |           ]
102 |       ]
103 | 
104 |     , bgroup "requiring the mean"
105 |       [ bgroup "variance"
106 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (variance m) (U.toList vec)) sample
107 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (variance (F.fold mean (U.toList vec))) (U.toList vec)) sample
108 |         , bench "Statistics.Sample"  $ nf S.variance sample
109 |         ]
110 |       , bgroup "varianceUnbiased"
111 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (varianceUnbiased m) (U.toList vec)) sample
112 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (varianceUnbiased (F.fold mean (U.toList vec))) (U.toList vec)) sample
113 |         , bench "Statistics.Sample"  $ nf S.varianceUnbiased sample
114 |         ]
115 |       , bgroup "stdDev"
116 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (stdDev m) (U.toList vec)) sample
117 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (stdDev (F.fold mean (U.toList vec))) (U.toList vec)) sample
118 |         , bench "Statistics.Sample"  $ nf S.stdDev sample
119 |         ]
120 |       , bgroup "varianceWeighted"
121 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (varianceWeighted m) (U.toList vec)) sampleW
122 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (varianceWeighted (F.fold meanWeighted (U.toList vec))) (U.toList vec)) sampleW
123 |         , bench "Statistics.Sample"  $ nf S.varianceWeighted sampleW
124 |         ]
125 |       ]
126 | 
127 |     , bgroup "over central moments"
128 |       [ bgroup "skewness"
129 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (skewness m) (U.toList vec)) sample
130 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (skewness (F.fold mean (U.toList vec))) (U.toList vec)) sample
131 |         , bench "Statistics.Sample"  $ nf S.skewness sample
132 |         ]
133 |       , bgroup "kurtosis"
134 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (kurtosis m) (U.toList vec)) sample
135 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (kurtosis (F.fold mean (U.toList vec))) (U.toList vec)) sample
136 |         , bench "Statistics.Sample"  $ nf S.kurtosis sample
137 |         ]
138 |       , bgroup "centralMoment 2"
139 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (centralMoment 2 m) (U.toList vec)) sample
140 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (centralMoment 2 (F.fold mean (U.toList vec))) (U.toList vec)) sample
141 |         , bench "Statistics.Sample"  $ nf (S.centralMoment 2) sample
142 |         ]
143 |       , bgroup "centralMoment 3"
144 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (centralMoment 3 m) (U.toList vec)) sample
145 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (centralMoment 3 (F.fold mean (U.toList vec))) (U.toList vec)) sample
146 |         , bench "Statistics.Sample"  $ nf (S.centralMoment 3) sample
147 |         ]
148 |       , bgroup "centralMoment 4"
149 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (centralMoment 4 m) (U.toList vec)) sample
150 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (centralMoment 4 (F.fold mean (U.toList vec))) (U.toList vec)) sample
151 |         , bench "Statistics.Sample"  $ nf (S.centralMoment 4) sample
152 |         ]
153 |       , bgroup "centralMoment 7"
154 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (centralMoment 7 m) (U.toList vec)) sample
155 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (centralMoment 7 (F.fold mean (U.toList vec))) (U.toList vec)) sample
156 |         , bench "Statistics.Sample"  $ nf (S.centralMoment 7) sample
157 |         ]
158 |       , bgroup "centralMoments 4 9"
159 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (centralMoments 4 9 m) (U.toList vec)) sample
160 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (centralMoments 4 9 (F.fold mean (U.toList vec))) (U.toList vec)) sample
161 |         , bench "Statistics.Sample"  $ nf (S.centralMoments 4 9) sample
162 |         ]
163 |       , bgroup "centralMoments' 4 9"
164 |         [ bench "C.F.Statistics"     $ nf (\vec -> F.fold (centralMoments' 4 9 m) (U.toList vec)) sample
165 |         , bench "C.F.S(comp mean)"   $ nf (\vec -> F.fold (centralMoments' 4 9 (F.fold mean (U.toList vec))) (U.toList vec)) sample
166 |         ]
167 |       ]
168 |     ]
169 | 


--------------------------------------------------------------------------------
/foldl-statistics.cabal:
--------------------------------------------------------------------------------
 1 | name:                foldl-statistics
 2 | version:             0.1.5.1
 3 | synopsis:            Statistical functions from the statistics package implemented as
 4 |                      Folds.
 5 | description:         The use of this package allows statistics to be computed using at most two
 6 |                      passes over the input data, one to compute a mean and one to compute a further
 7 |                      statistic such as variance and /n/th central moments. All algorithms are the
 8 |                      obvious implementation of Bryan O\'Sullivan\'s
 9 |                      <https://hackage.haskell.org/package/statistics statistics> package imeplemented
10 |                      as `Fold's from the
11 |                      <https://hackage.haskell.org/package/foldl foldl> package.
12 | homepage:            http://github.com/Data61/foldl-statistics#readme
13 | license:             BSD3
14 | license-file:        LICENSE
15 | author:              Alex Mason
16 | maintainer:          Alex.Mason@data61.csiro.au
17 | copyright:           2016 Data61 (CSIRO)
18 | category:            Math, Statistics
19 | build-type:          Simple
20 | extra-source-files:  CHANGELOG.md, README.md
21 | cabal-version:       >=1.10
22 | tested-with:         GHC == 7.8.4,
23 |                      GHC == 7.10.3,
24 |                      GHC == 8.0.2,
25 |                      GHC == 8.2.2,
26 |                      GHC == 8.4.3,
27 |                      GHC == 8.6.1,
28 |                      GHC == 9.0.2,
29 |                      GHC == 9.2.8,
30 |                      GHC == 9.4.8,
31 |                      GHC == 9.6.4,
32 |                      GHC == 9.8.2
33 | 
34 | library
35 |   hs-source-dirs:      src
36 |   exposed-modules:     Control.Foldl.Statistics
37 |   default-language:    Haskell2010
38 |   build-depends:       base             >= 4.7 && < 5
39 |                        , foldl          >= 1.1 && < 1.5
40 |                        , math-functions >= 0.1 && < 0.4
41 |                        , profunctors    >= 5.2 && < 5.7
42 |                        , containers     >= 0.1.0.0 && < 0.8
43 |                        , unordered-containers >= 0.1.0.0 && < 0.3
44 |                        , hashable       >=1.0.1.1 && < 1.5
45 |   if impl(ghc < 8.0)
46 |     build-depends:     semigroups       >= 0.18 && < 1.0
47 | 
48 | 
49 | test-suite foldl-statistics-test
50 |   type:                exitcode-stdio-1.0
51 |   hs-source-dirs:      test
52 |   main-is:             Spec.hs
53 |   ghc-options:         -threaded -rtsopts -with-rtsopts=-N
54 |   default-language:    Haskell2010
55 |   build-depends:       base
56 |                      , foldl-statistics
57 |                      , foldl
58 |                      , statistics           >= 0.13 && < 0.17
59 |                      , tasty                >= 0.11 && < 1.6
60 |                      , tasty-quickcheck     >= 0.8 && < 0.11
61 |                      , vector               >= 0.11 && < 0.14
62 |                      , quickcheck-instances >= 0.3 && < 0.4
63 |                      , profunctors
64 |   if impl(ghc < 8.0)
65 |     build-depends:     semigroups
66 | 
67 | Benchmark bench-folds
68 |     type:       exitcode-stdio-1.0
69 |     hs-source-dirs:      bench
70 |     main-is:             Main.hs
71 |     default-language:    Haskell2010
72 |     build-depends: base
73 |                   , foldl-statistics
74 |                   , foldl
75 |                   , statistics      >= 0.13 && < 0.17
76 |                   , criterion       >= 1.1 && < 1.7
77 |                   , vector          >= 0.10 && < 1.0
78 |                   , mwc-random      >= 0.13 && < 0.16
79 |     if impl(ghc < 8.0)
80 |       build-depends: semigroups
81 | 
82 | source-repository head
83 |   type:     git
84 |   location: https://github.com/Data61/foldl-statistics
85 | 


--------------------------------------------------------------------------------
/src/Control/Foldl/Statistics.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP #-}
  2 | 
  3 | -- |
  4 | -- Module    : Control.Foldl.Statistics
  5 | -- Copyright : (c) 2011 Bryan O'Sullivan, 2016 National ICT Australia, 2018 CSIRO
  6 | -- License   : BSD3
  7 | --
  8 | -- Maintainer  : Alex.Mason@data61.csiro.au
  9 | -- Stability   : experimental
 10 | -- Portability : portable
 11 | --
 12 | 
 13 | module Control.Foldl.Statistics (
 14 |     -- * Introduction
 15 |     -- $intro
 16 |     -- * Descriptive functions
 17 |     range
 18 |     , sum'
 19 |     , histogram
 20 |     , histogram'
 21 |     , ordersOfMagnitude
 22 | 
 23 |     -- * Statistics of location
 24 |     , mean
 25 |     , welfordMean
 26 |     , meanWeighted
 27 |     , harmonicMean
 28 |     , geometricMean
 29 | 
 30 |     -- * Statistics of dispersion
 31 |     -- $variance
 32 | 
 33 |     -- ** Functions over central moments
 34 |     , centralMoment
 35 |     , centralMoments
 36 |     , centralMoments'
 37 |     , skewness
 38 |     , kurtosis
 39 | 
 40 |     -- ** Functions requiring the mean to be known (numerically robust)
 41 |     -- $robust
 42 |     , variance
 43 |     , varianceUnbiased
 44 |     , stdDev
 45 |     , varianceWeighted
 46 | 
 47 |     -- ** Single-pass functions (faster, less safe)
 48 |     -- $cancellation
 49 |     , fastVariance
 50 |     , fastVarianceUnbiased
 51 |     , fastStdDev
 52 |     , fastLMVSK
 53 |     , fastLMVSKu
 54 |     , LMVSK(..)
 55 |     , LMVSKState
 56 |     , foldLMVSKState
 57 |     , getLMVSK
 58 |     , getLMVSKu
 59 | 
 60 |     -- ** Linear Regression
 61 |     , fastLinearReg
 62 |     , foldLinRegState
 63 |     , getLinRegResult
 64 |     , LinRegResult(..)
 65 |     , LinRegState
 66 |     , lrrCount
 67 |     , correlation
 68 | 
 69 |     -- * References
 70 |     -- $references
 71 |     , module Control.Foldl
 72 | 
 73 |     ) where
 74 | 
 75 | #if MIN_VERSION_foldl(1,2,2)
 76 | import           Control.Foldl       as F hiding (mean, variance)
 77 | #else
 78 | import           Control.Foldl       as F
 79 | #endif
 80 | 
 81 | import qualified Control.Foldl
 82 | import           Data.Profunctor
 83 | import           Data.Semigroup
 84 | 
 85 | #if !MIN_VERSION_base(4,8,0)
 86 | import           Control.Applicative
 87 | #endif
 88 | 
 89 | import           Data.Hashable       (Hashable)
 90 | import qualified Data.HashMap.Strict as Hash
 91 | import qualified Data.Map.Strict     as Map
 92 | 
 93 | import           Numeric.Sum         (KBNSum, add, kbn, zero)
 94 | 
 95 | data T   = T   {-# UNPACK #-}!Double {-# UNPACK #-}!Int
 96 | data TS  = TS  {-# UNPACK #-}!KBNSum {-# UNPACK #-}!Int
 97 | data T1  = T1  {-# UNPACK #-}!Int    {-# UNPACK #-}!Double {-# UNPACK #-}!Double
 98 | data V   = V   {-# UNPACK #-}!Double {-# UNPACK #-}!Double
 99 | data V1  = V1  {-# UNPACK #-}!Double {-# UNPACK #-}!Double {-# UNPACK #-}!Int
100 | data V1S = V1S {-# UNPACK #-}!KBNSum {-# UNPACK #-}!KBNSum {-# UNPACK #-}!Int
101 | 
102 | 
103 | -- $intro
104 | -- Statistical functions from the
105 | -- <https://hackage.haskell.org/package/statistics/docs/Statistics-Sample.html Statistics.Sample>
106 | -- module of the
107 | -- <https://hackage.haskell.org/package/statistics statistics> package by
108 | -- Bryan O'Sullivan, implemented as `Control.Foldl.Fold's from the
109 | -- <https://hackage.haskell.org/package/foldl foldl> package.
110 | --
111 | -- This allows many statistics to be computed concurrently with at most
112 | -- two passes over the data, usually by computing the `mean' first, and
113 | -- passing it to further `Fold's.
114 | 
115 | -- | A numerically stable sum using Kahan-Babuška-Neumaier
116 | -- summation from "Numeric.Sum"
117 | {-# INLINE sum' #-}
118 | sum' :: Fold Double Double
119 | sum' = Fold (add :: KBNSum -> Double -> KBNSum)
120 |             (zero :: KBNSum)
121 |             kbn
122 | 
123 | 
124 | -- | The difference between the largest and smallest
125 | -- elements of a sample.
126 | {-# INLINE range #-}
127 | range :: Fold Double Double
128 | range = (\(Just lo) (Just hi) -> hi - lo)
129 |         <$> F.minimum
130 |         <*> F.maximum
131 | 
132 | -- | Create a histogram of each value of type a. Useful for folding over
133 | -- categorical values, for example, a CSV where you have a data type for a
134 | -- selection of categories.
135 | --
136 | -- It should not be used for continuous values which would lead to a high number
137 | -- of keys. One way to avoid this is to use the `Profunctor` instance for `Fold`
138 | -- to break your values into categories. For an example of doing this, see
139 | -- `ordersOfMagnitude`.
140 | histogram :: Ord a => Fold a (Map.Map a Int)
141 | histogram = Fold step Map.empty id where
142 |   step m a = Map.insertWith (+) a 1 m
143 | 
144 | -- | Like `histogram`, but for use when hashmaps would be more efficient for the
145 | -- particular type @a@.
146 | histogram' :: (Hashable a, Eq a) => Fold a (Hash.HashMap a Int)
147 | histogram' = Fold step Hash.empty id where
148 |   step m a = Hash.insertWith (+) a 1 m
149 | 
150 | -- | Provides a histogram of the orders of magnitude of the values in a series.
151 | -- Negative values are placed in the @0.0@ category due to the behaviour of
152 | -- `logBase`. it may be useful to use @lmap abs@ on this Fold to get a histogram
153 | -- of the absolute magnitudes.
154 | 
155 | -- TODO: logBase 10 1000000 /= 6 but 5, fix this
156 | ordersOfMagnitude :: Fold Double (Map.Map Double Int)
157 | ordersOfMagnitude =
158 |   dimap
159 |     ((floor :: Double -> Int) . logBase 10)
160 |     (Map.mapKeysMonotonic (10^^))
161 |     histogram
162 | 
163 | -- | Arithmetic mean.  This uses Kahan-Babuška-Neumaier
164 | -- summation, so is more accurate than 'welfordMean' unless the input
165 | -- values are very large.
166 | --
167 | -- Since foldl-1.2.2, 'Control.Foldl` exports a `mean` function, so you will
168 | -- have to hide one.
169 | {-# INLINE mean #-}
170 | mean :: Fold Double Double
171 | mean = Fold step (TS zero 0) final where
172 |     step  (TS s n) x = TS (add s x) (n+1)
173 |     final (TS s n)   = kbn s / fromIntegral n
174 | 
175 | 
176 | -- | Arithmetic mean.  This uses Welford's algorithm to provide
177 | -- numerical stability, using a single pass over the sample data.
178 | --
179 | -- Compared to 'mean', this loses a surprising amount of precision
180 | -- unless the inputs are very large.
181 | {-# INLINE welfordMean #-}
182 | welfordMean :: Fold Double Double
183 | welfordMean = Fold step (T 0 0) final where
184 |     final (T m _) = m
185 |     step (T m n) x = T m' n' where
186 |         m' = m + (x - m) / fromIntegral n'
187 |         n' = n + 1
188 | 
189 | 
190 | -- | Arithmetic mean for weighted sample. It uses a single-pass
191 | -- algorithm analogous to the one used by 'welfordMean'.
192 | {-# INLINE meanWeighted #-}
193 | meanWeighted :: Fold (Double,Double) Double
194 | meanWeighted = Fold step (V 0 0) final
195 |     where
196 |       final (V a _) = a
197 |       step (V m w) (x,xw) = V m' w'
198 |           where m' | w' == 0   = 0
199 |                    | otherwise = m + xw * (x - m) / w'
200 |                 w' = w + xw
201 | 
202 | -- | Harmonic mean.
203 | {-# INLINE harmonicMean #-}
204 | harmonicMean :: Fold Double Double
205 | harmonicMean = Fold step (T 0 0) final
206 |   where
207 |     final (T b a) = fromIntegral a / b
208 |     step (T x y) n = T (x + (1/n)) (y+1)
209 | 
210 | -- | Geometric mean of a sample containing no negative values.
211 | {-# INLINE geometricMean #-}
212 | geometricMean :: Fold Double Double
213 | geometricMean = dimap log exp mean
214 | 
215 | -- | Compute the /k/th central moment of a sample.  The central moment
216 | -- is also known as the moment about the mean.
217 | --
218 | -- This function requires the mean of the data to compute the central moment.
219 | --
220 | -- For samples containing many values very close to the mean, this
221 | -- function is subject to inaccuracy due to catastrophic cancellation.
222 | {-# INLINE centralMoment #-}
223 | centralMoment :: Int -> Double -> Fold Double Double
224 | centralMoment a m
225 |     | a < 0  = error "Statistics.Sample.centralMoment: negative input"
226 |     | a == 0 = 1
227 |     | a == 1 = 0
228 |     | otherwise = Fold step (TS zero 0) final where
229 |         step  (TS s n) x = TS (add s $ go x) (n+1)
230 |         final (TS s n)   = kbn s / fromIntegral n
231 |         go x = (x - m) ^^^ a
232 | 
233 | -- | Compute the /k/th and /j/th central moments of a sample.
234 | --
235 | -- This fold requires the mean of the data to be known.
236 | --
237 | -- For samples containing many values very close to the mean, this
238 | -- function is subject to inaccuracy due to catastrophic cancellation.
239 | {-# INLINE centralMoments #-}
240 | centralMoments :: Int -> Int -> Double -> Fold Double (Double, Double)
241 | centralMoments a b m
242 |     | a < 2 || b < 2 = (,) <$> centralMoment a m <*> centralMoment b m
243 |     | otherwise      = Fold step (V1 0 0 0) final
244 |   where final (V1 i j n)   = (i / fromIntegral n , j / fromIntegral n)
245 |         step  (V1 i j n) x = V1 (i + d^^^a) (j + d^^^b) (n+1)
246 |             where d  = x - m
247 | 
248 | 
249 | -- | Compute the /k/th and /j/th central moments of a sample.
250 | --
251 | -- This fold requires the mean of the data to be known.
252 | --
253 | -- This variation of `centralMoments' uses Kahan-Babuška-Neumaier
254 | -- summation to attempt to improve the accuracy of results, which may
255 | -- make computation slower.
256 | {-# INLINE centralMoments' #-}
257 | centralMoments' :: Int -> Int -> Double -> Fold Double (Double, Double)
258 | centralMoments' a b m
259 |     | a < 2 || b < 2 = (,) <$> centralMoment a m <*> centralMoment b m
260 |     | otherwise      = Fold step (V1S zero zero 0) final
261 |   where final (V1S i j n)   = (kbn i / fromIntegral n , kbn j / fromIntegral n)
262 |         step  (V1S i j n) x = V1S (add i $ d^^^a) (add j $ d^^^b) (n+1)
263 |             where d  = x - m
264 | 
265 | -- | Compute the skewness of a sample. This is a measure of the
266 | -- asymmetry of its distribution.
267 | --
268 | -- A sample with negative skew is said to be /left-skewed/.  Most of
269 | -- its mass is on the right of the distribution, with the tail on the
270 | -- left.
271 | --
272 | -- > skewness $ U.to [1,100,101,102,103]
273 | -- > ==> -1.497681449918257
274 | --
275 | -- A sample with positive skew is said to be /right-skewed/.
276 | --
277 | -- > skewness $ U.to [1,2,3,4,100]
278 | -- > ==> 1.4975367033335198
279 | --
280 | -- A sample's skewness is not defined if its 'variance' is zero.
281 | --
282 | -- This fold requires the mean of the data to be known.
283 | --
284 | -- For samples containing many values very close to the mean, this
285 | -- function is subject to inaccuracy due to catastrophic cancellation.
286 | {-# INLINE skewness #-}
287 | skewness :: Double -> Fold Double Double
288 | skewness m = (\(c3, c2) -> c3 * c2 ** (-1.5)) <$> centralMoments 3 2 m
289 | 
290 | 
291 | -- | Compute the excess kurtosis of a sample.  This is a measure of
292 | -- the \"peakedness\" of its distribution.  A high kurtosis indicates
293 | -- that more of the sample's variance is due to infrequent severe
294 | -- deviations, rather than more frequent modest deviations.
295 | --
296 | -- A sample's excess kurtosis is not defined if its 'variance' is
297 | -- zero.
298 | --
299 | -- This fold requires the mean of the data to be known.
300 | --
301 | -- For samples containing many values very close to the mean, this
302 | -- function is subject to inaccuracy due to catastrophic cancellation.
303 | {-# INLINE kurtosis #-}
304 | kurtosis :: Double -> Fold Double Double
305 | kurtosis m = (\(c4,c2) -> c4 / (c2 * c2) - 3) <$> centralMoments 4 2 m
306 | 
307 | 
308 | -- $variance
309 | --
310 | -- The variance&#8212;and hence the standard deviation&#8212;of a
311 | -- sample of fewer than two elements are both defined to be zero.
312 | --
313 | -- Many of these Folds take the mean as an argument for constructing
314 | -- the variance, and as such require two passes over the data.
315 | 
316 | -- $robust
317 | --
318 | -- These functions use the compensated summation algorithm of Chan et
319 | -- al. for numerical robustness, but require two passes over the
320 | -- sample data as a result.
321 | 
322 | 
323 | -- Multiply a number by itself.
324 | {-# INLINE square #-}
325 | square :: Double -> Double
326 | square x = x * x
327 | 
328 | {-# INLINE robustSumVar #-}
329 | robustSumVar :: Double -> Fold Double TS
330 | robustSumVar m = Fold step (TS zero 0) id where
331 |     step  (TS s n) x = TS (add s . square . subtract m $ x) (n+1)
332 | 
333 | -- | Maximum likelihood estimate of a sample's variance.  Also known
334 | -- as the population variance, where the denominator is /n/.
335 | {-# INLINE variance #-}
336 | variance :: Double -> Fold Double Double
337 | variance m =
338 |     (\(TS sv n) -> if n > 1 then kbn sv / fromIntegral n else 0)
339 |     <$> robustSumVar m
340 | 
341 | -- | Unbiased estimate of a sample's variance.  Also known as the
342 | -- sample variance, where the denominator is /n/-1.
343 | {-# INLINE varianceUnbiased #-}
344 | varianceUnbiased :: Double -> Fold Double Double
345 | varianceUnbiased m =
346 |     (\(TS sv n) -> if n > 1 then kbn sv / fromIntegral (n-1) else 0)
347 |     <$> robustSumVar m
348 | 
349 | 
350 | -- | Standard deviation.  This is simply the square root of the
351 | -- unbiased estimate of the variance.
352 | {-# INLINE stdDev #-}
353 | stdDev :: Double -> Fold Double Double
354 | stdDev m = sqrt (varianceUnbiased m)
355 | 
356 | 
357 | {-# INLINE robustSumVarWeighted #-}
358 | robustSumVarWeighted :: Double -> Fold (Double,Double) V1
359 | robustSumVarWeighted m = Fold step (V1 0 0 0) id
360 |     where
361 |       step (V1 s w n) (x,xw) = V1 (s + xw*d*d) (w + xw) (n+1)
362 |           where d = x - m
363 | 
364 | -- | Weighted variance. This is biased estimation. Requires the
365 | -- weighted mean of the input data.
366 | {-# INLINE varianceWeighted #-}
367 | varianceWeighted :: Double -> Fold (Double,Double)  Double
368 | varianceWeighted m =
369 |     (\(V1 s w n) -> if n > 1 then s / w else 0)
370 |     <$> robustSumVarWeighted m
371 | 
372 | -- $cancellation
373 | --
374 | -- The functions prefixed with the name @fast@ below perform a single
375 | -- pass over the sample data using Knuth's algorithm. They usually
376 | -- work well, but see below for caveats. These functions are subject
377 | -- to fusion and do not require the mean to be passed.
378 | --
379 | -- /Note/: in cases where most sample data is close to the sample's
380 | -- mean, Knuth's algorithm gives inaccurate results due to
381 | -- catastrophic cancellation.
382 | 
383 | {-# INLINE fastVar #-}
384 | fastVar :: Fold Double T1
385 | fastVar = Fold step (T1 0 0 0) id
386 |   where
387 |     step (T1 n m s) x = T1 n' m' s'
388 |       where n' = n + 1
389 |             m' = m + d / fromIntegral n'
390 |             s' = s + d * (x - m')
391 |             d  = x - m
392 | 
393 | -- | Maximum likelihood estimate of a sample's variance.
394 | {-# INLINE fastVariance #-}
395 | fastVariance :: Fold Double Double
396 | fastVariance = final <$> fastVar
397 |   where final (T1 n _m s)
398 |           | n > 1     = s / fromIntegral n
399 |           | otherwise = 0
400 | 
401 | 
402 | -- | Maximum likelihood estimate of a sample's variance.
403 | {-# INLINE fastVarianceUnbiased #-}
404 | fastVarianceUnbiased :: Fold Double Double
405 | fastVarianceUnbiased = final <$> fastVar
406 |   where final (T1 n _m s)
407 |           | n > 1     = s / fromIntegral (n-1)
408 |           | otherwise = 0
409 | 
410 | 
411 | -- | Standard deviation.  This is simply the square root of the
412 | -- maximum likelihood estimate of the variance.
413 | {-# INLINE fastStdDev #-}
414 | fastStdDev :: Fold Double Double
415 | fastStdDev = sqrt fastVariance
416 | 
417 | 
418 | 
419 | -- | When returned by `fastLMVSK`, contains the count, mean,
420 | --  variance, skewness and kurtosis of a series of samples.
421 | --
422 | -- /Since: 0.1.1.0/
423 | data LMVSK  = LMVSK
424 |   { lmvskCount    :: {-# UNPACK #-}!Int
425 |   , lmvskMean     :: {-# UNPACK #-}!Double
426 |   , lmvskVariance :: {-# UNPACK #-}!Double
427 |   , lmvskSkewness :: {-# UNPACK #-}!Double
428 |   , lmvskKurtosis :: {-# UNPACK #-}!Double
429 |   } deriving (Show, Eq)
430 | 
431 | newtype LMVSKState = LMVSKState LMVSK
432 | 
433 | instance Monoid LMVSKState where
434 |   {-# INLINE mempty #-}
435 |   mempty = LMVSKState lmvsk0
436 |   {-# INLINE mappend #-}
437 |   mappend = (<>)
438 | 
439 | instance Semigroup LMVSKState where
440 |   {-# INLINE (<>) #-}
441 |   (LMVSKState (LMVSK an am1 am2 am3 am4)) <> (LMVSKState (LMVSK bn bm1 bm2 bm3 bm4))
442 |     = LMVSKState (LMVSK n m1 m2 m3 m4) where
443 |     fi :: Int -> Double
444 |     fi = fromIntegral
445 |     -- combined.n = a.n + b.n;
446 |     n      = an+bn
447 |     n2     = n*n
448 |     nd     = fi n
449 |     nda    = fi an
450 |     ndb    = fi bn
451 |     -- delta = b.M1 - a.M1;
452 |     delta  =    bm1 - am1
453 |     -- delta2 = delta*delta;
454 |     delta2 =    delta*delta
455 |     -- delta3 = delta*delta2;
456 |     delta3 =    delta*delta2
457 |     -- delta4 = delta2*delta2;
458 |     delta4 =    delta2*delta2
459 |     -- combined.M1 = (a.n*a.M1 + b.n*b.M1) / combined.n;
460 |     m1     =         (nda*am1  + ndb*bm1 ) / nd
461 |     -- combined.M2 = a.M2 + b.M2 + delta2*a.n*b.n / combined.n;
462 |     m2     =          am2 + bm2  + delta2*nda*ndb / nd
463 |     -- combined.M3 = a.M3 + b.M3 + delta3*a.n*b.n*   (a.n - b.n)/(combined.n*combined.n);
464 |     m3     =         am3  + bm3  + delta3*nda*ndb* fi( an - bn )/ fi n2
465 |     -- combined.M3 += 3.0*delta * (a.n*b.M2 - b.n*a.M2) / combined.n;
466 |            +          3.0*delta * (nda*bm2  - ndb*am2 ) / nd
467 |     --
468 |     -- combined.M4 = a.M4 + b.M4 + delta4*a.n*b.n * (a.n*a.n - a.n*b.n + b.n*b.n) /(combined.n*combined.n*combined.n);
469 |     m4     =         am4  + bm4  + delta4*nda*ndb *fi(an*an  -  an*bn  +  bn*bn ) / fi (n*n*n)
470 |     -- combined.M4 += 6.0*delta2 * (a.n*a.n*b.M2 + b.n*b.n*a.M2)/(combined.n*combined.n) +
471 |            +          6.0*delta2 * (nda*nda*bm2  + ndb*ndb*am2) / fi n2
472 |     --               4.0*delta*(a.n*b.M3 - b.n*a.M3) / combined.n;
473 |            +         4.0*delta*(nda*bm3  - ndb*am3) / nd
474 | 
475 | -- | Efficiently compute the
476 | -- __length, mean, variance, skewness and kurtosis__ with a single pass.
477 | --
478 | -- /Since: 0.1.1.0/
479 | {-# INLINE fastLMVSK #-}
480 | fastLMVSK :: Fold Double LMVSK
481 | fastLMVSK = getLMVSK <$> foldLMVSKState
482 | 
483 | -- | Efficiently compute the
484 | -- __length, mean, unbiased variance, skewness and kurtosis__ with a single pass.
485 | --
486 | -- /Since: 0.1.3.0/
487 | {-# INLINE fastLMVSKu #-}
488 | fastLMVSKu :: Fold Double LMVSK
489 | fastLMVSKu = getLMVSKu <$> foldLMVSKState
490 | 
491 | {-# INLINE lmvsk0 #-}
492 | lmvsk0 :: LMVSK
493 | lmvsk0 = LMVSK 0 0 0 0 0
494 | 
495 | -- | Performs the heavy lifting of fastLMVSK. This is exposed
496 | --   because the internal `LMVSKState` is monoidal, allowing you
497 | --   to run these statistics in parallel over datasets which are
498 | --   split and then combine the results.
499 | --
500 | -- /Since: 0.1.2.0/
501 | {-# INLINE foldLMVSKState #-}
502 | foldLMVSKState :: Fold Double LMVSKState
503 | foldLMVSKState = Fold stepLMVSKState (LMVSKState lmvsk0) id
504 | 
505 | {-# INLINE stepLMVSKState #-}
506 | stepLMVSKState :: LMVSKState -> Double -> LMVSKState
507 | stepLMVSKState (LMVSKState (LMVSK n1 m1 m2 m3 m4)) x = LMVSKState $ LMVSK n m1' m2' m3' m4' where
508 |   fi :: Int -> Double
509 |   fi = fromIntegral
510 |   -- long long n1 = n;
511 |   -- n++;
512 |   n = n1+1
513 |   -- delta = x - M1;
514 |   delta =    x - m1
515 |   -- delta_n = delta / n;
516 |   delta_n =    delta / fi n
517 |   -- delta_n2 = delta_n * delta_n;
518 |   delta_n2 =    delta_n * delta_n
519 |   -- term1 = delta * delta_n * n1;
520 |   term1 =    delta * delta_n * fi n1
521 |   -- M1 +=   delta_n;
522 |   m1' = m1 + delta_n
523 |   -- M4 +=   term1 * delta_n2 *    (n*n - 3*n + 3) + 6 * delta_n2 * M2 - 4 * delta_n * M3;
524 |   m4' = m4 + term1 * delta_n2 * fi (n*n - 3*n + 3) + 6 * delta_n2 * m2 - 4 * delta_n * m3
525 |   -- M3 +=   term1 * delta_n *    (n - 2) - 3 * delta_n * M2;
526 |   m3' = m3 + term1 * delta_n * fi (n - 2) - 3 * delta_n * m2
527 |   -- M2 +=  term1;
528 |   m2' = m2 + term1
529 | 
530 | -- | Returns the stats which have been computed in a LMVSKState.
531 | --
532 | -- /Since: 0.1.2.0/
533 | getLMVSK :: LMVSKState -> LMVSK
534 | getLMVSK (LMVSKState (LMVSK n m1 m2 m3 m4)) = LMVSK n m1 m2' m3' m4' where
535 |   nd = fromIntegral n
536 |   -- M2/(n-1.0)
537 |   m2' = m2 / nd
538 |   --    sqrt(double(n)) * M3/ pow(M2, 1.5)
539 |   m3' = sqrt nd * m3 / (m2 ** 1.5)
540 |   -- double(n)*M4 / (M2*M2) - 3.0
541 |   m4' = nd*m4     / (m2*m2) - 3.0
542 | 
543 | -- | Returns the stats which have been computed in a LMVSKState,
544 | --   with the unbiased variance.
545 | --
546 | -- /Since: 0.1.2.0/
547 | getLMVSKu :: LMVSKState -> LMVSK
548 | getLMVSKu (LMVSKState (LMVSK n m1 m2 m3 m4)) = LMVSK n m1 m2' m3' m4' where
549 |   nd = fromIntegral n
550 |   -- M2/(n-1.0)
551 |   m2' = m2 / (nd-1)
552 |   --    sqrt(double(n)) * M3/ pow(M2, 1.5)
553 |   m3' = sqrt nd * m3 / (m2 ** 1.5)
554 |   -- double(n)*M4 / (M2*M2) - 3.0
555 |   m4' = nd*m4     / (m2*m2) - 3.0
556 | 
557 | 
558 | -- | When returned by `fastLinearReg`, contains the count,
559 | --   slope, intercept and correlation of combining @(x,y)@ pairs.
560 | --
561 | -- /Since: 0.1.1.0/
562 | data LinRegResult = LinRegResult
563 |   {lrrSlope       :: {-# UNPACK #-}!Double
564 |   ,lrrIntercept   :: {-# UNPACK #-}!Double
565 |   ,lrrCorrelation :: {-# UNPACK #-}!Double
566 |   ,lrrXStats      :: {-# UNPACK #-}!LMVSK
567 |   ,lrrYStats      :: {-# UNPACK #-}!LMVSK
568 |   } deriving (Show, Eq)
569 | 
570 | -- | The number of elements which make up this 'LinRegResult'
571 | -- /Since: 0.1.4.1/
572 | lrrCount :: LinRegResult -> Int
573 | lrrCount = lmvskCount . lrrXStats
574 | 
575 | -- | The Monoidal state used to compute linear regression, see `fastLinearReg`.
576 | --
577 | -- /Since: 0.1.4.0/
578 | data LinRegState = LinRegState
579 |   {-# UNPACK #-}!LMVSKState
580 |   {-# UNPACK #-}!LMVSKState
581 |   {-# UNPACK #-}!Double
582 | 
583 | 
584 | {-
585 | RunningRegression operator+(const RunningRegression a, const RunningRegression b)
586 | {
587 |     RunningRegression combined;
588 | 
589 |     combined.x_stats = a.x_stats + b.x_stats;
590 |     combined.y_stats = a.y_stats + b.y_stats;
591 |     combined.n = a.n + b.n;
592 | 
593 |     double delta_x = b.x_stats.Mean() - a.x_stats.Mean();
594 |     double delta_y = b.y_stats.Mean() - a.y_stats.Mean();
595 |     combined.S_xy = a.S_xy + b.S_xy +
596 |     double(a.n*b.n)*delta_x*delta_y/double(combined.n);
597 | 
598 |     return combined;
599 | }
600 | -}
601 | instance Semigroup LinRegState where
602 |   {-# INLINE (<>) #-}
603 |   (LinRegState     ax@(LMVSKState ax') ay a_xy)
604 |    <> (LinRegState bx@(LMVSKState bx') by b_xy)
605 |    = LinRegState x y s_xy where
606 |     an = lmvskCount ax'
607 |     bn = lmvskCount bx'
608 |     x = ax <> bx
609 |     y = ay <> by
610 |     delta_x = lmvskMean (getLMVSK bx) - lmvskMean (getLMVSK ax)
611 |     delta_y = lmvskMean (getLMVSK by) - lmvskMean (getLMVSK ay)
612 |     s_xy = a_xy+b_xy + fromIntegral (an*bn) * delta_x * delta_y/fromIntegral (an+bn)
613 | 
614 | 
615 | instance Monoid LinRegState where
616 |   {-# INLINE mempty #-}
617 |   mempty = LinRegState mempty mempty 0
618 |   {-# INLINE mappend #-}
619 |   mappend = (<>)
620 | 
621 | 
622 | 
623 | -- | Computes the __slope, (Y) intercept and correlation__ of @(x,y)@
624 | --   pairs, as well as the `LMVSK` stats for both the x and y series.
625 | --
626 | -- >>> F.fold fastLinearReg $ map (\x -> (x,3*x+7)) [1..100]
627 | -- LinRegResult
628 | --   {lrrSlope = 3.0
629 | --   , lrrIntercept = 7.0
630 | --   , lrrCorrelation = 100.0
631 | --   , lrrXStats = LMVSK
632 | --       {lmvskCount = 100
633 | --       , lmvskMean = 50.5
634 | --       , lmvskVariance = 833.25
635 | --       , lmvskSkewness = 0.0
636 | --       , lmvskKurtosis = -1.2002400240024003}
637 | --   , lrrYStats = LMVSK
638 | --       {lmvskCount = 100
639 | --       , lmvskMean = 158.5
640 | --       , lmvskVariance = 7499.25
641 | --       , lmvskSkewness = 0.0
642 | --       , lmvskKurtosis = -1.2002400240024003}
643 | --   }
644 | --
645 | -- >>> F.fold fastLinearReg $ map (\x -> (x,0.005*x*x+3*x+7)) [1..100]
646 | -- LinRegResult
647 | --   {lrrSlope = 3.5049999999999994
648 | --   , lrrIntercept = -1.5849999999999795
649 | --   , lrrCorrelation = 99.93226275740273
650 | --   , lrrXStats = LMVSK
651 | --       {lmvskCount = 100
652 | --       , lmvskMean = 50.5
653 | --       , lmvskVariance = 833.25
654 | --       , lmvskSkewness = 0.0
655 | --       , lmvskKurtosis = -1.2002400240024003}
656 | --   , lrrYStats = LMVSK
657 | --       {lmvskCount = 100
658 | --       , lmvskMean = 175.4175
659 | --       , lmvskVariance = 10250.37902625
660 | --       , lmvskSkewness = 9.862971188165422e-2
661 | --       , lmvskKurtosis = -1.1923628437011482}
662 | --   }
663 | --
664 | -- /Since: 0.1.1.0/
665 | {-# INLINE fastLinearReg #-}
666 | fastLinearReg :: Fold (Double,Double) LinRegResult
667 | fastLinearReg = getLinRegResult <$> foldLinRegState
668 | 
669 | -- | Produces the slope, Y intercept, correlation and LMVSK stats from a
670 | --   `LinRegState`.
671 | --
672 | -- /Since: 0.1.4.0/
673 | {-# INLINE getLinRegResult #-}
674 | getLinRegResult :: LinRegState -> LinRegResult
675 | getLinRegResult (LinRegState vx@(LMVSKState vx') vy s_xy) = LinRegResult slope intercept correl statsx statsy where
676 |   n                               = lmvskCount vx'
677 |   ndm1                            = fromIntegral (n-1)
678 |   -- slope = S_xy / (x_stats.Variance()*(n - 1.0));
679 |   -- in LMVSKState, 'lmvskVariance' hasn't been divided
680 |   -- by (n-1), so division not necessary
681 |   slope                           = s_xy / lmvskVariance vx'
682 |   intercept                       = yMean - slope*xMean
683 |   t                               = sqrt xVar * sqrt yVar -- stddev x * stddev y
684 |   correl                          = s_xy / (ndm1 * t)
685 |   -- Need unbiased variance or correlation may be > ±1
686 |   statsx@(LMVSK _ xMean xVar _ _) = getLMVSKu vx
687 |   statsy@(LMVSK _ yMean yVar _ _) = getLMVSKu vy
688 | 
689 | 
690 | -- | Performs the heavy lifting for `fastLinReg`. Exposed because `LinRegState`
691 | --  is a `Monoid`, allowing statistics to be computed on datasets in parallel
692 | --  and combined afterwards.
693 | --
694 | -- /Since: 0.1.4.0/
695 | {-# INLINE foldLinRegState #-}
696 | foldLinRegState :: Fold (Double,Double) LinRegState
697 | foldLinRegState = Fold step (LinRegState (LMVSKState lmvsk0) (LMVSKState lmvsk0) 0) id where
698 |   step (LinRegState vx@(LMVSKState vx') vy s_xy) (x,y) = LinRegState vx2 vy2 s_xy' where
699 |     n     = lmvskCount vx'
700 |     nd    = fromIntegral n
701 |     nd1   = fromIntegral (n+1)
702 |     s_xy' = s_xy + (xMean - x)*(yMean - y)*nd/nd1
703 |     xMean = lmvskMean (getLMVSK vx)
704 |     yMean = lmvskMean (getLMVSK vy)
705 |     vx2   = stepLMVSKState vx x
706 |     vy2   = stepLMVSKState vy y
707 | 
708 | 
709 | -- | Given the mean and standard deviation of two distributions, computes
710 | --   the correlation between them, given the means and standard deviation
711 | --   of the @x@ and @y@ series. The results may be more accurate than those
712 | --   returned by `fastLinearReg`
713 | correlation :: (Double, Double) -> (Double, Double) -> Fold (Double,Double) Double
714 | correlation (m1,m2) (s1,s2) = Fold step (TS zero 0) final where
715 |     step  (TS s n) (x1,x2) = TS (add s $ ((x1 - m1)/s1) * ((x2 - m2)/s2)) (n+1)
716 |     final (TS s n)         = kbn s / fromIntegral (n-1)
717 | 
718 | 
719 | -- $references
720 | --
721 | -- * Chan, T. F.; Golub, G.H.; LeVeque, R.J. (1979) Updating formulae
722 | --   and a pairwise algorithm for computing sample
723 | --   variances. Technical Report STAN-CS-79-773, Department of
724 | --   Computer Science, Stanford
725 | --   University. <ftp://reports.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf>
726 | --
727 | -- * Knuth, D.E. (1998) The art of computer programming, volume 2:
728 | --   seminumerical algorithms, 3rd ed., p. 232.
729 | --
730 | -- * Welford, B.P. (1962) Note on a method for calculating corrected
731 | --   sums of squares and products. /Technometrics/
732 | --   4(3):419&#8211;420. <http://www.jstor.org/stable/1266577>
733 | --
734 | -- * West, D.H.D. (1979) Updating mean and variance estimates: an
735 | --   improved method. /Communications of the ACM/
736 | --   22(9):532&#8211;535. <http://doi.acm.org/10.1145/359146.359153>
737 | --
738 | -- * John D. Cook. Computing skewness and kurtosis in one pass
739 | --   <http://www.johndcook.com/blog/skewness_kurtosis/>
740 | 
741 | 
742 | 
743 | -- (^) operator from Prelude is just slow.
744 | (^^^) :: Double -> Int -> Double
745 | x ^^^ 1 = x
746 | x ^^^ n = x * (x ^^^ (n-1))
747 | {-# INLINE[2] (^^^) #-}
748 | {-# RULES
749 | "pow 2"  forall x. x ^^^ 2  = x * x
750 | "pow 3"  forall x. x ^^^ 3  = x * x * x
751 | "pow 4"  forall x. x ^^^ 4  = x * x * x * x
752 | "pow 5"  forall x. x ^^^ 5  = x * x * x * x * x
753 | "pow 6"  forall x. x ^^^ 6  = x * x * x * x * x * x
754 | "pow 7"  forall x. x ^^^ 7  = x * x * x * x * x * x * x
755 | "pow 8"  forall x. x ^^^ 8  = x * x * x * x * x * x * x * x
756 | "pow 9"  forall x. x ^^^ 9  = x * x * x * x * x * x * x * x * x
757 | "pow 10" forall x. x ^^^ 10 = x * x * x * x * x * x * x * x * x * x
758 | 
759 |  #-}
760 | 


--------------------------------------------------------------------------------
/stack-6.yaml:
--------------------------------------------------------------------------------
 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md
 2 | 
 3 | # Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2)
 4 | resolver: lts-6.35
 5 | 
 6 | # Local packages, usually specified by relative directory name
 7 | packages:
 8 | - '.'
 9 | 
10 | # Packages to be pulled from upstream that are not in the resolver (e.g., acme-missiles-0.3)
11 | extra-deps: []
12 | 
13 | # Override default flag values for local packages and extra-deps
14 | flags: {}
15 | 
16 | # Extra package databases containing global packages
17 | extra-package-dbs: []
18 | 
19 | # Control whether we use the GHC we find on the path
20 | # system-ghc: true
21 | 
22 | # Require a specific version of stack, using version ranges
23 | # require-stack-version: -any # Default
24 | # require-stack-version: >= 1.0.0
25 | 
26 | # Override the architecture used by stack, especially useful on Windows
27 | # arch: i386
28 | # arch: x86_64
29 | 
30 | # Extra directories used by stack for building
31 | # extra-include-dirs: [/path/to/dir]
32 | # extra-lib-dirs: [/path/to/dir]
33 | 


--------------------------------------------------------------------------------
/stack-7.yaml:
--------------------------------------------------------------------------------
 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md
 2 | 
 3 | # Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2)
 4 | resolver: lts-7.0
 5 | 
 6 | # Local packages, usually specified by relative directory name
 7 | packages:
 8 | - '.'
 9 | 
10 | # Packages to be pulled from upstream that are not in the resolver (e.g., acme-missiles-0.3)
11 | extra-deps: []
12 | 
13 | # Override default flag values for local packages and extra-deps
14 | flags: {}
15 | 
16 | # Extra package databases containing global packages
17 | extra-package-dbs: []
18 | 
19 | # Control whether we use the GHC we find on the path
20 | # system-ghc: true
21 | 
22 | # Require a specific version of stack, using version ranges
23 | # require-stack-version: -any # Default
24 | # require-stack-version: >= 1.0.0
25 | 
26 | # Override the architecture used by stack, especially useful on Windows
27 | # arch: i386
28 | # arch: x86_64
29 | 
30 | # Extra directories used by stack for building
31 | # extra-include-dirs: [/path/to/dir]
32 | # extra-lib-dirs: [/path/to/dir]
33 | 


--------------------------------------------------------------------------------
/stack.yaml:
--------------------------------------------------------------------------------
 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md
 2 | 
 3 | # Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2)
 4 | resolver: lts-10.3
 5 | 
 6 | # Local packages, usually specified by relative directory name
 7 | packages:
 8 | - '.'
 9 | 
10 | # Packages to be pulled from upstream that are not in the resolver (e.g., acme-missiles-0.3)
11 | extra-deps: []
12 | 
13 | # Override default flag values for local packages and extra-deps
14 | flags: {}
15 | 
16 | # Extra package databases containing global packages
17 | extra-package-dbs: []
18 | 
19 | # Control whether we use the GHC we find on the path
20 | # system-ghc: true
21 | 
22 | # Require a specific version of stack, using version ranges
23 | # require-stack-version: -any # Default
24 | # require-stack-version: >= 1.0.0
25 | 
26 | # Override the architecture used by stack, especially useful on Windows
27 | # arch: i386
28 | # arch: x86_64
29 | 
30 | # Extra directories used by stack for building
31 | # extra-include-dirs: [/path/to/dir]
32 | # extra-lib-dirs: [/path/to/dir]
33 | 


--------------------------------------------------------------------------------
/test/Spec.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE CPP #-}
  2 | 
  3 | import           Test.Tasty
  4 | -- import Test.Tasty.SmallCheck as SC
  5 | import           Test.Tasty.QuickCheck     ((==>))
  6 | import qualified Test.Tasty.QuickCheck     as QC
  7 | 
  8 | #if MIN_VERSION_foldl(1,2,2)
  9 | import qualified Control.Foldl             as F hiding (mean, variance)
 10 | #else
 11 | import qualified Control.Foldl             as F
 12 | #endif
 13 | 
 14 | import           Control.Foldl.Statistics  hiding (length)
 15 | 
 16 | import qualified Data.Vector.Unboxed       as U
 17 | import           Test.QuickCheck.Instances ()
 18 | 
 19 | import qualified Statistics.Sample         as S
 20 | 
 21 | import           Data.Profunctor
 22 | 
 23 | import           Data.Function             (on)
 24 | 
 25 | import           Data.Semigroup            ((<>))
 26 | #if !MIN_VERSION_base(4,8,0)
 27 | import           Control.Applicative
 28 | import           Data.Monoid               (mappend)
 29 | #endif
 30 | 
 31 | 
 32 | 
 33 | toV :: [Double] -> U.Vector Double
 34 | toV = U.fromList
 35 | 
 36 | 
 37 | onVec :: String -> (U.Vector Double -> QC.Property) -> TestTree
 38 | onVec str f = QC.testProperty str (f . toV)
 39 | 
 40 | onVec2 :: String -> (U.Vector (Double,Double) -> QC.Property) -> TestTree
 41 | onVec2 str f = QC.testProperty str (f . U.fromList)
 42 | 
 43 | 
 44 | testLMVSK :: Double -> Fold Double LMVSK
 45 | testLMVSK m = LMVSK
 46 |   <$> F.length
 47 |   <*> mean
 48 |   <*> variance m
 49 |   <*> skewness m
 50 |   <*> kurtosis m
 51 | 
 52 | precision :: Double
 53 | precision = 10e-9
 54 | 
 55 | cmpLMVSK :: Double -> LMVSK -> LMVSK -> Bool
 56 | cmpLMVSK prec a b = let
 57 |   t f = on (withinPCT prec) f a b
 58 |   in a == b ||
 59 |         (  t lmvskMean
 60 |         && t lmvskVariance
 61 |         && t lmvskKurtosis
 62 |         && t lmvskSkewness
 63 |         && ((==) `on` lmvskCount) a b
 64 |         )
 65 | 
 66 | diffLMVSK :: LMVSK -> LMVSK -> LMVSK
 67 | diffLMVSK a b = LMVSK
 68 |     (t lmvskCount)
 69 |     (t lmvskMean)
 70 |     (t lmvskVariance)
 71 |     (t lmvskSkewness)
 72 |     (t lmvskKurtosis)
 73 |     where t f = f a - f b
 74 | 
 75 | 
 76 | main :: IO ()
 77 | main = defaultMain $
 78 |     testGroup "Results match Statistics.Sample"
 79 |         [ testGroup "Without pre-computed mean"
 80 |             [ testGroup "Statistics of location"
 81 |                 [ onVec "mean" $ \vec ->
 82 |                      not (U.null vec) ==> F.fold mean (U.toList vec) == S.mean vec
 83 |                 , onVec2 "meanWeighted" $ \vec ->
 84 |                      not (U.null vec) ==> F.fold meanWeighted (U.toList vec) == S.meanWeighted vec
 85 |                 , onVec "welfordMean" $ \vec ->
 86 |                      not (U.null vec) ==> F.fold welfordMean (U.toList vec) == S.welfordMean vec
 87 |                 , onVec "harmonicMean" $ \vec ->
 88 |                      not (U.null vec) ==> F.fold harmonicMean (U.toList vec) == S.harmonicMean vec
 89 |                 , onVec "geometricMean" $ \vec ->
 90 |                      not (U.null vec) ==>
 91 |                         let vec' = U.map abs vec
 92 |                             res = S.geometricMean vec'
 93 |                         in isNaN res || F.fold geometricMean (U.toList vec') == res
 94 |                 ]
 95 | 
 96 |             , testGroup "Single-pass functions" $
 97 |                 [ onVec "fastVariance" $ \vec ->
 98 |                     not (U.null vec) ==> F.fold fastVariance (U.toList vec) == S.fastVariance vec
 99 |                 , onVec "fastVarianceUnbiased" $ \vec ->
100 |                     not (U.null vec) ==> F.fold fastVarianceUnbiased (U.toList vec) == S.fastVarianceUnbiased vec
101 |                 , onVec "fastStdDev" $ \vec ->
102 |                     not (U.null vec) ==> F.fold fastStdDev (U.toList vec) == S.fastStdDev vec
103 |                 , let
104 |                   -- TODO: Known failure when using
105 |                   -- --quickcheck-replay '39 TFGenR A6EB566E901D554AAA13826C088B8831192E813D893D082A85F8A27C86D569E0 0 65535 16 0'
106 |                   in onVec ("fastLMVSK within " ++ show precision ++ " %") $ \vec ->
107 |                     U.length vec > 3 && U.sum (U.map abs vec) > 0.0 ==> let
108 |                       m         = F.fold mean $ U.toList vec
109 |                       fast      = F.fold fastLMVSK $ U.toList vec
110 |                       reference = F.fold (testLMVSK m) $ U.toList vec
111 |                       in QC.counterexample (unlines ["",show fast,show reference, "Diff:", show (diffLMVSK fast reference)]) $
112 |                             cmpLMVSK precision fast reference
113 |                 , QC.testProperty "LMVSKSemigroup" $ \v1 v2 ->
114 |                     U.length v1 > 2 && U.length v2 > 2 && U.sum (mappend v1 v1) /= U.product (mappend v1 v1) ==> let
115 |                       sep = getLMVSK $ F.fold foldLMVSKState (U.toList v1) <> F.fold foldLMVSKState (U.toList v2)
116 |                       tog = F.fold fastLMVSK (U.toList v1 ++ U.toList v2)
117 |                       in QC.counterexample (unlines ["",show sep,show tog, "Diff:", show (diffLMVSK sep tog)])
118 |                         $ cmpLMVSK precision sep tog
119 |                         || isNaN (lmvskKurtosis sep)
120 |                         || isNaN (lmvskKurtosis tog)
121 |                 ]
122 |             ]
123 | 
124 |         , testGroup "With pre-computed mean"
125 |             [ testGroup "Functions requiring the mean to be known"
126 |                 [ onVec "variance" $ \vec ->
127 |                     not (U.null vec) ==> let m = F.fold mean (U.toList vec)
128 |                     in F.fold (variance m) (U.toList vec) == S.variance vec
129 |                 , onVec "varianceUnbiased" $ \vec ->
130 |                     not (U.null vec) ==> let m = F.fold mean (U.toList vec)
131 |                     in F.fold (varianceUnbiased m) (U.toList vec) == S.varianceUnbiased vec
132 |                 , onVec "stdDev" $ \vec ->
133 |                     not (U.null vec) ==> let m = F.fold mean (U.toList vec)
134 |                     in F.fold (stdDev m) (U.toList vec) == S.stdDev vec
135 |                 , onVec2 "varianceWeighted" $ \vec ->
136 |                     not (U.null vec) ==> let m = F.fold meanWeighted (U.toList vec)
137 |                     in F.fold (varianceWeighted m) (U.toList vec) == S.varianceWeighted vec
138 |                 ]
139 | 
140 |             , testGroup "Functions over central moments"
141 |                 [ onVec "skewness" $ \vec ->
142 |                     U.length vec > 3 ==> let m = F.fold mean (U.toList vec)
143 |                     in F.fold (skewness m) (U.toList vec) == S.skewness vec
144 |                 , onVec "kurtosis" $ \vec ->
145 |                     U.length vec > 4 ==> let m = F.fold mean (U.toList vec)
146 |                     in F.fold (kurtosis m) (U.toList vec) == S.kurtosis vec
147 |                 , onVec "centralMoment 2" $ \vec ->
148 |                      U.length vec > 2 ==> let m = F.fold mean (U.toList vec)
149 |                     in F.fold (centralMoment 2 m) (U.toList vec) == S.centralMoment 2 vec
150 |                 , onVec "centralMoment 3" $ \vec ->
151 |                     U.length vec > 3 ==> let m = F.fold mean (U.toList vec)
152 |                     in F.fold (centralMoment 3 m) (U.toList vec) == S.centralMoment 3 vec
153 |                 , onVec "centralMoment 4" $ \vec ->
154 |                     U.length vec > 4 ==> let m = F.fold mean (U.toList vec)
155 |                     in F.fold (centralMoment 4 m) (U.toList vec) == S.centralMoment 4 vec
156 |                 , onVec "centralMoment 7" $ \vec ->
157 |                     U.length vec > 7 ==> let m = F.fold mean (U.toList vec)
158 |                     in F.fold (centralMoment 7 m) (U.toList vec) == S.centralMoment 7 vec
159 |                 , onVec "centralMoments 4 9" $ \vec ->
160 |                     U.length vec > 7 ==> let m = F.fold mean (U.toList vec)
161 |                     in F.fold (centralMoments 4 9 m) (U.toList vec) == S.centralMoments 4 9 vec
162 |                 -- Cannot test this because we do not have an equivalent implementation
163 |                 -- from the statistics package.
164 |                 -- , onVec "centralMoments' 4 9" $ \vec -> length lst > 7 ==>
165 |                 --     let m = F.fold mean lst
166 |                 --         (f1,f2) = (F.fold (centralMoments' 4 9 m) lst)
167 |                 --         (s1,s2) = (S.centralMoments 4 9 vec)
168 |                 --     in within 3 f1 s1 && within 3 f2 s2
169 |                 ]
170 |             , testGroup "Correlation"
171 |                 [ onVec2 "correlation between [-1,1]" $ \vec ->
172 |                     U.length vec > 2 && U.any (/= (0.0,0.0)) vec ==>
173 |                     let m1 = F.fold mean (U.toList $ U.map fst vec)
174 |                         m2 = F.fold mean (U.toList $ U.map snd vec)
175 |                         s1 = F.fold (stdDev m1) (U.toList $ U.map fst vec)
176 |                         s2 = F.fold (stdDev m2) (U.toList $ U.map snd vec)
177 |                     in between (-1,1) $
178 |                         F.fold (correlation (m1,m2) (s1,s2)) (U.toList vec)
179 |                 , onVec2 "correlation between [-1,1] fastStdDev" $ \vec ->
180 | 
181 |                     let (m1,m2) = F.fold ((,) <$> lmap fst mean <*> lmap snd mean)
182 |                                         (U.toList vec)
183 |                         (s1,s2) = F.fold ((,) <$> lmap fst (stdDev m1) <*> lmap snd (stdDev m2))
184 |                                         (U.toList vec)
185 |                         corr = F.fold (correlation (m1,m2) (s1,s2)) (U.toList vec)
186 |                     in U.length vec > 2 && s2 /= 0.0 && s2 /= 0.0 ==>
187 |                         QC.counterexample ("Correlation: " ++ show corr ++ " Stats: " ++ show (m1,m2,s1,s2)) $
188 |                             between (-1,1) corr || isNaN corr
189 |                 , QC.testProperty "LinRegState Semigroup" $ \v1 v2 ->
190 |                     U.length v1 > 3 && U.length v2 > 3
191 |                     && U.any (/=0.0) (U.map fst v1) && U.any (/=0.0) (U.map fst v2)
192 |                     && U.any (/=0.0) (U.map snd v1) && U.any (/=0.0) (U.map snd v2) ==> let
193 |                       sep = getLinRegResult $ F.fold foldLinRegState (U.toList v1) <> F.fold foldLinRegState (U.toList v2)
194 |                       tog = F.fold fastLinearReg (U.toList v1 ++ U.toList v2)
195 |                       in (cmpLMVSK precision (lrrXStats sep) (lrrXStats tog)
196 |                          && cmpLMVSK precision (lrrYStats sep) (lrrYStats tog))
197 |                         || isNaN (lmvskKurtosis (lrrXStats sep))
198 |                         || isNaN (lmvskKurtosis (lrrYStats sep))
199 |                         || isNaN (lmvskKurtosis (lrrXStats tog))
200 |                         || isNaN (lmvskKurtosis (lrrYStats tog))
201 |                 ]
202 |             ]
203 |         ]
204 | 
205 | between :: (Double,Double) -> Double -> Bool
206 | between (lo,hi) = \x -> lo <= x && x <= hi
207 | 
208 | 
209 | withinPCT :: Double -> Double -> Double -> Bool
210 | withinPCT pct a b = abs ((a - b) * 100 / abs b)  < pct
211 | 


--------------------------------------------------------------------------------