├── .github └── workflows │ └── haskell-ci.yml ├── .gitignore ├── .travis.yml.bak ├── CHANGELOG.md ├── LICENSE ├── README.md ├── Setup.hs ├── bench └── Main.hs ├── foldl-statistics.cabal ├── src └── Control │ └── Foldl │ └── Statistics.hs ├── stack-6.yaml ├── stack-7.yaml ├── stack.yaml └── test └── Spec.hs /.github/workflows/haskell-ci.yml: -------------------------------------------------------------------------------- 1 | # This GitHub workflow config has been generated by a script via 2 | # 3 | # haskell-ci 'github' 'cabal.project' 4 | # 5 | # To regenerate the script (for example after adjusting tested-with) run 6 | # 7 | # haskell-ci regenerate 8 | # 9 | # For more information, see https://github.com/haskell-CI/haskell-ci 10 | # 11 | # version: 0.18.1 12 | # 13 | # REGENDATA ("0.18.1",["github","cabal.project"]) 14 | # 15 | name: Haskell-CI 16 | on: 17 | - push 18 | - pull_request 19 | jobs: 20 | linux: 21 | name: Haskell-CI - Linux - ${{ matrix.compiler }} 22 | runs-on: ubuntu-20.04 23 | timeout-minutes: 24 | 60 25 | container: 26 | image: buildpack-deps:bionic 27 | continue-on-error: ${{ matrix.allow-failure }} 28 | strategy: 29 | matrix: 30 | include: 31 | - compiler: ghc-9.8.2 32 | compilerKind: ghc 33 | compilerVersion: 9.8.2 34 | setup-method: ghcup 35 | allow-failure: false 36 | - compiler: ghc-9.6.4 37 | compilerKind: ghc 38 | compilerVersion: 9.6.4 39 | setup-method: ghcup 40 | allow-failure: false 41 | - compiler: ghc-9.4.8 42 | compilerKind: ghc 43 | compilerVersion: 9.4.8 44 | setup-method: ghcup 45 | allow-failure: false 46 | - compiler: ghc-9.2.8 47 | compilerKind: ghc 48 | compilerVersion: 9.2.8 49 | setup-method: ghcup 50 | allow-failure: false 51 | - compiler: ghc-9.0.2 52 | compilerKind: ghc 53 | compilerVersion: 9.0.2 54 | setup-method: ghcup 55 | allow-failure: false 56 | - compiler: ghc-8.6.1 57 | compilerKind: ghc 58 | compilerVersion: 8.6.1 59 | setup-method: hvr-ppa 60 | allow-failure: false 61 | - compiler: ghc-8.4.3 62 | compilerKind: ghc 63 | compilerVersion: 8.4.3 64 | setup-method: hvr-ppa 65 | allow-failure: false 66 | - compiler: ghc-8.2.2 67 | compilerKind: ghc 68 | compilerVersion: 8.2.2 69 | setup-method: hvr-ppa 70 | allow-failure: false 71 | - compiler: ghc-8.0.2 72 | compilerKind: ghc 73 | compilerVersion: 8.0.2 74 | setup-method: hvr-ppa 75 | allow-failure: false 76 | - compiler: ghc-7.10.3 77 | compilerKind: ghc 78 | compilerVersion: 7.10.3 79 | setup-method: hvr-ppa 80 | allow-failure: false 81 | - compiler: ghc-7.8.4 82 | compilerKind: ghc 83 | compilerVersion: 7.8.4 84 | setup-method: hvr-ppa 85 | allow-failure: false 86 | fail-fast: false 87 | steps: 88 | - name: apt 89 | run: | 90 | apt-get update 91 | apt-get install -y --no-install-recommends gnupg ca-certificates dirmngr curl git software-properties-common libtinfo5 92 | if [ "${{ matrix.setup-method }}" = ghcup ]; then 93 | mkdir -p "$HOME/.ghcup/bin" 94 | curl -sL https://downloads.haskell.org/ghcup/0.1.20.0/x86_64-linux-ghcup-0.1.20.0 > "$HOME/.ghcup/bin/ghcup" 95 | chmod a+x "$HOME/.ghcup/bin/ghcup" 96 | "$HOME/.ghcup/bin/ghcup" install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false) 97 | "$HOME/.ghcup/bin/ghcup" install cabal 3.10.2.0 || (cat "$HOME"/.ghcup/logs/*.* && false) 98 | else 99 | apt-add-repository -y 'ppa:hvr/ghc' 100 | apt-get update 101 | apt-get install -y "$HCNAME" 102 | mkdir -p "$HOME/.ghcup/bin" 103 | curl -sL https://downloads.haskell.org/ghcup/0.1.20.0/x86_64-linux-ghcup-0.1.20.0 > "$HOME/.ghcup/bin/ghcup" 104 | chmod a+x "$HOME/.ghcup/bin/ghcup" 105 | "$HOME/.ghcup/bin/ghcup" install cabal 3.10.2.0 || (cat "$HOME"/.ghcup/logs/*.* && false) 106 | fi 107 | env: 108 | HCKIND: ${{ matrix.compilerKind }} 109 | HCNAME: ${{ matrix.compiler }} 110 | HCVER: ${{ matrix.compilerVersion }} 111 | - name: Set PATH and environment variables 112 | run: | 113 | echo "$HOME/.cabal/bin" >> $GITHUB_PATH 114 | echo "LANG=C.UTF-8" >> "$GITHUB_ENV" 115 | echo "CABAL_DIR=$HOME/.cabal" >> "$GITHUB_ENV" 116 | echo "CABAL_CONFIG=$HOME/.cabal/config" >> "$GITHUB_ENV" 117 | HCDIR=/opt/$HCKIND/$HCVER 118 | if [ "${{ matrix.setup-method }}" = ghcup ]; then 119 | HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER") 120 | HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#') 121 | HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#') 122 | echo "HC=$HC" >> "$GITHUB_ENV" 123 | echo "HCPKG=$HCPKG" >> "$GITHUB_ENV" 124 | echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV" 125 | echo "CABAL=$HOME/.ghcup/bin/cabal-3.10.2.0 -vnormal+nowrap" >> "$GITHUB_ENV" 126 | else 127 | HC=$HCDIR/bin/$HCKIND 128 | echo "HC=$HC" >> "$GITHUB_ENV" 129 | echo "HCPKG=$HCDIR/bin/$HCKIND-pkg" >> "$GITHUB_ENV" 130 | echo "HADDOCK=$HCDIR/bin/haddock" >> "$GITHUB_ENV" 131 | echo "CABAL=$HOME/.ghcup/bin/cabal-3.10.2.0 -vnormal+nowrap" >> "$GITHUB_ENV" 132 | fi 133 | 134 | HCNUMVER=$(${HC} --numeric-version|perl -ne '/^(\d+)\.(\d+)\.(\d+)(\.(\d+))?$/; print(10000 * $1 + 100 * $2 + ($3 == 0 ? $5 != 1 : $3))') 135 | echo "HCNUMVER=$HCNUMVER" >> "$GITHUB_ENV" 136 | echo "ARG_TESTS=--enable-tests" >> "$GITHUB_ENV" 137 | echo "ARG_BENCH=--enable-benchmarks" >> "$GITHUB_ENV" 138 | echo "HEADHACKAGE=false" >> "$GITHUB_ENV" 139 | echo "ARG_COMPILER=--$HCKIND --with-compiler=$HC" >> "$GITHUB_ENV" 140 | echo "GHCJSARITH=0" >> "$GITHUB_ENV" 141 | env: 142 | HCKIND: ${{ matrix.compilerKind }} 143 | HCNAME: ${{ matrix.compiler }} 144 | HCVER: ${{ matrix.compilerVersion }} 145 | - name: env 146 | run: | 147 | env 148 | - name: write cabal config 149 | run: | 150 | mkdir -p $CABAL_DIR 151 | cat >> $CABAL_CONFIG <> $CABAL_CONFIG < cabal-plan.xz 184 | echo 'f62ccb2971567a5f638f2005ad3173dba14693a45154c1508645c52289714cb2 cabal-plan.xz' | sha256sum -c - 185 | xz -d < cabal-plan.xz > $HOME/.cabal/bin/cabal-plan 186 | rm -f cabal-plan.xz 187 | chmod a+x $HOME/.cabal/bin/cabal-plan 188 | cabal-plan --version 189 | - name: checkout 190 | uses: actions/checkout@v3 191 | with: 192 | path: source 193 | - name: initial cabal.project for sdist 194 | run: | 195 | touch cabal.project 196 | echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project 197 | cat cabal.project 198 | - name: sdist 199 | run: | 200 | mkdir -p sdist 201 | $CABAL sdist all --output-dir $GITHUB_WORKSPACE/sdist 202 | - name: unpack 203 | run: | 204 | mkdir -p unpacked 205 | find sdist -maxdepth 1 -type f -name '*.tar.gz' -exec tar -C $GITHUB_WORKSPACE/unpacked -xzvf {} \; 206 | - name: generate cabal.project 207 | run: | 208 | PKGDIR_foldl_statistics="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/foldl-statistics-[0-9.]*')" 209 | echo "PKGDIR_foldl_statistics=${PKGDIR_foldl_statistics}" >> "$GITHUB_ENV" 210 | rm -f cabal.project cabal.project.local 211 | touch cabal.project 212 | touch cabal.project.local 213 | echo "packages: ${PKGDIR_foldl_statistics}" >> cabal.project 214 | if [ $((HCNUMVER >= 80200)) -ne 0 ] ; then echo "package foldl-statistics" >> cabal.project ; fi 215 | if [ $((HCNUMVER >= 80200)) -ne 0 ] ; then echo " ghc-options: -Werror=missing-methods" >> cabal.project ; fi 216 | cat >> cabal.project <> cabal.project.local 219 | cat cabal.project 220 | cat cabal.project.local 221 | - name: dump install plan 222 | run: | 223 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dry-run all 224 | cabal-plan 225 | - name: restore cache 226 | uses: actions/cache/restore@v3 227 | with: 228 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 229 | path: ~/.cabal/store 230 | restore-keys: ${{ runner.os }}-${{ matrix.compiler }}- 231 | - name: install dependencies 232 | run: | 233 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks --dependencies-only -j2 all 234 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dependencies-only -j2 all 235 | - name: build w/o tests 236 | run: | 237 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 238 | - name: build 239 | run: | 240 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --write-ghc-environment-files=always 241 | - name: tests 242 | run: | 243 | $CABAL v2-test $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --test-show-details=direct 244 | - name: cabal check 245 | run: | 246 | cd ${PKGDIR_foldl_statistics} || false 247 | ${CABAL} -vnormal check 248 | - name: haddock 249 | run: | 250 | $CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all 251 | - name: unconstrained build 252 | run: | 253 | rm -f cabal.project.local 254 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 255 | - name: save cache 256 | uses: actions/cache/save@v3 257 | if: always() 258 | with: 259 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 260 | path: ~/.cabal/store 261 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | cabal-dev 3 | *.o 4 | *.hi 5 | *.chi 6 | *.chs.h 7 | *.dyn_o 8 | *.dyn_hi 9 | .hpc 10 | .hsenv 11 | .cabal-sandbox/ 12 | cabal.sandbox.config 13 | *.prof 14 | *.aux 15 | *.hp 16 | .stack-work/ 17 | -------------------------------------------------------------------------------- /.travis.yml.bak: -------------------------------------------------------------------------------- 1 | # This Travis job script has been generated by a script via 2 | # 3 | # runghc make_travis_yml_2.hs 'foldl-statistics.cabal' 4 | # 5 | # For more information, see https://github.com/haskell-CI/haskell-ci 6 | # 7 | language: c 8 | sudo: false 9 | 10 | git: 11 | submodules: false # whether to recursively clone submodules 12 | 13 | cache: 14 | directories: 15 | - $HOME/.cabal/packages 16 | - $HOME/.cabal/store 17 | 18 | before_cache: 19 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/build-reports.log 20 | # remove files that are regenerated by 'cabal update' 21 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/00-index.* 22 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/*.json 23 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.cache 24 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.tar 25 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.tar.idx 26 | 27 | - rm -rfv $HOME/.cabal/packages/head.hackage 28 | 29 | matrix: 30 | include: 31 | - compiler: "ghc-8.6.1" 32 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 33 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-8.6.1], sources: [hvr-ghc]}} 34 | - compiler: "ghc-8.4.3" 35 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 36 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-2.2,ghc-8.4.3], sources: [hvr-ghc]}} 37 | - compiler: "ghc-8.2.2" 38 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 39 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-2.2,ghc-8.2.2], sources: [hvr-ghc]}} 40 | - compiler: "ghc-8.0.2" 41 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 42 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-2.2,ghc-8.0.2], sources: [hvr-ghc]}} 43 | - compiler: "ghc-7.10.2" 44 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 45 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-2.2,ghc-7.10.2], sources: [hvr-ghc]}} 46 | - compiler: "ghc-7.8.4" 47 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 48 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-2.2,ghc-7.8.4], sources: [hvr-ghc]}} 49 | 50 | before_install: 51 | - HC=${CC} 52 | - HCPKG=${HC/ghc/ghc-pkg} 53 | - unset CC 54 | - ROOTDIR=$(pwd) 55 | - mkdir -p $HOME/.local/bin 56 | - "PATH=/opt/ghc/bin:/opt/ghc-ppa-tools/bin:$HOME/local/bin:$PATH" 57 | - HCNUMVER=$(( $(${HC} --numeric-version|sed -E 's/([0-9]+)\.([0-9]+)\.([0-9]+).*/\1 * 10000 + \2 * 100 + \3/') )) 58 | - echo $HCNUMVER 59 | 60 | install: 61 | - cabal --version 62 | - echo "$(${HC} --version) [$(${HC} --print-project-git-commit-id 2> /dev/null || echo '?')]" 63 | - BENCH=${BENCH---enable-benchmarks} 64 | - TEST=${TEST---enable-tests} 65 | - HADDOCK=${HADDOCK-true} 66 | - UNCONSTRAINED=${UNCONSTRAINED-true} 67 | - NOINSTALLEDCONSTRAINTS=${NOINSTALLEDCONSTRAINTS-false} 68 | - GHCHEAD=${GHCHEAD-false} 69 | - travis_retry cabal update -v 70 | - "sed -i.bak 's/^jobs:/-- jobs:/' ${HOME}/.cabal/config" 71 | - rm -fv cabal.project cabal.project.local 72 | - grep -Ev -- '^\s*--' ${HOME}/.cabal/config | grep -Ev '^\s*$' 73 | - "printf 'packages: \".\"\\n' > cabal.project" 74 | - touch cabal.project.local 75 | - "if ! $NOINSTALLEDCONSTRAINTS; then for pkg in $($HCPKG list --simple-output); do echo $pkg | grep -vw -- foldl-statistics | sed 's/^/constraints: /' | sed 's/-[^-]*$/ installed/' >> cabal.project.local; done; fi" 76 | - cat cabal.project || true 77 | - cat cabal.project.local || true 78 | - if [ -f "./configure.ac" ]; then 79 | (cd "." && autoreconf -i); 80 | fi 81 | - rm -f cabal.project.freeze 82 | - cabal new-build -w ${HC} ${TEST} ${BENCH} --project-file="cabal.project" --dep -j2 all 83 | - cabal new-build -w ${HC} --disable-tests --disable-benchmarks --project-file="cabal.project" --dep -j2 all 84 | - rm -rf .ghc.environment.* "."/dist 85 | - DISTDIR=$(mktemp -d /tmp/dist-test.XXXX) 86 | 87 | # Here starts the actual work to be performed for the package under test; 88 | # any command which exits with a non-zero exit code causes the build to fail. 89 | script: 90 | # test that source-distributions can be generated 91 | - (cd "." && cabal sdist) 92 | - mv "."/dist/foldl-statistics-*.tar.gz ${DISTDIR}/ 93 | - cd ${DISTDIR} || false 94 | - find . -maxdepth 1 -name '*.tar.gz' -exec tar -xvf '{}' \; 95 | - "printf 'packages: foldl-statistics-*/*.cabal\\n' > cabal.project" 96 | - touch cabal.project.local 97 | - "if ! $NOINSTALLEDCONSTRAINTS; then for pkg in $($HCPKG list --simple-output); do echo $pkg | grep -vw -- foldl-statistics | sed 's/^/constraints: /' | sed 's/-[^-]*$/ installed/' >> cabal.project.local; done; fi" 98 | - cat cabal.project || true 99 | - cat cabal.project.local || true 100 | # this builds all libraries and executables (without tests/benchmarks) 101 | - cabal new-build -w ${HC} --disable-tests --disable-benchmarks all 102 | 103 | # build & run tests, build benchmarks 104 | - cabal new-build -w ${HC} ${TEST} ${BENCH} all 105 | - if [ "x$TEST" = "x--enable-tests" ]; then cabal new-test -w ${HC} ${TEST} ${BENCH} all; fi 106 | 107 | # cabal check 108 | - (cd foldl-statistics-* && cabal check) 109 | 110 | # haddock 111 | - rm -rf ./dist-newstyle 112 | - if $HADDOCK; then cabal new-haddock -w ${HC} ${TEST} ${BENCH} all; else echo "Skipping haddock generation";fi 113 | 114 | # Build without installed constraints for packages in global-db 115 | - if $UNCONSTRAINED; then rm -f cabal.project.local; echo cabal new-build -w ${HC} --disable-tests --disable-benchmarks all; else echo "Not building without installed constraints"; fi 116 | 117 | # REGENDATA ["foldl-statistics.cabal"] 118 | # EOF 119 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # 0.1.5.1 2 | 3 | - GHC 8.6 compatibility (Thanks @gwils) 4 | 5 | # 0.1.5.0 6 | 7 | - Added `histogram`, `histogram'` and `ordersOfMagnitude`. 8 | 9 | # 0.1.4.6 10 | 11 | - Relax bounds on tasty-quickcheck 12 | 13 | # 0.1.4.5 14 | 15 | - Fix dependencies for GHC < 8.0 16 | 17 | # 0.1.4.4 18 | 19 | - Update to foldl < 1.4 20 | 21 | # 0.1.4.3 22 | 23 | - Update to statistics 0.14 24 | 25 | # 0.1.4.1 26 | 27 | - foldl >= 1.2.2 exports `mean` and ` variance`, so hide them. 28 | - export `lrrCount` 29 | 30 | # 0.1.4.0 31 | 32 | - Added monoidal interface to linear regression 33 | 34 | # 0.1.3.0 35 | 36 | - Added unbiased versions of LMVSK functions 37 | 38 | # 0.1.2.0 39 | 40 | - Exposed monoidal LMVSKState 41 | - Improved testing, including for fastLMVSK 42 | 43 | # 0.1.1.0 44 | 45 | - Add fastLMVSK (length, mean, variance, skewness and kurtosis) 46 | - Add fastLinearReg (count, slope, (Y) intercept and correlation of `(x,y)`) 47 | 48 | 49 | # 0.1.0.0 50 | 51 | - Initial release 52 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009, 2010 Bryan O'Sullivan 2 | Copyright (c) 2016, Commonwealth Scientific and Industrial Research Organisation 3 | (CSIRO) ABN 41 687 119 230. 4 | 5 | All rights reserved. CSIRO is willing to grant you a license to this 6 | aemo-webservice on the following terms, except where otherwise indicated for 7 | third party material. 8 | 9 | Redistribution and use of this software in source and binary forms, with or 10 | without modification, are permitted provided that the following conditions are 11 | met: 12 | 13 | * Redistributions of source code must retain the above copyright notice, this 14 | list of conditions and the following disclaimer. 15 | 16 | * Redistributions in binary form must reproduce the above copyright notice, this 17 | list of conditions and the following disclaimer in the documentation and/or 18 | other materials provided with the distribution. 19 | 20 | * Neither the name of CSIRO nor the names of its contributors may be used to 21 | endorse or promote products derived from this software without specific prior 22 | written permission of CSIRO. 23 | 24 | EXCEPT AS EXPRESSLY STATED IN THIS AGREEMENT AND TO THE FULL EXTENT PERMITTED BY 25 | APPLICABLE LAW, THE SOFTWARE IS PROVIDED "AS-IS". CSIRO MAKES NO 26 | REPRESENTATIONS, WARRANTIES OR CONDITIONS OF ANY KIND, EXPRESS OR IMPLIED, 27 | INCLUDING BUT NOT LIMITED TO ANY REPRESENTATIONS, WARRANTIES OR CONDITIONS 28 | REGARDING THE CONTENTS OR ACCURACY OF THE SOFTWARE, OR OF TITLE, 29 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, NON-INFRINGEMENT, THE ABSENCE 30 | OF LATENT OR OTHER DEFECTS, OR THE PRESENCE OR ABSENCE OF ERRORS, WHETHER OR NOT 31 | DISCOVERABLE. 32 | 33 | TO THE FULL EXTENT PERMITTED BY APPLICABLE LAW, IN NO EVENT SHALL CSIRO BE 34 | LIABLE ON ANY LEGAL THEORY (INCLUDING, WITHOUT LIMITATION, IN AN ACTION FOR 35 | BREACH OF CONTRACT, NEGLIGENCE OR OTHERWISE) FOR ANY CLAIM, LOSS, DAMAGES OR 36 | OTHER LIABILITY HOWSOEVER INCURRED. WITHOUT LIMITING THE SCOPE OF THE PREVIOUS 37 | SENTENCE THE EXCLUSION OF LIABILITY SHALL INCLUDE: LOSS OF PRODUCTION OR 38 | OPERATION TIME, LOSS, DAMAGE OR CORRUPTION OF DATA OR RECORDS; OR LOSS OF 39 | ANTICIPATED SAVINGS, OPPORTUNITY, REVENUE, PROFIT OR GOODWILL, OR OTHER ECONOMIC 40 | LOSS; OR ANY SPECIAL, INCIDENTAL, INDIRECT, CONSEQUENTIAL, PUNITIVE OR EXEMPLARY 41 | DAMAGES, ARISING OUT OF OR IN CONNECTION WITH THIS AGREEMENT, ACCESS OF THE 42 | SOFTWARE OR ANY OTHER DEALINGS WITH THE SOFTWARE, EVEN IF CSIRO HAS BEEN ADVISED 43 | OF THE POSSIBILITY OF SUCH CLAIM, LOSS, DAMAGES OR OTHER LIABILITY. 44 | 45 | APPLICABLE LEGISLATION SUCH AS THE AUSTRALIAN CONSUMER LAW MAY APPLY 46 | REPRESENTATIONS, WARRANTIES, OR CONDITIONS, OR IMPOSES OBLIGATIONS OR LIABILITY 47 | ON CSIRO THAT CANNOT BE EXCLUDED, RESTRICTED OR MODIFIED TO THE FULL EXTENT SET 48 | OUT IN THE EXPRESS TERMS OF THIS CLAUSE ABOVE "CONSUMER GUARANTEES". TO THE 49 | EXTENT THAT SUCH CONSUMER GUARANTEES CONTINUE TO APPLY, THEN TO THE FULL EXTENT 50 | PERMITTED BY THE APPLICABLE LEGISLATION, THE LIABILITY OF CSIRO UNDER THE 51 | RELEVANT CONSUMER GUARANTEE IS LIMITED (WHERE PERMITTED AT CSIRO'S OPTION) TO 52 | ONE OF FOLLOWING REMEDIES OR SUBSTANTIALLY EQUIVALENT REMEDIES: 53 | 54 | (a) THE REPLACEMENT OF THE SOFTWARE, THE SUPPLY OF EQUIVALENT 55 | SOFTWARE, OR SUPPLYING RELEVANT SERVICES AGAIN; 56 | (b) THE REPAIR OF THE SOFTWARE; 57 | (c) THE PAYMENT OF THE COST OF REPLACING THE 58 | SOFTWARE, OF ACQUIRING EQUIVALENT SOFTWARE, HAVING THE 59 | RELEVANT SERVICES SUPPLIED AGAIN, OR HAVING THE SOFTWARE 60 | REPAIRED. 61 | 62 | IN THIS CLAUSE, CSIRO INCLUDES ANY THIRD PARTY AUTHOR OR OWNER OF ANY PART OF 63 | THE SOFTWARE OR MATERIAL DISTRIBUTED WITH IT. CSIRO MAY ENFORCE ANY RIGHTS ON 64 | BEHALF OF THE RELEVANT THIRD PARTY. 65 | 66 | Third Party Components 67 | 68 | The following third party components are distributed with the Software. You 69 | agree to comply with the license terms for these components as part of accessing 70 | the Software. Other third party software may also be identified in separate 71 | files distributed with the Software. 72 | 73 | ___________________________________________________________________ 74 | ___________________________________________________________________ 75 | The following Haskell library dependencies may be obtained from 76 | https://hackage.haskell.org/packages/ 77 | 78 | StateVar 1.1.0.4 79 | array 0.5.1.0 80 | base 4.8.2.0 81 | base-orphans 0.5.4 82 | bifunctors 5.2 83 | binary 0.7.5.0 84 | bytestring 0.10.6.0 85 | comonad 4.2.7.2 86 | containers 0.5.6.2 87 | contravariant 1.4 88 | deepseq 1.4.1.1 89 | distributive 0.5.0.2 90 | erf 2.0.0.0 91 | foldl 1.2.1 92 | foldl-statistics 0.1.0.0 93 | ghc-prim 0.4.0.0 94 | hashable 1.2.4.0 95 | integer-gmp 1.0.0.0 96 | math-functions 0.1.7.0 97 | mwc-random 0.13.4.0 98 | primitive 0.6.1.0 99 | profunctors 5.2 100 | semigroups 0.18.1 101 | stm 2.4.4.1 102 | tagged 0.8.4 103 | template-haskell 2.10.0.0 104 | text 1.2.2.1 105 | time 1.5.0.1 106 | transformers 0.4.2.0 107 | transformers-compat 0.4.0.4 108 | unordered-containers 0.2.7.1 109 | vector 0.11.0.0 110 | vector-th-unbox 0.2.1.6 111 | void 0.7.1 112 | ___________________________________________________________________ 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # foldl-statistics [![Build Status](https://travis-ci.org/data61/foldl-statistics.svg?branch=master)](https://travis-ci.org/data61/foldl-statistics) 2 | A reimplementation of the [Statistics.Sample](https://hackage.haskell.org/package/statistics/docs/Statistics-Sample.html) 3 | module using the [foldl](https://www.stackage.org/lts-5.1/package/foldl) package. 4 | The intention of this package is to allow these algorithms to be used on a much broader set of data input types, 5 | including lists and streaming libraries such as `conduit` and `pipes`, and any other type which is `Foldable`. 6 | 7 | All statistics in this package can be computed with no more than two passes over the data - once to compute the mean and once to compute 8 | any statistics which require the mean. this is achieved because foldl `Fold`s are `Applicative`, which means that to compute, for example, the first 4 central moments as well as the count, the following could be used: 9 | 10 | ```haskell 11 | import Control.Foldl as F 12 | 13 | ... 14 | 15 | dataseries :: [Double] 16 | dataseries = ... 17 | 18 | ... 19 | let m = F.fold mean dataseries 20 | (c2,c3,c4,c5,n) = flip F.fold dataseries $ 21 | (\(c2,c3) (c4,c5) n -> (c2,c3,c4,c5,n)) 22 | <$> centralMoment 2 3 m 23 | <*> centralMoment 4 5 m 24 | <*> F.length 25 | ``` 26 | 27 | which traverses the data twice, once to compute the mean `m`, and once to compute all the central moments and the count concurrently. This brings along with it for free the ability to compute streaming statistics, such as the mean of all data seen so far, using the `foldl`'s `scan` function. 28 | 29 | Where possible, care has been taken to ensure the numerical stability of the computation of statistics. 30 | 31 | Several algorithms require the mean of the data to be known before computing the statistic, such as `skewness`, `kurtosis` and other `centralMoment`s. 32 | There are 'fast' implementations for calculating the variance, unbiased variance and standard deviation, which can be computed without knowing the mean 33 | *a priori*, but which may produce less accurate results. 34 | 35 | ## Performance & Correctness 36 | Benchmarks are included comparing performance to the [statistics](https://hackage.haskell.org/package/statistics) package. In nearly all cases, the implementations in this package perform better than those in `statistics` on the same inputs, and in several cases, performing two passes (to compute the mean and another statistic) is faster than the equivalent `statistics` implementation. 37 | 38 | This speed has not come at the cost of correctness; all `Fold`s are tested against their `statistics` counterparts to ensure the results are identical. 39 | 40 | These results can be confirmed by running 41 | 42 | stack build --test --bench --benchmark-arguments "--output bench.html" 43 | 44 | which will print out the results of the tests against `statistics` and then run the benchmark (this may take several minutes and is best run on a "quiet" machine which is doing very little other than running the benchmark). The results of the benchmarking are then available in the file `bench.html`. 45 | 46 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /bench/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | 3 | module Main where 4 | 5 | import Control.Monad.ST (runST) 6 | import Criterion.Main 7 | import qualified Data.Vector.Unboxed as U 8 | import qualified Statistics.Sample as S 9 | -- import Statistics.Transform 10 | import System.Random.MWC 11 | #if MIN_VERSION_foldl(1,2,2) 12 | import Control.Foldl as F hiding (mean, variance) 13 | #else 14 | import Control.Foldl as F 15 | #endif 16 | 17 | import Control.Foldl.Statistics 18 | 19 | 20 | -- Test sample 21 | {-# NOINLINE sample #-} 22 | sample :: U.Vector Double 23 | #if MIN_VERSION_mwc_random(0,15,0) 24 | sample = runST $ do 25 | g <- create 26 | U.replicateM 10000 (uniformRM (-10.0,10.0) g) 27 | #else 28 | sample = runST $ flip uniformVector 10000 =<< create 29 | #endif 30 | 31 | {-# NOINLINE sample2 #-} 32 | sample2 :: U.Vector (Double,Double) 33 | #if MIN_VERSION_mwc_random(0,15,0) 34 | sample2 = runST $ do 35 | g <- create 36 | U.replicateM 10000 ((,) <$> (uniformRM (-10.0,10.0) g) <*> (uniformRM (0.2,5.0) g)) 37 | #else 38 | sample2 = runST $ flip uniformVector 10000 =<< create 39 | #endif 40 | 41 | {-# NOINLINE absSample #-} 42 | absSample :: U.Vector Double 43 | absSample = U.map abs sample 44 | 45 | -- Weighted test sample 46 | {-# NOINLINE sampleW #-} 47 | sampleW :: U.Vector (Double,Double) 48 | sampleW = U.zip sample (U.reverse sample) 49 | 50 | m, mw :: Double 51 | m = F.fold mean (U.toList sample) 52 | 53 | mw = F.fold meanWeighted (U.toList sampleW) 54 | 55 | 56 | 57 | 58 | main :: IO () 59 | main = defaultMain 60 | [ bgroup "Statistics of location" 61 | [ bgroup "mean" 62 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold mean (U.toList vec)) sample 63 | , bench "Statistics.Sample" $ nf S.mean sample 64 | ] 65 | , bgroup "meanWeighted" 66 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold meanWeighted (U.toList vec)) sampleW 67 | , bench "Statistics.Sample" $ nf S.meanWeighted sampleW 68 | ] 69 | , bgroup "welfordMean" 70 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold welfordMean (U.toList vec)) sample 71 | , bench "Statistics.Sample" $ nf S.welfordMean sample 72 | ] 73 | , bgroup "harmonicMean" 74 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold harmonicMean (U.toList vec)) sample 75 | , bench "Statistics.Sample" $ nf S.harmonicMean sample 76 | ] 77 | , bgroup "geometricMean" 78 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold geometricMean (U.toList vec)) absSample 79 | , bench "Statistics.Sample" $ nf S.geometricMean absSample 80 | ] 81 | ] 82 | , bgroup "Single-pass" 83 | [ bgroup "fastVariance" 84 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold fastVariance (U.toList vec)) sample 85 | , bench "Statistics.Sample" $ nf S.fastVariance sample 86 | ] 87 | , bgroup "fastVarianceUnbiased" 88 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold fastVarianceUnbiased (U.toList vec)) sample 89 | , bench "Statistics.Sample" $ nf S.fastVarianceUnbiased sample 90 | ] 91 | , bgroup "fastStdDev" 92 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold fastStdDev (U.toList vec)) sample 93 | , bench "Statistics.Sample" $ nf S.fastStdDev sample 94 | ] 95 | , bgroup "fastLMVSK" 96 | -- T4 is strict in all arguments, so WHNF ok here 97 | [bench "C.F.Statistics" $ whnf (\vec -> F.fold fastLMVSK (U.toList vec)) sample 98 | ] 99 | , bgroup "fastLinearReg" 100 | [bench "fastLinearReg" $ whnf (\vec -> F.fold fastLinearReg (U.toList vec)) sample2 101 | ] 102 | ] 103 | 104 | , bgroup "requiring the mean" 105 | [ bgroup "variance" 106 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (variance m) (U.toList vec)) sample 107 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (variance (F.fold mean (U.toList vec))) (U.toList vec)) sample 108 | , bench "Statistics.Sample" $ nf S.variance sample 109 | ] 110 | , bgroup "varianceUnbiased" 111 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (varianceUnbiased m) (U.toList vec)) sample 112 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (varianceUnbiased (F.fold mean (U.toList vec))) (U.toList vec)) sample 113 | , bench "Statistics.Sample" $ nf S.varianceUnbiased sample 114 | ] 115 | , bgroup "stdDev" 116 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (stdDev m) (U.toList vec)) sample 117 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (stdDev (F.fold mean (U.toList vec))) (U.toList vec)) sample 118 | , bench "Statistics.Sample" $ nf S.stdDev sample 119 | ] 120 | , bgroup "varianceWeighted" 121 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (varianceWeighted m) (U.toList vec)) sampleW 122 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (varianceWeighted (F.fold meanWeighted (U.toList vec))) (U.toList vec)) sampleW 123 | , bench "Statistics.Sample" $ nf S.varianceWeighted sampleW 124 | ] 125 | ] 126 | 127 | , bgroup "over central moments" 128 | [ bgroup "skewness" 129 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (skewness m) (U.toList vec)) sample 130 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (skewness (F.fold mean (U.toList vec))) (U.toList vec)) sample 131 | , bench "Statistics.Sample" $ nf S.skewness sample 132 | ] 133 | , bgroup "kurtosis" 134 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (kurtosis m) (U.toList vec)) sample 135 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (kurtosis (F.fold mean (U.toList vec))) (U.toList vec)) sample 136 | , bench "Statistics.Sample" $ nf S.kurtosis sample 137 | ] 138 | , bgroup "centralMoment 2" 139 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (centralMoment 2 m) (U.toList vec)) sample 140 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (centralMoment 2 (F.fold mean (U.toList vec))) (U.toList vec)) sample 141 | , bench "Statistics.Sample" $ nf (S.centralMoment 2) sample 142 | ] 143 | , bgroup "centralMoment 3" 144 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (centralMoment 3 m) (U.toList vec)) sample 145 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (centralMoment 3 (F.fold mean (U.toList vec))) (U.toList vec)) sample 146 | , bench "Statistics.Sample" $ nf (S.centralMoment 3) sample 147 | ] 148 | , bgroup "centralMoment 4" 149 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (centralMoment 4 m) (U.toList vec)) sample 150 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (centralMoment 4 (F.fold mean (U.toList vec))) (U.toList vec)) sample 151 | , bench "Statistics.Sample" $ nf (S.centralMoment 4) sample 152 | ] 153 | , bgroup "centralMoment 7" 154 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (centralMoment 7 m) (U.toList vec)) sample 155 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (centralMoment 7 (F.fold mean (U.toList vec))) (U.toList vec)) sample 156 | , bench "Statistics.Sample" $ nf (S.centralMoment 7) sample 157 | ] 158 | , bgroup "centralMoments 4 9" 159 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (centralMoments 4 9 m) (U.toList vec)) sample 160 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (centralMoments 4 9 (F.fold mean (U.toList vec))) (U.toList vec)) sample 161 | , bench "Statistics.Sample" $ nf (S.centralMoments 4 9) sample 162 | ] 163 | , bgroup "centralMoments' 4 9" 164 | [ bench "C.F.Statistics" $ nf (\vec -> F.fold (centralMoments' 4 9 m) (U.toList vec)) sample 165 | , bench "C.F.S(comp mean)" $ nf (\vec -> F.fold (centralMoments' 4 9 (F.fold mean (U.toList vec))) (U.toList vec)) sample 166 | ] 167 | ] 168 | ] 169 | -------------------------------------------------------------------------------- /foldl-statistics.cabal: -------------------------------------------------------------------------------- 1 | name: foldl-statistics 2 | version: 0.1.5.1 3 | synopsis: Statistical functions from the statistics package implemented as 4 | Folds. 5 | description: The use of this package allows statistics to be computed using at most two 6 | passes over the input data, one to compute a mean and one to compute a further 7 | statistic such as variance and /n/th central moments. All algorithms are the 8 | obvious implementation of Bryan O\'Sullivan\'s 9 | package imeplemented 10 | as `Fold's from the 11 | package. 12 | homepage: http://github.com/Data61/foldl-statistics#readme 13 | license: BSD3 14 | license-file: LICENSE 15 | author: Alex Mason 16 | maintainer: Alex.Mason@data61.csiro.au 17 | copyright: 2016 Data61 (CSIRO) 18 | category: Math, Statistics 19 | build-type: Simple 20 | extra-source-files: CHANGELOG.md, README.md 21 | cabal-version: >=1.10 22 | tested-with: GHC == 7.8.4, 23 | GHC == 7.10.3, 24 | GHC == 8.0.2, 25 | GHC == 8.2.2, 26 | GHC == 8.4.3, 27 | GHC == 8.6.1, 28 | GHC == 9.0.2, 29 | GHC == 9.2.8, 30 | GHC == 9.4.8, 31 | GHC == 9.6.4, 32 | GHC == 9.8.2 33 | 34 | library 35 | hs-source-dirs: src 36 | exposed-modules: Control.Foldl.Statistics 37 | default-language: Haskell2010 38 | build-depends: base >= 4.7 && < 5 39 | , foldl >= 1.1 && < 1.5 40 | , math-functions >= 0.1 && < 0.4 41 | , profunctors >= 5.2 && < 5.7 42 | , containers >= 0.1.0.0 && < 0.8 43 | , unordered-containers >= 0.1.0.0 && < 0.3 44 | , hashable >=1.0.1.1 && < 1.5 45 | if impl(ghc < 8.0) 46 | build-depends: semigroups >= 0.18 && < 1.0 47 | 48 | 49 | test-suite foldl-statistics-test 50 | type: exitcode-stdio-1.0 51 | hs-source-dirs: test 52 | main-is: Spec.hs 53 | ghc-options: -threaded -rtsopts -with-rtsopts=-N 54 | default-language: Haskell2010 55 | build-depends: base 56 | , foldl-statistics 57 | , foldl 58 | , statistics >= 0.13 && < 0.17 59 | , tasty >= 0.11 && < 1.6 60 | , tasty-quickcheck >= 0.8 && < 0.11 61 | , vector >= 0.11 && < 0.14 62 | , quickcheck-instances >= 0.3 && < 0.4 63 | , profunctors 64 | if impl(ghc < 8.0) 65 | build-depends: semigroups 66 | 67 | Benchmark bench-folds 68 | type: exitcode-stdio-1.0 69 | hs-source-dirs: bench 70 | main-is: Main.hs 71 | default-language: Haskell2010 72 | build-depends: base 73 | , foldl-statistics 74 | , foldl 75 | , statistics >= 0.13 && < 0.17 76 | , criterion >= 1.1 && < 1.7 77 | , vector >= 0.10 && < 1.0 78 | , mwc-random >= 0.13 && < 0.16 79 | if impl(ghc < 8.0) 80 | build-depends: semigroups 81 | 82 | source-repository head 83 | type: git 84 | location: https://github.com/Data61/foldl-statistics 85 | -------------------------------------------------------------------------------- /src/Control/Foldl/Statistics.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | 3 | -- | 4 | -- Module : Control.Foldl.Statistics 5 | -- Copyright : (c) 2011 Bryan O'Sullivan, 2016 National ICT Australia, 2018 CSIRO 6 | -- License : BSD3 7 | -- 8 | -- Maintainer : Alex.Mason@data61.csiro.au 9 | -- Stability : experimental 10 | -- Portability : portable 11 | -- 12 | 13 | module Control.Foldl.Statistics ( 14 | -- * Introduction 15 | -- $intro 16 | -- * Descriptive functions 17 | range 18 | , sum' 19 | , histogram 20 | , histogram' 21 | , ordersOfMagnitude 22 | 23 | -- * Statistics of location 24 | , mean 25 | , welfordMean 26 | , meanWeighted 27 | , harmonicMean 28 | , geometricMean 29 | 30 | -- * Statistics of dispersion 31 | -- $variance 32 | 33 | -- ** Functions over central moments 34 | , centralMoment 35 | , centralMoments 36 | , centralMoments' 37 | , skewness 38 | , kurtosis 39 | 40 | -- ** Functions requiring the mean to be known (numerically robust) 41 | -- $robust 42 | , variance 43 | , varianceUnbiased 44 | , stdDev 45 | , varianceWeighted 46 | 47 | -- ** Single-pass functions (faster, less safe) 48 | -- $cancellation 49 | , fastVariance 50 | , fastVarianceUnbiased 51 | , fastStdDev 52 | , fastLMVSK 53 | , fastLMVSKu 54 | , LMVSK(..) 55 | , LMVSKState 56 | , foldLMVSKState 57 | , getLMVSK 58 | , getLMVSKu 59 | 60 | -- ** Linear Regression 61 | , fastLinearReg 62 | , foldLinRegState 63 | , getLinRegResult 64 | , LinRegResult(..) 65 | , LinRegState 66 | , lrrCount 67 | , correlation 68 | 69 | -- * References 70 | -- $references 71 | , module Control.Foldl 72 | 73 | ) where 74 | 75 | #if MIN_VERSION_foldl(1,2,2) 76 | import Control.Foldl as F hiding (mean, variance) 77 | #else 78 | import Control.Foldl as F 79 | #endif 80 | 81 | import qualified Control.Foldl 82 | import Data.Profunctor 83 | import Data.Semigroup 84 | 85 | #if !MIN_VERSION_base(4,8,0) 86 | import Control.Applicative 87 | #endif 88 | 89 | import Data.Hashable (Hashable) 90 | import qualified Data.HashMap.Strict as Hash 91 | import qualified Data.Map.Strict as Map 92 | 93 | import Numeric.Sum (KBNSum, add, kbn, zero) 94 | 95 | data T = T {-# UNPACK #-}!Double {-# UNPACK #-}!Int 96 | data TS = TS {-# UNPACK #-}!KBNSum {-# UNPACK #-}!Int 97 | data T1 = T1 {-# UNPACK #-}!Int {-# UNPACK #-}!Double {-# UNPACK #-}!Double 98 | data V = V {-# UNPACK #-}!Double {-# UNPACK #-}!Double 99 | data V1 = V1 {-# UNPACK #-}!Double {-# UNPACK #-}!Double {-# UNPACK #-}!Int 100 | data V1S = V1S {-# UNPACK #-}!KBNSum {-# UNPACK #-}!KBNSum {-# UNPACK #-}!Int 101 | 102 | 103 | -- $intro 104 | -- Statistical functions from the 105 | -- 106 | -- module of the 107 | -- package by 108 | -- Bryan O'Sullivan, implemented as `Control.Foldl.Fold's from the 109 | -- package. 110 | -- 111 | -- This allows many statistics to be computed concurrently with at most 112 | -- two passes over the data, usually by computing the `mean' first, and 113 | -- passing it to further `Fold's. 114 | 115 | -- | A numerically stable sum using Kahan-Babuška-Neumaier 116 | -- summation from "Numeric.Sum" 117 | {-# INLINE sum' #-} 118 | sum' :: Fold Double Double 119 | sum' = Fold (add :: KBNSum -> Double -> KBNSum) 120 | (zero :: KBNSum) 121 | kbn 122 | 123 | 124 | -- | The difference between the largest and smallest 125 | -- elements of a sample. 126 | {-# INLINE range #-} 127 | range :: Fold Double Double 128 | range = (\(Just lo) (Just hi) -> hi - lo) 129 | <$> F.minimum 130 | <*> F.maximum 131 | 132 | -- | Create a histogram of each value of type a. Useful for folding over 133 | -- categorical values, for example, a CSV where you have a data type for a 134 | -- selection of categories. 135 | -- 136 | -- It should not be used for continuous values which would lead to a high number 137 | -- of keys. One way to avoid this is to use the `Profunctor` instance for `Fold` 138 | -- to break your values into categories. For an example of doing this, see 139 | -- `ordersOfMagnitude`. 140 | histogram :: Ord a => Fold a (Map.Map a Int) 141 | histogram = Fold step Map.empty id where 142 | step m a = Map.insertWith (+) a 1 m 143 | 144 | -- | Like `histogram`, but for use when hashmaps would be more efficient for the 145 | -- particular type @a@. 146 | histogram' :: (Hashable a, Eq a) => Fold a (Hash.HashMap a Int) 147 | histogram' = Fold step Hash.empty id where 148 | step m a = Hash.insertWith (+) a 1 m 149 | 150 | -- | Provides a histogram of the orders of magnitude of the values in a series. 151 | -- Negative values are placed in the @0.0@ category due to the behaviour of 152 | -- `logBase`. it may be useful to use @lmap abs@ on this Fold to get a histogram 153 | -- of the absolute magnitudes. 154 | 155 | -- TODO: logBase 10 1000000 /= 6 but 5, fix this 156 | ordersOfMagnitude :: Fold Double (Map.Map Double Int) 157 | ordersOfMagnitude = 158 | dimap 159 | ((floor :: Double -> Int) . logBase 10) 160 | (Map.mapKeysMonotonic (10^^)) 161 | histogram 162 | 163 | -- | Arithmetic mean. This uses Kahan-Babuška-Neumaier 164 | -- summation, so is more accurate than 'welfordMean' unless the input 165 | -- values are very large. 166 | -- 167 | -- Since foldl-1.2.2, 'Control.Foldl` exports a `mean` function, so you will 168 | -- have to hide one. 169 | {-# INLINE mean #-} 170 | mean :: Fold Double Double 171 | mean = Fold step (TS zero 0) final where 172 | step (TS s n) x = TS (add s x) (n+1) 173 | final (TS s n) = kbn s / fromIntegral n 174 | 175 | 176 | -- | Arithmetic mean. This uses Welford's algorithm to provide 177 | -- numerical stability, using a single pass over the sample data. 178 | -- 179 | -- Compared to 'mean', this loses a surprising amount of precision 180 | -- unless the inputs are very large. 181 | {-# INLINE welfordMean #-} 182 | welfordMean :: Fold Double Double 183 | welfordMean = Fold step (T 0 0) final where 184 | final (T m _) = m 185 | step (T m n) x = T m' n' where 186 | m' = m + (x - m) / fromIntegral n' 187 | n' = n + 1 188 | 189 | 190 | -- | Arithmetic mean for weighted sample. It uses a single-pass 191 | -- algorithm analogous to the one used by 'welfordMean'. 192 | {-# INLINE meanWeighted #-} 193 | meanWeighted :: Fold (Double,Double) Double 194 | meanWeighted = Fold step (V 0 0) final 195 | where 196 | final (V a _) = a 197 | step (V m w) (x,xw) = V m' w' 198 | where m' | w' == 0 = 0 199 | | otherwise = m + xw * (x - m) / w' 200 | w' = w + xw 201 | 202 | -- | Harmonic mean. 203 | {-# INLINE harmonicMean #-} 204 | harmonicMean :: Fold Double Double 205 | harmonicMean = Fold step (T 0 0) final 206 | where 207 | final (T b a) = fromIntegral a / b 208 | step (T x y) n = T (x + (1/n)) (y+1) 209 | 210 | -- | Geometric mean of a sample containing no negative values. 211 | {-# INLINE geometricMean #-} 212 | geometricMean :: Fold Double Double 213 | geometricMean = dimap log exp mean 214 | 215 | -- | Compute the /k/th central moment of a sample. The central moment 216 | -- is also known as the moment about the mean. 217 | -- 218 | -- This function requires the mean of the data to compute the central moment. 219 | -- 220 | -- For samples containing many values very close to the mean, this 221 | -- function is subject to inaccuracy due to catastrophic cancellation. 222 | {-# INLINE centralMoment #-} 223 | centralMoment :: Int -> Double -> Fold Double Double 224 | centralMoment a m 225 | | a < 0 = error "Statistics.Sample.centralMoment: negative input" 226 | | a == 0 = 1 227 | | a == 1 = 0 228 | | otherwise = Fold step (TS zero 0) final where 229 | step (TS s n) x = TS (add s $ go x) (n+1) 230 | final (TS s n) = kbn s / fromIntegral n 231 | go x = (x - m) ^^^ a 232 | 233 | -- | Compute the /k/th and /j/th central moments of a sample. 234 | -- 235 | -- This fold requires the mean of the data to be known. 236 | -- 237 | -- For samples containing many values very close to the mean, this 238 | -- function is subject to inaccuracy due to catastrophic cancellation. 239 | {-# INLINE centralMoments #-} 240 | centralMoments :: Int -> Int -> Double -> Fold Double (Double, Double) 241 | centralMoments a b m 242 | | a < 2 || b < 2 = (,) <$> centralMoment a m <*> centralMoment b m 243 | | otherwise = Fold step (V1 0 0 0) final 244 | where final (V1 i j n) = (i / fromIntegral n , j / fromIntegral n) 245 | step (V1 i j n) x = V1 (i + d^^^a) (j + d^^^b) (n+1) 246 | where d = x - m 247 | 248 | 249 | -- | Compute the /k/th and /j/th central moments of a sample. 250 | -- 251 | -- This fold requires the mean of the data to be known. 252 | -- 253 | -- This variation of `centralMoments' uses Kahan-Babuška-Neumaier 254 | -- summation to attempt to improve the accuracy of results, which may 255 | -- make computation slower. 256 | {-# INLINE centralMoments' #-} 257 | centralMoments' :: Int -> Int -> Double -> Fold Double (Double, Double) 258 | centralMoments' a b m 259 | | a < 2 || b < 2 = (,) <$> centralMoment a m <*> centralMoment b m 260 | | otherwise = Fold step (V1S zero zero 0) final 261 | where final (V1S i j n) = (kbn i / fromIntegral n , kbn j / fromIntegral n) 262 | step (V1S i j n) x = V1S (add i $ d^^^a) (add j $ d^^^b) (n+1) 263 | where d = x - m 264 | 265 | -- | Compute the skewness of a sample. This is a measure of the 266 | -- asymmetry of its distribution. 267 | -- 268 | -- A sample with negative skew is said to be /left-skewed/. Most of 269 | -- its mass is on the right of the distribution, with the tail on the 270 | -- left. 271 | -- 272 | -- > skewness $ U.to [1,100,101,102,103] 273 | -- > ==> -1.497681449918257 274 | -- 275 | -- A sample with positive skew is said to be /right-skewed/. 276 | -- 277 | -- > skewness $ U.to [1,2,3,4,100] 278 | -- > ==> 1.4975367033335198 279 | -- 280 | -- A sample's skewness is not defined if its 'variance' is zero. 281 | -- 282 | -- This fold requires the mean of the data to be known. 283 | -- 284 | -- For samples containing many values very close to the mean, this 285 | -- function is subject to inaccuracy due to catastrophic cancellation. 286 | {-# INLINE skewness #-} 287 | skewness :: Double -> Fold Double Double 288 | skewness m = (\(c3, c2) -> c3 * c2 ** (-1.5)) <$> centralMoments 3 2 m 289 | 290 | 291 | -- | Compute the excess kurtosis of a sample. This is a measure of 292 | -- the \"peakedness\" of its distribution. A high kurtosis indicates 293 | -- that more of the sample's variance is due to infrequent severe 294 | -- deviations, rather than more frequent modest deviations. 295 | -- 296 | -- A sample's excess kurtosis is not defined if its 'variance' is 297 | -- zero. 298 | -- 299 | -- This fold requires the mean of the data to be known. 300 | -- 301 | -- For samples containing many values very close to the mean, this 302 | -- function is subject to inaccuracy due to catastrophic cancellation. 303 | {-# INLINE kurtosis #-} 304 | kurtosis :: Double -> Fold Double Double 305 | kurtosis m = (\(c4,c2) -> c4 / (c2 * c2) - 3) <$> centralMoments 4 2 m 306 | 307 | 308 | -- $variance 309 | -- 310 | -- The variance—and hence the standard deviation—of a 311 | -- sample of fewer than two elements are both defined to be zero. 312 | -- 313 | -- Many of these Folds take the mean as an argument for constructing 314 | -- the variance, and as such require two passes over the data. 315 | 316 | -- $robust 317 | -- 318 | -- These functions use the compensated summation algorithm of Chan et 319 | -- al. for numerical robustness, but require two passes over the 320 | -- sample data as a result. 321 | 322 | 323 | -- Multiply a number by itself. 324 | {-# INLINE square #-} 325 | square :: Double -> Double 326 | square x = x * x 327 | 328 | {-# INLINE robustSumVar #-} 329 | robustSumVar :: Double -> Fold Double TS 330 | robustSumVar m = Fold step (TS zero 0) id where 331 | step (TS s n) x = TS (add s . square . subtract m $ x) (n+1) 332 | 333 | -- | Maximum likelihood estimate of a sample's variance. Also known 334 | -- as the population variance, where the denominator is /n/. 335 | {-# INLINE variance #-} 336 | variance :: Double -> Fold Double Double 337 | variance m = 338 | (\(TS sv n) -> if n > 1 then kbn sv / fromIntegral n else 0) 339 | <$> robustSumVar m 340 | 341 | -- | Unbiased estimate of a sample's variance. Also known as the 342 | -- sample variance, where the denominator is /n/-1. 343 | {-# INLINE varianceUnbiased #-} 344 | varianceUnbiased :: Double -> Fold Double Double 345 | varianceUnbiased m = 346 | (\(TS sv n) -> if n > 1 then kbn sv / fromIntegral (n-1) else 0) 347 | <$> robustSumVar m 348 | 349 | 350 | -- | Standard deviation. This is simply the square root of the 351 | -- unbiased estimate of the variance. 352 | {-# INLINE stdDev #-} 353 | stdDev :: Double -> Fold Double Double 354 | stdDev m = sqrt (varianceUnbiased m) 355 | 356 | 357 | {-# INLINE robustSumVarWeighted #-} 358 | robustSumVarWeighted :: Double -> Fold (Double,Double) V1 359 | robustSumVarWeighted m = Fold step (V1 0 0 0) id 360 | where 361 | step (V1 s w n) (x,xw) = V1 (s + xw*d*d) (w + xw) (n+1) 362 | where d = x - m 363 | 364 | -- | Weighted variance. This is biased estimation. Requires the 365 | -- weighted mean of the input data. 366 | {-# INLINE varianceWeighted #-} 367 | varianceWeighted :: Double -> Fold (Double,Double) Double 368 | varianceWeighted m = 369 | (\(V1 s w n) -> if n > 1 then s / w else 0) 370 | <$> robustSumVarWeighted m 371 | 372 | -- $cancellation 373 | -- 374 | -- The functions prefixed with the name @fast@ below perform a single 375 | -- pass over the sample data using Knuth's algorithm. They usually 376 | -- work well, but see below for caveats. These functions are subject 377 | -- to fusion and do not require the mean to be passed. 378 | -- 379 | -- /Note/: in cases where most sample data is close to the sample's 380 | -- mean, Knuth's algorithm gives inaccurate results due to 381 | -- catastrophic cancellation. 382 | 383 | {-# INLINE fastVar #-} 384 | fastVar :: Fold Double T1 385 | fastVar = Fold step (T1 0 0 0) id 386 | where 387 | step (T1 n m s) x = T1 n' m' s' 388 | where n' = n + 1 389 | m' = m + d / fromIntegral n' 390 | s' = s + d * (x - m') 391 | d = x - m 392 | 393 | -- | Maximum likelihood estimate of a sample's variance. 394 | {-# INLINE fastVariance #-} 395 | fastVariance :: Fold Double Double 396 | fastVariance = final <$> fastVar 397 | where final (T1 n _m s) 398 | | n > 1 = s / fromIntegral n 399 | | otherwise = 0 400 | 401 | 402 | -- | Maximum likelihood estimate of a sample's variance. 403 | {-# INLINE fastVarianceUnbiased #-} 404 | fastVarianceUnbiased :: Fold Double Double 405 | fastVarianceUnbiased = final <$> fastVar 406 | where final (T1 n _m s) 407 | | n > 1 = s / fromIntegral (n-1) 408 | | otherwise = 0 409 | 410 | 411 | -- | Standard deviation. This is simply the square root of the 412 | -- maximum likelihood estimate of the variance. 413 | {-# INLINE fastStdDev #-} 414 | fastStdDev :: Fold Double Double 415 | fastStdDev = sqrt fastVariance 416 | 417 | 418 | 419 | -- | When returned by `fastLMVSK`, contains the count, mean, 420 | -- variance, skewness and kurtosis of a series of samples. 421 | -- 422 | -- /Since: 0.1.1.0/ 423 | data LMVSK = LMVSK 424 | { lmvskCount :: {-# UNPACK #-}!Int 425 | , lmvskMean :: {-# UNPACK #-}!Double 426 | , lmvskVariance :: {-# UNPACK #-}!Double 427 | , lmvskSkewness :: {-# UNPACK #-}!Double 428 | , lmvskKurtosis :: {-# UNPACK #-}!Double 429 | } deriving (Show, Eq) 430 | 431 | newtype LMVSKState = LMVSKState LMVSK 432 | 433 | instance Monoid LMVSKState where 434 | {-# INLINE mempty #-} 435 | mempty = LMVSKState lmvsk0 436 | {-# INLINE mappend #-} 437 | mappend = (<>) 438 | 439 | instance Semigroup LMVSKState where 440 | {-# INLINE (<>) #-} 441 | (LMVSKState (LMVSK an am1 am2 am3 am4)) <> (LMVSKState (LMVSK bn bm1 bm2 bm3 bm4)) 442 | = LMVSKState (LMVSK n m1 m2 m3 m4) where 443 | fi :: Int -> Double 444 | fi = fromIntegral 445 | -- combined.n = a.n + b.n; 446 | n = an+bn 447 | n2 = n*n 448 | nd = fi n 449 | nda = fi an 450 | ndb = fi bn 451 | -- delta = b.M1 - a.M1; 452 | delta = bm1 - am1 453 | -- delta2 = delta*delta; 454 | delta2 = delta*delta 455 | -- delta3 = delta*delta2; 456 | delta3 = delta*delta2 457 | -- delta4 = delta2*delta2; 458 | delta4 = delta2*delta2 459 | -- combined.M1 = (a.n*a.M1 + b.n*b.M1) / combined.n; 460 | m1 = (nda*am1 + ndb*bm1 ) / nd 461 | -- combined.M2 = a.M2 + b.M2 + delta2*a.n*b.n / combined.n; 462 | m2 = am2 + bm2 + delta2*nda*ndb / nd 463 | -- combined.M3 = a.M3 + b.M3 + delta3*a.n*b.n* (a.n - b.n)/(combined.n*combined.n); 464 | m3 = am3 + bm3 + delta3*nda*ndb* fi( an - bn )/ fi n2 465 | -- combined.M3 += 3.0*delta * (a.n*b.M2 - b.n*a.M2) / combined.n; 466 | + 3.0*delta * (nda*bm2 - ndb*am2 ) / nd 467 | -- 468 | -- combined.M4 = a.M4 + b.M4 + delta4*a.n*b.n * (a.n*a.n - a.n*b.n + b.n*b.n) /(combined.n*combined.n*combined.n); 469 | m4 = am4 + bm4 + delta4*nda*ndb *fi(an*an - an*bn + bn*bn ) / fi (n*n*n) 470 | -- combined.M4 += 6.0*delta2 * (a.n*a.n*b.M2 + b.n*b.n*a.M2)/(combined.n*combined.n) + 471 | + 6.0*delta2 * (nda*nda*bm2 + ndb*ndb*am2) / fi n2 472 | -- 4.0*delta*(a.n*b.M3 - b.n*a.M3) / combined.n; 473 | + 4.0*delta*(nda*bm3 - ndb*am3) / nd 474 | 475 | -- | Efficiently compute the 476 | -- __length, mean, variance, skewness and kurtosis__ with a single pass. 477 | -- 478 | -- /Since: 0.1.1.0/ 479 | {-# INLINE fastLMVSK #-} 480 | fastLMVSK :: Fold Double LMVSK 481 | fastLMVSK = getLMVSK <$> foldLMVSKState 482 | 483 | -- | Efficiently compute the 484 | -- __length, mean, unbiased variance, skewness and kurtosis__ with a single pass. 485 | -- 486 | -- /Since: 0.1.3.0/ 487 | {-# INLINE fastLMVSKu #-} 488 | fastLMVSKu :: Fold Double LMVSK 489 | fastLMVSKu = getLMVSKu <$> foldLMVSKState 490 | 491 | {-# INLINE lmvsk0 #-} 492 | lmvsk0 :: LMVSK 493 | lmvsk0 = LMVSK 0 0 0 0 0 494 | 495 | -- | Performs the heavy lifting of fastLMVSK. This is exposed 496 | -- because the internal `LMVSKState` is monoidal, allowing you 497 | -- to run these statistics in parallel over datasets which are 498 | -- split and then combine the results. 499 | -- 500 | -- /Since: 0.1.2.0/ 501 | {-# INLINE foldLMVSKState #-} 502 | foldLMVSKState :: Fold Double LMVSKState 503 | foldLMVSKState = Fold stepLMVSKState (LMVSKState lmvsk0) id 504 | 505 | {-# INLINE stepLMVSKState #-} 506 | stepLMVSKState :: LMVSKState -> Double -> LMVSKState 507 | stepLMVSKState (LMVSKState (LMVSK n1 m1 m2 m3 m4)) x = LMVSKState $ LMVSK n m1' m2' m3' m4' where 508 | fi :: Int -> Double 509 | fi = fromIntegral 510 | -- long long n1 = n; 511 | -- n++; 512 | n = n1+1 513 | -- delta = x - M1; 514 | delta = x - m1 515 | -- delta_n = delta / n; 516 | delta_n = delta / fi n 517 | -- delta_n2 = delta_n * delta_n; 518 | delta_n2 = delta_n * delta_n 519 | -- term1 = delta * delta_n * n1; 520 | term1 = delta * delta_n * fi n1 521 | -- M1 += delta_n; 522 | m1' = m1 + delta_n 523 | -- M4 += term1 * delta_n2 * (n*n - 3*n + 3) + 6 * delta_n2 * M2 - 4 * delta_n * M3; 524 | m4' = m4 + term1 * delta_n2 * fi (n*n - 3*n + 3) + 6 * delta_n2 * m2 - 4 * delta_n * m3 525 | -- M3 += term1 * delta_n * (n - 2) - 3 * delta_n * M2; 526 | m3' = m3 + term1 * delta_n * fi (n - 2) - 3 * delta_n * m2 527 | -- M2 += term1; 528 | m2' = m2 + term1 529 | 530 | -- | Returns the stats which have been computed in a LMVSKState. 531 | -- 532 | -- /Since: 0.1.2.0/ 533 | getLMVSK :: LMVSKState -> LMVSK 534 | getLMVSK (LMVSKState (LMVSK n m1 m2 m3 m4)) = LMVSK n m1 m2' m3' m4' where 535 | nd = fromIntegral n 536 | -- M2/(n-1.0) 537 | m2' = m2 / nd 538 | -- sqrt(double(n)) * M3/ pow(M2, 1.5) 539 | m3' = sqrt nd * m3 / (m2 ** 1.5) 540 | -- double(n)*M4 / (M2*M2) - 3.0 541 | m4' = nd*m4 / (m2*m2) - 3.0 542 | 543 | -- | Returns the stats which have been computed in a LMVSKState, 544 | -- with the unbiased variance. 545 | -- 546 | -- /Since: 0.1.2.0/ 547 | getLMVSKu :: LMVSKState -> LMVSK 548 | getLMVSKu (LMVSKState (LMVSK n m1 m2 m3 m4)) = LMVSK n m1 m2' m3' m4' where 549 | nd = fromIntegral n 550 | -- M2/(n-1.0) 551 | m2' = m2 / (nd-1) 552 | -- sqrt(double(n)) * M3/ pow(M2, 1.5) 553 | m3' = sqrt nd * m3 / (m2 ** 1.5) 554 | -- double(n)*M4 / (M2*M2) - 3.0 555 | m4' = nd*m4 / (m2*m2) - 3.0 556 | 557 | 558 | -- | When returned by `fastLinearReg`, contains the count, 559 | -- slope, intercept and correlation of combining @(x,y)@ pairs. 560 | -- 561 | -- /Since: 0.1.1.0/ 562 | data LinRegResult = LinRegResult 563 | {lrrSlope :: {-# UNPACK #-}!Double 564 | ,lrrIntercept :: {-# UNPACK #-}!Double 565 | ,lrrCorrelation :: {-# UNPACK #-}!Double 566 | ,lrrXStats :: {-# UNPACK #-}!LMVSK 567 | ,lrrYStats :: {-# UNPACK #-}!LMVSK 568 | } deriving (Show, Eq) 569 | 570 | -- | The number of elements which make up this 'LinRegResult' 571 | -- /Since: 0.1.4.1/ 572 | lrrCount :: LinRegResult -> Int 573 | lrrCount = lmvskCount . lrrXStats 574 | 575 | -- | The Monoidal state used to compute linear regression, see `fastLinearReg`. 576 | -- 577 | -- /Since: 0.1.4.0/ 578 | data LinRegState = LinRegState 579 | {-# UNPACK #-}!LMVSKState 580 | {-# UNPACK #-}!LMVSKState 581 | {-# UNPACK #-}!Double 582 | 583 | 584 | {- 585 | RunningRegression operator+(const RunningRegression a, const RunningRegression b) 586 | { 587 | RunningRegression combined; 588 | 589 | combined.x_stats = a.x_stats + b.x_stats; 590 | combined.y_stats = a.y_stats + b.y_stats; 591 | combined.n = a.n + b.n; 592 | 593 | double delta_x = b.x_stats.Mean() - a.x_stats.Mean(); 594 | double delta_y = b.y_stats.Mean() - a.y_stats.Mean(); 595 | combined.S_xy = a.S_xy + b.S_xy + 596 | double(a.n*b.n)*delta_x*delta_y/double(combined.n); 597 | 598 | return combined; 599 | } 600 | -} 601 | instance Semigroup LinRegState where 602 | {-# INLINE (<>) #-} 603 | (LinRegState ax@(LMVSKState ax') ay a_xy) 604 | <> (LinRegState bx@(LMVSKState bx') by b_xy) 605 | = LinRegState x y s_xy where 606 | an = lmvskCount ax' 607 | bn = lmvskCount bx' 608 | x = ax <> bx 609 | y = ay <> by 610 | delta_x = lmvskMean (getLMVSK bx) - lmvskMean (getLMVSK ax) 611 | delta_y = lmvskMean (getLMVSK by) - lmvskMean (getLMVSK ay) 612 | s_xy = a_xy+b_xy + fromIntegral (an*bn) * delta_x * delta_y/fromIntegral (an+bn) 613 | 614 | 615 | instance Monoid LinRegState where 616 | {-# INLINE mempty #-} 617 | mempty = LinRegState mempty mempty 0 618 | {-# INLINE mappend #-} 619 | mappend = (<>) 620 | 621 | 622 | 623 | -- | Computes the __slope, (Y) intercept and correlation__ of @(x,y)@ 624 | -- pairs, as well as the `LMVSK` stats for both the x and y series. 625 | -- 626 | -- >>> F.fold fastLinearReg $ map (\x -> (x,3*x+7)) [1..100] 627 | -- LinRegResult 628 | -- {lrrSlope = 3.0 629 | -- , lrrIntercept = 7.0 630 | -- , lrrCorrelation = 100.0 631 | -- , lrrXStats = LMVSK 632 | -- {lmvskCount = 100 633 | -- , lmvskMean = 50.5 634 | -- , lmvskVariance = 833.25 635 | -- , lmvskSkewness = 0.0 636 | -- , lmvskKurtosis = -1.2002400240024003} 637 | -- , lrrYStats = LMVSK 638 | -- {lmvskCount = 100 639 | -- , lmvskMean = 158.5 640 | -- , lmvskVariance = 7499.25 641 | -- , lmvskSkewness = 0.0 642 | -- , lmvskKurtosis = -1.2002400240024003} 643 | -- } 644 | -- 645 | -- >>> F.fold fastLinearReg $ map (\x -> (x,0.005*x*x+3*x+7)) [1..100] 646 | -- LinRegResult 647 | -- {lrrSlope = 3.5049999999999994 648 | -- , lrrIntercept = -1.5849999999999795 649 | -- , lrrCorrelation = 99.93226275740273 650 | -- , lrrXStats = LMVSK 651 | -- {lmvskCount = 100 652 | -- , lmvskMean = 50.5 653 | -- , lmvskVariance = 833.25 654 | -- , lmvskSkewness = 0.0 655 | -- , lmvskKurtosis = -1.2002400240024003} 656 | -- , lrrYStats = LMVSK 657 | -- {lmvskCount = 100 658 | -- , lmvskMean = 175.4175 659 | -- , lmvskVariance = 10250.37902625 660 | -- , lmvskSkewness = 9.862971188165422e-2 661 | -- , lmvskKurtosis = -1.1923628437011482} 662 | -- } 663 | -- 664 | -- /Since: 0.1.1.0/ 665 | {-# INLINE fastLinearReg #-} 666 | fastLinearReg :: Fold (Double,Double) LinRegResult 667 | fastLinearReg = getLinRegResult <$> foldLinRegState 668 | 669 | -- | Produces the slope, Y intercept, correlation and LMVSK stats from a 670 | -- `LinRegState`. 671 | -- 672 | -- /Since: 0.1.4.0/ 673 | {-# INLINE getLinRegResult #-} 674 | getLinRegResult :: LinRegState -> LinRegResult 675 | getLinRegResult (LinRegState vx@(LMVSKState vx') vy s_xy) = LinRegResult slope intercept correl statsx statsy where 676 | n = lmvskCount vx' 677 | ndm1 = fromIntegral (n-1) 678 | -- slope = S_xy / (x_stats.Variance()*(n - 1.0)); 679 | -- in LMVSKState, 'lmvskVariance' hasn't been divided 680 | -- by (n-1), so division not necessary 681 | slope = s_xy / lmvskVariance vx' 682 | intercept = yMean - slope*xMean 683 | t = sqrt xVar * sqrt yVar -- stddev x * stddev y 684 | correl = s_xy / (ndm1 * t) 685 | -- Need unbiased variance or correlation may be > ±1 686 | statsx@(LMVSK _ xMean xVar _ _) = getLMVSKu vx 687 | statsy@(LMVSK _ yMean yVar _ _) = getLMVSKu vy 688 | 689 | 690 | -- | Performs the heavy lifting for `fastLinReg`. Exposed because `LinRegState` 691 | -- is a `Monoid`, allowing statistics to be computed on datasets in parallel 692 | -- and combined afterwards. 693 | -- 694 | -- /Since: 0.1.4.0/ 695 | {-# INLINE foldLinRegState #-} 696 | foldLinRegState :: Fold (Double,Double) LinRegState 697 | foldLinRegState = Fold step (LinRegState (LMVSKState lmvsk0) (LMVSKState lmvsk0) 0) id where 698 | step (LinRegState vx@(LMVSKState vx') vy s_xy) (x,y) = LinRegState vx2 vy2 s_xy' where 699 | n = lmvskCount vx' 700 | nd = fromIntegral n 701 | nd1 = fromIntegral (n+1) 702 | s_xy' = s_xy + (xMean - x)*(yMean - y)*nd/nd1 703 | xMean = lmvskMean (getLMVSK vx) 704 | yMean = lmvskMean (getLMVSK vy) 705 | vx2 = stepLMVSKState vx x 706 | vy2 = stepLMVSKState vy y 707 | 708 | 709 | -- | Given the mean and standard deviation of two distributions, computes 710 | -- the correlation between them, given the means and standard deviation 711 | -- of the @x@ and @y@ series. The results may be more accurate than those 712 | -- returned by `fastLinearReg` 713 | correlation :: (Double, Double) -> (Double, Double) -> Fold (Double,Double) Double 714 | correlation (m1,m2) (s1,s2) = Fold step (TS zero 0) final where 715 | step (TS s n) (x1,x2) = TS (add s $ ((x1 - m1)/s1) * ((x2 - m2)/s2)) (n+1) 716 | final (TS s n) = kbn s / fromIntegral (n-1) 717 | 718 | 719 | -- $references 720 | -- 721 | -- * Chan, T. F.; Golub, G.H.; LeVeque, R.J. (1979) Updating formulae 722 | -- and a pairwise algorithm for computing sample 723 | -- variances. Technical Report STAN-CS-79-773, Department of 724 | -- Computer Science, Stanford 725 | -- University. 726 | -- 727 | -- * Knuth, D.E. (1998) The art of computer programming, volume 2: 728 | -- seminumerical algorithms, 3rd ed., p. 232. 729 | -- 730 | -- * Welford, B.P. (1962) Note on a method for calculating corrected 731 | -- sums of squares and products. /Technometrics/ 732 | -- 4(3):419–420. 733 | -- 734 | -- * West, D.H.D. (1979) Updating mean and variance estimates: an 735 | -- improved method. /Communications of the ACM/ 736 | -- 22(9):532–535. 737 | -- 738 | -- * John D. Cook. Computing skewness and kurtosis in one pass 739 | -- 740 | 741 | 742 | 743 | -- (^) operator from Prelude is just slow. 744 | (^^^) :: Double -> Int -> Double 745 | x ^^^ 1 = x 746 | x ^^^ n = x * (x ^^^ (n-1)) 747 | {-# INLINE[2] (^^^) #-} 748 | {-# RULES 749 | "pow 2" forall x. x ^^^ 2 = x * x 750 | "pow 3" forall x. x ^^^ 3 = x * x * x 751 | "pow 4" forall x. x ^^^ 4 = x * x * x * x 752 | "pow 5" forall x. x ^^^ 5 = x * x * x * x * x 753 | "pow 6" forall x. x ^^^ 6 = x * x * x * x * x * x 754 | "pow 7" forall x. x ^^^ 7 = x * x * x * x * x * x * x 755 | "pow 8" forall x. x ^^^ 8 = x * x * x * x * x * x * x * x 756 | "pow 9" forall x. x ^^^ 9 = x * x * x * x * x * x * x * x * x 757 | "pow 10" forall x. x ^^^ 10 = x * x * x * x * x * x * x * x * x * x 758 | 759 | #-} 760 | -------------------------------------------------------------------------------- /stack-6.yaml: -------------------------------------------------------------------------------- 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md 2 | 3 | # Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2) 4 | resolver: lts-6.35 5 | 6 | # Local packages, usually specified by relative directory name 7 | packages: 8 | - '.' 9 | 10 | # Packages to be pulled from upstream that are not in the resolver (e.g., acme-missiles-0.3) 11 | extra-deps: [] 12 | 13 | # Override default flag values for local packages and extra-deps 14 | flags: {} 15 | 16 | # Extra package databases containing global packages 17 | extra-package-dbs: [] 18 | 19 | # Control whether we use the GHC we find on the path 20 | # system-ghc: true 21 | 22 | # Require a specific version of stack, using version ranges 23 | # require-stack-version: -any # Default 24 | # require-stack-version: >= 1.0.0 25 | 26 | # Override the architecture used by stack, especially useful on Windows 27 | # arch: i386 28 | # arch: x86_64 29 | 30 | # Extra directories used by stack for building 31 | # extra-include-dirs: [/path/to/dir] 32 | # extra-lib-dirs: [/path/to/dir] 33 | -------------------------------------------------------------------------------- /stack-7.yaml: -------------------------------------------------------------------------------- 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md 2 | 3 | # Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2) 4 | resolver: lts-7.0 5 | 6 | # Local packages, usually specified by relative directory name 7 | packages: 8 | - '.' 9 | 10 | # Packages to be pulled from upstream that are not in the resolver (e.g., acme-missiles-0.3) 11 | extra-deps: [] 12 | 13 | # Override default flag values for local packages and extra-deps 14 | flags: {} 15 | 16 | # Extra package databases containing global packages 17 | extra-package-dbs: [] 18 | 19 | # Control whether we use the GHC we find on the path 20 | # system-ghc: true 21 | 22 | # Require a specific version of stack, using version ranges 23 | # require-stack-version: -any # Default 24 | # require-stack-version: >= 1.0.0 25 | 26 | # Override the architecture used by stack, especially useful on Windows 27 | # arch: i386 28 | # arch: x86_64 29 | 30 | # Extra directories used by stack for building 31 | # extra-include-dirs: [/path/to/dir] 32 | # extra-lib-dirs: [/path/to/dir] 33 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | # For more information, see: https://github.com/commercialhaskell/stack/blob/release/doc/yaml_configuration.md 2 | 3 | # Specifies the GHC version and set of packages available (e.g., lts-3.5, nightly-2015-09-21, ghc-7.10.2) 4 | resolver: lts-10.3 5 | 6 | # Local packages, usually specified by relative directory name 7 | packages: 8 | - '.' 9 | 10 | # Packages to be pulled from upstream that are not in the resolver (e.g., acme-missiles-0.3) 11 | extra-deps: [] 12 | 13 | # Override default flag values for local packages and extra-deps 14 | flags: {} 15 | 16 | # Extra package databases containing global packages 17 | extra-package-dbs: [] 18 | 19 | # Control whether we use the GHC we find on the path 20 | # system-ghc: true 21 | 22 | # Require a specific version of stack, using version ranges 23 | # require-stack-version: -any # Default 24 | # require-stack-version: >= 1.0.0 25 | 26 | # Override the architecture used by stack, especially useful on Windows 27 | # arch: i386 28 | # arch: x86_64 29 | 30 | # Extra directories used by stack for building 31 | # extra-include-dirs: [/path/to/dir] 32 | # extra-lib-dirs: [/path/to/dir] 33 | -------------------------------------------------------------------------------- /test/Spec.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | 3 | import Test.Tasty 4 | -- import Test.Tasty.SmallCheck as SC 5 | import Test.Tasty.QuickCheck ((==>)) 6 | import qualified Test.Tasty.QuickCheck as QC 7 | 8 | #if MIN_VERSION_foldl(1,2,2) 9 | import qualified Control.Foldl as F hiding (mean, variance) 10 | #else 11 | import qualified Control.Foldl as F 12 | #endif 13 | 14 | import Control.Foldl.Statistics hiding (length) 15 | 16 | import qualified Data.Vector.Unboxed as U 17 | import Test.QuickCheck.Instances () 18 | 19 | import qualified Statistics.Sample as S 20 | 21 | import Data.Profunctor 22 | 23 | import Data.Function (on) 24 | 25 | import Data.Semigroup ((<>)) 26 | #if !MIN_VERSION_base(4,8,0) 27 | import Control.Applicative 28 | import Data.Monoid (mappend) 29 | #endif 30 | 31 | 32 | 33 | toV :: [Double] -> U.Vector Double 34 | toV = U.fromList 35 | 36 | 37 | onVec :: String -> (U.Vector Double -> QC.Property) -> TestTree 38 | onVec str f = QC.testProperty str (f . toV) 39 | 40 | onVec2 :: String -> (U.Vector (Double,Double) -> QC.Property) -> TestTree 41 | onVec2 str f = QC.testProperty str (f . U.fromList) 42 | 43 | 44 | testLMVSK :: Double -> Fold Double LMVSK 45 | testLMVSK m = LMVSK 46 | <$> F.length 47 | <*> mean 48 | <*> variance m 49 | <*> skewness m 50 | <*> kurtosis m 51 | 52 | precision :: Double 53 | precision = 10e-9 54 | 55 | cmpLMVSK :: Double -> LMVSK -> LMVSK -> Bool 56 | cmpLMVSK prec a b = let 57 | t f = on (withinPCT prec) f a b 58 | in a == b || 59 | ( t lmvskMean 60 | && t lmvskVariance 61 | && t lmvskKurtosis 62 | && t lmvskSkewness 63 | && ((==) `on` lmvskCount) a b 64 | ) 65 | 66 | diffLMVSK :: LMVSK -> LMVSK -> LMVSK 67 | diffLMVSK a b = LMVSK 68 | (t lmvskCount) 69 | (t lmvskMean) 70 | (t lmvskVariance) 71 | (t lmvskSkewness) 72 | (t lmvskKurtosis) 73 | where t f = f a - f b 74 | 75 | 76 | main :: IO () 77 | main = defaultMain $ 78 | testGroup "Results match Statistics.Sample" 79 | [ testGroup "Without pre-computed mean" 80 | [ testGroup "Statistics of location" 81 | [ onVec "mean" $ \vec -> 82 | not (U.null vec) ==> F.fold mean (U.toList vec) == S.mean vec 83 | , onVec2 "meanWeighted" $ \vec -> 84 | not (U.null vec) ==> F.fold meanWeighted (U.toList vec) == S.meanWeighted vec 85 | , onVec "welfordMean" $ \vec -> 86 | not (U.null vec) ==> F.fold welfordMean (U.toList vec) == S.welfordMean vec 87 | , onVec "harmonicMean" $ \vec -> 88 | not (U.null vec) ==> F.fold harmonicMean (U.toList vec) == S.harmonicMean vec 89 | , onVec "geometricMean" $ \vec -> 90 | not (U.null vec) ==> 91 | let vec' = U.map abs vec 92 | res = S.geometricMean vec' 93 | in isNaN res || F.fold geometricMean (U.toList vec') == res 94 | ] 95 | 96 | , testGroup "Single-pass functions" $ 97 | [ onVec "fastVariance" $ \vec -> 98 | not (U.null vec) ==> F.fold fastVariance (U.toList vec) == S.fastVariance vec 99 | , onVec "fastVarianceUnbiased" $ \vec -> 100 | not (U.null vec) ==> F.fold fastVarianceUnbiased (U.toList vec) == S.fastVarianceUnbiased vec 101 | , onVec "fastStdDev" $ \vec -> 102 | not (U.null vec) ==> F.fold fastStdDev (U.toList vec) == S.fastStdDev vec 103 | , let 104 | -- TODO: Known failure when using 105 | -- --quickcheck-replay '39 TFGenR A6EB566E901D554AAA13826C088B8831192E813D893D082A85F8A27C86D569E0 0 65535 16 0' 106 | in onVec ("fastLMVSK within " ++ show precision ++ " %") $ \vec -> 107 | U.length vec > 3 && U.sum (U.map abs vec) > 0.0 ==> let 108 | m = F.fold mean $ U.toList vec 109 | fast = F.fold fastLMVSK $ U.toList vec 110 | reference = F.fold (testLMVSK m) $ U.toList vec 111 | in QC.counterexample (unlines ["",show fast,show reference, "Diff:", show (diffLMVSK fast reference)]) $ 112 | cmpLMVSK precision fast reference 113 | , QC.testProperty "LMVSKSemigroup" $ \v1 v2 -> 114 | U.length v1 > 2 && U.length v2 > 2 && U.sum (mappend v1 v1) /= U.product (mappend v1 v1) ==> let 115 | sep = getLMVSK $ F.fold foldLMVSKState (U.toList v1) <> F.fold foldLMVSKState (U.toList v2) 116 | tog = F.fold fastLMVSK (U.toList v1 ++ U.toList v2) 117 | in QC.counterexample (unlines ["",show sep,show tog, "Diff:", show (diffLMVSK sep tog)]) 118 | $ cmpLMVSK precision sep tog 119 | || isNaN (lmvskKurtosis sep) 120 | || isNaN (lmvskKurtosis tog) 121 | ] 122 | ] 123 | 124 | , testGroup "With pre-computed mean" 125 | [ testGroup "Functions requiring the mean to be known" 126 | [ onVec "variance" $ \vec -> 127 | not (U.null vec) ==> let m = F.fold mean (U.toList vec) 128 | in F.fold (variance m) (U.toList vec) == S.variance vec 129 | , onVec "varianceUnbiased" $ \vec -> 130 | not (U.null vec) ==> let m = F.fold mean (U.toList vec) 131 | in F.fold (varianceUnbiased m) (U.toList vec) == S.varianceUnbiased vec 132 | , onVec "stdDev" $ \vec -> 133 | not (U.null vec) ==> let m = F.fold mean (U.toList vec) 134 | in F.fold (stdDev m) (U.toList vec) == S.stdDev vec 135 | , onVec2 "varianceWeighted" $ \vec -> 136 | not (U.null vec) ==> let m = F.fold meanWeighted (U.toList vec) 137 | in F.fold (varianceWeighted m) (U.toList vec) == S.varianceWeighted vec 138 | ] 139 | 140 | , testGroup "Functions over central moments" 141 | [ onVec "skewness" $ \vec -> 142 | U.length vec > 3 ==> let m = F.fold mean (U.toList vec) 143 | in F.fold (skewness m) (U.toList vec) == S.skewness vec 144 | , onVec "kurtosis" $ \vec -> 145 | U.length vec > 4 ==> let m = F.fold mean (U.toList vec) 146 | in F.fold (kurtosis m) (U.toList vec) == S.kurtosis vec 147 | , onVec "centralMoment 2" $ \vec -> 148 | U.length vec > 2 ==> let m = F.fold mean (U.toList vec) 149 | in F.fold (centralMoment 2 m) (U.toList vec) == S.centralMoment 2 vec 150 | , onVec "centralMoment 3" $ \vec -> 151 | U.length vec > 3 ==> let m = F.fold mean (U.toList vec) 152 | in F.fold (centralMoment 3 m) (U.toList vec) == S.centralMoment 3 vec 153 | , onVec "centralMoment 4" $ \vec -> 154 | U.length vec > 4 ==> let m = F.fold mean (U.toList vec) 155 | in F.fold (centralMoment 4 m) (U.toList vec) == S.centralMoment 4 vec 156 | , onVec "centralMoment 7" $ \vec -> 157 | U.length vec > 7 ==> let m = F.fold mean (U.toList vec) 158 | in F.fold (centralMoment 7 m) (U.toList vec) == S.centralMoment 7 vec 159 | , onVec "centralMoments 4 9" $ \vec -> 160 | U.length vec > 7 ==> let m = F.fold mean (U.toList vec) 161 | in F.fold (centralMoments 4 9 m) (U.toList vec) == S.centralMoments 4 9 vec 162 | -- Cannot test this because we do not have an equivalent implementation 163 | -- from the statistics package. 164 | -- , onVec "centralMoments' 4 9" $ \vec -> length lst > 7 ==> 165 | -- let m = F.fold mean lst 166 | -- (f1,f2) = (F.fold (centralMoments' 4 9 m) lst) 167 | -- (s1,s2) = (S.centralMoments 4 9 vec) 168 | -- in within 3 f1 s1 && within 3 f2 s2 169 | ] 170 | , testGroup "Correlation" 171 | [ onVec2 "correlation between [-1,1]" $ \vec -> 172 | U.length vec > 2 && U.any (/= (0.0,0.0)) vec ==> 173 | let m1 = F.fold mean (U.toList $ U.map fst vec) 174 | m2 = F.fold mean (U.toList $ U.map snd vec) 175 | s1 = F.fold (stdDev m1) (U.toList $ U.map fst vec) 176 | s2 = F.fold (stdDev m2) (U.toList $ U.map snd vec) 177 | in between (-1,1) $ 178 | F.fold (correlation (m1,m2) (s1,s2)) (U.toList vec) 179 | , onVec2 "correlation between [-1,1] fastStdDev" $ \vec -> 180 | 181 | let (m1,m2) = F.fold ((,) <$> lmap fst mean <*> lmap snd mean) 182 | (U.toList vec) 183 | (s1,s2) = F.fold ((,) <$> lmap fst (stdDev m1) <*> lmap snd (stdDev m2)) 184 | (U.toList vec) 185 | corr = F.fold (correlation (m1,m2) (s1,s2)) (U.toList vec) 186 | in U.length vec > 2 && s2 /= 0.0 && s2 /= 0.0 ==> 187 | QC.counterexample ("Correlation: " ++ show corr ++ " Stats: " ++ show (m1,m2,s1,s2)) $ 188 | between (-1,1) corr || isNaN corr 189 | , QC.testProperty "LinRegState Semigroup" $ \v1 v2 -> 190 | U.length v1 > 3 && U.length v2 > 3 191 | && U.any (/=0.0) (U.map fst v1) && U.any (/=0.0) (U.map fst v2) 192 | && U.any (/=0.0) (U.map snd v1) && U.any (/=0.0) (U.map snd v2) ==> let 193 | sep = getLinRegResult $ F.fold foldLinRegState (U.toList v1) <> F.fold foldLinRegState (U.toList v2) 194 | tog = F.fold fastLinearReg (U.toList v1 ++ U.toList v2) 195 | in (cmpLMVSK precision (lrrXStats sep) (lrrXStats tog) 196 | && cmpLMVSK precision (lrrYStats sep) (lrrYStats tog)) 197 | || isNaN (lmvskKurtosis (lrrXStats sep)) 198 | || isNaN (lmvskKurtosis (lrrYStats sep)) 199 | || isNaN (lmvskKurtosis (lrrXStats tog)) 200 | || isNaN (lmvskKurtosis (lrrYStats tog)) 201 | ] 202 | ] 203 | ] 204 | 205 | between :: (Double,Double) -> Double -> Bool 206 | between (lo,hi) = \x -> lo <= x && x <= hi 207 | 208 | 209 | withinPCT :: Double -> Double -> Double -> Bool 210 | withinPCT pct a b = abs ((a - b) * 100 / abs b) < pct 211 | --------------------------------------------------------------------------------