├── .github └── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md ├── .gitignore ├── .travis.yml ├── README.md ├── analyze ├── .gitignore ├── CHANGELOG.md ├── LICENSE.md ├── README.md ├── Setup.hs ├── analyze.cabal ├── bench │ └── Main.hs ├── src │ ├── Analyze.hs │ └── Analyze │ │ ├── Common.hs │ │ ├── Conversions.hs │ │ ├── Csv.hs │ │ ├── Datasets.hs │ │ ├── Decoding.hs │ │ ├── Html.hs │ │ ├── Ops.hs │ │ ├── RFrame.hs │ │ └── Values.hs ├── stack.yaml └── test │ ├── Fixtures.hs │ ├── Generation.hs │ ├── PropTests.hs │ ├── Spec.hs │ └── UnitTests.hs ├── datasets ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── Setup.hs ├── bench │ ├── Main.hs │ └── Numeric │ │ └── Dataloader │ │ └── Benchmark.hs ├── bin │ └── cifar10 │ │ └── Main.hs ├── changelog.md ├── datafiles │ ├── arff │ │ ├── README │ │ ├── ReutersCorn-test.arff │ │ ├── ReutersCorn-train.arff │ │ ├── ReutersGrain-test.arff │ │ ├── ReutersGrain-train.arff │ │ ├── contact-lens.arff │ │ ├── cpu.arff │ │ ├── cpu.with.vendor.arff │ │ ├── diabetes.arff │ │ ├── glass.arff │ │ ├── ionosphere.arff │ │ ├── iris.arff │ │ ├── labor.arff │ │ ├── segment-challenge.arff │ │ ├── segment-test.arff │ │ ├── soybean.arff │ │ ├── supermarket.arff │ │ ├── vote.arff │ │ ├── weather.arff │ │ └── weather.nominal.arff │ ├── iris.data │ ├── michelson.json │ ├── netflix │ │ ├── movies │ │ │ └── movie_titles.txt │ │ ├── test │ │ │ └── qualifying.txt │ │ └── training │ │ │ └── mv_0000001.txt │ ├── nightingale.json │ ├── titanic.tsv │ ├── titanic2_full.tsv │ └── titanic3.csv ├── datasets.cabal ├── src │ ├── Numeric │ │ ├── Dataloader.hs │ │ ├── Datasets.hs │ │ └── Datasets │ │ │ ├── Abalone.hs │ │ │ ├── Adult.hs │ │ │ ├── Anscombe.hs │ │ │ ├── BostonHousing.hs │ │ │ ├── BreastCancerWisconsin.hs │ │ │ ├── CIFAR10.hs │ │ │ ├── CO2.hs │ │ │ ├── Car.hs │ │ │ ├── Coal.hs │ │ │ ├── Diabetes.hs │ │ │ ├── Gapminder.hs │ │ │ ├── Internal │ │ │ ├── ArffParser.hs │ │ │ └── Streaming.hs │ │ │ ├── Iris.hs │ │ │ ├── Michelson.hs │ │ │ ├── Mushroom.hs │ │ │ ├── Mushroom.hs~ │ │ │ ├── Netflix.hs │ │ │ ├── Nightingale.hs │ │ │ ├── OldFaithful.hs │ │ │ ├── Quakes.hs │ │ │ ├── States.hs │ │ │ ├── Sunspots.hs │ │ │ ├── Titanic.hs │ │ │ ├── Titanic.hs~ │ │ │ ├── UN.hs │ │ │ ├── Vocabulary.hs │ │ │ ├── Wine.hs │ │ │ └── WineQuality.hs │ └── Streaming │ │ └── Instances.hs └── test │ └── Spec.hs ├── dense-linear-algebra ├── CHANGELOG.md ├── LICENSE ├── README.md ├── Setup.hs ├── bench │ ├── ChronosBench.hs │ └── WeighBench.hs ├── dense-linear-algebra.cabal ├── src │ └── Statistics │ │ ├── Matrix.hs │ │ └── Matrix │ │ ├── Algorithms.hs │ │ ├── Fast.hs │ │ ├── Fast │ │ └── Algorithms.hs │ │ ├── Function.hs │ │ ├── Mutable.hs │ │ └── Types.hs ├── stack.yaml └── test │ ├── AlgorithmsSpec.hs │ ├── Fixtures.hs │ ├── LibSpec.hs │ ├── Spec.hs │ └── Utils.hs ├── dh-core-accelerate ├── README.md ├── Setup.hs ├── core-accelerate.cabal ├── src │ └── Lib.hs └── test │ ├── LibSpec.hs │ └── Spec.hs └── dh-core ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── Setup.hs ├── core.cabal ├── src ├── Core │ ├── Data │ │ ├── Dataset.hs │ │ └── Datum │ │ │ └── Vector.hs │ └── Numeric │ │ ├── BLAS.hs │ │ ├── BLAS │ │ └── Class.hs │ │ └── Statistics │ │ ├── Classification │ │ ├── DecisionTrees.hs │ │ ├── Exceptions.hs │ │ └── Utils.hs │ │ └── InformationTheory.hs └── Lib.hs ├── stack.yaml ├── stack.yaml.lock └── test ├── LibSpec.hs └── Spec.hs /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | /dist/ 2 | /.cabal-sandbox/ 3 | /cabal.sandbox.config 4 | .stack-work/ 5 | .ionide/ 6 | .vscode/ 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Use new container infrastructure to enable caching 2 | sudo: false 3 | 4 | os: 5 | - linux 6 | - osx 7 | 8 | # Choose a lightweight base image; we provide our own build tools. 9 | language: c 10 | 11 | # GHC depends on GMP. You can add other dependencies here as well. 12 | addons: 13 | apt: 14 | packages: 15 | - libgmp-dev 16 | 17 | # The different configurations we want to test. You could also do things like 18 | # change flags or use --stack-yaml to point to a different file. 19 | env: 20 | global: 21 | - CODECOV_TOKEN="c11b1f41-6dad-495b-a5b0-fa4170eab2c4" 22 | - ANALYZE_V=0.2.0 23 | - DENSE_LINEAR_ALGEBRA_V=0.2.0.0 24 | - DATASETS_V=0.4.0 25 | matrix: 26 | - ARGS="--resolver nightly-2019-02-27" 27 | - ARGS="--resolver nightly-2019-05-10" 28 | 29 | before_install: 30 | # Download and unpack the stack executable 31 | - mkdir -p ~/.local/bin 32 | - export PATH=$HOME/.local/bin:$PATH 33 | - travis_retry curl -sSL https://get.haskellstack.org/ | sh 34 | 35 | # This line does all of the work: installs GHC if necessary, build the library, 36 | # executables, and test suites, and runs the test suites. --no-terminal works 37 | # around some quirks in Travis's terminal implementation. 38 | script: 39 | - cd dh-core 40 | - stack $ARGS --no-terminal --install-ghc test --haddock --coverage 41 | - stack $ARGS --no-terminal bench analyze:bench 42 | - stack $ARGS --no-terminal build dense-linear-algebra:weigh-bench 43 | - stack $ARGS --no-terminal build dense-linear-algebra:chronos-bench 44 | # - stack $ARGS --no-terminal bench datasets:bench 45 | #- cat .stack-work/logs/analyze-${ANALYZE_V}-test.log 46 | #- cat .stack-work/logs/dense-linear-algebra-${DENSE_LINEAR_ALGEBRA_V}-test.log 47 | #- cat .stack-work/logs/datasets-${DATASETS_V}-test.log 48 | 49 | after_script: 50 | - stack hpc report --all 51 | - travis_retry curl -sSL https://github.com/lehins/stack-hpc-coveralls/releases/download/0.0.5.0/shc.tar.gz | tar xz shc 52 | - ./shc --repo-token=$COVERALLS_REPO_TOKEN combined all 53 | # - stack exec shc --repo-token=$COVERALLS_REPO_TOKEN spec 54 | 55 | # Caching so the next build will be fast too. 56 | cache: 57 | directories: 58 | - $HOME/.stack 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DataHaskell/dh-core 2 | 3 | [![Build Status](https://travis-ci.org/DataHaskell/dh-core.png)](https://travis-ci.org/DataHaskell/dh-core) 4 | 5 | 6 | DataHaskell core project monorepo 7 | 8 | 9 | ## Aims 10 | 11 | This project aims to provide a native, end-to-end data science toolkit in Haskell. To achieve this, many types of experience are valuable; engineers, scientists, programmers, visualization experts, data journalists are all welcome to join the discussions and contribute. 12 | Not only this should be a working piece of software, but it should be intuitive and pleasant to use. 13 | All contributions, big or small, are very welcome and will be acknowledged. 14 | 15 | ## Architecture 16 | 17 | One single repository allows us to experiment with interfaces and move code around much more freely than many single-purpose repositories. Also, it makes it more convenient to track and visualize progress. 18 | 19 | This is the directory structure of the project; the main project lives in the `dh-core` subdirectory: 20 | 21 | dh-core/ 22 | dh-core/ 23 | dh-core-accelerate/ 24 | .... 25 | 26 | 27 | ## Contributed packages 28 | 29 | A number of authors and maintainers agreed to move ownership of their repositories under the `dh-core` umbrella. In some cases, these packages were already published on Hackage and cannot simply disappear from there, nor can this new line of development break downstream packages. 30 | 31 | For this reason, contributed packages will appear as subdirectories to the main `dh-core` project, and will need to retain their original .cabal file. 32 | 33 | The `stack` tool can take care of multi-package projects; its `packages` stanza in the `stack.yaml` file has only its directory as a default, but can contain a list of paths to other Cabal projects; e.g. in our case it could look like: 34 | 35 | packages: 36 | - . 37 | - analyze/ 38 | - datasets/ 39 | 40 | Packages that are listed on Hackage already must be added here as distinct sub-directories. Once the migration is complete (PRs merged etc.), add the project to this table : 41 | 42 | 43 | | Package | Description | Original author(s) | First version after merge | 44 | | --- | --- | --- | --- | 45 | | [`analyze`](https://hackage.haskell.org/package/analyze) | Data analysis and manipulation library | [Eric Conlon](https://github.com/ejconlon) | 0.2.0 | 46 | | [`datasets`](https://hackage.haskell.org/package/datasets) | A collection of ready-to-use datasets | [Tom Nielsen](https://github.com/glutamate) | 0.2.6 | 47 | | [`dense-linear-algebra`](https://hackage.haskell.org/package/dense-linear-algebra) | Fast, native dense linear algebra primitives | [Brian O'Sullivan](https://github.com/bos), [Alexey Khudyakov](https://github.com/Shimuuar) | 0.1.0 (a) | 48 | 49 | (a) : To be updated 50 | 51 | NB: Remember to bump version numbers and change web links accordingly when moving in contributed packages. 52 | 53 | 54 | 55 | 56 | 57 | ## Contributing 58 | 59 | 1. Open an issue (https://github.com/DataHaskell/dh-core/issues) with a description of what you want to work on (if it's not already open) 60 | 2. Assign or add yourself to the issue contributors 61 | 3. Pull from `dh-core:master`, start a git branch, add code 62 | 4. Add tests 63 | 5. Update the changelog, describing briefly your changes and their possible effects 64 | 6. 65 | 66 | * If you're working on a contributed package (see next section), increase the version number in the Cabal file accordingly 67 | 68 | * If you bumped version numbers, make sure these are updated accordingly in the Travis CI .yaml file 69 | 70 | 7. Send a pull request with your branch, referencing the issue 71 | 8. `dh-core` admins : merge only _after_ another admin has reviewed and approved the PR 72 | 73 | 74 | ### GHC and Stackage compatibility 75 | 76 | Tested against : 77 | 78 | - Stackage nightly-2019-02-27 (GHC 8.6.3) 79 | 80 | 81 | 82 | ## Development information and guidelines 83 | 84 | ### Dependencies 85 | 86 | We use the [`stack`](https://docs.haskellstack.org/en/stable/README/) build tool. 87 | 88 | Some systems /might/ need binaries and headers for these additional libraries: 89 | 90 | * zlib 91 | * curl 92 | 93 | (however if you're unsure, first try building with your current configuration). 94 | 95 | Nix users should set `nix.enable` to `true` in the `dh-core/dh-core/stack.yaml` file. 96 | 97 | 98 | ### Building instructions 99 | 100 | In the `dh-core/dh-core` subdirectory, run 101 | 102 | $ stack build 103 | 104 | and this will re-build the main project and the contributed packages. 105 | 106 | While developing this `stack` command can come in handy : it will trigger a re-build and run the tests every time a file in the project is modified: 107 | 108 | $ stack build --test --ghc-options -Wall --file-watch 109 | 110 | ## Testing 111 | 112 | Example : 113 | 114 | $ stack test core:doctest core:spec 115 | 116 | The `:` pairs determine which tests will be run. 117 | 118 | 119 | ## Continuous Integration (TravisCI) 120 | 121 | Travis builds `dh-core` and its hosted projects every time a commit is pushed to Github. 122 | Currently the `dh-core/.travis.yml` script uses the following command to install the GHC compiler, build the project and subprojects with `stack`, run the tests and build the Haddock documentation HTMLs: 123 | 124 | - stack $ARGS --no-terminal --install-ghc test core:spec core:doctest dense-linear-algebra:spec --haddock 125 | 126 | 127 | ## Visualizing the dependency tree of a package 128 | 129 | `stack` can produce a .dot file with the dependency graph of a Haskell project, which can then be rendered by the `dot` tool (from the [`graphviz`](https://graphviz.gitlab.io/_pages/Download/Download_source.html) suite). 130 | For example, in the following command the output of `stack dot` will be piped into `dot`, which will produce a SVG file called `deps.svg`: 131 | 132 | `stack dot --external --no-include-base --prune rts,ghc-prim,ghc-boot-th,template-haskell,transformers,containers,deepseq,bytestring,time,primitive,vector,text,hashable | dot -Tsvg > deps.svg` 133 | 134 | 135 | 136 | 137 | -------------------------------------------------------------------------------- /analyze/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/haskell 3 | 4 | ### Haskell ### 5 | dist 6 | dist-* 7 | cabal-dev 8 | *.o 9 | *.hi 10 | *.chi 11 | *.chs.h 12 | *.dyn_o 13 | *.dyn_hi 14 | .hpc 15 | .hsenv 16 | .cabal-sandbox/ 17 | cabal.sandbox.config 18 | *.prof 19 | *.aux 20 | *.hp 21 | *.eventlog 22 | .stack-work/ 23 | cabal.project.local 24 | .HTF/ 25 | 26 | -------------------------------------------------------------------------------- /analyze/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 0.2.0 2 | Integrated within DataHaskell/dh-core 3 | Adapted to latest `tasty-hspec` API 4 | Builds with GHC 8.2.2 and 8.4.3 5 | -------------------------------------------------------------------------------- /analyze/LICENSE.md: -------------------------------------------------------------------------------- 1 | Modified BSD License 2 | ==================== 3 | 4 | _Copyright © 2017, Analyze contributors_ 5 | _All rights reserved._ 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | 1. Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | 2. Redistributions in binary form must reproduce the above copyright 13 | notice, this list of conditions and the following disclaimer in the 14 | documentation and/or other materials provided with the distribution. 15 | 3. Neither the name of the `` nor the 16 | names of its contributors may be used to endorse or promote products 17 | derived from this software without specific prior written permission. 18 | 19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND 20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 | DISCLAIMED. IN NO EVENT SHALL `` BE LIABLE FOR ANY 23 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 26 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 29 | -------------------------------------------------------------------------------- /analyze/README.md: -------------------------------------------------------------------------------- 1 | analyze 2 | ===== 3 | -------------------------------------------------------------------------------- /analyze/Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /analyze/analyze.cabal: -------------------------------------------------------------------------------- 1 | cabal-version: 1.12 2 | name: analyze 3 | version: 0.2.0 4 | synopsis: Haskell data analysis and manipulation library 5 | description: Haskell data analysis and manipulation library, please see README.md 6 | category: Data Science 7 | homepage: https://github.com/DataHaskell/dh-core/analyze 8 | author: Eric Conlon , Nikita Tchayka 9 | maintainer: Marco Zocca 10 | copyright: 2017-2018 Analyze contributors 11 | license: BSD3 12 | license-file: LICENSE.md 13 | build-type: Simple 14 | extra-source-files: README.md 15 | CHANGELOG.md 16 | tested-with: GHC == 8.2.2, GHC == 8.4.3 17 | 18 | library 19 | exposed-modules: 20 | Analyze 21 | Analyze.Common 22 | Analyze.Conversions 23 | Analyze.Csv 24 | Analyze.Datasets 25 | Analyze.Decoding 26 | Analyze.Html 27 | Analyze.Ops 28 | Analyze.RFrame 29 | Analyze.Values 30 | other-modules: 31 | Paths_analyze 32 | hs-source-dirs: 33 | src 34 | default-extensions: OverloadedStrings 35 | build-depends: 36 | aeson , 37 | base >= 4.9 && < 5, 38 | binary , 39 | bytestring , 40 | cassava , 41 | exceptions , 42 | foldl , 43 | free , 44 | hashable , 45 | lucid , 46 | text , 47 | unordered-containers , 48 | vector , 49 | 50 | -- DEBUG 51 | tasty-hunit, 52 | tasty-quickcheck , 53 | criterion , 54 | mwc-random , 55 | streamly >= 0.6.0 , 56 | weigh 57 | -- QuickCheck, 58 | -- hspec 59 | 60 | default-language: Haskell2010 61 | 62 | test-suite spec 63 | type: exitcode-stdio-1.0 64 | main-is: Spec.hs 65 | other-modules: 66 | Fixtures 67 | Generation 68 | Analyze 69 | Analyze.Common 70 | Analyze.Conversions 71 | Analyze.Csv 72 | Analyze.Datasets 73 | Analyze.Decoding 74 | Analyze.Html 75 | Analyze.Ops 76 | Analyze.RFrame 77 | Analyze.Values 78 | Paths_analyze 79 | hs-source-dirs: 80 | test 81 | src 82 | default-extensions: OverloadedStrings 83 | ghc-options: -threaded -rtsopts -with-rtsopts=-N 84 | build-depends: 85 | QuickCheck , 86 | aeson , 87 | base , 88 | binary , 89 | bytestring , 90 | cassava , 91 | exceptions , 92 | foldl , 93 | free , 94 | hashable , 95 | lucid , 96 | mwc-random, 97 | tasty , 98 | tasty-hunit , 99 | tasty-quickcheck , 100 | text , 101 | unordered-containers , 102 | vector 103 | 104 | default-language: Haskell2010 105 | 106 | benchmark bench 107 | type: exitcode-stdio-1.0 108 | main-is: Main.hs 109 | hs-source-dirs: bench 110 | default-extensions: OverloadedStrings 111 | ghc-options: -threaded -rtsopts -with-rtsopts=-N 112 | build-depends: 113 | base, 114 | analyze , 115 | criterion, 116 | exceptions , 117 | mwc-random , 118 | random , 119 | streamly >= 0.6.0 , 120 | -- tasty , 121 | -- tasty-hunit , 122 | -- tasty-quickcheck , 123 | text , 124 | unordered-containers , 125 | weigh, 126 | vector 127 | -------------------------------------------------------------------------------- /analyze/bench/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE GADTs #-} 2 | module Main where 3 | 4 | import Streamly 5 | import qualified Streamly.Prelude as S 6 | 7 | import Control.Monad.Catch (MonadThrow (..)) 8 | import Data.Function ((&)) 9 | import Control.Monad 10 | import System.IO.Unsafe 11 | 12 | import Analyze.Common (Data (..), DuplicateKeyError (..), RowSizeMismatch (..)) 13 | import qualified Analyze as A 14 | import qualified Analyze.Common as AC 15 | 16 | 17 | import qualified Data.Vector as V 18 | import Data.Vector (Vector) 19 | 20 | import qualified Data.Text as T 21 | import Data.Text (Text) 22 | 23 | import Data.HashMap.Strict (HashMap) 24 | -- import qualified Data.HashMap.Strict as HM 25 | -- import qualified Data.HashSet as HS 26 | 27 | import System.Random (randomRs, newStdGen) 28 | import qualified System.Random.MWC as M 29 | import qualified Criterion.Main as C 30 | 31 | import qualified Weigh as W 32 | 33 | main :: IO () 34 | main = do 35 | speed 36 | -- weight 37 | 38 | -- implementation details required 39 | data RFrameUpdateS k v m where 40 | RFrameUpdateS :: MonadThrow m => { 41 | _rframeUpdateKeys :: !(Vector k), 42 | _rframeUpdateData :: !(SerialT m (Vector v)) 43 | } -> RFrameUpdateS k v m 44 | 45 | data RFrameS k v m where 46 | 47 | RFrameS :: MonadThrow m => { 48 | _rframeKeys :: !(Vector k), 49 | _rframeLookup :: !(HashMap k Int), 50 | _rframeData :: !(SerialT m (Vector v)) 51 | } -> RFrameS k v m 52 | 53 | 54 | fromUpdate :: (Data k, MonadThrow m) => RFrameUpdateS k v m -> m (RFrameS k v m) 55 | fromUpdate (RFrameUpdateS ks vs) = AC.checkForDupes ks >> pure (RFrameS ks (AC.makeLookup ks) vs) 56 | 57 | update :: (Data k, MonadThrow m) => RFrameUpdateS k v m -> RFrameS k v m -> m (RFrameS k v m) 58 | update (RFrameUpdateS uks uvs) (RFrameS fks _ fvs) = do 59 | fSize <- S.length fvs 60 | uSize <- S.length uvs 61 | if fSize /= uSize 62 | then throwM (RowSizeMismatch fSize uSize) 63 | else do 64 | AC.checkForDupes uks 65 | let kis = AC.mergeKeys fks uks 66 | ks' = (\(k, _, _) -> k) <$> kis 67 | look' = AC.makeLookup ks' 68 | vs' = S.zipWith (AC.runIndexedLookup kis) fvs uvs 69 | return (RFrameS ks' look' vs') 70 | 71 | 72 | -- only concerned with generating data 73 | n :: Int 74 | n = 1500 75 | 76 | testKeys :: Vector Text 77 | testKeys = unsafePerformIO $ V.replicateM n $ liftM (T.pack . take 10 . randomRs ('a','z')) newStdGen 78 | 79 | testData :: Vector (Vector Double) 80 | testData = unsafePerformIO $ do 81 | gen <- M.create 82 | V.replicateM n $ M.uniformVector gen n 83 | 84 | testDataS :: SerialT IO (Vector Double) 85 | testDataS = unsafePerformIO $ do 86 | let 87 | vec = testData 88 | return $ S.fromFoldable vec 89 | 90 | -- the actual benchmarks 91 | cmprVec :: IO Bool 92 | cmprVec = do 93 | let 94 | keys = testKeys 95 | dat = testData 96 | upd = A.RFrameUpdate keys dat 97 | frame1 <- A.fromUpdate upd 98 | frame2 <- A.fromUpdate upd 99 | return $ frame1 == frame2 100 | 101 | -- compr fills the same role as (==) but wrapped in IO 102 | cmpr :: RFrameS Text Double IO -> RFrameS Text Double IO -> IO Bool 103 | cmpr f1 f2 = do 104 | let 105 | sameKeys = _rframeKeys f1 == _rframeKeys f2 106 | sameLookup = _rframeLookup f1 == _rframeLookup f2 107 | dat1 = _rframeData f1 108 | dat2 = _rframeData f2 109 | sameDat <- (S.zipWith (==) dat1 dat2 & S.notElem False) 110 | return $ sameKeys && sameLookup && sameDat 111 | 112 | cmprStream :: IO Bool 113 | cmprStream = do 114 | let 115 | keys = testKeys 116 | dat = testDataS 117 | upd = RFrameUpdateS keys dat 118 | frame1 <- fromUpdate upd 119 | frame2 <- fromUpdate upd 120 | cmpr frame1 frame2 121 | 122 | takeRowsVec :: Int -> IO Bool 123 | takeRowsVec m = do 124 | let 125 | keys = testKeys 126 | dat = testData 127 | upd = A.RFrameUpdate keys dat 128 | frame <- A.fromUpdate upd 129 | let 130 | postTake = A.takeRows m frame 131 | return $ (==) frame postTake -- probably returns false, it's just to force evaluation 132 | 133 | -- | Takes first 'n' rows of an 'RFrameS'. 134 | takeRows :: Int -> RFrameS Text Double IO -> RFrameS Text Double IO 135 | takeRows n (RFrameS ks look srm) = RFrameS ks look (S.take n srm) 136 | 137 | takeRowsS :: Int -> IO Bool 138 | takeRowsS m = do 139 | let 140 | keys = testKeys 141 | dat = testDataS 142 | update = RFrameUpdateS keys dat 143 | frame <- fromUpdate update 144 | let 145 | postTake = takeRows m frame 146 | cmpr frame postTake 147 | 148 | speed :: IO () 149 | speed = do 150 | -- cmprVec -- to actually generate the data 151 | -- cmprStream -- this too 152 | C.defaultMain [ 153 | C.bgroup "Tests" [ 154 | C.bench "Vec" $ C.whnfIO cmprVec 155 | , C.bench "Stream" $ C.whnfIO cmprStream 156 | , C.bench "takeRowsVec 1000" $ C.whnfIO (takeRowsVec 1000) 157 | , C.bench "takeRowsVec 500" $ C.whnfIO (takeRowsVec 500) 158 | , C.bench "takeRowsVec 200" $ C.whnfIO (takeRowsVec 200) 159 | , C.bench "takeRowsVec 100" $ C.whnfIO (takeRowsVec 100) 160 | , C.bench "takeRowsStream 1000" $ C.whnfIO (takeRowsS 1000) 161 | , C.bench "takeRowsStream 500" $ C.whnfIO (takeRowsS 500) 162 | , C.bench "takeRowsStream 200" $ C.whnfIO (takeRowsS 200) 163 | , C.bench "takeRowsStream 100" $ C.whnfIO (takeRowsS 100)]] 164 | 165 | weight :: IO () 166 | weight = do 167 | -- cmprVec 168 | -- cmprStream 169 | W.mainWith ( do 170 | W.action "Vec" cmprVec 171 | W.action "Stream" cmprStream 172 | -- W.io "takeRowsVec 1000" takeRowsVec 1000 173 | W.io "takeRowsVec 500" takeRowsVec 500 174 | W.io "takeRowsVec 200" takeRowsVec 200 175 | W.io "takeRowsVec 100" takeRowsVec 100 176 | -- W.io "takeRowsStream 1000" takeRowsS 1000 177 | W.io "takeRowsStream 500" takeRowsS 500 178 | W.io "takeRowsStream 200" takeRowsS 200 179 | W.io "takeRowsStream 100" takeRowsS 100) 180 | -------------------------------------------------------------------------------- /analyze/src/Analyze.hs: -------------------------------------------------------------------------------- 1 | -- | See "Analyze.RFrame". 2 | module Analyze 3 | ( module X 4 | ) where 5 | 6 | import Analyze.Common as X 7 | import Analyze.Conversions as X 8 | import Analyze.Csv as X 9 | import Analyze.Datasets as X 10 | import Analyze.Decoding as X 11 | import Analyze.Html as X 12 | import Analyze.Ops as X 13 | import Analyze.RFrame as X 14 | import Analyze.Values as X 15 | -------------------------------------------------------------------------------- /analyze/src/Analyze/Common.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ConstraintKinds #-} 2 | 3 | -- | Common internal things (no other internal deps) 4 | module Analyze.Common where 5 | 6 | import Control.Exception 7 | import Control.Monad (forM_, unless) 8 | import Control.Monad.Catch (MonadThrow (..)) 9 | import Data.Hashable (Hashable) 10 | import Data.HashMap.Strict (HashMap) 11 | import qualified Data.HashMap.Strict as HM 12 | import Data.HashSet (HashSet) 13 | import qualified Data.HashSet as HS 14 | import Data.Typeable (Typeable) 15 | import Data.Vector (Vector) 16 | import qualified Data.Vector as V 17 | 18 | -- | Column keys need to have equality and hashability. 19 | type Data k = (Eq k, Hashable k, Show k, Typeable k) 20 | 21 | -- | flip <$> 22 | (<&>) :: Functor f => f a -> (a -> b) -> f b 23 | (<&>) x f = f <$> x 24 | {-# INLINE (<&>) #-} 25 | infixl 1 <&> 26 | 27 | -- | Exception for when a column is missing from a frame. 28 | data MissingKeyError k = MissingKeyError k deriving (Show, Eq, Typeable) 29 | instance (Show k, Typeable k) => Exception (MissingKeyError k) 30 | 31 | -- | Exception for when a column is duplicated in a frame. 32 | data DuplicateKeyError k = DuplicateKeyError k deriving (Show, Eq, Typeable) 33 | instance (Show k, Typeable k) => Exception (DuplicateKeyError k) 34 | 35 | -- | Exception for when frame column sizes don't match. 36 | data ColSizeMismatch = ColSizeMismatch Int Int deriving (Show, Eq, Typeable) 37 | instance Exception ColSizeMismatch 38 | 39 | -- | Exception for when frame row sizes don't match. 40 | data RowSizeMismatch = RowSizeMismatch Int Int deriving (Show, Eq, Typeable) 41 | instance Exception RowSizeMismatch 42 | 43 | -- | Throws when duplicate keys are present in a vector. 44 | checkForDupes :: (Data k, MonadThrow m) => Vector k -> m () 45 | checkForDupes vs = go HS.empty (V.toList vs) 46 | where 47 | go _ [] = pure () 48 | go s (k:ks) = 49 | if HS.member k s 50 | then throwM (DuplicateKeyError k) 51 | else go (HS.insert k s) ks 52 | 53 | -- | Throws when one vector is not a reordering of the other. 54 | checkReorder :: (Data k, MonadThrow m) => Vector k -> Vector k -> m () 55 | checkReorder xs ys = 56 | let xSize = V.length xs 57 | ySize = V.length ys 58 | in if xSize /= ySize 59 | then throwM (ColSizeMismatch xSize ySize) 60 | else checkSubset (V.toList xs) (HS.fromList (V.toList ys)) 61 | 62 | -- | Throws when any key is not present in the set. 63 | checkSubset :: (Data k, MonadThrow m) => [k] -> HashSet k -> m () 64 | checkSubset qs ks = forM_ qs (\q -> unless (HS.member q ks) (throwM (MissingKeyError q))) 65 | 66 | -- | Builds a reverse lookup for the vector. 67 | makeLookup :: Data k => Vector k -> HashMap k Int 68 | makeLookup = HM.fromList . flip zip [0..] . V.toList 69 | 70 | -- | Indexes into the vector of values, throwing on key missing or bad index. 71 | runLookup :: (Data k, MonadThrow m) => HashMap k Int -> Vector v -> k -> m v 72 | runLookup look vs k = 73 | case HM.lookup k look >>= (vs V.!?) of 74 | Nothing -> throwM (MissingKeyError k) 75 | Just v -> pure v 76 | 77 | -- | Reorders the vector of values by a new key order and an old lookup. 78 | reorder :: Data k => Vector k -> HashMap k Int -> Vector v -> Vector v 79 | reorder ks look vs = pick <$> ks 80 | where 81 | pick k = vs V.! (look HM.! k) 82 | 83 | -- | Merges two key vectors and tags each with its provenance (favoring the second). 84 | mergeKeys :: Data k => Vector k -> Vector k -> Vector (k, Int, Int) 85 | mergeKeys xs ys = 86 | let m = HM.fromList (V.toList (V.imap (\i x -> (x, (0, i))) xs)) 87 | n = HM.fromList (V.toList (V.imap (\i x -> (x, (1, i))) ys)) 88 | -- Ties go to the first argument, in this case favoring the update 89 | o = HM.union n m 90 | p = (\x -> let (a, b) = o HM.! x in (x, a, b)) <$> xs 91 | q = (\x -> let (a, b) = n HM.! x in (x, a, b)) <$> V.filter (\x -> not (HM.member x m)) ys 92 | in p V.++ q 93 | 94 | -- | Uses a merged key vector to select values. 95 | runIndexedLookup :: Vector (k, Int, Int) -> Vector v -> Vector v -> Vector v 96 | runIndexedLookup ks xs ys = (\(k, i, j) -> (if i == 0 then xs else ys) V.! j) <$> ks 97 | -------------------------------------------------------------------------------- /analyze/src/Analyze/Conversions.hs: -------------------------------------------------------------------------------- 1 | -- | Simple structural conversions. 2 | module Analyze.Conversions 3 | ( projectRow 4 | , projectRows 5 | ) where 6 | 7 | import Analyze.Common (Data, MissingKeyError (..), makeLookup) 8 | import Analyze.RFrame (RFrame (..), RFrameUpdate (..), fromUpdate) 9 | import Control.Monad.Catch (MonadThrow (..)) 10 | import Data.HashMap.Strict (HashMap) 11 | import qualified Data.HashMap.Strict as HM 12 | import Data.Vector (Vector) 13 | import qualified Data.Vector as V 14 | 15 | -- | Projects values out of the map according to the given key order. 16 | projectRow :: (Data k, MonadThrow m) => Vector k -> HashMap k v -> m (Vector v) 17 | projectRow ks row = V.mapM f ks 18 | where 19 | f k = 20 | case HM.lookup k row of 21 | Nothing -> throwM (MissingKeyError k) 22 | Just v -> pure v 23 | 24 | -- | Projects an 'RFrame' out of many maps according to the given key order. 25 | projectRows :: (Data k, MonadThrow m) => Vector k -> Vector (HashMap k v) -> m (RFrame k v) 26 | projectRows ks rs = do 27 | vs <- V.mapM (projectRow ks) rs 28 | fromUpdate (RFrameUpdate ks vs) 29 | -------------------------------------------------------------------------------- /analyze/src/Analyze/Csv.hs: -------------------------------------------------------------------------------- 1 | -- | Functions for working with CSV files. 2 | module Analyze.Csv where 3 | 4 | import Analyze.Conversions (projectRows) 5 | import Analyze.RFrame (RFrame (..), RFrameUpdate (..), empty, fromUpdate) 6 | import Control.Monad.Catch (Exception, MonadThrow (..)) 7 | import qualified Data.Binary.Builder as B 8 | import qualified Data.ByteString.Lazy as LBS 9 | import qualified Data.ByteString.Lazy.Char8 as LBS8 10 | import qualified Data.Csv as C 11 | import qualified Data.Csv.Builder as CB 12 | import Data.Text (Text) 13 | import Data.Text.Encoding (decodeUtf8, encodeUtf8) 14 | import Data.Typeable (Typeable) 15 | import qualified Data.Vector as V 16 | 17 | -- | Exception to wrap Cassava error strings. 18 | data CsvError = CsvError String deriving (Eq, Show, Typeable) 19 | instance Exception CsvError 20 | 21 | -- | Decode CSV bytes as an 'RFrame' with a header row. 22 | decodeWithHeader :: MonadThrow m => LBS.ByteString -> m (RFrame Text Text) 23 | decodeWithHeader bs = 24 | case C.decodeByName bs of 25 | Left err -> throwM (CsvError err) 26 | Right (header, rows) -> do 27 | let ks = decodeUtf8 <$> header 28 | projectRows ks rows 29 | 30 | -- | Decode CSV bytes as an 'RFrame' without a header row. 31 | decodeWithoutHeader :: MonadThrow m => LBS.ByteString -> m (RFrame Int Text) 32 | decodeWithoutHeader bs = 33 | case C.decode C.NoHeader bs of 34 | Left err -> throwM (CsvError err) 35 | Right rows -> 36 | if V.null rows 37 | then return empty 38 | else do 39 | let ks = V.imap const (V.head rows) 40 | update = RFrameUpdate ks rows 41 | fromUpdate update 42 | 43 | -- | Encode an 'RFrame' as CSV bytes with a header row. 44 | encodeWithHeader :: RFrame Text Text -> LBS.ByteString 45 | encodeWithHeader (RFrame ks _ vs) = 46 | let header = CB.encodeHeader (encodeUtf8 <$> ks) 47 | rows = header `mappend` foldMap (CB.encodeRecord . (encodeUtf8 <$>)) vs 48 | in B.toLazyByteString header 49 | 50 | -- | Encode an 'RFrame' as CSV bytes without header row. 51 | encodeWithoutHeader :: RFrame k Text -> LBS.ByteString 52 | encodeWithoutHeader (RFrame _ _ vs) = 53 | B.toLazyByteString (foldMap (CB.encodeRecord . (encodeUtf8 <$>)) vs) 54 | 55 | 56 | loadCSVFileWithHeader :: FilePath -> IO (RFrame Text Text) 57 | loadCSVFileWithHeader fileName = do 58 | contents <- readFile fileName 59 | decodeWithHeader (LBS8.pack contents) 60 | 61 | loadCSVFileWithoutHeader :: FilePath -> IO (RFrame Int Text) 62 | loadCSVFileWithoutHeader fileName = do 63 | contents <- readFile fileName 64 | decodeWithoutHeader (LBS8.pack contents) 65 | -------------------------------------------------------------------------------- /analyze/src/Analyze/Datasets.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | 3 | -- | Functions to work with included datasets. 4 | module Analyze.Datasets where 5 | 6 | import Analyze.Csv 7 | import Analyze.RFrame (RFrame) 8 | import Control.Monad.Catch (MonadThrow (..)) 9 | import qualified Data.ByteString.Lazy as LBS 10 | import Data.Text (Text) 11 | import qualified Data.Text as T 12 | import Paths_analyze 13 | 14 | -- | Load an included dataset. 15 | datasetWithHeader :: Text -> Text -> IO (RFrame Text Text) 16 | datasetWithHeader a b = do 17 | let path = "datasets/" ++ T.unpack a ++ "/" ++ T.unpack b ++ ".csv" 18 | newPath <- getDataFileName path 19 | bs <- LBS.readFile newPath 20 | decodeWithHeader bs 21 | 22 | -- | Load the "train" partition of the "titanic" dataset. 23 | titanicTrain :: IO (RFrame Text Text) 24 | titanicTrain = datasetWithHeader "titanic" "train" 25 | 26 | -- | Load the "test" partition of the "titanic" dataset. 27 | titanicTest :: IO (RFrame Text Text) 28 | titanicTest = datasetWithHeader "titanic" "test" 29 | -------------------------------------------------------------------------------- /analyze/src/Analyze/Decoding.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveFunctor #-} 2 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 3 | 4 | -- | Applicative decoding with key-value lookups. 5 | -- Think of a 'Decoder' as a row function that exposes the columns it uses. 6 | module Analyze.Decoding 7 | ( Arg (..) 8 | , Decoder(..) 9 | , decoderKeys 10 | , fromArg 11 | , require 12 | , requireWhere 13 | , runDecoder 14 | ) where 15 | 16 | import Analyze.Common (Data) 17 | import Control.Applicative.Free (Ap (..), liftAp) 18 | import Data.Maybe (fromMaybe) 19 | 20 | -- | Pair of key and an extraction function. 21 | data Arg m k v a = Arg k (v -> m a) deriving (Functor) 22 | 23 | -- | Free applicative over 'Arg'. 24 | newtype Decoder m k v a = Decoder (Ap (Arg m k v) a) deriving (Functor, Applicative) 25 | 26 | -- | Lifts a single 'Arg' into a 'Decoder' 27 | fromArg :: Arg m k v a -> Decoder m k v a 28 | fromArg = Decoder . liftAp 29 | 30 | -- | Simple 'Decoder' that just looks up and returns the value for a given key. 31 | require :: Applicative m => k -> Decoder m k v v 32 | require k = fromArg (Arg k pure) 33 | 34 | -- | Shorthand for lookup and transform. 35 | requireWhere :: k -> (k -> v -> m a) -> Decoder m k v a 36 | requireWhere k e = fromArg (Arg k (e k)) 37 | 38 | -- | List all column names used in the 'Decoder'. 39 | decoderKeys :: Data k => Decoder m k v a -> [k] 40 | decoderKeys (Decoder x) = go x 41 | where 42 | go :: Ap (Arg m k v) a -> [k] 43 | go (Pure _) = [] 44 | go (Ap (Arg k _) rest) = k : go rest 45 | 46 | -- This is pretty sensitive to let bindings 47 | apRow :: (Data k, Monad m) => Ap (Arg m k v) a -> (k -> m v) -> m a 48 | apRow (Pure a) _ = pure a 49 | apRow (Ap (Arg k f) rest) row = do 50 | v <- row k 51 | z <- f v 52 | fz <- apRow rest row 53 | return (fz z) 54 | 55 | -- | Run a 'Decoder' with a lookup function (typically row lookup). 56 | runDecoder :: (Data k, Monad m) => Decoder m k v a -> (k -> m v) -> m a 57 | runDecoder (Decoder x) = apRow x 58 | -------------------------------------------------------------------------------- /analyze/src/Analyze/Html.hs: -------------------------------------------------------------------------------- 1 | -- | Functions for working with HTML. 2 | module Analyze.Html where 3 | 4 | import Analyze.RFrame (RFrame (..)) 5 | import Control.Monad (forM_) 6 | import qualified Lucid as L 7 | 8 | -- | Renders an 'RFrame' to an HTML table. 9 | renderHtml :: (L.ToHtml k, L.ToHtml v, Monad m) 10 | => RFrame k v -> L.HtmlT m () 11 | renderHtml (RFrame ks _ vs) = 12 | L.table_ $ do 13 | L.thead_ $ 14 | L.tr_ $ forM_ ks (L.th_ . L.toHtml) 15 | L.tbody_ $ forM_ vs $ \v -> L.tr_ (forM_ v (L.td_ . L.toHtml)) 16 | -------------------------------------------------------------------------------- /analyze/src/Analyze/Ops.hs: -------------------------------------------------------------------------------- 1 | -- | Various data-sciencey functions. 2 | module Analyze.Ops 3 | ( oneHot 4 | ) where 5 | 6 | import Analyze.Common (Data) 7 | import Analyze.RFrame (RFrame (..), RFrameUpdate (..), col, splitCols, update, fromUpdate) 8 | import Control.Monad.Catch (MonadThrow (..)) 9 | import qualified Data.HashSet as HS 10 | import Data.Vector (Vector) 11 | import qualified Data.Vector as V 12 | 13 | uniq :: Data k => Vector k -> Vector k 14 | uniq ks = V.reverse (V.fromList newKsR) 15 | where 16 | acc (hs, uks) k = 17 | if HS.member k hs 18 | then (hs, uks) 19 | else (HS.insert k hs, k:uks) 20 | (_, newKsR) = V.foldl acc (HS.empty, []) ks 21 | 22 | match :: Eq k => Vector k -> v -> v -> k -> Vector v 23 | match ks yesVal noVal tk = V.map (\k -> if k == tk then yesVal else noVal) ks 24 | 25 | -- | One-hot encode a given column. (See tests for usage.) 26 | oneHot :: (Data k, MonadThrow m, Show v) => (k -> v -> k) -> k -> v -> v -> RFrame k v -> m (RFrame k v) 27 | oneHot combine key yesVal noVal frame = do 28 | let (target, cold) = splitCols (== key) frame 29 | rawVs <- col key target 30 | let cookedKs = V.map (combine key) rawVs 31 | newKs = uniq cookedKs 32 | newVs = V.map (match newKs yesVal noVal) cookedKs 33 | hot = RFrameUpdate newKs newVs 34 | 35 | fromUpdate hot 36 | -------------------------------------------------------------------------------- /analyze/src/Analyze/RFrame.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveFunctor #-} 2 | {-# LANGUAGE FlexibleInstances #-} 3 | 4 | -- | Core frame types and functions 5 | module Analyze.RFrame where 6 | 7 | import Analyze.Common 8 | import Analyze.Decoding (Decoder (..), decoderKeys, runDecoder) 9 | -- import qualified Control.Foldl as F 10 | import Control.Monad (join) 11 | import Control.Monad.Catch (MonadThrow (..)) 12 | -- import qualified Data.Aeson as A 13 | import Data.HashMap.Strict (HashMap) 14 | import qualified Data.HashMap.Strict as HM 15 | -- import Data.HashSet (HashSet) 16 | import qualified Data.HashSet as HS 17 | -- import Data.Text (Text) 18 | -- import qualified Data.Text as T 19 | import Data.Vector (Vector) 20 | import qualified Data.Vector as V 21 | 22 | -- | In-memory row-oriented frame with columns named by `k` and values by `v` 23 | data RFrame k v = RFrame 24 | { -- | Ordered vector of column names 25 | _rframeKeys :: !(Vector k) 26 | , -- | Quick lookup from column name to column index 27 | _rframeLookup :: !(HashMap k Int) 28 | , -- | Vector of rows. Each element should be the length of number of columns. 29 | _rframeData :: !(Vector (Vector v)) 30 | } deriving (Eq, Show, Functor) 31 | 32 | -- | A simpler 'RFrame' for updates 33 | data RFrameUpdate k v = RFrameUpdate 34 | { -- | Ordered vector of column names 35 | _rframeUpdateKeys :: !(Vector k) 36 | , -- | Vector of rows. 37 | _rframeUpdateData :: !(Vector (Vector v)) 38 | } deriving (Eq, Show, Functor) 39 | 40 | -- | Alias for a function to be applied to each row 41 | type RFrameMap k v a = Vector k -> HashMap k Int -> Int -> Vector v -> a 42 | 43 | -- | Alias for a row filter 44 | type RFrameFilter k v = RFrameMap k v Bool 45 | 46 | -- | Prettier alias for getting the keys of an 'RFrame' 47 | rframeKeys :: RFrame k v -> Vector k 48 | rframeKeys = _rframeKeys 49 | 50 | -- | Prettier alias for getting the data matrix of an 'RFrame' 51 | rframeData :: RFrame k v -> Vector (Vector v) 52 | rframeData = _rframeData 53 | 54 | -- | An empty frame with no rows or columns 55 | empty :: RFrame k v 56 | empty = RFrame V.empty HM.empty V.empty 57 | 58 | -- | Build an 'RFrame' from an 'RFrameUpdate'. 59 | -- Throws on duplicate keys. 60 | fromUpdate :: (Data k, MonadThrow m) => RFrameUpdate k v -> m (RFrame k v) 61 | fromUpdate (RFrameUpdate ks vs) = checkForDupes ks >> pure (RFrame ks (makeLookup ks) vs) 62 | 63 | -- | Build an 'RFrameUpdate' from an 'RFrame' 64 | toUpdate :: Data k => RFrame k v -> RFrameUpdate k v 65 | toUpdate (RFrame ks _ vs) = RFrameUpdate ks vs 66 | 67 | -- | Number of columns in an 'RFrame' 68 | numCols :: RFrame k v -> Int 69 | numCols (RFrame ks _ _) = V.length ks 70 | 71 | -- | Number of rows in an 'RFrame' 72 | numRows :: RFrame k v -> Int 73 | numRows (RFrame _ _ vs) = V.length vs 74 | 75 | -- | Project to the given column 76 | col :: (Data k, MonadThrow m) => k -> RFrame k v -> m (Vector v) 77 | col k (RFrame _ look vs) = V.mapM (\v -> runLookup look v k) vs 78 | 79 | -- | Decode by row. Each element of the returned vector may fail on decoding error 80 | -- so flatten manually or use 'flatDecode'. 81 | decode :: (Data k, MonadThrow m) => Decoder m k v a -> RFrame k v -> m (Vector (m a)) 82 | decode decoder (RFrame ks look vs) = checkSubset required keySet >> pure decoded 83 | where 84 | keySet = HS.fromList (V.toList ks) 85 | required = decoderKeys decoder 86 | decoded = runDecoder decoder . runLookup look <$> vs 87 | 88 | -- | An auto-flattened version of 'decode'. 89 | flatDecode :: (Data k, MonadThrow m) => Decoder m k v a -> RFrame k v -> m (Vector a) 90 | flatDecode decoder rframe = join $ sequence <$> decode decoder rframe 91 | 92 | -- | Filter an 'RFrame' by row 93 | filter :: Data k => RFrameFilter k v -> RFrame k v -> RFrame k v 94 | filter p (RFrame ks look vs) = RFrame ks look vs' 95 | where 96 | vs' = V.ifilter (p ks look) vs 97 | 98 | -- | Update row-wise, adding or replacing values per-column. 99 | -- Retains the existing column order, appending new columns. 100 | -- Throws on row length mismatch or duplicate columns in the update. 101 | update :: (Data k, MonadThrow m) => RFrameUpdate k v -> RFrame k v -> m (RFrame k v) 102 | update (RFrameUpdate uks uvs) (RFrame fks _ fvs) = do 103 | let fSize = V.length fvs 104 | uSize = V.length uvs 105 | if fSize /= uSize 106 | then throwM (RowSizeMismatch fSize uSize) 107 | else do 108 | checkForDupes uks 109 | let kis = mergeKeys fks uks 110 | ks' = (\(k, _, _) -> k) <$> kis 111 | look' = makeLookup ks' 112 | vs' = V.zipWith (runIndexedLookup kis) fvs uvs 113 | return (RFrame ks' look' vs') 114 | 115 | -- | Split columns in an 'RFrame' by a predicate. 116 | splitCols :: Data k => (k -> Bool) -> RFrame k v -> (RFrame k v, RFrame k v) 117 | splitCols p (RFrame ks look vs) = (RFrame keepKs keepLook keepVs, RFrame dropKs dropLook dropVs) 118 | where 119 | (keepKs, dropKs) = V.partition p ks 120 | keepLook = makeLookup keepKs 121 | keepVs = reorder keepKs look <$> vs 122 | dropLook = makeLookup dropKs 123 | dropVs = reorder dropKs look <$> vs 124 | 125 | -- | Drop columns in an 'RFrame' by a predicate. 126 | dropCols :: Data k => (k -> Bool) -> RFrame k v -> RFrame k v 127 | dropCols p frame = snd (splitCols p frame) 128 | 129 | -- | Keep columns in an 'RFrame' by a predicate. 130 | keepCols :: Data k => (k -> Bool) -> RFrame k v -> RFrame k v 131 | keepCols p frame = fst (splitCols p frame) 132 | 133 | -- | Appends rows to an 'RFrame', retaining column order of the first. 134 | -- Throws on column mismatch. 135 | appendRows :: (Data k, MonadThrow m) => RFrame k v -> RFrame k v -> m (RFrame k v) 136 | appendRows (RFrame ks0 look0 vs0) (RFrame ks1 look1 vs1) = do 137 | checkReorder ks0 ks1 138 | let vs1' = reorder ks0 look1 vs1 139 | return (RFrame ks0 look0 (vs0 V.++ vs1')) 140 | 141 | -- | Appends columns to an 'RFrame', retaining column order of the first. 142 | extendCols :: (Data k, MonadThrow m) => RFrame k v -> RFrame k v -> m (RFrame k v) 143 | extendCols f g = update (toUpdate g) f 144 | 145 | -- | Takes first 'n' rows of an 'RFrame'. 146 | takeRows :: Int -> RFrame k v -> RFrame k v 147 | takeRows n (RFrame ks look vs) = RFrame ks look (V.take n vs) 148 | 149 | -- | Adds a 'Vector' column to the 'RFrame' 150 | addColumn :: (Data k, MonadThrow m) => RFrame k v -> k -> Vector v -> m (RFrame k v) 151 | addColumn rf name v = do 152 | c <- newRFrameColumn name $ V.singleton <$> v 153 | extendCols rf c 154 | where 155 | newRFrameColumn rfName = fromUpdate . RFrameUpdate (V.singleton rfName) 156 | -------------------------------------------------------------------------------- /analyze/src/Analyze/Values.hs: -------------------------------------------------------------------------------- 1 | -- | Simple value types and functions. 2 | module Analyze.Values where 3 | 4 | import Analyze.Common (Data) 5 | import Control.Monad.Catch (Exception, MonadThrow (..)) 6 | import Data.Text (Text) 7 | import Data.Typeable (Typeable) 8 | 9 | -- | Singleton type for value types. 10 | data ValueType = 11 | ValueTypeText 12 | | ValueTypeInteger 13 | | ValueTypeDouble 14 | | ValueTypeBool 15 | deriving (Show, Eq, Enum, Bounded) 16 | 17 | -- | Union type for values. 18 | data Value = 19 | ValueText Text 20 | | ValueInteger Integer 21 | | ValueDouble Double 22 | | ValueBool Bool 23 | deriving (Show, Eq) 24 | 25 | -- | Returns the type of the value. 26 | valueToType :: Value -> ValueType 27 | valueToType (ValueText _) = ValueTypeText 28 | valueToType (ValueInteger _) = ValueTypeInteger 29 | valueToType (ValueDouble _) = ValueTypeDouble 30 | valueToType (ValueBool _) = ValueTypeBool 31 | 32 | -- | Extracts 'Text' from the 'Value'. 33 | getText :: Value -> Maybe Text 34 | getText (ValueText s) = Just s 35 | getText _ = Nothing 36 | 37 | -- | Extracts 'Integer' from the 'Value'. 38 | getInteger :: Value -> Maybe Integer 39 | getInteger (ValueInteger i) = Just i 40 | getInteger _ = Nothing 41 | 42 | -- | Extracts 'Double' from the 'Value'. 43 | getDouble :: Value -> Maybe Double 44 | getDouble (ValueDouble d) = Just d 45 | getDouble _ = Nothing 46 | 47 | -- | Extracts 'Bool' from the 'Value'. 48 | getBool :: Value -> Maybe Bool 49 | getBool (ValueBool b) = Just b 50 | getBool _ = Nothing 51 | 52 | -- | Exception for when we encounder unexpected values. 53 | data ValueTypeError k = ValueTypeError k ValueType Value deriving (Show, Eq, Typeable) 54 | instance (Show k, Typeable k) => Exception (ValueTypeError k) 55 | 56 | -- | Use with 'Analyze.Decoding.requireWhere' to read 'Text' values. 57 | textual :: (Data k, MonadThrow m) => k -> Value -> m Text 58 | textual _ (ValueText s) = pure s 59 | textual k v = throwM (ValueTypeError k ValueTypeText v) 60 | 61 | -- | Use with 'Analyze.Decoding.requireWhere' to read 'Integer' values. 62 | integral :: (Data k, MonadThrow m) => k -> Value -> m Integer 63 | integral _ (ValueInteger s) = pure s 64 | integral k v = throwM (ValueTypeError k ValueTypeInteger v) 65 | 66 | -- | Use with 'Analyze.Decoding.requireWhere' to read 'Double' values. 67 | floating :: (Data k, MonadThrow m) => k -> Value -> m Double 68 | floating _ (ValueDouble s) = pure s 69 | floating k v = throwM (ValueTypeError k ValueTypeDouble v) 70 | 71 | -- | Use with 'Analyze.Decoding.requireWhere' to read 'Bool' values. 72 | boolean :: (Data k, MonadThrow m) => k -> Value -> m Bool 73 | boolean _ (ValueBool s) = pure s 74 | boolean k v = throwM (ValueTypeError k ValueTypeBool v) 75 | -------------------------------------------------------------------------------- /analyze/stack.yaml: -------------------------------------------------------------------------------- 1 | resolver: lts-11.22 2 | packages: 3 | - . 4 | # Dependency packages to be pulled from upstream that are not in the resolver 5 | # using the same syntax as the packages field. 6 | # (e.g., acme-missiles-0.3) 7 | extra-deps: 8 | - streamly-0.6.1 9 | - vector-0.12.0.3 10 | # Override default flag values for local packages and extra-deps 11 | # flags: {} 12 | 13 | # Extra package databases containing global packages 14 | # extra-package-dbs: [] 15 | 16 | # Control whether we use the GHC we find on the path 17 | # system-ghc: true 18 | # 19 | # Require a specific version of stack, using version ranges 20 | # require-stack-version: -any # Default 21 | # require-stack-version: ">=1.9" 22 | # 23 | # Override the architecture used by stack, especially useful on Windows 24 | # arch: i386 25 | # arch: x86_64 26 | # 27 | # Extra directories used by stack for building 28 | # extra-include-dirs: [/path/to/dir] 29 | # extra-lib-dirs: [/path/to/dir] 30 | # 31 | # Allow a newer minor version of GHC than the snapshot specifies 32 | # compiler-check: newer-minor 33 | -------------------------------------------------------------------------------- /analyze/test/Fixtures.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE TypeSynonymInstances #-} 3 | module Fixtures where 4 | 5 | import Analyze.RFrame (RFrame (..), RFrameUpdate (..)) 6 | import Analyze.Values 7 | import qualified Control.Foldl as F 8 | import Data.HashMap.Strict (HashMap) 9 | import qualified Data.HashMap.Strict as HM 10 | import Data.Text (Text) 11 | import qualified Data.Text as T 12 | import Data.Vector (Vector) 13 | import qualified Data.Vector as V 14 | 15 | exampleDecl :: Vector (Text, ValueType) 16 | exampleDecl = V.fromList 17 | [ ("id", ValueTypeInteger) 18 | , ("name", ValueTypeText) 19 | , ("score", ValueTypeDouble) 20 | ] 21 | 22 | exampleHeader :: Vector Text 23 | exampleHeader = V.fromList 24 | [ "id" 25 | , "name" 26 | , "score" 27 | ] 28 | 29 | exampleObj1 :: Vector (Text, Value) 30 | exampleObj1 = V.fromList 31 | [ ("id", ValueInteger 42) 32 | , ("name", ValueText "foo") 33 | , ("score", ValueDouble 5.0) 34 | ] 35 | 36 | exampleRecord1 :: Vector Value 37 | exampleRecord1 = V.fromList 38 | [ ValueInteger 42 39 | , ValueText "foo" 40 | , ValueDouble 50.0 41 | ] 42 | 43 | exampleObj2 :: Vector (Text, Value) 44 | exampleObj2 = V.fromList 45 | [ ("id", ValueInteger 43) 46 | , ("name", ValueText "bar") 47 | , ("score", ValueDouble 3.0) 48 | ] 49 | 50 | fullUpdate :: RFrameUpdate Text Value 51 | fullUpdate = RFrameUpdate names values 52 | where 53 | names = V.fromList ["id", "name", "score"] 54 | values = V.fromList 55 | [ V.fromList [ValueInteger 42, ValueText "foo", ValueDouble 5.0] 56 | , V.fromList [ValueInteger 43, ValueText "bar", ValueDouble 3.0] 57 | ] 58 | 59 | noNameUpdate :: RFrameUpdate Text Value 60 | noNameUpdate = RFrameUpdate names values 61 | where 62 | names = V.fromList ["id", "score"] 63 | values = V.fromList 64 | [ V.fromList [ValueInteger 42, ValueDouble 5.0] 65 | , V.fromList [ValueInteger 43, ValueDouble 3.0] 66 | ] 67 | 68 | colorUpdate :: RFrameUpdate Text Value 69 | colorUpdate = RFrameUpdate names values 70 | where 71 | names = V.fromList ["color"] 72 | values = V.fromList 73 | [ V.fromList [ValueText "purple"] 74 | , V.fromList [ValueText "orange"] 75 | ] 76 | 77 | colorOneUpdate :: RFrameUpdate Text Value 78 | colorOneUpdate = RFrameUpdate names values 79 | where 80 | names = V.fromList ["color"] 81 | values = V.fromList 82 | [ V.fromList [ValueText "purple"] 83 | ] 84 | 85 | colorSpanishUpdate :: RFrameUpdate Text Value 86 | colorSpanishUpdate = RFrameUpdate names values 87 | where 88 | names = V.fromList ["color", "spanishColor"] 89 | values = V.fromList 90 | [ V.fromList [ValueText "purple", ValueText "lila"] 91 | , V.fromList [ValueText "orange", ValueText "naranja"] 92 | ] 93 | 94 | colorHotUpdate :: RFrameUpdate Text Value 95 | colorHotUpdate = RFrameUpdate names values 96 | where 97 | names = V.fromList ["color/purple", "color/orange"] 98 | values = V.fromList 99 | [ V.fromList [ValueBool True, ValueBool False] 100 | , V.fromList [ValueBool False, ValueBool True] 101 | ] 102 | 103 | fullColorUpdate :: RFrameUpdate Text Value 104 | fullColorUpdate = RFrameUpdate names values 105 | where 106 | names = V.fromList ["id", "name", "score", "color"] 107 | values = V.fromList 108 | [ V.fromList [ValueInteger 42, ValueText "foo", ValueDouble 5.0, ValueText "purple"] 109 | , V.fromList [ValueInteger 43, ValueText "bar", ValueDouble 3.0, ValueText "orange"] 110 | ] 111 | 112 | overlapUpdate :: RFrameUpdate Text Value 113 | overlapUpdate = RFrameUpdate names values 114 | where 115 | names = V.fromList ["color", "score"] 116 | values = V.fromList 117 | [ V.fromList [ValueText "purple", ValueDouble 10.0] 118 | , V.fromList [ValueText "orange", ValueDouble 6.0] 119 | ] 120 | 121 | fullOverlapUpdate :: RFrameUpdate Text Value 122 | fullOverlapUpdate = RFrameUpdate names values 123 | where 124 | names = V.fromList ["id", "name", "score", "color"] 125 | values = V.fromList 126 | [ V.fromList [ValueInteger 42, ValueText "foo", ValueDouble 10.0, ValueText "purple"] 127 | , V.fromList [ValueInteger 43, ValueText "bar", ValueDouble 6.0, ValueText "orange"] 128 | ] 129 | 130 | emptyUpdate :: RFrameUpdate Text Value 131 | emptyUpdate = RFrameUpdate V.empty (V.replicate 2 V.empty) 132 | 133 | fixtures :: HashMap Text (RFrameUpdate Text Value) 134 | fixtures = HM.fromList 135 | [ ("full", fullUpdate) 136 | , ("noName", noNameUpdate) 137 | , ("color", colorUpdate) 138 | , ("colorOne", colorOneUpdate) 139 | , ("colorHot", colorHotUpdate) 140 | , ("colorSpanish", colorSpanishUpdate) 141 | , ("empty", emptyUpdate) 142 | , ("fullColor", fullColorUpdate) 143 | , ("overlap", overlapUpdate) 144 | , ("fullOverlap", fullOverlapUpdate) 145 | ] 146 | 147 | exampleCsv :: Text 148 | exampleCsv = "id,name,score\n" `mappend` "42,foo,5.0\n" `mappend` "43,bar,3.0\n" 149 | -------------------------------------------------------------------------------- /analyze/test/Generation.hs: -------------------------------------------------------------------------------- 1 | module Generation where 2 | 3 | import Analyze.Common (Data, makeLookup) 4 | import Analyze.RFrame (RFrame (..),RFrameUpdate (..)) 5 | import Analyze.Values 6 | import Data.HashSet (HashSet) 7 | import qualified Data.HashSet as HS 8 | import Data.Text (Text) 9 | import qualified Data.Text as T 10 | import Data.Vector (Vector) 11 | import qualified Data.Vector as V 12 | import Test.QuickCheck 13 | 14 | distinctGenSized :: Data k => Gen k -> Int -> Gen (HashSet k) 15 | distinctGenSized = go HS.empty 16 | where 17 | go s g i | i <= 0 = pure s 18 | | otherwise = do 19 | k <- g `suchThat` \k' -> not (HS.member k' s) 20 | go (HS.insert k s) g (i - 1) 21 | 22 | distinctGen :: Data k => Gen k -> Gen (HashSet k) 23 | distinctGen = sized . distinctGenSized 24 | 25 | declGenSized :: Data k => Gen k -> Gen t -> Int -> Gen (Vector (k, t)) 26 | declGenSized kg tg i = do 27 | nameSet <- distinctGen kg 28 | let nameVec = V.fromList (HS.toList nameSet) 29 | valueTypeVec <- V.replicateM i tg 30 | pure (V.zip nameVec valueTypeVec) 31 | 32 | declGen :: Data k => Gen k -> Gen t -> Gen (Vector (k, t)) 33 | declGen kg tg = sized (declGenSized kg tg) 34 | 35 | rframeGenSized :: Data k => (t -> Gen v) -> Vector (k, t) -> Int -> Gen (RFrame k v) 36 | rframeGenSized prod decl numRows = gen 37 | where 38 | rowGen = sequenceA (prod . snd <$> decl) 39 | allRowsGen = V.replicateM numRows rowGen 40 | keys = fst <$> decl 41 | gen = RFrame keys (makeLookup keys) <$> allRowsGen 42 | 43 | rframeGen :: Data k => (t -> Gen v) -> Vector (k, t) -> Gen (RFrame k v) 44 | rframeGen prod decl = sized (rframeGenSized prod decl) 45 | 46 | -- needed to generate an updated, copied off the 47 | rframeUpdateGenSized :: Data k => (t -> Gen v) -> Vector (k, t) -> Int -> Gen (RFrameUpdate k v) 48 | rframeUpdateGenSized prod decl numRows = gen 49 | where 50 | rowGen = sequenceA (prod . snd <$> decl) 51 | allRowsGen = V.replicateM numRows rowGen 52 | keys = fst <$> decl 53 | gen = RFrameUpdate keys <$> allRowsGen 54 | 55 | -- again some machinery 56 | rframeUpdateGen :: Data k => (t -> Gen v) -> Vector (k, t) -> Gen (RFrameUpdate k v) 57 | rframeUpdateGen prod decl = sized (rframeUpdateGenSized prod decl) 58 | 59 | -- Specifics 60 | 61 | nameGen :: Gen Text 62 | nameGen = T.pack <$> listOf (choose ('a', 'z')) 63 | 64 | valueGen :: ValueType -> Gen Value 65 | valueGen ValueTypeText = ValueText <$> nameGen 66 | valueGen ValueTypeInteger = ValueInteger <$> arbitrary 67 | valueGen ValueTypeDouble = ValueDouble <$> arbitrary 68 | valueGen ValueTypeBool = ValueBool <$> arbitrary 69 | 70 | valueTypeGen :: Gen ValueType 71 | valueTypeGen = arbitraryBoundedEnum 72 | 73 | valueDeclGen :: Gen (Vector (Text, ValueType)) 74 | valueDeclGen = declGen nameGen valueTypeGen 75 | 76 | valueRFrameGen :: Gen (RFrame Text Value) 77 | valueRFrameGen = valueDeclGen >>= rframeGen valueGen 78 | 79 | 80 | -- generates an update 81 | valueRFrameUpdateGen :: Gen (RFrameUpdate Text Value) 82 | valueRFrameUpdateGen = valueDeclGen >>= rframeUpdateGen valueGen 83 | 84 | 85 | -- things down there serve to produce updates with only doubles 86 | -- only ouputs Vector (Text, ValueTypeDouble)) 87 | valueDeclGenDouble :: Gen (Vector (Text, ValueType)) 88 | valueDeclGenDouble = declGen nameGen (elements [ValueTypeDouble]) 89 | 90 | -- the actual generator 91 | doubleRFrameUpdateGen :: Gen (RFrameUpdate Text Value) 92 | doubleRFrameUpdateGen = valueDeclGenDouble >>= rframeUpdateGen valueGen-- a frame generator that will only have Double's as data -------------------------------------------------------------------------------- /analyze/test/Spec.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import qualified PropTests as P 4 | import qualified UnitTests as U 5 | import qualified Test.Tasty as Ts 6 | 7 | groupTest :: Ts.TestTree 8 | groupTest = Ts.testGroup "Analyze tests" [P.propTests, U.tests] 9 | 10 | main :: IO () 11 | main = do 12 | Ts.defaultMain groupTest -------------------------------------------------------------------------------- /analyze/test/UnitTests.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleInstances #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE TypeSynonymInstances #-} 4 | module UnitTests where 5 | 6 | import qualified Analyze as A 7 | import Analyze.Common ((<&>)) 8 | import Control.Monad.Catch 9 | import qualified Data.HashMap.Strict as HM 10 | import qualified Data.HashSet as HS 11 | import Data.Monoid ((<>)) 12 | import Data.Text (Text) 13 | import qualified Data.Text as T 14 | import Data.Vector (Vector) 15 | import qualified Data.Vector as V 16 | import Fixtures 17 | import Generation 18 | import Test.QuickCheck 19 | import qualified Test.QuickCheck.Property as P 20 | import Test.Tasty 21 | import Test.Tasty.HUnit 22 | import Test.Tasty.QuickCheck 23 | 24 | -- Boilerplate 25 | 26 | propertyIO :: Assertion -> Property 27 | propertyIO action = ioProperty tester 28 | where 29 | tester :: IO P.Result 30 | tester = catch (action >> return P.succeeded) handler 31 | handler (HUnitFailure _ err) = return P.failed { P.reason = err } 32 | 33 | testPropertyIO :: TestName -> Gen a -> (a -> Assertion) -> TestTree 34 | testPropertyIO name g t = testProperty name (propertyIO . t <$> g) 35 | 36 | -- Aux 37 | 38 | getUpdateFixture :: Text -> IO (A.RFrameUpdate Text A.Value) 39 | getUpdateFixture name = 40 | case HM.lookup name fixtures of 41 | Just u -> return u 42 | Nothing -> error ("fixture not found: " ++ T.unpack name) 43 | 44 | 45 | getFrameFixture :: Text -> IO (A.RFrame Text A.Value) 46 | getFrameFixture name = A.fromUpdate =<< getUpdateFixture name 47 | 48 | -- Tests 49 | 50 | testFixture :: TestTree 51 | testFixture = testCase "fixture" $ do 52 | frame <- getFrameFixture "full" 53 | A._rframeKeys frame @?= exampleHeader 54 | A.numRows frame @?= 2 55 | A.numCols frame @?= 3 56 | 57 | testRowDecode :: TestTree 58 | testRowDecode = testCase "rowDecode" $ do 59 | frame <- getFrameFixture "full" 60 | let decoder = A.requireWhere "score" A.floating <&> (*2) 61 | result <- sequenceA =<< A.decode decoder frame 62 | V.fromList [10.0, 6.0] @?= result 63 | 64 | testDrop :: TestTree 65 | testDrop = testCase "drop" $ do 66 | original <- getFrameFixture "full" 67 | expected <- getFrameFixture "noName" 68 | A.numCols original @?= 3 69 | A.numCols expected @?= 2 70 | let names = HS.singleton "name" 71 | let actual = A.dropCols (`HS.member` names) original 72 | A._rframeKeys actual @?= A._rframeKeys expected 73 | 74 | testKeep :: TestTree 75 | testKeep = testCase "keep" $ do 76 | original <- getFrameFixture "full" 77 | expected <- getFrameFixture "noName" 78 | A.numCols original @?= 3 79 | A.numCols expected @?= 2 80 | let names = HS.fromList ["id", "score"] 81 | let actual = A.keepCols (`HS.member` names) original 82 | A._rframeKeys actual @?= A._rframeKeys expected 83 | 84 | testUpdateEmpty :: TestTree 85 | testUpdateEmpty = testCase "update empty" $ do 86 | update <- getUpdateFixture "full" 87 | empty <- A.fromUpdate =<< getUpdateFixture "empty" 88 | expected <- A.fromUpdate update 89 | actual <- A.update update empty 90 | actual @?= expected 91 | 92 | testUpdateEmpty2 :: TestTree 93 | testUpdateEmpty2 = testCase "update empty 2" $ do 94 | frame <- getFrameFixture "full" 95 | update <- getUpdateFixture "empty" 96 | actual <- A.update update frame 97 | actual @?= frame 98 | 99 | testUpdateAdd :: TestTree 100 | testUpdateAdd = testCase "update add" $ do 101 | frame <- getFrameFixture "full" 102 | update <- getUpdateFixture "color" 103 | expected <- getFrameFixture "fullColor" 104 | actual <- A.update update frame 105 | actual @?= expected 106 | 107 | testUpdateOverlap :: TestTree 108 | testUpdateOverlap = testCase "update overlap" $ do 109 | frame <- getFrameFixture "full" 110 | update <- getUpdateFixture "overlap" 111 | expected <- getFrameFixture "fullOverlap" 112 | actual <- A.update update frame 113 | actual @?= expected 114 | 115 | testTakeRows :: TestTree 116 | testTakeRows = testCase "takeRows" $ do 117 | before <- getFrameFixture "color" 118 | A.numRows before @?= 2 119 | expected <- getFrameFixture "colorOne" 120 | A.numRows expected @?= 1 121 | let actual = A.takeRows 1 before 122 | --A.numRows actual @?= 1 123 | actual @?= expected 124 | 125 | testAddColumn :: TestTree 126 | testAddColumn = testCase "add column" $ do 127 | before <- getFrameFixture "color" 128 | A.numCols before @?= 1 129 | expected <- getFrameFixture "colorSpanish" 130 | A.numCols expected @?= 2 131 | actual <- A.addColumn before "spanishColor" (V.fromList [ A.ValueText "lila", A.ValueText "naranja"]) 132 | --A.numRows actual @?= 1 133 | actual @?= expected 134 | 135 | testOneHot :: TestTree 136 | testOneHot = testCase "one hot" $ do 137 | color <- getFrameFixture "color" 138 | colorHot <- getFrameFixture "colorHot" 139 | actual <- A.oneHot (\k (A.ValueText v) -> k <> "/" <> v) "color" (A.ValueBool True) (A.ValueBool False) color 140 | actual @?= colorHot 141 | 142 | -- Runner 143 | 144 | tests :: TestTree 145 | tests = testGroup "Unit tests" 146 | [ testFixture 147 | , testRowDecode 148 | , testDrop 149 | , testKeep 150 | , testUpdateEmpty 151 | , testUpdateEmpty2 152 | , testUpdateAdd 153 | , testUpdateOverlap 154 | , testTakeRows 155 | , testAddColumn 156 | , testOneHot 157 | ] 158 | 159 | main :: IO () 160 | main = defaultMain tests 161 | -------------------------------------------------------------------------------- /datasets/.gitignore: -------------------------------------------------------------------------------- 1 | # CUSTOM IGNORES 2 | ### Datasets in development ### 3 | cifar-10 4 | mnist 5 | 6 | ### NixOS ### 7 | result 8 | 9 | # Created by https://www.gitignore.io/api/vim,emacs,linux,macos,windows,haskell 10 | # Edit at https://www.gitignore.io/?templates=vim,emacs,linux,macos,windows,haskell 11 | 12 | ### Emacs ### 13 | # -*- mode: gitignore; -*- 14 | *~ 15 | \#*\# 16 | /.emacs.desktop 17 | /.emacs.desktop.lock 18 | *.elc 19 | auto-save-list 20 | tramp 21 | .\#* 22 | 23 | # Org-mode 24 | .org-id-locations 25 | *_archive 26 | 27 | # flymake-mode 28 | *_flymake.* 29 | 30 | # eshell files 31 | /eshell/history 32 | /eshell/lastdir 33 | 34 | # elpa packages 35 | /elpa/ 36 | 37 | # reftex files 38 | *.rel 39 | 40 | # AUCTeX auto folder 41 | /auto/ 42 | 43 | # cask packages 44 | .cask/ 45 | dist/ 46 | 47 | # Flycheck 48 | flycheck_*.el 49 | 50 | # server auth directory 51 | /server/ 52 | 53 | # projectiles files 54 | .projectile 55 | 56 | # directory configuration 57 | .dir-locals.el 58 | 59 | # network security 60 | /network-security.data 61 | 62 | 63 | ### Haskell ### 64 | dist 65 | dist-* 66 | cabal-dev 67 | *.o 68 | *.hi 69 | *.chi 70 | *.chs.h 71 | *.dyn_o 72 | *.dyn_hi 73 | .hpc 74 | .hsenv 75 | .cabal-sandbox/ 76 | cabal.sandbox.config 77 | *.prof 78 | *.aux 79 | *.hp 80 | *.eventlog 81 | .stack-work/ 82 | cabal.project.local 83 | cabal.project.local~ 84 | .HTF/ 85 | .ghc.environment.* 86 | 87 | ### Linux ### 88 | 89 | # temporary files which can be created if a process still has a handle open of a deleted file 90 | .fuse_hidden* 91 | 92 | # KDE directory preferences 93 | .directory 94 | 95 | # Linux trash folder which might appear on any partition or disk 96 | .Trash-* 97 | 98 | # .nfs files are created when an open file is removed but is still being accessed 99 | .nfs* 100 | 101 | ### macOS ### 102 | # General 103 | .DS_Store 104 | .AppleDouble 105 | .LSOverride 106 | 107 | # Icon must end with two \r 108 | Icon 109 | 110 | # Thumbnails 111 | ._* 112 | 113 | # Files that might appear in the root of a volume 114 | .DocumentRevisions-V100 115 | .fseventsd 116 | .Spotlight-V100 117 | .TemporaryItems 118 | .Trashes 119 | .VolumeIcon.icns 120 | .com.apple.timemachine.donotpresent 121 | 122 | # Directories potentially created on remote AFP share 123 | .AppleDB 124 | .AppleDesktop 125 | Network Trash Folder 126 | Temporary Items 127 | .apdisk 128 | 129 | ### Vim ### 130 | # Swap 131 | [._]*.s[a-v][a-z] 132 | [._]*.sw[a-p] 133 | [._]s[a-rt-v][a-z] 134 | [._]ss[a-gi-z] 135 | [._]sw[a-p] 136 | 137 | # Session 138 | Session.vim 139 | 140 | # Temporary 141 | .netrwhist 142 | # Auto-generated tag files 143 | tags 144 | # Persistent undo 145 | [._]*.un~ 146 | 147 | ### Windows ### 148 | # Windows thumbnail cache files 149 | Thumbs.db 150 | ehthumbs.db 151 | ehthumbs_vista.db 152 | 153 | # Dump file 154 | *.stackdump 155 | 156 | # Folder config file 157 | [Dd]esktop.ini 158 | 159 | # Recycle Bin used on file shares 160 | $RECYCLE.BIN/ 161 | 162 | # Windows Installer files 163 | *.cab 164 | *.msi 165 | *.msix 166 | *.msm 167 | *.msp 168 | 169 | # Windows shortcuts 170 | *.lnk 171 | 172 | # End of https://www.gitignore.io/api/vim,emacs,linux,macos,windows,haskell 173 | -------------------------------------------------------------------------------- /datasets/.travis.yml: -------------------------------------------------------------------------------- 1 | # This Travis job script has been generated by a script via 2 | # 3 | # make_travis_yml_2.hs 'datasets.cabal' 4 | # 5 | # For more information, see https://github.com/hvr/multi-ghc-travis 6 | # 7 | language: c 8 | sudo: false 9 | 10 | git: 11 | submodules: false # whether to recursively clone submodules 12 | 13 | cache: 14 | directories: 15 | - $HOME/.cabal/packages 16 | - $HOME/.cabal/store 17 | 18 | before_cache: 19 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/build-reports.log 20 | # remove files that are regenerated by 'cabal update' 21 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/00-index.* 22 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/*.json 23 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.cache 24 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.tar 25 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.tar.idx 26 | 27 | matrix: 28 | include: 29 | - compiler: "ghc-7.6.3" 30 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 31 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.6.3], sources: [hvr-ghc]}} 32 | - compiler: "ghc-7.8.4" 33 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 34 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.8.4], sources: [hvr-ghc]}} 35 | - compiler: "ghc-7.10.2" 36 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 37 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.10.2], sources: [hvr-ghc]}} 38 | - compiler: "ghc-7.10.3" 39 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 40 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.10.3], sources: [hvr-ghc]}} 41 | - compiler: "ghc-8.0.1" 42 | # env: TEST=--disable-tests BENCH=--disable-benchmarks 43 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-8.0.1], sources: [hvr-ghc]}} 44 | 45 | before_install: 46 | - HC=${CC} 47 | - unset CC 48 | - PATH=/opt/ghc/bin:/opt/ghc-ppa-tools/bin:$PATH 49 | - PKGNAME='datasets' 50 | 51 | install: 52 | - cabal --version 53 | - echo "$(${HC} --version) [$(${HC} --print-project-git-commit-id 2> /dev/null || echo '?')]" 54 | - BENCH=${BENCH---enable-benchmarks} 55 | - TEST=${TEST---enable-tests} 56 | - travis_retry cabal update -v 57 | - sed -i 's/^jobs:/-- jobs:/' ${HOME}/.cabal/config 58 | - rm -fv cabal.project.local 59 | - "echo 'packages: .' > cabal.project" 60 | - rm -f cabal.project.freeze 61 | - cabal new-build -w ${HC} ${TEST} ${BENCH} --dep -j2 62 | - cabal new-build -w ${HC} --disable-tests --disable-benchmarks --dep -j2 63 | 64 | # Here starts the actual work to be performed for the package under test; 65 | # any command which exits with a non-zero exit code causes the build to fail. 66 | script: 67 | - if [ -f configure.ac ]; then autoreconf -i; fi 68 | - rm -rf dist/ 69 | - cabal sdist # test that a source-distribution can be generated 70 | - cd dist/ 71 | - SRCTAR=(${PKGNAME}-*.tar.gz) 72 | - SRC_BASENAME="${SRCTAR/%.tar.gz}" 73 | - tar -xvf "./$SRC_BASENAME.tar.gz" 74 | - cd "$SRC_BASENAME/" 75 | ## from here on, CWD is inside the extracted source-tarball 76 | - rm -fv cabal.project.local 77 | - "echo 'packages: .' > cabal.project" 78 | # this builds all libraries and executables (without tests/benchmarks) 79 | - rm -f cabal.project.freeze 80 | - cabal new-build -w ${HC} --disable-tests --disable-benchmarks 81 | # this builds all libraries and executables (including tests/benchmarks) 82 | # - rm -rf ./dist-newstyle 83 | - cabal new-build -w ${HC} ${TEST} ${BENCH} 84 | 85 | # there's no 'cabal new-test' yet, so let's emulate for now 86 | - TESTS=( $(awk 'tolower($0) ~ /^test-suite / { print $2 }' *.cabal) ) 87 | - if [ "$TEST" != "--enable-tests" ]; then TESTS=(); fi 88 | - shopt -s globstar; 89 | RC=true; for T in ${TESTS[@]}; do echo "== $T =="; 90 | if dist-newstyle/build/**/$SRC_BASENAME/**/build/$T/$T; then echo "= $T OK ="; 91 | else echo "= $T FAILED ="; RC=false; fi; done; $RC 92 | 93 | # EOF 94 | -------------------------------------------------------------------------------- /datasets/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2016 Tom Nielsen 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /datasets/README.md: -------------------------------------------------------------------------------- 1 | datasets: data sets for statistics and machine learning, in Haskell 2 | ===== 3 | 4 | [![Hackage](https://img.shields.io/hackage/v/datasets.svg)](https://hackage.haskell.org/package/datasets) [![Build Status](https://secure.travis-ci.org/glutamate/datasets.svg)](http://travis-ci.org/glutamate/datasets) 5 | 6 | This library provides easy access in Haskell to a series of data sets 7 | for Statistics and Machine learning. 8 | 9 | Most of these datasets come from the [UCI Machine Learning Reposity](http://archive.ics.uci.edu/ml/) 10 | ([Mirror](http://mlr.cs.umass.edu/ml/)) 11 | 12 | ## Usage 13 | 14 | ```haskell 15 | 16 | import Numeric.Datasets (getDataset) 17 | import Numeric.Datasets.Iris (iris) 18 | import Numeric.Datasets.Abalone (abalone) 19 | 20 | main = do 21 | 22 | -- The Iris data set is embedded 23 | print (length iris) 24 | print (head iris) 25 | 26 | -- The Abalone dataset is fetched 27 | abas <- getDataset abalone 28 | print (length abas) 29 | print (head abas) 30 | 31 | ``` 32 | -------------------------------------------------------------------------------- /datasets/Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /datasets/bench/Main.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import qualified Numeric.Dataloader.Benchmark 4 | 5 | main :: IO () 6 | main = Numeric.Dataloader.Benchmark.main 7 | 8 | -------------------------------------------------------------------------------- /datasets/bench/Numeric/Dataloader/Benchmark.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -Wno-orphans #-} 2 | {-# LANGUAGE DataKinds #-} 3 | {-# LANGUAGE LambdaCase #-} 4 | module Numeric.Dataloader.Benchmark where 5 | 6 | import Control.DeepSeq 7 | import Control.Concurrent 8 | import Numeric.Dataloader 9 | import Numeric.Datasets 10 | import Numeric.Datasets.Abalone 11 | import Streaming (Of) 12 | import qualified Streaming.Prelude as S 13 | import qualified System.Random.MWC as MWC 14 | 15 | -- ImageLoading bench 16 | import Numeric.Datasets.Internal.Streaming 17 | import Numeric.Datasets.CIFAR10 18 | import System.FilePath 19 | import System.Directory 20 | import Codec.Picture 21 | import Control.Exception.Safe 22 | import Text.Read 23 | import System.IO.Unsafe 24 | import qualified Data.List.NonEmpty as NonEmpty 25 | 26 | import Criterion.Main 27 | 28 | instance NFData Abalone 29 | instance NFData Sex 30 | instance (NFData a, NFData b) => NFData (Of a b) 31 | 32 | 33 | mkDataloaderWithIx :: Dataset a -> IO (Dataloader a a) 34 | mkDataloaderWithIx ds = MWC.withSystemRandom $ \g -> do 35 | ixl <- uniformIxline ds g 36 | pure $ Dataloader 1 (Just ixl) ds id 37 | 38 | 39 | main :: IO () 40 | main = do 41 | dl <- mkDataloaderWithIx abalone 42 | cifar10l <- cifar10ImageLoader 43 | defaultMain 44 | [ bgroup "Numeric.Dataloader" 45 | [ bench "making an ixline" $ nfIO $ MWC.withSystemRandom (uniformIxline abalone) 46 | , bgroup "testStream" 47 | [ bench "with ixline" . nfIO $ foldStream (S.take 100 $ slow . stream $ dl) 48 | , bench "no ixline" . nfIO $ foldStream (S.take 100 $ slow . stream $ Dataloader 1 Nothing abalone id) 49 | ] 50 | , bench "cifar10 image folder" $ nfIO $ foldStream $ S.take 1000 $ stream cifar10l 51 | , bench "cifar10 batch folder" $ nfIO $ foldStream $ S.take 1 $ batchStream (cifar10l { batchSize = 1000 }) 52 | ] 53 | ] 54 | 55 | slow :: S.Stream (Of a) IO r -> S.Stream (Of a) IO r 56 | slow = S.mapM (\a -> threadDelay 2 >> pure a) 57 | 58 | foldStream :: Show a => S.Stream (Of a) IO () -> IO (Of [a] ()) 59 | foldStream = S.fold (\memo a -> a:memo) [] id 60 | 61 | ------------------------------------------------------------------------------- 62 | -- Image Folder loading 63 | 64 | 65 | 66 | 67 | -- | dataloading the image folder dataset 68 | cifar10ImageLoader :: IO (Dataloader (String, FilePath) CIFARImage) 69 | cifar10ImageLoader = do 70 | xdgCache <- getXdgDirectory XdgCache "datasets-hs" 71 | let imfolder = xdgCache "cifar-10-imagefolder" 72 | pure $ Dataloader 1 Nothing (imgFolderDataset imfolder) load 73 | 74 | where 75 | labelFolders :: NonEmpty.NonEmpty String 76 | labelFolders = show <$> NonEmpty.fromList [minBound..maxBound::Label] 77 | 78 | imgFolderDataset :: FilePath -> Dataset (String, FilePath) 79 | imgFolderDataset fp = 80 | Dataset 81 | (ImgFolder fp labelFolders) 82 | Nothing 83 | Nothing 84 | (ImageFolder labelFolders) 85 | 86 | load :: (String, FilePath) -> CIFARImage 87 | load (str, fp) = CIFARImage (img, lbl) 88 | where 89 | lbl :: Label 90 | lbl = either error id $ readEither str 91 | 92 | img :: Image PixelRGB8 93 | img = case unsafePerformIO (readPng fp) of 94 | Left err -> error err 95 | Right (ImageRGB8 i) -> i 96 | Right _ -> error "decoded image was not rgb8" 97 | 98 | -------------------------------------------------------------------------------- /datasets/bin/cifar10/Main.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import Codec.Picture (writePng) 4 | import Numeric.Datasets.Internal.Streaming (streamDataset) 5 | import Numeric.Datasets.CIFAR10 (CIFARImage(..), cifar10) 6 | import System.FilePath 7 | import System.Directory (XdgDirectory(..), getXdgDirectory, createDirectoryIfMissing, listDirectory) 8 | import qualified Streaming.Prelude as S 9 | import System.ProgressBar (newProgressBar, defStyle, Progress(..), incProgress) 10 | 11 | main :: IO () 12 | main = provision 13 | 14 | -- may be required if you don't already have CIFAR10 loaded 15 | provision :: IO () 16 | provision = do 17 | xdgCache <- getXdgDirectory XdgCache "datasets-hs" 18 | let imfolder = xdgCache "cifar-10-imagefolder" 19 | createDirectoryIfMissing True imfolder 20 | -- progress bar 21 | let n = 60000 22 | pb <- newProgressBar defStyle 10 (Progress 0 n ()) 23 | let 24 | go :: FilePath -> CIFARImage -> IO () 25 | go cachefolder (CIFARImage (im, lbl)) = do 26 | let labelfolder = cachefolder show lbl 27 | createDirectoryIfMissing True labelfolder 28 | ix <- length <$> listDirectory labelfolder 29 | let 30 | fname = show lbl ++ "_" ++ show ix ++ ".png" 31 | writePng (labelfolder fname) im 32 | incProgress pb 1 -- increment progress bar 33 | 34 | -- build the image folder 35 | S.mapM_ (go imfolder) $ streamDataset cifar10 36 | 37 | 38 | -------------------------------------------------------------------------------- /datasets/changelog.md: -------------------------------------------------------------------------------- 1 | 0.4 2 | * Get rid of dependency on 'data-default' (introduced by previous versions of 'req') 3 | 4 | * Bump 'req' dependency to 2.0.0 5 | 0.3 6 | * 'datasets' hosted within the DataHaskell/dh-core project 7 | 8 | * use 'req' for HTTP and HTTPS requests, instead of 'wreq' 9 | 10 | * Mushroom and Titanic datasets 11 | 12 | * Restructured top-level documentation 13 | 14 | * Removed 'csvDatasetPreprocess' and added 'withPreprocess'. Now bytestring preprocessing is more compositional, i.e. 'withPreprocess' can be used with JSON datasets as well. 15 | 16 | 17 | 0.2.5 18 | 19 | * Old Faithful matches R dataset 20 | 21 | 0.2.4 22 | 23 | * Netflix dataset 24 | 25 | 0.2.3 26 | 27 | * Coal dataset 28 | 29 | * New internal API 30 | 31 | * Ord instance for IrisClass 32 | 33 | 0.2.2 34 | 35 | * Enum, Bounded instances for IrisClass 36 | 37 | * Gapminder dataset 38 | 39 | * Use wreq for HTTP and HTTPS requests 40 | 41 | 0.2.1 42 | 43 | * Wine quality datasets 44 | 45 | * Vocabulary, UN, States datasets 46 | 47 | * CO2, Sunspots and Quakes datasets 48 | 49 | 0.2.0.3 50 | 51 | * Further GHC portability 52 | 53 | 0.2.0.2 54 | 55 | * Improve GHC portability 56 | 57 | 0.2.0.1 58 | 59 | * Bugfix: include embedded data files in cabal extra-source-files 60 | 61 | 0.2 62 | 63 | * iris dataset is a pure value (with file-embed) 64 | 65 | * Michelson, Nightingale and BostonHousing datasets 66 | -------------------------------------------------------------------------------- /datasets/datafiles/arff/README: -------------------------------------------------------------------------------- 1 | This folder contains many weka ARFF files used for testing the ARFF parser. 2 | 3 | The files were obtained from: https://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html -------------------------------------------------------------------------------- /datasets/datafiles/arff/contact-lens.arff: -------------------------------------------------------------------------------- 1 | % 1. Title: Database for fitting contact lenses 2 | % 3 | % 2. Sources: 4 | % (a) Cendrowska, J. "PRISM: An algorithm for inducing modular rules", 5 | % International Journal of Man-Machine Studies, 1987, 27, 349-370 6 | % (b) Donor: Benoit Julien (Julien@ce.cmu.edu) 7 | % (c) Date: 1 August 1990 8 | % 9 | % 3. Past Usage: 10 | % 1. See above. 11 | % 2. Witten, I. H. & MacDonald, B. A. (1988). Using concept 12 | % learning for knowledge acquisition. International Journal of 13 | % Man-Machine Studies, 27, (pp. 349-370). 14 | % 15 | % Notes: This database is complete (all possible combinations of 16 | % attribute-value pairs are represented). 17 | % 18 | % Each instance is complete and correct. 19 | % 20 | % 9 rules cover the training set. 21 | % 22 | % 4. Relevant Information Paragraph: 23 | % The examples are complete and noise free. 24 | % The examples highly simplified the problem. The attributes do not 25 | % fully describe all the factors affecting the decision as to which type, 26 | % if any, to fit. 27 | % 28 | % 5. Number of Instances: 24 29 | % 30 | % 6. Number of Attributes: 4 (all nominal) 31 | % 32 | % 7. Attribute Information: 33 | % -- 3 Classes 34 | % 1 : the patient should be fitted with hard contact lenses, 35 | % 2 : the patient should be fitted with soft contact lenses, 36 | % 1 : the patient should not be fitted with contact lenses. 37 | % 38 | % 1. age of the patient: (1) young, (2) pre-presbyopic, (3) presbyopic 39 | % 2. spectacle prescription: (1) myope, (2) hypermetrope 40 | % 3. astigmatic: (1) no, (2) yes 41 | % 4. tear production rate: (1) reduced, (2) normal 42 | % 43 | % 8. Number of Missing Attribute Values: 0 44 | % 45 | % 9. Class Distribution: 46 | % 1. hard contact lenses: 4 47 | % 2. soft contact lenses: 5 48 | % 3. no contact lenses: 15 49 | 50 | @relation contact-lenses 51 | 52 | @attribute age {young, pre-presbyopic, presbyopic} 53 | @attribute spectacle-prescrip {myope, hypermetrope} 54 | @attribute astigmatism {no, yes} 55 | @attribute tear-prod-rate {reduced, normal} 56 | @attribute contact-lenses {soft, hard, none} 57 | 58 | @data 59 | % 60 | % 24 instances 61 | % 62 | young,myope,no,reduced,none 63 | young,myope,no,normal,soft 64 | young,myope,yes,reduced,none 65 | young,myope,yes,normal,hard 66 | young,hypermetrope,no,reduced,none 67 | young,hypermetrope,no,normal,soft 68 | young,hypermetrope,yes,reduced,none 69 | young,hypermetrope,yes,normal,hard 70 | pre-presbyopic,myope,no,reduced,none 71 | pre-presbyopic,myope,no,normal,soft 72 | pre-presbyopic,myope,yes,reduced,none 73 | pre-presbyopic,myope,yes,normal,hard 74 | pre-presbyopic,hypermetrope,no,reduced,none 75 | pre-presbyopic,hypermetrope,no,normal,soft 76 | pre-presbyopic,hypermetrope,yes,reduced,none 77 | pre-presbyopic,hypermetrope,yes,normal,none 78 | presbyopic,myope,no,reduced,none 79 | presbyopic,myope,no,normal,none 80 | presbyopic,myope,yes,reduced,none 81 | presbyopic,myope,yes,normal,hard 82 | presbyopic,hypermetrope,no,reduced,none 83 | presbyopic,hypermetrope,no,normal,soft 84 | presbyopic,hypermetrope,yes,reduced,none 85 | presbyopic,hypermetrope,yes,normal,none 86 | -------------------------------------------------------------------------------- /datasets/datafiles/arff/cpu.arff: -------------------------------------------------------------------------------- 1 | % 2 | % As used by Kilpatrick, D. & Cameron-Jones, M. (1998). Numeric prediction 3 | % using instance-based learning with encoding length selection. In Progress 4 | % in Connectionist-Based Information Systems. Singapore: Springer-Verlag. 5 | % 6 | % Deleted "vendor" attribute to make data consistent with with what we 7 | % used in the data mining book. 8 | % 9 | @relation 'cpu' 10 | @attribute MYCT real 11 | @attribute MMIN real 12 | @attribute MMAX real 13 | @attribute CACH real 14 | @attribute CHMIN real 15 | @attribute CHMAX real 16 | @attribute class real 17 | @data 18 | 125,256,6000,256,16,128,198 19 | 29,8000,32000,32,8,32,269 20 | 29,8000,32000,32,8,32,220 21 | 29,8000,32000,32,8,32,172 22 | 29,8000,16000,32,8,16,132 23 | 26,8000,32000,64,8,32,318 24 | 23,16000,32000,64,16,32,367 25 | 23,16000,32000,64,16,32,489 26 | 23,16000,64000,64,16,32,636 27 | 23,32000,64000,128,32,64,1144 28 | 400,1000,3000,0,1,2,38 29 | 400,512,3500,4,1,6,40 30 | 60,2000,8000,65,1,8,92 31 | 50,4000,16000,65,1,8,138 32 | 350,64,64,0,1,4,10 33 | 200,512,16000,0,4,32,35 34 | 167,524,2000,8,4,15,19 35 | 143,512,5000,0,7,32,28 36 | 143,1000,2000,0,5,16,31 37 | 110,5000,5000,142,8,64,120 38 | 143,1500,6300,0,5,32,30 39 | 143,3100,6200,0,5,20,33 40 | 143,2300,6200,0,6,64,61 41 | 110,3100,6200,0,6,64,76 42 | 320,128,6000,0,1,12,23 43 | 320,512,2000,4,1,3,69 44 | 320,256,6000,0,1,6,33 45 | 320,256,3000,4,1,3,27 46 | 320,512,5000,4,1,5,77 47 | 320,256,5000,4,1,6,27 48 | 25,1310,2620,131,12,24,274 49 | 25,1310,2620,131,12,24,368 50 | 50,2620,10480,30,12,24,32 51 | 50,2620,10480,30,12,24,63 52 | 56,5240,20970,30,12,24,106 53 | 64,5240,20970,30,12,24,208 54 | 50,500,2000,8,1,4,20 55 | 50,1000,4000,8,1,5,29 56 | 50,2000,8000,8,1,5,71 57 | 50,1000,4000,8,3,5,26 58 | 50,1000,8000,8,3,5,36 59 | 50,2000,16000,8,3,5,40 60 | 50,2000,16000,8,3,6,52 61 | 50,2000,16000,8,3,6,60 62 | 133,1000,12000,9,3,12,72 63 | 133,1000,8000,9,3,12,72 64 | 810,512,512,8,1,1,18 65 | 810,1000,5000,0,1,1,20 66 | 320,512,8000,4,1,5,40 67 | 200,512,8000,8,1,8,62 68 | 700,384,8000,0,1,1,24 69 | 700,256,2000,0,1,1,24 70 | 140,1000,16000,16,1,3,138 71 | 200,1000,8000,0,1,2,36 72 | 110,1000,4000,16,1,2,26 73 | 110,1000,12000,16,1,2,60 74 | 220,1000,8000,16,1,2,71 75 | 800,256,8000,0,1,4,12 76 | 800,256,8000,0,1,4,14 77 | 800,256,8000,0,1,4,20 78 | 800,256,8000,0,1,4,16 79 | 800,256,8000,0,1,4,22 80 | 125,512,1000,0,8,20,36 81 | 75,2000,8000,64,1,38,144 82 | 75,2000,16000,64,1,38,144 83 | 75,2000,16000,128,1,38,259 84 | 90,256,1000,0,3,10,17 85 | 105,256,2000,0,3,10,26 86 | 105,1000,4000,0,3,24,32 87 | 105,2000,4000,8,3,19,32 88 | 75,2000,8000,8,3,24,62 89 | 75,3000,8000,8,3,48,64 90 | 175,256,2000,0,3,24,22 91 | 300,768,3000,0,6,24,36 92 | 300,768,3000,6,6,24,44 93 | 300,768,12000,6,6,24,50 94 | 300,768,4500,0,1,24,45 95 | 300,384,12000,6,1,24,53 96 | 300,192,768,6,6,24,36 97 | 180,768,12000,6,1,31,84 98 | 330,1000,3000,0,2,4,16 99 | 300,1000,4000,8,3,64,38 100 | 300,1000,16000,8,2,112,38 101 | 330,1000,2000,0,1,2,16 102 | 330,1000,4000,0,3,6,22 103 | 140,2000,4000,0,3,6,29 104 | 140,2000,4000,0,4,8,40 105 | 140,2000,4000,8,1,20,35 106 | 140,2000,32000,32,1,20,134 107 | 140,2000,8000,32,1,54,66 108 | 140,2000,32000,32,1,54,141 109 | 140,2000,32000,32,1,54,189 110 | 140,2000,4000,8,1,20,22 111 | 57,4000,16000,1,6,12,132 112 | 57,4000,24000,64,12,16,237 113 | 26,16000,32000,64,16,24,465 114 | 26,16000,32000,64,8,24,465 115 | 26,8000,32000,0,8,24,277 116 | 26,8000,16000,0,8,16,185 117 | 480,96,512,0,1,1,6 118 | 203,1000,2000,0,1,5,24 119 | 115,512,6000,16,1,6,45 120 | 1100,512,1500,0,1,1,7 121 | 1100,768,2000,0,1,1,13 122 | 600,768,2000,0,1,1,16 123 | 400,2000,4000,0,1,1,32 124 | 400,4000,8000,0,1,1,32 125 | 900,1000,1000,0,1,2,11 126 | 900,512,1000,0,1,2,11 127 | 900,1000,4000,4,1,2,18 128 | 900,1000,4000,8,1,2,22 129 | 900,2000,4000,0,3,6,37 130 | 225,2000,4000,8,3,6,40 131 | 225,2000,4000,8,3,6,34 132 | 180,2000,8000,8,1,6,50 133 | 185,2000,16000,16,1,6,76 134 | 180,2000,16000,16,1,6,66 135 | 225,1000,4000,2,3,6,24 136 | 25,2000,12000,8,1,4,49 137 | 25,2000,12000,16,3,5,66 138 | 17,4000,16000,8,6,12,100 139 | 17,4000,16000,32,6,12,133 140 | 1500,768,1000,0,0,0,12 141 | 1500,768,2000,0,0,0,18 142 | 800,768,2000,0,0,0,20 143 | 50,2000,4000,0,3,6,27 144 | 50,2000,8000,8,3,6,45 145 | 50,2000,8000,8,1,6,56 146 | 50,2000,16000,24,1,6,70 147 | 50,2000,16000,24,1,6,80 148 | 50,8000,16000,48,1,10,136 149 | 100,1000,8000,0,2,6,16 150 | 100,1000,8000,24,2,6,26 151 | 100,1000,8000,24,3,6,32 152 | 50,2000,16000,12,3,16,45 153 | 50,2000,16000,24,6,16,54 154 | 50,2000,16000,24,6,16,65 155 | 150,512,4000,0,8,128,30 156 | 115,2000,8000,16,1,3,50 157 | 115,2000,4000,2,1,5,40 158 | 92,2000,8000,32,1,6,62 159 | 92,2000,8000,32,1,6,60 160 | 92,2000,8000,4,1,6,50 161 | 75,4000,16000,16,1,6,66 162 | 60,4000,16000,32,1,6,86 163 | 60,2000,16000,64,5,8,74 164 | 60,4000,16000,64,5,8,93 165 | 50,4000,16000,64,5,10,111 166 | 72,4000,16000,64,8,16,143 167 | 72,2000,8000,16,6,8,105 168 | 40,8000,16000,32,8,16,214 169 | 40,8000,32000,64,8,24,277 170 | 35,8000,32000,64,8,24,370 171 | 38,16000,32000,128,16,32,510 172 | 48,4000,24000,32,8,24,214 173 | 38,8000,32000,64,8,24,326 174 | 30,16000,32000,256,16,24,510 175 | 112,1000,1000,0,1,4,8 176 | 84,1000,2000,0,1,6,12 177 | 56,1000,4000,0,1,6,17 178 | 56,2000,6000,0,1,8,21 179 | 56,2000,8000,0,1,8,24 180 | 56,4000,8000,0,1,8,34 181 | 56,4000,12000,0,1,8,42 182 | 56,4000,16000,0,1,8,46 183 | 38,4000,8000,32,16,32,51 184 | 38,4000,8000,32,16,32,116 185 | 38,8000,16000,64,4,8,100 186 | 38,8000,24000,160,4,8,140 187 | 38,4000,16000,128,16,32,212 188 | 200,1000,2000,0,1,2,25 189 | 200,1000,4000,0,1,4,30 190 | 200,2000,8000,64,1,5,41 191 | 250,512,4000,0,1,7,25 192 | 250,512,4000,0,4,7,50 193 | 250,1000,16000,1,1,8,50 194 | 160,512,4000,2,1,5,30 195 | 160,512,2000,2,3,8,32 196 | 160,1000,4000,8,1,14,38 197 | 160,1000,8000,16,1,14,60 198 | 160,2000,8000,32,1,13,109 199 | 240,512,1000,8,1,3,6 200 | 240,512,2000,8,1,5,11 201 | 105,2000,4000,8,3,8,22 202 | 105,2000,6000,16,6,16,33 203 | 105,2000,8000,16,4,14,58 204 | 52,4000,16000,32,4,12,130 205 | 70,4000,12000,8,6,8,75 206 | 59,4000,12000,32,6,12,113 207 | 59,8000,16000,64,12,24,188 208 | 26,8000,24000,32,8,16,173 209 | 26,8000,32000,64,12,16,248 210 | 26,8000,32000,128,24,32,405 211 | 116,2000,8000,32,5,28,70 212 | 50,2000,32000,24,6,26,114 213 | 50,2000,32000,48,26,52,208 214 | 50,2000,32000,112,52,104,307 215 | 50,4000,32000,112,52,104,397 216 | 30,8000,64000,96,12,176,915 217 | 30,8000,64000,128,12,176,1150 218 | 180,262,4000,0,1,3,12 219 | 180,512,4000,0,1,3,14 220 | 180,262,4000,0,1,3,18 221 | 180,512,4000,0,1,3,21 222 | 124,1000,8000,0,1,8,42 223 | 98,1000,8000,32,2,8,46 224 | 125,2000,8000,0,2,14,52 225 | 480,512,8000,32,0,0,67 226 | 480,1000,4000,0,0,0,45 -------------------------------------------------------------------------------- /datasets/datafiles/arff/cpu.with.vendor.arff: -------------------------------------------------------------------------------- 1 | % 2 | % As used by Kilpatrick, D. & Cameron-Jones, M. (1998). Numeric prediction 3 | % using instance-based learning with encoding length selection. In Progress 4 | % in Connectionist-Based Information Systems. Singapore: Springer-Verlag. 5 | % 6 | 7 | @relation 'cpu' 8 | @attribute vendor { adviser, amdahl, apollo, basf, bti, burroughs, c.r.d, cdc, cambex, dec, dg, formation, four-phase, gould, hp, harris, honeywell, ibm, ipl, magnuson, microdata, nas, ncr, nixdorf, perkin-elmer, prime, siemens, sperry, sratus, wang} 9 | @attribute MYCT real 10 | @attribute MMIN real 11 | @attribute MMAX real 12 | @attribute CACH real 13 | @attribute CHMIN real 14 | @attribute CHMAX real 15 | @attribute class real 16 | @data 17 | adviser,125,256,6000,256,16,128,199 18 | amdahl,29,8000,32000,32,8,32,253 19 | amdahl,29,8000,32000,32,8,32,253 20 | amdahl,29,8000,32000,32,8,32,253 21 | amdahl,29,8000,16000,32,8,16,132 22 | amdahl,26,8000,32000,64,8,32,290 23 | amdahl,23,16000,32000,64,16,32,381 24 | amdahl,23,16000,32000,64,16,32,381 25 | amdahl,23,16000,64000,64,16,32,749 26 | amdahl,23,32000,64000,128,32,64,1238 27 | apollo,400,1000,3000,0,1,2,23 28 | apollo,400,512,3500,4,1,6,24 29 | basf,60,2000,8000,65,1,8,70 30 | basf,50,4000,16000,65,1,8,117 31 | bti,350,64,64,0,1,4,15 32 | bti,200,512,16000,0,4,32,64 33 | burroughs,167,524,2000,8,4,15,23 34 | burroughs,143,512,5000,0,7,32,29 35 | burroughs,143,1000,2000,0,5,16,22 36 | burroughs,110,5000,5000,142,8,64,124 37 | burroughs,143,1500,6300,0,5,32,35 38 | burroughs,143,3100,6200,0,5,20,39 39 | burroughs,143,2300,6200,0,6,64,40 40 | burroughs,110,3100,6200,0,6,64,45 41 | c.r.d,320,128,6000,0,1,12,28 42 | c.r.d,320,512,2000,4,1,3,21 43 | c.r.d,320,256,6000,0,1,6,28 44 | c.r.d,320,256,3000,4,1,3,22 45 | c.r.d,320,512,5000,4,1,5,28 46 | c.r.d,320,256,5000,4,1,6,27 47 | cdc,25,1310,2620,131,12,24,102 48 | cdc,25,1310,2620,131,12,24,102 49 | cdc,50,2620,10480,30,12,24,74 50 | cdc,50,2620,10480,30,12,24,74 51 | cdc,56,5240,20970,30,12,24,138 52 | cdc,64,5240,20970,30,12,24,136 53 | cdc,50,500,2000,8,1,4,23 54 | cdc,50,1000,4000,8,1,5,29 55 | cdc,50,2000,8000,8,1,5,44 56 | cambex,50,1000,4000,8,3,5,30 57 | cambex,50,1000,8000,8,3,5,41 58 | cambex,50,2000,16000,8,3,5,74 59 | cambex,50,2000,16000,8,3,6,74 60 | cambex,50,2000,16000,8,3,6,74 61 | dec,133,1000,12000,9,3,12,54 62 | dec,133,1000,8000,9,3,12,41 63 | dec,810,512,512,8,1,1,18 64 | dec,810,1000,5000,0,1,1,28 65 | dec,320,512,8000,4,1,5,36 66 | dec,200,512,8000,8,1,8,38 67 | dg,700,384,8000,0,1,1,34 68 | dg,700,256,2000,0,1,1,19 69 | dg,140,1000,16000,16,1,3,72 70 | dg,200,1000,8000,0,1,2,36 71 | dg,110,1000,4000,16,1,2,30 72 | dg,110,1000,12000,16,1,2,56 73 | dg,220,1000,8000,16,1,2,42 74 | formation,800,256,8000,0,1,4,34 75 | formation,800,256,8000,0,1,4,34 76 | formation,800,256,8000,0,1,4,34 77 | formation,800,256,8000,0,1,4,34 78 | formation,800,256,8000,0,1,4,34 79 | four-phase,125,512,1000,0,8,20,19 80 | gould,75,2000,8000,64,1,38,75 81 | gould,75,2000,16000,64,1,38,113 82 | gould,75,2000,16000,128,1,38,157 83 | hp,90,256,1000,0,3,10,18 84 | hp,105,256,2000,0,3,10,20 85 | hp,105,1000,4000,0,3,24,28 86 | hp,105,2000,4000,8,3,19,33 87 | hp,75,2000,8000,8,3,24,47 88 | hp,75,3000,8000,8,3,48,54 89 | hp,175,256,2000,0,3,24,20 90 | harris,300,768,3000,0,6,24,23 91 | harris,300,768,3000,6,6,24,25 92 | harris,300,768,12000,6,6,24,52 93 | harris,300,768,4500,0,1,24,27 94 | harris,300,384,12000,6,1,24,50 95 | harris,300,192,768,6,6,24,18 96 | harris,180,768,12000,6,1,31,53 97 | honeywell,330,1000,3000,0,2,4,23 98 | honeywell,300,1000,4000,8,3,64,30 99 | honeywell,300,1000,16000,8,2,112,73 100 | honeywell,330,1000,2000,0,1,2,20 101 | honeywell,330,1000,4000,0,3,6,25 102 | honeywell,140,2000,4000,0,3,6,28 103 | honeywell,140,2000,4000,0,4,8,29 104 | honeywell,140,2000,4000,8,1,20,32 105 | honeywell,140,2000,32000,32,1,20,175 106 | honeywell,140,2000,8000,32,1,54,57 107 | honeywell,140,2000,32000,32,1,54,181 108 | honeywell,140,2000,32000,32,1,54,181 109 | honeywell,140,2000,4000,8,1,20,32 110 | ibm,57,4000,16000,1,6,12,82 111 | ibm,57,4000,24000,64,12,16,171 112 | ibm,26,16000,32000,64,16,24,361 113 | ibm,26,16000,32000,64,8,24,350 114 | ibm,26,8000,32000,0,8,24,220 115 | ibm,26,8000,16000,0,8,16,113 116 | ibm,480,96,512,0,1,1,15 117 | ibm,203,1000,2000,0,1,5,21 118 | ibm,115,512,6000,16,1,6,35 119 | ibm,1100,512,1500,0,1,1,18 120 | ibm,1100,768,2000,0,1,1,20 121 | ibm,600,768,2000,0,1,1,20 122 | ibm,400,2000,4000,0,1,1,28 123 | ibm,400,4000,8000,0,1,1,45 124 | ibm,900,1000,1000,0,1,2,18 125 | ibm,900,512,1000,0,1,2,17 126 | ibm,900,1000,4000,4,1,2,26 127 | ibm,900,1000,4000,8,1,2,28 128 | ibm,900,2000,4000,0,3,6,28 129 | ibm,225,2000,4000,8,3,6,31 130 | ibm,225,2000,4000,8,3,6,31 131 | ibm,180,2000,8000,8,1,6,42 132 | ibm,185,2000,16000,16,1,6,76 133 | ibm,180,2000,16000,16,1,6,76 134 | ibm,225,1000,4000,2,3,6,26 135 | ibm,25,2000,12000,8,1,4,59 136 | ibm,25,2000,12000,16,3,5,65 137 | ibm,17,4000,16000,8,6,12,101 138 | ibm,17,4000,16000,32,6,12,116 139 | ibm,1500,768,1000,0,0,0,18 140 | ibm,1500,768,2000,0,0,0,20 141 | ibm,800,768,2000,0,0,0,20 142 | ipl,50,2000,4000,0,3,6,30 143 | ipl,50,2000,8000,8,3,6,44 144 | ipl,50,2000,8000,8,1,6,44 145 | ipl,50,2000,16000,24,1,6,82 146 | ipl,50,2000,16000,24,1,6,82 147 | ipl,50,8000,16000,48,1,10,128 148 | magnuson,100,1000,8000,0,2,6,37 149 | magnuson,100,1000,8000,24,2,6,46 150 | magnuson,100,1000,8000,24,3,6,46 151 | magnuson,50,2000,16000,12,3,16,80 152 | magnuson,50,2000,16000,24,6,16,88 153 | magnuson,50,2000,16000,24,6,16,88 154 | microdata,150,512,4000,0,8,128,33 155 | nas,115,2000,8000,16,1,3,46 156 | nas,115,2000,4000,2,1,5,29 157 | nas,92,2000,8000,32,1,6,53 158 | nas,92,2000,8000,32,1,6,53 159 | nas,92,2000,8000,4,1,6,41 160 | nas,75,4000,16000,16,1,6,86 161 | nas,60,4000,16000,32,1,6,95 162 | nas,60,2000,16000,64,5,8,107 163 | nas,60,4000,16000,64,5,8,117 164 | nas,50,4000,16000,64,5,10,119 165 | nas,72,4000,16000,64,8,16,120 166 | nas,72,2000,8000,16,6,8,48 167 | nas,40,8000,16000,32,8,16,126 168 | nas,40,8000,32000,64,8,24,266 169 | nas,35,8000,32000,64,8,24,270 170 | nas,38,16000,32000,128,16,32,426 171 | nas,48,4000,24000,32,8,24,151 172 | nas,38,8000,32000,64,8,24,267 173 | nas,30,16000,32000,256,16,24,603 174 | ncr,112,1000,1000,0,1,4,19 175 | ncr,84,1000,2000,0,1,6,21 176 | ncr,56,1000,4000,0,1,6,26 177 | ncr,56,2000,6000,0,1,8,35 178 | ncr,56,2000,8000,0,1,8,41 179 | ncr,56,4000,8000,0,1,8,47 180 | ncr,56,4000,12000,0,1,8,62 181 | ncr,56,4000,16000,0,1,8,78 182 | ncr,38,4000,8000,32,16,32,80 183 | ncr,38,4000,8000,32,16,32,80 184 | ncr,38,8000,16000,64,4,8,142 185 | ncr,38,8000,24000,160,4,8,281 186 | ncr,38,4000,16000,128,16,32,190 187 | nixdorf,200,1000,2000,0,1,2,21 188 | nixdorf,200,1000,4000,0,1,4,25 189 | nixdorf,200,2000,8000,64,1,5,67 190 | perkin-elmer,250,512,4000,0,1,7,24 191 | perkin-elmer,250,512,4000,0,4,7,24 192 | perkin-elmer,250,1000,16000,1,1,8,64 193 | prime,160,512,4000,2,1,5,25 194 | prime,160,512,2000,2,3,8,20 195 | prime,160,1000,4000,8,1,14,29 196 | prime,160,1000,8000,16,1,14,43 197 | prime,160,2000,8000,32,1,13,53 198 | siemens,240,512,1000,8,1,3,19 199 | siemens,240,512,2000,8,1,5,22 200 | siemens,105,2000,4000,8,3,8,31 201 | siemens,105,2000,6000,16,6,16,41 202 | siemens,105,2000,8000,16,4,14,47 203 | siemens,52,4000,16000,32,4,12,99 204 | siemens,70,4000,12000,8,6,8,67 205 | siemens,59,4000,12000,32,6,12,81 206 | siemens,59,8000,16000,64,12,24,149 207 | siemens,26,8000,24000,32,8,16,183 208 | siemens,26,8000,32000,64,12,16,275 209 | siemens,26,8000,32000,128,24,32,382 210 | sperry,116,2000,8000,32,5,28,56 211 | sperry,50,2000,32000,24,6,26,182 212 | sperry,50,2000,32000,48,26,52,227 213 | sperry,50,2000,32000,112,52,104,341 214 | sperry,50,4000,32000,112,52,104,360 215 | sperry,30,8000,64000,96,12,176,919 216 | sperry,30,8000,64000,128,12,176,978 217 | sperry,180,262,4000,0,1,3,24 218 | sperry,180,512,4000,0,1,3,24 219 | sperry,180,262,4000,0,1,3,24 220 | sperry,180,512,4000,0,1,3,24 221 | sperry,124,1000,8000,0,1,8,37 222 | sperry,98,1000,8000,32,2,8,50 223 | sratus,125,2000,8000,0,2,14,41 224 | wang,480,512,8000,32,0,0,47 225 | wang,480,1000,4000,0,0,0,25 -------------------------------------------------------------------------------- /datasets/datafiles/arff/weather.arff: -------------------------------------------------------------------------------- 1 | @relation weather 2 | 3 | @attribute outlook {sunny, overcast, rainy} 4 | @attribute temperature real 5 | @attribute humidity real 6 | @attribute windy {TRUE, FALSE} 7 | @attribute play {yes, no} 8 | 9 | @data 10 | sunny,85,85,FALSE,no 11 | sunny,80,90,TRUE,no 12 | overcast,83,86,FALSE,yes 13 | rainy,70,96,FALSE,yes 14 | rainy,68,80,FALSE,yes 15 | rainy,65,70,TRUE,no 16 | overcast,64,65,TRUE,yes 17 | sunny,72,95,FALSE,no 18 | sunny,69,70,FALSE,yes 19 | rainy,75,80,FALSE,yes 20 | sunny,75,70,TRUE,yes 21 | overcast,72,90,TRUE,yes 22 | overcast,81,75,FALSE,yes 23 | rainy,71,91,TRUE,no -------------------------------------------------------------------------------- /datasets/datafiles/arff/weather.nominal.arff: -------------------------------------------------------------------------------- 1 | @relation weather.symbolic 2 | 3 | @attribute outlook {sunny, overcast, rainy} 4 | @attribute temperature {hot, mild, cool} 5 | @attribute humidity {high, normal} 6 | @attribute windy {TRUE, FALSE} 7 | @attribute play {yes, no} 8 | 9 | @data 10 | sunny,hot,high,FALSE,no 11 | sunny,hot,high,TRUE,no 12 | overcast,hot,high,FALSE,yes 13 | rainy,mild,high,FALSE,yes 14 | rainy,cool,normal,FALSE,yes 15 | rainy,cool,normal,TRUE,no 16 | overcast,cool,normal,TRUE,yes 17 | sunny,mild,high,FALSE,no 18 | sunny,cool,normal,FALSE,yes 19 | rainy,mild,normal,FALSE,yes 20 | sunny,mild,normal,TRUE,yes 21 | overcast,mild,high,TRUE,yes 22 | overcast,hot,normal,FALSE,yes 23 | rainy,mild,high,TRUE,no -------------------------------------------------------------------------------- /datasets/datafiles/iris.data: -------------------------------------------------------------------------------- 1 | 5.1,3.5,1.4,0.2,Iris-setosa 2 | 4.9,3.0,1.4,0.2,Iris-setosa 3 | 4.7,3.2,1.3,0.2,Iris-setosa 4 | 4.6,3.1,1.5,0.2,Iris-setosa 5 | 5.0,3.6,1.4,0.2,Iris-setosa 6 | 5.4,3.9,1.7,0.4,Iris-setosa 7 | 4.6,3.4,1.4,0.3,Iris-setosa 8 | 5.0,3.4,1.5,0.2,Iris-setosa 9 | 4.4,2.9,1.4,0.2,Iris-setosa 10 | 4.9,3.1,1.5,0.1,Iris-setosa 11 | 5.4,3.7,1.5,0.2,Iris-setosa 12 | 4.8,3.4,1.6,0.2,Iris-setosa 13 | 4.8,3.0,1.4,0.1,Iris-setosa 14 | 4.3,3.0,1.1,0.1,Iris-setosa 15 | 5.8,4.0,1.2,0.2,Iris-setosa 16 | 5.7,4.4,1.5,0.4,Iris-setosa 17 | 5.4,3.9,1.3,0.4,Iris-setosa 18 | 5.1,3.5,1.4,0.3,Iris-setosa 19 | 5.7,3.8,1.7,0.3,Iris-setosa 20 | 5.1,3.8,1.5,0.3,Iris-setosa 21 | 5.4,3.4,1.7,0.2,Iris-setosa 22 | 5.1,3.7,1.5,0.4,Iris-setosa 23 | 4.6,3.6,1.0,0.2,Iris-setosa 24 | 5.1,3.3,1.7,0.5,Iris-setosa 25 | 4.8,3.4,1.9,0.2,Iris-setosa 26 | 5.0,3.0,1.6,0.2,Iris-setosa 27 | 5.0,3.4,1.6,0.4,Iris-setosa 28 | 5.2,3.5,1.5,0.2,Iris-setosa 29 | 5.2,3.4,1.4,0.2,Iris-setosa 30 | 4.7,3.2,1.6,0.2,Iris-setosa 31 | 4.8,3.1,1.6,0.2,Iris-setosa 32 | 5.4,3.4,1.5,0.4,Iris-setosa 33 | 5.2,4.1,1.5,0.1,Iris-setosa 34 | 5.5,4.2,1.4,0.2,Iris-setosa 35 | 4.9,3.1,1.5,0.1,Iris-setosa 36 | 5.0,3.2,1.2,0.2,Iris-setosa 37 | 5.5,3.5,1.3,0.2,Iris-setosa 38 | 4.9,3.1,1.5,0.1,Iris-setosa 39 | 4.4,3.0,1.3,0.2,Iris-setosa 40 | 5.1,3.4,1.5,0.2,Iris-setosa 41 | 5.0,3.5,1.3,0.3,Iris-setosa 42 | 4.5,2.3,1.3,0.3,Iris-setosa 43 | 4.4,3.2,1.3,0.2,Iris-setosa 44 | 5.0,3.5,1.6,0.6,Iris-setosa 45 | 5.1,3.8,1.9,0.4,Iris-setosa 46 | 4.8,3.0,1.4,0.3,Iris-setosa 47 | 5.1,3.8,1.6,0.2,Iris-setosa 48 | 4.6,3.2,1.4,0.2,Iris-setosa 49 | 5.3,3.7,1.5,0.2,Iris-setosa 50 | 5.0,3.3,1.4,0.2,Iris-setosa 51 | 7.0,3.2,4.7,1.4,Iris-versicolor 52 | 6.4,3.2,4.5,1.5,Iris-versicolor 53 | 6.9,3.1,4.9,1.5,Iris-versicolor 54 | 5.5,2.3,4.0,1.3,Iris-versicolor 55 | 6.5,2.8,4.6,1.5,Iris-versicolor 56 | 5.7,2.8,4.5,1.3,Iris-versicolor 57 | 6.3,3.3,4.7,1.6,Iris-versicolor 58 | 4.9,2.4,3.3,1.0,Iris-versicolor 59 | 6.6,2.9,4.6,1.3,Iris-versicolor 60 | 5.2,2.7,3.9,1.4,Iris-versicolor 61 | 5.0,2.0,3.5,1.0,Iris-versicolor 62 | 5.9,3.0,4.2,1.5,Iris-versicolor 63 | 6.0,2.2,4.0,1.0,Iris-versicolor 64 | 6.1,2.9,4.7,1.4,Iris-versicolor 65 | 5.6,2.9,3.6,1.3,Iris-versicolor 66 | 6.7,3.1,4.4,1.4,Iris-versicolor 67 | 5.6,3.0,4.5,1.5,Iris-versicolor 68 | 5.8,2.7,4.1,1.0,Iris-versicolor 69 | 6.2,2.2,4.5,1.5,Iris-versicolor 70 | 5.6,2.5,3.9,1.1,Iris-versicolor 71 | 5.9,3.2,4.8,1.8,Iris-versicolor 72 | 6.1,2.8,4.0,1.3,Iris-versicolor 73 | 6.3,2.5,4.9,1.5,Iris-versicolor 74 | 6.1,2.8,4.7,1.2,Iris-versicolor 75 | 6.4,2.9,4.3,1.3,Iris-versicolor 76 | 6.6,3.0,4.4,1.4,Iris-versicolor 77 | 6.8,2.8,4.8,1.4,Iris-versicolor 78 | 6.7,3.0,5.0,1.7,Iris-versicolor 79 | 6.0,2.9,4.5,1.5,Iris-versicolor 80 | 5.7,2.6,3.5,1.0,Iris-versicolor 81 | 5.5,2.4,3.8,1.1,Iris-versicolor 82 | 5.5,2.4,3.7,1.0,Iris-versicolor 83 | 5.8,2.7,3.9,1.2,Iris-versicolor 84 | 6.0,2.7,5.1,1.6,Iris-versicolor 85 | 5.4,3.0,4.5,1.5,Iris-versicolor 86 | 6.0,3.4,4.5,1.6,Iris-versicolor 87 | 6.7,3.1,4.7,1.5,Iris-versicolor 88 | 6.3,2.3,4.4,1.3,Iris-versicolor 89 | 5.6,3.0,4.1,1.3,Iris-versicolor 90 | 5.5,2.5,4.0,1.3,Iris-versicolor 91 | 5.5,2.6,4.4,1.2,Iris-versicolor 92 | 6.1,3.0,4.6,1.4,Iris-versicolor 93 | 5.8,2.6,4.0,1.2,Iris-versicolor 94 | 5.0,2.3,3.3,1.0,Iris-versicolor 95 | 5.6,2.7,4.2,1.3,Iris-versicolor 96 | 5.7,3.0,4.2,1.2,Iris-versicolor 97 | 5.7,2.9,4.2,1.3,Iris-versicolor 98 | 6.2,2.9,4.3,1.3,Iris-versicolor 99 | 5.1,2.5,3.0,1.1,Iris-versicolor 100 | 5.7,2.8,4.1,1.3,Iris-versicolor 101 | 6.3,3.3,6.0,2.5,Iris-virginica 102 | 5.8,2.7,5.1,1.9,Iris-virginica 103 | 7.1,3.0,5.9,2.1,Iris-virginica 104 | 6.3,2.9,5.6,1.8,Iris-virginica 105 | 6.5,3.0,5.8,2.2,Iris-virginica 106 | 7.6,3.0,6.6,2.1,Iris-virginica 107 | 4.9,2.5,4.5,1.7,Iris-virginica 108 | 7.3,2.9,6.3,1.8,Iris-virginica 109 | 6.7,2.5,5.8,1.8,Iris-virginica 110 | 7.2,3.6,6.1,2.5,Iris-virginica 111 | 6.5,3.2,5.1,2.0,Iris-virginica 112 | 6.4,2.7,5.3,1.9,Iris-virginica 113 | 6.8,3.0,5.5,2.1,Iris-virginica 114 | 5.7,2.5,5.0,2.0,Iris-virginica 115 | 5.8,2.8,5.1,2.4,Iris-virginica 116 | 6.4,3.2,5.3,2.3,Iris-virginica 117 | 6.5,3.0,5.5,1.8,Iris-virginica 118 | 7.7,3.8,6.7,2.2,Iris-virginica 119 | 7.7,2.6,6.9,2.3,Iris-virginica 120 | 6.0,2.2,5.0,1.5,Iris-virginica 121 | 6.9,3.2,5.7,2.3,Iris-virginica 122 | 5.6,2.8,4.9,2.0,Iris-virginica 123 | 7.7,2.8,6.7,2.0,Iris-virginica 124 | 6.3,2.7,4.9,1.8,Iris-virginica 125 | 6.7,3.3,5.7,2.1,Iris-virginica 126 | 7.2,3.2,6.0,1.8,Iris-virginica 127 | 6.2,2.8,4.8,1.8,Iris-virginica 128 | 6.1,3.0,4.9,1.8,Iris-virginica 129 | 6.4,2.8,5.6,2.1,Iris-virginica 130 | 7.2,3.0,5.8,1.6,Iris-virginica 131 | 7.4,2.8,6.1,1.9,Iris-virginica 132 | 7.9,3.8,6.4,2.0,Iris-virginica 133 | 6.4,2.8,5.6,2.2,Iris-virginica 134 | 6.3,2.8,5.1,1.5,Iris-virginica 135 | 6.1,2.6,5.6,1.4,Iris-virginica 136 | 7.7,3.0,6.1,2.3,Iris-virginica 137 | 6.3,3.4,5.6,2.4,Iris-virginica 138 | 6.4,3.1,5.5,1.8,Iris-virginica 139 | 6.0,3.0,4.8,1.8,Iris-virginica 140 | 6.9,3.1,5.4,2.1,Iris-virginica 141 | 6.7,3.1,5.6,2.4,Iris-virginica 142 | 6.9,3.1,5.1,2.3,Iris-virginica 143 | 5.8,2.7,5.1,1.9,Iris-virginica 144 | 6.8,3.2,5.9,2.3,Iris-virginica 145 | 6.7,3.3,5.7,2.5,Iris-virginica 146 | 6.7,3.0,5.2,2.3,Iris-virginica 147 | 6.3,2.5,5.0,1.9,Iris-virginica 148 | 6.5,3.0,5.2,2.0,Iris-virginica 149 | 6.2,3.4,5.4,2.3,Iris-virginica 150 | 5.9,3.0,5.1,1.8,Iris-virginica 151 | 152 | -------------------------------------------------------------------------------- /datasets/datafiles/michelson.json: -------------------------------------------------------------------------------- 1 | [[299.85,299.74,299.9,300.07,299.93,299.85,299.95,299.98,299.98,299.88,300,299.98,299.93,299.65,299.76,299.81,300,300,299.96,299.96],[299.96,299.94,299.96,299.94,299.88,299.8,299.85,299.88,299.9,299.84,299.83,299.79,299.81,299.88,299.88,299.83,299.8,299.79,299.76,299.8],[299.88,299.88,299.88,299.86,299.72,299.72,299.62,299.86,299.97,299.95,299.88,299.91,299.85,299.87,299.84,299.84,299.85,299.84,299.84,299.84],[299.89,299.81,299.81,299.82,299.8,299.77,299.76,299.74,299.75,299.76,299.91,299.92,299.89,299.86,299.88,299.72,299.84,299.85,299.85,299.78],[299.89,299.84,299.78,299.81,299.76,299.81,299.79,299.81,299.82,299.85,299.87,299.87,299.81,299.74,299.81,299.94,299.95,299.8,299.81,299.87]] 2 | -------------------------------------------------------------------------------- /datasets/datafiles/netflix/movies/movie_titles.txt: -------------------------------------------------------------------------------- 1 | 597,1977,The Brady Bunch Variety Hour 2 | 598,1973,Bobby Darin: Mack is Back 3 | -------------------------------------------------------------------------------- /datasets/datafiles/netflix/test/qualifying.txt: -------------------------------------------------------------------------------- 1 | 1: 2 | 1046323,2005-12-19 3 | 1080030,2005-12-23 4 | 2127527,2005-12-04 5 | 1944918,2005-10-05 6 | 1057066,2005-11-07 7 | 954049,2005-12-20 8 | 10: 9 | 12868,2004-10-19 10 | 627923,2005-12-16 11 | 690763,2005-12-13 12 | -------------------------------------------------------------------------------- /datasets/datafiles/netflix/training/mv_0000001.txt: -------------------------------------------------------------------------------- 1 | 1: 2 | 1488844,3,2005-09-06 3 | 822109,5,2005-05-13 4 | 885013,4,2005-10-19 5 | 30878,4,2005-12-26 6 | 823519,3,2004-05-03 7 | -------------------------------------------------------------------------------- /datasets/datafiles/nightingale.json: -------------------------------------------------------------------------------- 1 | [{"date":"1854-04-01T07:00:00.000Z","army_size":8571,"disease":1,"wounds":0,"other":5},{"date":"1854-05-01T07:00:00.000Z","army_size":23333,"disease":12,"wounds":0,"other":9},{"date":"1854-06-01T07:00:00.000Z","army_size":28333,"disease":11,"wounds":0,"other":6},{"date":"1854-07-01T07:00:00.000Z","army_size":28722,"disease":359,"wounds":0,"other":23},{"date":"1854-08-01T07:00:00.000Z","army_size":30246,"disease":828,"wounds":1,"other":30},{"date":"1854-09-01T07:00:00.000Z","army_size":30290,"disease":788,"wounds":81,"other":70},{"date":"1854-10-01T07:00:00.000Z","army_size":30643,"disease":503,"wounds":132,"other":128},{"date":"1854-11-01T07:00:00.000Z","army_size":29736,"disease":844,"wounds":287,"other":106},{"date":"1854-12-01T08:00:00.000Z","army_size":32779,"disease":1725,"wounds":114,"other":131},{"date":"1855-01-01T08:00:00.000Z","army_size":32393,"disease":2761,"wounds":83,"other":324},{"date":"1855-02-01T08:00:00.000Z","army_size":30919,"disease":2120,"wounds":42,"other":361},{"date":"1855-03-01T08:00:00.000Z","army_size":30107,"disease":1205,"wounds":32,"other":172},{"date":"1855-04-01T07:00:00.000Z","army_size":32252,"disease":477,"wounds":48,"other":57},{"date":"1855-05-01T07:00:00.000Z","army_size":35473,"disease":508,"wounds":49,"other":37},{"date":"1855-06-01T07:00:00.000Z","army_size":38863,"disease":802,"wounds":209,"other":31},{"date":"1855-07-01T07:00:00.000Z","army_size":42647,"disease":382,"wounds":134,"other":33},{"date":"1855-08-01T07:00:00.000Z","army_size":44614,"disease":483,"wounds":164,"other":25},{"date":"1855-09-01T07:00:00.000Z","army_size":47751,"disease":189,"wounds":276,"other":20},{"date":"1855-10-01T07:00:00.000Z","army_size":46852,"disease":128,"wounds":53,"other":18},{"date":"1855-11-01T07:00:00.000Z","army_size":37853,"disease":178,"wounds":33,"other":32},{"date":"1855-12-01T08:00:00.000Z","army_size":43217,"disease":91,"wounds":18,"other":28},{"date":"1856-01-01T08:00:00.000Z","army_size":44212,"disease":42,"wounds":2,"other":48},{"date":"1856-02-01T08:00:00.000Z","army_size":43485,"disease":24,"wounds":0,"other":19},{"date":"1856-03-01T08:00:00.000Z","army_size":46140,"disease":15,"wounds":0,"other":35}] 2 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Dataloader.hs: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | -- | 3 | -- Module : Numeric.Dataloader 4 | -- Stability : experimental 5 | -- Portability: non-portable 6 | -- 7 | -- A Dataloader is an extension of a Dataset and is primarily intended for 8 | -- compute-intensive, batch loading interfaces. When used with ImageFolder 9 | -- representations of Datasets, it shuffles the order of files to be loaded 10 | -- and leverages the async library when possible. 11 | -- 12 | -- Concurrent loading primarily takes place in 'batchStream'. 'stream' exists 13 | -- primarily to provide a unified API with training that is not batch-oriented. 14 | ------------------------------------------------------------------------------- 15 | {-# LANGUAGE ScopedTypeVariables #-} 16 | {-# OPTIONS_GHC -Wno-unused-top-binds -Wno-name-shadowing #-} 17 | module Numeric.Dataloader 18 | ( Dataloader(..) 19 | , uniformIxline 20 | 21 | , stream 22 | , batchStream 23 | ) where 24 | 25 | import Control.Monad ((>=>)) 26 | import Control.Monad.IO.Class (MonadIO, liftIO) 27 | import Data.Vector (Vector) 28 | import Streaming (Stream, Of(..)) 29 | import System.Random.MWC (GenIO) 30 | import qualified Data.Vector as V 31 | import qualified Streaming as S 32 | import qualified Streaming.Prelude as S 33 | import qualified System.Random.MWC.Distributions as MWC 34 | import Control.Exception.Safe (MonadThrow) 35 | import Streaming.Instances () 36 | 37 | import Control.Parallel.Strategies 38 | 39 | import Numeric.Datasets 40 | 41 | -- * Configuring data loaders 42 | 43 | 44 | -- | Options for a data loading functions. 45 | data Dataloader a b = Dataloader 46 | { batchSize :: Int -- ^ Batch size used with 'batchStream'. 47 | , shuffle :: Maybe (Vector Int) -- ^ Optional shuffle order (forces the dataset to be loaded in memory if it wasn't already). 48 | , dataset :: Dataset a -- ^ Dataset associated with the dataloader. 49 | , transform :: a -> b -- ^ Transformation associated with the dataloader which will be run in parallel. If using an 50 | -- ImageFolder, this is where you would transform image filepaths to an image (or other 51 | -- compute-optimized form). Additionally, this is where you should perform any 52 | -- static normalization. 53 | } 54 | 55 | 56 | -- | Generate a uniformly random index line from a dataset and a generator. 57 | uniformIxline 58 | :: Dataset a 59 | -> GenIO 60 | -> IO (Vector Int) 61 | uniformIxline ds gen = do 62 | len <- V.length <$> getDatavec ds 63 | MWC.uniformPermutation len gen 64 | 65 | -- * Data loading functions 66 | 67 | 68 | -- | Stream a dataset in-memory, applying a transformation function. 69 | stream 70 | :: (MonadThrow io, MonadIO io) 71 | => Dataloader a b 72 | -> Stream (Of b) io () 73 | stream dl = S.maps (\(a:>b) -> (transform dl a `using` rpar) :> b) (sourceStream dl) 74 | 75 | 76 | -- | Stream batches of a dataset, concurrently processing each element 77 | -- 78 | -- NOTE: Run with @-threaded -rtsopts@ to concurrently load data in-memory. 79 | batchStream 80 | :: (MonadThrow io, MonadIO io, NFData b) 81 | => Dataloader a b 82 | -> Stream (Of [b]) io () 83 | batchStream dl 84 | = S.mapsM (S.toList >=> liftIO . firstOfM go) 85 | $ S.chunksOf (batchSize dl) 86 | $ sourceStream dl 87 | where 88 | go as = fmap (transform dl) as `usingIO` parList rdeepseq 89 | 90 | 91 | -- * helper functions (not for export) 92 | 93 | 94 | -- | Stream a dataset in-memory 95 | sourceStream 96 | :: (MonadThrow io, MonadIO io) 97 | => Dataloader a b 98 | -> Stream (Of a) io () 99 | sourceStream loader 100 | = permute loader <$> getDatavec (dataset loader) 101 | >>= S.each 102 | where 103 | -- Use a dataloader's shuffle order to return a permuted vector (or return the 104 | -- identity vector). 105 | permute :: Dataloader a b -> Vector a -> Vector a 106 | permute loader va = maybe va (V.backpermute va) (shuffle loader) 107 | 108 | 109 | -- | Monadic, concrete version of Control.Arrow.first for @Of@ 110 | firstOfM :: Monad m => (a -> m b) -> Of a c -> m (Of b c) 111 | firstOfM fm (a:>c) = do 112 | b <- fm a 113 | pure (b:>c) 114 | 115 | 116 | 117 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Abalone.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | 3 | {-| 4 | 5 | Abalone data set 6 | 7 | UCI ML Repository link 8 | 9 | -} 10 | 11 | module Numeric.Datasets.Abalone where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Network.HTTP.Req ((/:), http, Scheme(..)) 18 | 19 | data Sex = M | F | I 20 | deriving (Show, Read, Eq, Generic, Bounded, Enum) 21 | 22 | instance FromField Sex where 23 | parseField = parseReadField 24 | 25 | data Abalone = Abalone 26 | { sex :: Sex 27 | , abaloneLength :: Double 28 | , diameter :: Double 29 | , height :: Double 30 | , wholeWeight :: Double 31 | , shuckedWeight :: Double 32 | , visceraWeight :: Double 33 | , shellWeight :: Double 34 | , rings :: Int 35 | } deriving (Show, Read, Generic) 36 | 37 | instance FromRecord Abalone 38 | 39 | abalone :: Dataset Abalone 40 | abalone = csvDataset $ URL $ uciMLDB /: "abalone" /: "abalone.data" 41 | 42 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Adult.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | 3 | {-| 4 | 5 | Adult (AKA Census Income) dataset. 6 | 7 | UCI ML Repository link 8 | 9 | -} 10 | 11 | module Numeric.Datasets.Adult where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Control.Applicative 18 | import Data.Text (Text, strip) 19 | import Network.HTTP.Req ((/:), Scheme(..)) 20 | 21 | data WorkClass = Private | SelfEmpNotInc | SelfEmpInc | FederalGov 22 | | LocalGov | StateGov | WithoutPay | NeverWorked 23 | deriving (Show, Read, Eq, Generic, Bounded, Enum) 24 | 25 | instance FromField WorkClass where 26 | parseField = parseDashToCamelField 27 | 28 | 29 | data MaritalStatus = MarriedCivSpouse | Divorced | NeverMarried 30 | | Separated | Widowed | MarriedSpouseAbsent | MarriedAFSpouse 31 | deriving (Show, Read, Eq, Generic, Bounded, Enum) 32 | 33 | instance FromField MaritalStatus where 34 | -- parseField "Married-AF-spouse" = pure MarriedAFSpouse 35 | parseField = parseDashToCamelField 36 | 37 | data Occupation = TechSupport | CraftRepair | OtherService | Sales | ExecManagerial | ProfSpecialty 38 | | HandlersCleaners | MachineOpInspct | AdmClerical | FarmingFishing | TransportMoving 39 | | PrivHouseServ | ProtectiveServ | ArmedForces 40 | deriving (Show, Read, Eq, Generic, Bounded, Enum) 41 | 42 | instance FromField Occupation where 43 | parseField = parseDashToCamelField 44 | 45 | data Relationship = Wife | OwnChild | Husband | NotInFamily | OtherRelative | Unmarried 46 | deriving (Show, Read, Eq, Generic, Bounded, Enum) 47 | 48 | instance FromField Relationship where 49 | parseField = parseDashToCamelField 50 | 51 | data Race = White | AsianPacIslander | AmerIndianEskimo | Other | Black 52 | deriving (Show, Read, Eq, Generic, Bounded, Enum) 53 | 54 | instance FromField Race where 55 | parseField = parseDashToCamelField 56 | 57 | data Sex = Female | Male 58 | deriving (Show, Read, Eq, Generic, Bounded, Enum) 59 | 60 | instance FromField Sex where 61 | parseField = parseDashToCamelField 62 | 63 | data Income = GT50K | LE50K 64 | deriving (Show, Read, Eq, Generic, Bounded, Enum) 65 | 66 | instance FromField Income where 67 | parseField " >50K" = pure GT50K 68 | parseField " <=50K" = pure LE50K 69 | parseField " >50K." = pure GT50K 70 | parseField " <=50K." = pure LE50K 71 | parseField ">50K" = pure GT50K 72 | parseField "<=50K" = pure LE50K 73 | parseField _ = fail "unknown income" 74 | 75 | data Adult = Adult 76 | { age :: Int 77 | , workClass :: Maybe WorkClass 78 | , finalWeight :: Int 79 | , education :: Text 80 | , educationNum :: Int 81 | , maritalStatus :: MaritalStatus 82 | , occupation :: Maybe Occupation 83 | , relationship :: Relationship 84 | , race :: Race 85 | , sex :: Sex 86 | , capitalGain :: Int 87 | , capitalLoss :: Int 88 | , hoursPerWeek :: Int 89 | , nativeCountry :: Text 90 | , income :: Income 91 | } deriving (Show, Read, Generic) 92 | 93 | instance FromRecord Adult where 94 | parseRecord v = Adult <$> v .! 0 <*> (v.! 1 <|> return Nothing) <*> v.!2 <*> (strip <$> v.!3) 95 | <*> v.!4 <*> v.!5 <*> (v.!6 <|> return Nothing) <*> v.!7 <*> v.!8 96 | <*> v.!9 <*> v.!10 <*> v.!11 <*> v.!12 <*> v.!13 <*> v.!14 97 | 98 | adult :: Dataset Adult 99 | adult = csvDataset $ URL $ uciMLDB /: "adult" /: "adult.data" 100 | 101 | adultTestSet :: Dataset Adult 102 | adultTestSet = withPreprocess (dropLines 1) $ csvDataset $ URL $ umassMLDB /: "adult" /: "adult.test" 103 | 104 | -- "http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test" 105 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Anscombe.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-} 2 | 3 | {-| 4 | 5 | Anscombe's quartet 6 | 7 | Four datasets with nearly identical statistical properties 8 | 9 | Wikipedia article: 10 | 11 | -} 12 | 13 | module Numeric.Datasets.Anscombe where 14 | 15 | anscombe :: [[(Double,Double)]] 16 | anscombe = [anscombe1, anscombe2, anscombe3, anscombe4] 17 | 18 | anscombe1, anscombe2, anscombe3, anscombe4 :: [(Double, Double)] 19 | anscombe1 = [ 20 | (10,8.04), 21 | (8,6.95), 22 | (13,7.58), 23 | (9,8.81), 24 | (11,8.33), 25 | (14,9.96), 26 | (6,7.24), 27 | (4,4.26), 28 | (12,10.84), 29 | (7,4.82), 30 | (5,5.68) 31 | ] 32 | 33 | anscombe2 = [ 34 | (10,9.14), 35 | (8,8.14), 36 | (13,8.74), 37 | (9,8.77), 38 | (11,9.26), 39 | (14,8.1), 40 | (6,6.13), 41 | (4,3.1), 42 | (12,9.13), 43 | (7,7.26), 44 | (5,4.74) 45 | ] 46 | 47 | anscombe3 = [ 48 | (10,7.46), 49 | (8,6.77), 50 | (13,12.74), 51 | (9,7.11), 52 | (11,7.81), 53 | (14,8.84), 54 | (6,6.08), 55 | (4,5.39), 56 | (12,8.15), 57 | (7,6.42), 58 | (5,5.73) 59 | ] 60 | 61 | anscombe4 = [ 62 | (8,6.58), 63 | (8,5.76), 64 | (8,7.71), 65 | (8,8.84), 66 | (8,8.47), 67 | (8,7.04), 68 | (8,5.25), 69 | (19,12.5), 70 | (8,5.56), 71 | (8,7.91), 72 | (8,6.89) 73 | ] 74 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/BostonHousing.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-} 3 | 4 | {-| 5 | 6 | BostonHousing Data set 7 | 8 | scikit-learn calls this "boston" and UCI calls it "Housing" 9 | 10 | UCI ML Repository link 11 | 12 | -} 13 | 14 | module Numeric.Datasets.BostonHousing where 15 | 16 | import Numeric.Datasets 17 | 18 | import Data.Csv 19 | import GHC.Generics 20 | import Control.Applicative 21 | import Network.HTTP.Req ((/:), http, Scheme(..)) 22 | 23 | 24 | data BostonHousing = BostonHousing 25 | { crimeRate :: Double 26 | , zoned :: Double 27 | , industrial :: Double 28 | , charlesRiver :: Bool 29 | , nitricOxides :: Double 30 | , rooms :: Double 31 | , age :: Double 32 | , distance :: Double 33 | , radialHwy :: Double 34 | , tax :: Double 35 | , ptRatio :: Double 36 | , b :: Double 37 | , lowerStatus :: Double 38 | , medianValue :: Double 39 | } deriving (Show, Read, Generic) 40 | 41 | instance FromRecord BostonHousing where 42 | parseRecord v = BostonHousing <$> 43 | v .! 0 <*> 44 | v .! 1 <*> 45 | v .! 2 <*> 46 | (intToBool <$> v .! 3) <*> 47 | v .! 4 <*> 48 | v .! 5 <*> 49 | v .! 6 <*> 50 | v .! 7 <*> 51 | v .! 8 <*> 52 | v .! 9 <*> 53 | v .! 10 <*> 54 | v .! 11 <*> 55 | v .! 12 <*> 56 | v .! 13 57 | where intToBool :: Int -> Bool 58 | intToBool 0 = False 59 | intToBool 1 = True 60 | intToBool _ = error "intToBool" 61 | 62 | bostonHousing :: Dataset BostonHousing 63 | bostonHousing = withPreprocess fixedWidthToCSV $ 64 | csvDataset $ URL $ uciMLDB /: "housing" /: "housing.data" 65 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/BreastCancerWisconsin.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | 3 | {-| 4 | 5 | Breast Cancer Wisconsin (Diagnostic) Data Set 6 | 7 | Repository link: 8 | 9 | -} 10 | 11 | module Numeric.Datasets.BreastCancerWisconsin where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Control.Applicative 18 | import Network.HTTP.Req ((/:), Scheme(..)) 19 | 20 | data Diagnosis = Malignant | Benign deriving (Show, Read, Eq, Generic, Bounded, Enum) 21 | 22 | data Prognosis = Recurrent | Nonrecurrent deriving (Show, Read, Eq, Generic, Bounded, Enum) 23 | 24 | intToDiagnosis :: Int -> Diagnosis 25 | intToDiagnosis 2 = Benign 26 | intToDiagnosis 4 = Malignant 27 | intToDiagnosis _ = error "unknown diagnosis code" 28 | 29 | data BreastCancerEntry = BreastCancerEntry 30 | { sampleCodeNumber :: Int 31 | , clumpThickness :: Int 32 | , uniformityCellSize :: Int 33 | , uniformityCellShape :: Int 34 | , marginalAdhesion :: Int 35 | , singleEpithelialCellSize :: Int 36 | , bareNuclei :: Maybe Int 37 | , blandChromatin :: Int 38 | , normalNucleoli :: Int 39 | , mitosis :: Int 40 | , sampleClass :: Diagnosis 41 | } deriving (Show, Read, Generic) 42 | 43 | instance FromRecord BreastCancerEntry where 44 | parseRecord v = BreastCancerEntry <$> v .! 0 <*> v .! 1 <*> v .! 2 <*> v .! 3 <*> v .! 4 <*> v .! 5 <*> (v .! 6 <|> return Nothing) <*> v .! 7 <*> v .! 8 <*> v .! 9 <*> (intToDiagnosis <$> v .! 10) 45 | 46 | breastCancerDatabase :: Dataset BreastCancerEntry 47 | breastCancerDatabase = csvDataset 48 | $ URL $ umassMLDB /: "breast-cancer-wisconsin" /: "breast-cancer-wisconsin.data" 49 | 50 | data DiagnosticBreastCancer = DiagnosticBreastCancer 51 | { diagnosticID :: Int 52 | , diagnosis :: Diagnosis 53 | , diagnosticCells :: CellFeatures 54 | } deriving (Show, Read, Generic) 55 | 56 | data PrognosticBreastCancer = PrognosticBreastCancer 57 | { prognosticID :: Int 58 | , prognosis :: Prognosis 59 | , prognosticCells :: CellFeatures 60 | } deriving (Show, Read, Generic) 61 | 62 | data CellFeatures = CellFeatures 63 | { radius :: Double 64 | , perimeter :: Double 65 | , area :: Double 66 | , smoothness :: Double 67 | , compactness :: Double 68 | , concavity :: Double 69 | , concavePoints :: Double 70 | , symmetry :: Double 71 | , fractalDimension :: Double 72 | } deriving (Show, Read, Generic) 73 | 74 | charToDiagnosis :: String -> Diagnosis 75 | charToDiagnosis "M" = Malignant 76 | charToDiagnosis "B" = Benign 77 | charToDiagnosis _ = error "unknown diagnosis" 78 | 79 | charToPrognosis :: String -> Prognosis 80 | charToPrognosis "N" = Nonrecurrent 81 | charToPrognosis "R" = Recurrent 82 | charToPrognosis _ = error "unknown diagnosis" 83 | 84 | instance FromRecord DiagnosticBreastCancer where 85 | parseRecord v = DiagnosticBreastCancer <$> v .! 0 <*> (charToDiagnosis <$> v .! 1) <*> parseRecord v 86 | 87 | instance FromRecord PrognosticBreastCancer where 88 | parseRecord v = PrognosticBreastCancer <$> v .! 0 <*> (charToPrognosis <$> v .! 1) <*> parseRecord v 89 | 90 | instance FromRecord CellFeatures where 91 | parseRecord v = CellFeatures <$> v .! 2 <*> v .! 3 <*> v .! 4 <*> v .! 5 <*> v .! 6 <*> v .! 7 <*> v .! 8 <*> v .! 9 <*> v .! 10 92 | 93 | diagnosticBreastCancer :: Dataset DiagnosticBreastCancer 94 | diagnosticBreastCancer = csvDataset 95 | $ URL $ umassMLDB /: "breast-cancer-wisconsin" /: "wdbc.data" 96 | 97 | prognosticBreastCancer :: Dataset PrognosticBreastCancer 98 | prognosticBreastCancer = csvDataset 99 | $ URL $ umassMLDB /: "breast-cancer-wisconsin" /: "wpbc.data" 100 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/CIFAR10.hs: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | -- | 3 | -- Module : Numeric.Datasets.CIFAR10 4 | -- License : BSD-3-Clause 5 | -- Stability : experimental 6 | -- Portability: non-portable 7 | -- 8 | -- The binary version contains the files data_batch_1.bin, data_batch_2.bin, 9 | -- ..., data_batch_5.bin, as well as test_batch.bin. Each of these files is 10 | -- formatted as follows: 11 | -- 12 | -- <1 x label><3072 x pixel> 13 | -- ... 14 | -- <1 x label><3072 x pixel> 15 | -- 16 | -- In other words, the first byte is the label of the first image, which is a 17 | -- number in the range 0-9. The next 3072 bytes are the values of the pixels of 18 | -- the image. The first 1024 bytes are the red channel values, the next 1024 19 | -- the green, and the final 1024 the blue. The values are stored in row-major 20 | -- order, so the first 32 bytes are the red channel values of the first row of 21 | -- the image. 22 | ------------------------------------------------------------------------------- 23 | {-# LANGUAGE CPP #-} 24 | {-# LANGUAGE DataKinds #-} 25 | {-# LANGUAGE DeriveAnyClass #-} 26 | {-# LANGUAGE DeriveGeneric #-} 27 | #if MIN_VERSION_GLASGOW_HASKELL(8,2,0,0) 28 | {-# LANGUAGE DerivingStrategies #-} 29 | #endif 30 | {-# LANGUAGE GeneralizedNewtypeDeriving #-} 31 | {-# LANGUAGE OverloadedStrings #-} 32 | {-# LANGUAGE ScopedTypeVariables #-} 33 | {-# LANGUAGE TupleSections #-} 34 | #if MIN_VERSION_JuicyPixels(3,3,0) 35 | #else 36 | {-# LANGUAGE UndecidableInstances #-} 37 | #endif 38 | module Numeric.Datasets.CIFAR10 39 | ( Label(..) 40 | , CIFARImage(..), height, width, image, label 41 | , cifarURL 42 | , cifar10 43 | , parseCifar 44 | ) where 45 | 46 | import Codec.Picture (Image, PixelRGB8(PixelRGB8), Pixel8, writePixel) 47 | import Codec.Picture.Types (newMutableImage, freezeImage) 48 | import Control.DeepSeq 49 | import Control.Exception (throw) 50 | import Control.Monad.ST (runST) 51 | import Data.List (zipWith4) 52 | import GHC.Generics (Generic) 53 | import Network.HTTP.Req (Url, (/:), https, Scheme(..)) 54 | import qualified Codec.Archive.Tar as Tar 55 | import qualified Codec.Compression.GZip as GZip 56 | import qualified Data.Attoparsec.ByteString.Lazy as Atto 57 | import qualified Data.ByteString as BS 58 | import qualified Data.ByteString.Lazy as BL 59 | 60 | import Numeric.Datasets 61 | 62 | #if MIN_VERSION_JuicyPixels(3,3,0) 63 | #else 64 | import Foreign.Storable (Storable) 65 | import qualified Codec.Picture as Compat 66 | #endif 67 | -- ========================================================================= -- 68 | 69 | -- | labels of CIFAR-10 dataset. Enum corresponds to binary-based uint8 label. 70 | data Label 71 | = Airplane 72 | | Automobile 73 | | Bird 74 | | Cat 75 | | Deer 76 | | Dog 77 | | Frog 78 | | Horse 79 | | Ship 80 | | Truck 81 | deriving (Show, Eq, Generic, Bounded, Enum, Read, NFData) 82 | 83 | #if MIN_VERSION_JuicyPixels(3,3,0) 84 | #else 85 | instance (Eq (Compat.PixelBaseComponent a), Storable (Compat.PixelBaseComponent a)) 86 | => Eq (Image a) where 87 | a == b = Compat.imageWidth a == Compat.imageWidth b && 88 | Compat.imageHeight a == Compat.imageHeight b && 89 | Compat.imageData a == Compat.imageData b 90 | #endif 91 | 92 | -- | Data representation of a CIFAR image is a 32x32 RGB image 93 | newtype CIFARImage = CIFARImage { getXY :: (Image PixelRGB8, Label) } 94 | #if MIN_VERSION_GLASGOW_HASKELL(8,2,0,0) 95 | deriving newtype (Eq, NFData) 96 | #else 97 | deriving (Eq, Generic, NFData) 98 | #endif 99 | 100 | instance Show CIFARImage where 101 | show im = "CIFARImage{Height: 32, Width: 32, Pixel: RGB8, Label: " ++ show (label im) ++ "}" 102 | 103 | -- | height of 'CIFARImage' 104 | height :: Int 105 | height = 32 106 | 107 | -- | width of 'CIFARImage' 108 | width :: Int 109 | width = 32 110 | 111 | -- | extract the JuicyPixel representation from a CIFAR datapoint 112 | image :: CIFARImage -> Image PixelRGB8 113 | image = fst . getXY 114 | 115 | -- | extract the label from a CIFAR datapoint 116 | label :: CIFARImage -> Label 117 | label = snd . getXY 118 | 119 | -- | Source URL for cifar-10 and cifar-100 120 | cifarURL :: Url 'Https 121 | cifarURL = https "www.cs.toronto.edu" /: "~kriz" 122 | 123 | ------------------------------------------------------------------------------- 124 | tempdir :: Maybe FilePath 125 | tempdir = Nothing 126 | 127 | -- | Define a dataset from a source for a CSV file 128 | cifar10 :: Dataset CIFARImage 129 | cifar10 = Dataset 130 | (URL $ cifarURL /: "cifar-10-binary.tar.gz") 131 | tempdir 132 | (Just unzipCifar) 133 | (Parsable parseCifar) 134 | 135 | -- cifar10Sha256 = "c4a38c50a1bc5f3a1c5537f2155ab9d68f9f25eb1ed8d9ddda3db29a59bca1dd" 136 | 137 | -- | parser for a cifar binary 138 | parseCifar :: Atto.Parser CIFARImage 139 | parseCifar = do 140 | lbl :: Label <- toEnum . fromIntegral <$> Atto.anyWord8 141 | rs :: [Pixel8] <- BS.unpack <$> Atto.take 1024 142 | gs :: [Pixel8] <- BS.unpack <$> Atto.take 1024 143 | bs :: [Pixel8] <- BS.unpack <$> Atto.take 1024 144 | let ipixels = zipWith4 (\ix r g b -> (ix, PixelRGB8 r g b)) ixs rs gs bs 145 | pure $ CIFARImage (newImage ipixels, lbl) 146 | where 147 | newImage :: [((Int, Int), PixelRGB8)] -> Image PixelRGB8 148 | newImage ipixels = runST $ do 149 | mim <- newMutableImage height width 150 | mapM_ (\((x, y), rgb) -> writePixel mim x y rgb) ipixels 151 | freezeImage mim 152 | 153 | ixs :: [(Int, Int)] 154 | ixs = concat $ zipWith (\(row::Int) cols -> (row,) <$> cols) [0..] (replicate height [0..width - 1]) 155 | 156 | -- | how to unpack the tarball 157 | -- 158 | -- FIXME: this should be in MonadThrow 159 | unzipCifar :: BL.ByteString -> BL.ByteString 160 | unzipCifar zipbs = do 161 | either (throw . fst) (BL.concat) $ Tar.foldlEntries go [] entries 162 | where 163 | entries :: Tar.Entries Tar.FormatError 164 | entries = Tar.read $ GZip.decompress zipbs 165 | 166 | go :: [BL.ByteString] -> Tar.Entry -> [BL.ByteString] 167 | go agg entry = 168 | case Tar.entryContent entry of 169 | Tar.NormalFile ps fs -> 170 | -- Each file is exactly 30730000 bytes long. All other files are metadata. See https://www.cs.toronto.edu/~kriz/cifar.html 171 | if fs == 30730000 172 | then ps:agg 173 | else agg 174 | _ -> agg 175 | 176 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/CO2.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | 3 | {-| 4 | 5 | Mauna Loa CO2 time-series 6 | 7 | 8 | Listed as co2 here: http://vincentarelbundock.github.io/Rdatasets/datasets.html 9 | 10 | See 11 | 12 | -} 13 | 14 | module Numeric.Datasets.CO2 where 15 | 16 | import Numeric.Datasets 17 | 18 | import Data.Csv 19 | import GHC.Generics 20 | import Network.HTTP.Req ((/:), https, Scheme(..)) 21 | 22 | data CO2 = CO2 23 | { time :: Double 24 | , value :: Double 25 | } deriving (Show, Read, Generic) 26 | 27 | instance FromNamedRecord CO2 28 | 29 | maunaLoaCO2 :: Dataset CO2 30 | maunaLoaCO2 = csvHdrDataset 31 | $ URL $ https "raw.githubusercontent.com" /: "vincentarelbundock" /: "Rdatasets" /: "master" /: "csv" /: "datasets" /: "CO2.csv" 32 | 33 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Car.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-} 3 | 4 | {-| 5 | 6 | Car dataset 7 | 8 | UCI ML Repository link 9 | 10 | -} 11 | 12 | module Numeric.Datasets.Car where 13 | 14 | import Numeric.Datasets 15 | 16 | import Data.Csv 17 | import GHC.Generics 18 | import Control.Applicative 19 | import Network.HTTP.Req ((/:), http, Scheme(..)) 20 | 21 | data RelScore = Low | Med | High | VeryHigh deriving (Show, Read, Eq, Generic, Bounded, Enum) 22 | 23 | instance FromField RelScore where 24 | parseField "vhigh" = pure VeryHigh 25 | parseField "high" = pure High 26 | parseField "med" = pure Med 27 | parseField "low" = pure Low 28 | parseField _ = fail "unknown relative score" 29 | 30 | data RelSize = Small | Medium | Big deriving (Show, Read, Eq, Generic, Bounded, Enum) 31 | 32 | instance FromField RelSize where 33 | parseField "small" = pure Small 34 | parseField "med" = pure Medium 35 | parseField "big" = pure Big 36 | parseField _ = fail "unknown relative size" 37 | 38 | data Acceptability = Unacceptable | Acceptable | Good | VeryGood deriving (Show, Read, Eq, Generic, Bounded, Enum) 39 | 40 | instance FromField Acceptability where 41 | parseField "unacc" = pure Unacceptable 42 | parseField "acc" = pure Acceptable 43 | parseField "good" = pure Good 44 | parseField "vgood" = pure VeryGood 45 | parseField _ = fail "unknown acceptability" 46 | 47 | data Count = N Int | NOrMore Int | More deriving (Show, Read, Eq, Generic) 48 | 49 | instance FromField Count where 50 | parseField "more" = pure More 51 | parseField "5more" = pure (NOrMore 5) 52 | parseField "2" = pure (N 2) 53 | parseField "3" = pure (N 3) 54 | parseField "4" = pure (N 4) 55 | parseField _ = fail "unknown count" 56 | 57 | data Car = Car 58 | { buying :: RelScore 59 | , maintenance :: RelScore 60 | , doors :: Count 61 | , persons :: Count 62 | , luggageBoot :: RelSize 63 | , safety :: RelScore 64 | , acceptability:: Acceptability 65 | } deriving (Show, Read, Generic) 66 | 67 | instance FromRecord Car 68 | 69 | car :: Dataset Car 70 | car = csvDataset 71 | $ URL $ uciMLDB /: "car" /: "car.data" 72 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Coal.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | -- {-# LANGUAGE GADTs, QuasiQuotes, ViewPatterns, FlexibleContexts #-} 3 | -- {-# OPTIONS_GHC -fno-warn-unused-imports #-} 4 | 5 | {-| 6 | 7 | Coal data set 8 | 9 | Dates of mining disasters, from the `coal` dataset in the R package `boot`. 10 | 11 | For further information, see 12 | 13 | -} 14 | 15 | module Numeric.Datasets.Coal ( Coal, coal, date ) where 16 | 17 | import Numeric.Datasets 18 | 19 | import Data.Csv 20 | import GHC.Generics 21 | -- import Control.Applicative 22 | import Network.HTTP.Req ((/:), http, Scheme(..)) 23 | 24 | newtype Coal = Coal 25 | { date :: Double 26 | } deriving (Show, Read, Generic) 27 | 28 | instance FromRecord Coal where 29 | parseRecord v = Coal <$> v .! 1 30 | 31 | coal :: Dataset Coal 32 | coal = let src = URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "boot" /: "coal.csv" 33 | in Dataset src Nothing Nothing $ CSVRecord HasHeader defaultDecodeOptions 34 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Diabetes.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings, TemplateHaskell #-} 2 | 3 | {- | 4 | Module : Numeric.Datasets.Diabetes 5 | Description : Diabetes dataset created from an ARFF input file 6 | Copyright : (c) Arvind Devarajan 7 | License : BSD-3-Clause 8 | 9 | Maintainer : arvindd 10 | Stability : experimental 11 | Portability : portable 12 | -} 13 | 14 | module Numeric.Datasets.Diabetes 15 | ( DiabetesClass 16 | , PimaDiabetesEntry 17 | , pimaDiabetes 18 | ) where 19 | 20 | import Numeric.Datasets 21 | import Data.FileEmbed 22 | import qualified Data.ByteString.Lazy as BL (fromStrict, ByteString) 23 | import Numeric.Datasets.Internal.ArffParser 24 | 25 | data DiabetesClass = TestedNegative | TestedPositive | UnknownClass deriving (Show) 26 | 27 | data PimaDiabetesEntry = PimaDiabetesEntry 28 | { preg :: !Double 29 | , plas :: !Double 30 | , pres :: !Double 31 | , skin :: !Double 32 | , insu :: !Double 33 | , mass :: !Double 34 | , pedi :: !Double 35 | , age :: !Double 36 | , diabetesClass :: !DiabetesClass 37 | } deriving (Show) 38 | 39 | -- | Diabetes dataset, containing a list of Pima Indian diabetes entries 40 | pimaDiabetes :: [PimaDiabetesEntry] 41 | pimaDiabetes = toPimaDiabetes records 42 | where records = readArff (BL.fromStrict $(embedFile "datafiles/arff/diabetes.arff")) 43 | 44 | -- | Converts each ARFF record into a Pima diabetes entry 45 | toPimaDiabetes :: [ArffRecord] -> [PimaDiabetesEntry] 46 | toPimaDiabetes recs = 47 | let toPD :: ArffRecord -> PimaDiabetesEntry 48 | toPD = PimaDiabetesEntry <$> dblval 0 49 | <*> dblval 1 50 | <*> dblval 2 51 | <*> dblval 3 52 | <*> dblval 4 53 | <*> dblval 5 54 | <*> dblval 6 55 | <*> dblval 7 56 | <*> diabClass 8 57 | 58 | dblval :: Int -> ArffRecord -> Double 59 | dblval idx r = value (\_->0) idx r 60 | 61 | strval :: Int -> ArffRecord -> BL.ByteString 62 | strval idx r = value (\_->"") idx r 63 | 64 | diabClass :: Int -> ArffRecord -> DiabetesClass 65 | diabClass i r = 66 | let s = strval i r 67 | in case s of 68 | "tested_negative" -> TestedNegative 69 | "tested_positive" -> TestedPositive 70 | _ -> UnknownClass 71 | 72 | in fmap toPD recs -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Gapminder.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-} 3 | 4 | {-| 5 | 6 | Gapminder dataset - Life expectancy, GDP, population every five years per country 7 | 8 | Source: 9 | 10 | More information: https://cran.r-project.org/web/packages/gapminder/gapminder.pdf 11 | 12 | -} 13 | 14 | module Numeric.Datasets.Gapminder where 15 | 16 | import Numeric.Datasets 17 | 18 | import Data.Csv 19 | import GHC.Generics 20 | import Control.Applicative 21 | import Data.Text (Text) 22 | import Network.HTTP.Req ((/:), https, Scheme(..)) 23 | 24 | data Gapminder = Gapminder 25 | { country :: Text 26 | , year :: Int 27 | , pop :: Integer 28 | , continent :: Text 29 | , lifeExp :: Double 30 | , gdpPercap :: Double 31 | } deriving (Show, Read, Generic) 32 | 33 | instance FromNamedRecord Gapminder where 34 | parseNamedRecord m = Gapminder <$> 35 | m .: "country" <*> 36 | m .: "year" <*> 37 | (roundIt <$> m .: "pop") <*> 38 | m .: "continent" <*> 39 | m .: "lifeExp" <*> 40 | m .: "gdpPercap" 41 | where roundIt :: Double -> Integer 42 | roundIt = round 43 | 44 | gapminder :: Dataset Gapminder 45 | gapminder = csvHdrDataset 46 | $ URL $ https "raw.githubusercontent.com" /: "plotly" /: "datasets" /: "master" /: "gapminderDataFiveYear.csv" 47 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Internal/Streaming.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE FlexibleContexts #-} 2 | {-# LANGUAGE GADTs #-} 3 | {-# LANGUAGE LambdaCase #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# OPTIONS_GHC -Wno-unused-top-binds -Wno-name-shadowing #-} 7 | module Numeric.Datasets.Internal.Streaming 8 | ( streamDataset 9 | , streamByteString 10 | ) where 11 | 12 | import Control.Exception.Safe (MonadThrow, Exception, throwString, throwM) 13 | import Control.Monad.Error.Class (MonadError) 14 | import Control.Monad.IO.Class (MonadIO, liftIO) 15 | import Data.Attoparsec.ByteString.Lazy (Parser) 16 | import Data.Maybe (fromMaybe) 17 | import Data.Text.Encoding (decodeUtf8) 18 | import Data.List.NonEmpty (NonEmpty, toList) 19 | import Streaming (Stream, Of((:>)), lift) 20 | import qualified Data.ByteString.Streaming as BS (fromLazy, ByteString, null) 21 | import qualified Data.ByteString as B' (pack) 22 | import qualified Data.ByteString.Lazy as B (ByteString, concat) 23 | import qualified Data.List as L (intercalate) 24 | import qualified Data.Attoparsec.ByteString.Streaming as Atto (parse) 25 | import qualified Data.Attoparsec.ByteString.Lazy as Atto (anyWord8) 26 | import qualified Streaming as S (hoist, unfold) 27 | import qualified Streaming.Cassava as S (decodeWith, decodeByNameWith, CsvParseException) 28 | import qualified Streaming.Prelude as S (print, maps) 29 | 30 | import Numeric.Datasets 31 | import Streaming.Instances () 32 | 33 | -- | Stream a dataset 34 | streamDataset 35 | :: forall io a . (MonadThrow io, MonadIO io) 36 | => Dataset a 37 | -> Stream (Of a) io () 38 | streamDataset ds = do 39 | folder <- liftIO $ defaultTempDir (temporaryDirectory ds) 40 | files <- liftIO $ getFileFromSource folder (source ds) 41 | streamByteString (readAs ds) (fromMaybe id (preProcess ds) <$> files) 42 | 43 | 44 | -- | Stream a ByteString into a Haskell value 45 | streamByteString 46 | :: forall m a 47 | . (MonadThrow m) 48 | => ReadAs a 49 | -> NonEmpty B.ByteString 50 | -> Stream (Of a) m () 51 | streamByteString ra bs = _streamDataset ra (BS.fromLazy $ B.concat $ toList bs) 52 | 53 | 54 | -- private function which uses the streaming interface of bytestring 55 | _streamDataset 56 | :: forall mt a e 57 | . (MonadThrow mt, Exception e) 58 | => (MonadError S.CsvParseException (Either e)) 59 | => ReadAs a 60 | -> BS.ByteString (Either e) () 61 | -> Stream (Of a) mt () 62 | _streamDataset ra bs = 63 | case ra of 64 | JSON -> lift $ throwString "Not implemented: JSON streaming" 65 | CSVRecord hhdr opts -> S.hoist either2Throw $ S.decodeWith opts hhdr bs 66 | CSVNamedRecord opts -> S.hoist either2Throw $ S.decodeByNameWith opts bs 67 | MultiRecordParsable _ -> lift $ throwString "Not implemented: MultiRecord streaming" 68 | Parsable psr -> parseStream psr (S.hoist either2Throw bs) 69 | ImageFolder _ -> lift $ throwString "Not implemented: Image Folder streaming, use Dataloader" 70 | where 71 | either2Throw :: MonadThrow m => (forall x e . Exception e => Either e x -> m x) 72 | either2Throw = \case 73 | Left e -> throwM e 74 | Right r -> pure r 75 | 76 | 77 | -- private function to generate a stream from a parser 78 | parseStream 79 | :: forall m a . MonadThrow m => Parser a -> BS.ByteString m () -> Stream (Of a) m () 80 | parseStream psr = S.unfold go 81 | where 82 | go :: BS.ByteString m () -> m (Either () (Of a (BS.ByteString m ()))) 83 | go bs = do 84 | (eea, rest) <- Atto.parse psr bs 85 | BS.null rest >>= \(empty :> _) -> 86 | if empty 87 | then pure $ Left () 88 | else case eea of 89 | Left (es, lst) -> throwString (lst ++ "\n" ++ L.intercalate "\n" es) 90 | Right a -> pure $ Right (a :> rest) 91 | 92 | 93 | -- make this a real test 94 | test :: IO () 95 | test = do 96 | S.print $ S.maps render $ parseStream Atto.anyWord8 (BS.fromLazy "1") 97 | S.print $ S.maps render $ parseStream Atto.anyWord8 (BS.fromLazy "1 2 3 4") 98 | where 99 | render (a:>b) = (decodeUtf8 (B'.pack [a]) :> b) 100 | 101 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Iris.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, TemplateHaskell #-} 2 | 3 | {-| 4 | 5 | The classical Iris dataset, due to R.A. Fisher. 6 | 7 | UCI ML Repository link 8 | 9 | -} 10 | 11 | module Numeric.Datasets.Iris where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Data.FileEmbed 18 | import Data.ByteString.Lazy (fromStrict) 19 | 20 | 21 | data IrisClass = Setosa | Versicolor | Virginica 22 | deriving (Show, Read, Eq, Ord, Generic, Enum, Bounded) 23 | 24 | instance FromField IrisClass where 25 | parseField "Iris-setosa" = return Setosa 26 | parseField "Iris-versicolor" = return Versicolor 27 | parseField "Iris-virginica" = return Virginica 28 | parseField _ = fail "unknown iris class" 29 | 30 | data Iris = Iris 31 | { sepalLength :: Double 32 | , sepalWidth :: Double 33 | , petalLength :: Double 34 | , petalWidth :: Double 35 | , irisClass :: IrisClass 36 | } deriving (Show, Read, Generic) 37 | 38 | instance FromRecord Iris 39 | 40 | iris :: [Iris] 41 | iris = readDataset csvRecord (fromStrict $(embedFile "datafiles/iris.data")) 42 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Michelson.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, TemplateHaskell #-} 2 | 3 | {-| 4 | 5 | Michelson's speed of light dataset - five repeated measurements of the speed of light. 6 | 7 | Data from 8 | 9 | The embedded dataset is Copyright (c) 2015 The Compute.io Authors. 10 | 11 | -} 12 | 13 | module Numeric.Datasets.Michelson where 14 | 15 | import Numeric.Datasets 16 | import Data.FileEmbed 17 | import Data.ByteString.Lazy (fromStrict) 18 | 19 | 20 | michelson :: [[Double]] 21 | michelson = readDataset JSON (fromStrict $(embedFile "datafiles/michelson.json")) 22 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Mushroom.hs~: -------------------------------------------------------------------------------- 1 | module Numeric.Datasets.Mushroom where 2 | 3 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Nightingale.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, TemplateHaskell #-} 2 | 3 | {-| 4 | 5 | Florence Nightingale's count of injuries in the Crimean War, used for her rose plots 6 | 7 | Data from 8 | 9 | The embedded dataset is Copyright (c) 2015 The Compute.io Authors. 10 | 11 | -} 12 | 13 | module Numeric.Datasets.Nightingale where 14 | 15 | import Numeric.Datasets 16 | import Data.FileEmbed 17 | import Data.ByteString.Lazy (fromStrict) 18 | import Data.Aeson hiding (parseJSON) 19 | import Data.Time (UTCTime) 20 | import GHC.Generics 21 | 22 | 23 | data Nightingale = Nightingale 24 | { date :: UTCTime 25 | , army_size :: Int 26 | , disease :: Int 27 | , wounds :: Int 28 | , other :: Int 29 | } deriving (Show, Read, Generic) 30 | 31 | instance FromJSON Nightingale 32 | 33 | nightingale :: [Nightingale] 34 | nightingale = readDataset JSON $ fromStrict $(embedFile "datafiles/nightingale.json") 35 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/OldFaithful.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-} 3 | 4 | {-| 5 | 6 | Old Faithful Geyser Eruption data 7 | 8 | Article: http://web.pdx.edu/~jfreder/M212/oldfaithful.pdf 9 | 10 | These data from: 11 | 12 | For more data, see 13 | 14 | -} 15 | 16 | module Numeric.Datasets.OldFaithful where 17 | 18 | import Numeric.Datasets 19 | 20 | import Data.Csv 21 | import Control.Applicative 22 | import Network.HTTP.Req ((/:), https, Scheme(..)) 23 | 24 | 25 | data OldFaithful = OldFaithful 26 | { waiting :: Double -- ^ waiting time until next eruption 27 | , duration :: Double -- ^ duration of eruption in minutes 28 | } deriving Show 29 | 30 | instance FromRecord OldFaithful where 31 | parseRecord v = OldFaithful <$> v .! 2 <*> v.! 1 32 | 33 | oldFaithful :: Dataset OldFaithful 34 | oldFaithful 35 | = let src = URL $ https "raw.githubusercontent.com" /: "vincentarelbundock" /: "Rdatasets" /: "master" /: "csv" /: "datasets" /: "faithful.csv" 36 | in Dataset src Nothing Nothing $ CSVRecord HasHeader defaultDecodeOptions 37 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Quakes.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | 3 | {-| 4 | 5 | Locations of Earthquakes off Fiji 6 | 7 | Listed as quakes here: http://vincentarelbundock.github.io/Rdatasets/datasets.html 8 | 9 | 10 | -} 11 | 12 | module Numeric.Datasets.Quakes where 13 | 14 | import Numeric.Datasets 15 | 16 | import Data.Csv 17 | import GHC.Generics 18 | import Network.HTTP.Req ((/:), http, Scheme(..)) 19 | 20 | data Quake = Quake 21 | { lat :: Double 22 | , long :: Double 23 | , depth :: Double 24 | , mag :: Double 25 | , stations :: Int 26 | } deriving (Show, Read, Generic) 27 | 28 | instance FromNamedRecord Quake 29 | 30 | quakes :: Dataset Quake 31 | quakes = csvHdrDataset 32 | $ URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "datasets" /: "quakes.csv" 33 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/States.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-} 3 | 4 | {-| 5 | 6 | Data on education in US states 7 | 8 | 9 | 10 | -} 11 | 12 | module Numeric.Datasets.States where 13 | 14 | import Numeric.Datasets 15 | 16 | import Data.Csv 17 | import GHC.Generics 18 | import Control.Applicative 19 | import Network.HTTP.Req ((/:), http, Scheme(..)) 20 | 21 | data StateEdu = StateEdu 22 | { state :: String 23 | , region :: String 24 | , population :: Int 25 | , satVerbal :: Int 26 | , satMath :: Int 27 | , satPercent :: Int 28 | , dollarSpend :: Double 29 | , teacherPay :: Int 30 | } deriving (Show, Read, Generic) 31 | 32 | instance FromNamedRecord StateEdu where 33 | parseNamedRecord m = StateEdu <$> 34 | m .: "" <*> 35 | m .: "region" <*> 36 | m .: "pop" <*> 37 | m .: "SATV" <*> 38 | m .: "SATM" <*> 39 | m .: "percent" <*> 40 | m .: "dollars" <*> 41 | m .: "pay" 42 | 43 | states :: Dataset StateEdu 44 | states = csvHdrDataset 45 | $ URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "car" /: "States.csv" 46 | 47 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Sunspots.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-} 3 | 4 | {-| 5 | 6 | Monthly sunspots from 1749 7 | 8 | Listed as sunspot.month here: http://vincentarelbundock.github.io/Rdatasets/datasets.html 9 | 10 | See 11 | 12 | -} 13 | 14 | module Numeric.Datasets.Sunspots where 15 | 16 | import Numeric.Datasets 17 | 18 | import Data.Csv 19 | import GHC.Generics 20 | import Control.Applicative 21 | import Network.HTTP.Req ((/:), http, Scheme(..)) 22 | 23 | data Sunspot = Sunspot 24 | { time :: Double 25 | , sunspotMonth :: Double 26 | } deriving (Show, Read, Generic) 27 | 28 | instance FromNamedRecord Sunspot where 29 | parseNamedRecord m = Sunspot <$> 30 | m .: "time" <*> 31 | m .: "value" 32 | 33 | sunspots :: Dataset Sunspot 34 | sunspots = csvHdrDataset 35 | $ URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "datasets" /: "sunspot.month.csv" 36 | 37 | -- "http://vincentarelbundock.github.io/Rdatasets/csv/datasets/sunspot.month.csv" 38 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Titanic.hs: -------------------------------------------------------------------------------- 1 | {-# language DeriveGeneric, OverloadedStrings, LambdaCase, DataKinds #-} 2 | {-| 3 | 4 | Titanic Data Set. For each person on board the fatal maiden voyage of the ocean liner SS Titanic, this dataset records Sex, Age (child/adult), Class (Crew, 1st, 2nd, 3rd Class) and whether or not the person survived. 5 | 6 | ``The Titanic survival data seem to become to categorical data analysis what Fisher's Iris data are to discriminant analysis.'' - Buja A.: A word from the editor of JCGS. Statistical Computing & Graphics Newsletter 10 (1), pp.32-33, 1999. 7 | 8 | Retrieved from: . A copy of the dataset can be found in datasets/titanic2_full.tsv . 9 | 10 | Header: 11 | 12 | "PassengerId" "Survived" "Pclass" "Name" "Sex" "Age" "SibSp" "Parch" "Ticket" "Fare" "Cabin" "Embarked" 13 | 14 | Example rows : 15 | 16 | 10 1 2 "Nasser, Mrs. Nicholas (Adele Achem)" "female" 14 1 0 "237736" 30.0708 "" "C" 17 | 29 1 3 "O'Dwyer, Miss. Ellen \"Nellie\"" "female" NA 0 0 "330959" 7.8792 "" "Q" 18 | 19 | -} 20 | module Numeric.Datasets.Titanic (titanicRemote, titanicLocal, TitanicEntry(..), Class(..), Age(..), Sex(..))where 21 | 22 | import Numeric.Datasets 23 | 24 | import Data.Csv 25 | import GHC.Generics 26 | import Network.HTTP.Req ((/:), https, Scheme(..)) 27 | 28 | data TitanicEntry = TitanicEntry { 29 | tClass :: Class 30 | , tAge :: Age 31 | , tSex :: Sex 32 | , tSurvived :: Bool -- ^ Did the passenger survive ? 33 | } deriving (Eq, Read, Show, Generic) 34 | 35 | 36 | instance FromNamedRecord TitanicEntry where 37 | parseNamedRecord v = TitanicEntry <$> 38 | (parseClass <$> v .: "Pclass") <*> 39 | (v .: "Age") <*> 40 | (parseSex <$> v .: "Sex") <*> 41 | (parseBool <$> v .: "Survived") 42 | 43 | 44 | data Class = First | Second | Third | Crew deriving (Eq, Read, Show, Generic, Enum, Bounded) 45 | 46 | parseClass :: String -> Class 47 | parseClass = \case 48 | "1" -> First 49 | "2" -> Second 50 | "3" -> Third 51 | "Crew" -> Crew 52 | x -> error $ unwords ["Unexpected feature value :", show x] 53 | 54 | 55 | newtype Age = Age (Maybe Double) deriving (Eq, Read, Show, Generic) 56 | 57 | -- | The "Age" field requires a custom FromField instance because its value may be "NA" 58 | instance FromField Age where 59 | parseField s = case s of 60 | "NA" -> pure $ Age Nothing 61 | ss -> case runParser (parseField ss :: Parser Double) of 62 | Left _ -> pure $ Age Nothing 63 | Right x -> pure $ Age $ Just x 64 | 65 | data Sex = Female | Male deriving (Eq, Read, Show, Generic, Enum, Bounded) 66 | 67 | parseSex :: String -> Sex 68 | parseSex = \case 69 | "female" -> Female 70 | "male" -> Male 71 | x -> error $ unwords ["Unexpected feature value :", show x] 72 | 73 | parseBool :: String -> Bool 74 | parseBool = \case 75 | "1" -> True 76 | "0" -> False 77 | x -> error $ unwords ["Unexpected feature value :", show x] 78 | 79 | -- | The Titanic dataset, to be downloaded from 80 | titanicRemote :: Dataset TitanicEntry 81 | titanicRemote = withPreprocess removeEscQuotes $ csvHdrDatasetSep '\t' $ URL $ https "raw.githubusercontent.com" /: "JackStat" /: "6003Data" /: "master" /: "Titanic.txt" 82 | 83 | -- | The Titanic dataset, parsed from a local copy 84 | titanicLocal :: Dataset TitanicEntry 85 | titanicLocal = withPreprocess removeEscQuotes $ csvHdrDatasetSep '\t' $ File "datafiles/titanic2_full.tsv" 86 | 87 | 88 | 89 | 90 | 91 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Titanic.hs~: -------------------------------------------------------------------------------- 1 | {-# language DeriveGeneric, OverloadedStrings, TemplateHaskell #-} 2 | {-| 3 | 4 | Titanic Data Set. For each person on board the fatal maiden voyage of the ocean liner SS Titanic, this dataset records Sex, Age (child/adult), Class (Crew, 1st, 2nd, 3rd Class) and whether or not the person survived. 5 | 6 | ``The Titanic survival data seem to become to categorical data analysis what Fisher's Iris data are to discriminant analysis.'' - Buja A.: A word from the editor of JCGS. Statistical Computing & Graphics Newsletter 10 (1), pp.32-33, 1999. 7 | 8 | Retrieved from: 9 | 10 | -} 11 | module Numeric.Datasets.Titanic where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Data.FileEmbed 18 | import Data.ByteString.Lazy (fromStrict) 19 | 20 | 21 | titanic = readDataset $ csvHdrDatasetSep '\t' (fromStrict $(embedFile "datafiles/titanic.tsv")) 22 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/UN.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | 3 | {-| 4 | 5 | GDP and infant mortality 6 | 7 | 8 | 9 | -} 10 | 11 | module Numeric.Datasets.UN where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Control.Applicative 18 | import Network.HTTP.Req ((/:), http, Scheme(..)) 19 | 20 | data GdpMortality = GdpMortality 21 | { country :: String 22 | , infantMortality :: Maybe Int 23 | , gdp :: Maybe Int 24 | } deriving (Show, Read, Generic) 25 | 26 | instance FromNamedRecord GdpMortality where 27 | parseNamedRecord m = GdpMortality <$> 28 | m .: "" <*> 29 | (m .: "infant.mortality" <|> return Nothing) <*> 30 | (m .: "gdp" <|> return Nothing) 31 | 32 | gdpMortalityUN :: Dataset GdpMortality 33 | gdpMortalityUN = csvHdrDataset 34 | $ URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "car" /: "UN.csv" 35 | 36 | -- "http://vincentarelbundock.github.io/Rdatasets/csv/car/UN.csv" 37 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Vocabulary.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | 3 | {-| 4 | 5 | Vocabulary and Education 6 | 7 | 8 | 9 | -} 10 | 11 | module Numeric.Datasets.Vocabulary where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Network.HTTP.Req ((/:), http, Scheme(..)) 18 | 19 | data Sex = Female | Male 20 | deriving (Show, Read, Eq, Generic, Bounded, Enum) 21 | 22 | instance FromField Sex where 23 | parseField = parseReadField 24 | 25 | data Vocab = Vocab 26 | { year :: Integer 27 | , sex :: Sex 28 | , education :: Int 29 | , vocabulary :: Int 30 | } deriving (Show, Read, Generic) 31 | 32 | instance FromNamedRecord Vocab 33 | 34 | vocab :: Dataset Vocab 35 | vocab = csvHdrDataset 36 | $ URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "car" /: "Vocab.csv" 37 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/Wine.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | 3 | {-| 4 | 5 | Wine Data set 6 | 7 | UCI ML Repository link 8 | 9 | -} 10 | 11 | module Numeric.Datasets.Wine where 12 | 13 | import Numeric.Datasets 14 | 15 | import Data.Csv 16 | import GHC.Generics 17 | import Network.HTTP.Req ((/:), http, Scheme(..)) 18 | 19 | data Wine = Wine 20 | { wineClass :: Int 21 | , alcohol :: Double 22 | , malicAcid :: Double 23 | , ash :: Double 24 | , ashAlcalinity :: Double 25 | , magnesium :: Double 26 | , totalPhenols :: Double 27 | , flavanoids :: Double 28 | , nonflavanoidPhenols :: Double 29 | , proanthocyanins :: Double 30 | , colorIntensity :: Double 31 | , hue :: Double 32 | , dilutedOD280toOD315 :: Double 33 | , proline :: Int 34 | } deriving (Show, Read, Generic) 35 | 36 | instance FromRecord Wine 37 | 38 | wine :: Dataset Wine 39 | wine = withPreprocess fixAmericanDecimals $ 40 | csvDataset $ URL $ umassMLDB /: "wine" /: "wine.data" 41 | -------------------------------------------------------------------------------- /datasets/src/Numeric/Datasets/WineQuality.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-} 2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-} 3 | 4 | {-| 5 | 6 | Quality of red and white wines based on physicochemical properties 7 | 8 | See 9 | 10 | -} 11 | 12 | module Numeric.Datasets.WineQuality where 13 | 14 | import Numeric.Datasets 15 | 16 | import Data.Csv 17 | import GHC.Generics 18 | import Control.Applicative 19 | import Network.HTTP.Req ((/:), http, Scheme(..)) 20 | 21 | data WineQuality = WineQuality 22 | { fixedAcidity :: Double 23 | , volatileAcidity :: Double 24 | , citricAcid :: Double 25 | , residualSugar :: Double 26 | , chlorides :: Double 27 | , freeSulfurDioxide :: Double 28 | , totalSulfurDioxide :: Double 29 | , density :: Double 30 | , pH :: Double 31 | , sulphates :: Double 32 | , alcohol :: Double 33 | , quality :: Int 34 | } deriving (Show, Read, Generic) 35 | 36 | instance FromNamedRecord WineQuality where 37 | parseNamedRecord m = WineQuality <$> 38 | m .: "fixed acidity" <*> 39 | m .: "volatile acidity" <*> 40 | m .: "citric acid" <*> 41 | m .: "residual sugar" <*> 42 | m .: "chlorides" <*> 43 | m .: "free sulfur dioxide" <*> 44 | m .: "total sulfur dioxide" <*> 45 | m .: "density" <*> 46 | m .: "pH" <*> 47 | m .: "sulphates" <*> 48 | m .: "alcohol" <*> 49 | m .: "quality" 50 | 51 | redWineQuality, whiteWineQuality :: Dataset WineQuality 52 | redWineQuality = csvHdrDatasetSep ';' 53 | $ URL $ umassMLDB /: "wine-quality" /: "winequality-red.csv" 54 | 55 | whiteWineQuality = csvHdrDatasetSep ';' 56 | $ URL $ umassMLDB /: "wine-quality" /: "winequality-white.csv" 57 | -------------------------------------------------------------------------------- /datasets/src/Streaming/Instances.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeSynonymInstances #-} 2 | {-# LANGUAGE FlexibleInstances #-} 3 | {-# OPTIONS_GHC -Wno-orphans #-} 4 | module Streaming.Instances () where 5 | 6 | import Streaming (Stream, lift) 7 | import Data.Attoparsec.ByteString.Streaming (Errors) 8 | import Control.Monad.Catch (MonadThrow(throwM), Exception) 9 | 10 | instance (Functor f, MonadThrow m) => MonadThrow (Stream f m) where 11 | throwM e = lift (throwM e) 12 | 13 | instance Exception Errors where 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /datasets/test/Spec.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import Test.Hspec 4 | import Test.Hspec.QuickCheck 5 | import Test.QuickCheck 6 | 7 | 8 | main :: IO () 9 | main = hspec spec 10 | 11 | spec :: Spec 12 | spec = do 13 | describe "Spec" $ 14 | it "works" $ do 15 | 41 + 1 `shouldBe` 42 16 | -------------------------------------------------------------------------------- /dense-linear-algebra/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 0.2.0.0 2 | * Added unit tests 3 | 4 | * Introduced 'Decimal' for floating point comparison 5 | 6 | -------------------------------------------------------------------------------- /dense-linear-algebra/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2009, 2010 Bryan O'Sullivan 2 | All rights reserved. 3 | 4 | Redistribution and use in source and binary forms, with or without 5 | modification, are permitted provided that the following conditions 6 | are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 20 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /dense-linear-algebra/README.md: -------------------------------------------------------------------------------- 1 | # statistics-dense-linear-algebra 2 | 3 | [![Build Status](https://travis-ci.org/githubuser/statistics-dense-linear-algebra.png)](https://travis-ci.org/githubuser/statistics-dense-linear-algebra) 4 | 5 | The dense linear algebra functionality and related modules, extracted from `statistics-0.14.0.2` 6 | -------------------------------------------------------------------------------- /dense-linear-algebra/Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /dense-linear-algebra/bench/ChronosBench.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import qualified Statistics.Matrix as M 4 | import qualified Statistics.Matrix.Fast as F 5 | import qualified Statistics.Matrix.Fast.Algorithms as FA 6 | import Statistics.Matrix (Matrix (..)) 7 | import qualified Statistics.Matrix.Algorithms as A 8 | 9 | import qualified Data.Vector.Unboxed as U 10 | import Data.Vector.Unboxed (Vector) 11 | 12 | import qualified System.Random.MWC as Mwc 13 | 14 | import qualified Chronos.Bench as C 15 | 16 | n :: Int 17 | n = 100 18 | 19 | testVector :: IO (Vector Double) 20 | testVector = do 21 | gen <- Mwc.create 22 | Mwc.uniformVector gen (n*n) 23 | 24 | 25 | testMatrix :: IO Matrix 26 | testMatrix = do 27 | vec <- testVector 28 | return $ Matrix n n vec 29 | 30 | runtimelight :: Vector Double -> Matrix -> IO () 31 | runtimelight v a = do 32 | let 33 | v2 = U.take n v 34 | 35 | C.defaultMainWith (C.defaultConfig {C.timeout = Just 3}) [ 36 | C.bench "norm" M.norm v2, 37 | C.bench "Fast.norm" F.norm v2, 38 | 39 | C.bench "multiplyV" (M.multiplyV a) (v2), 40 | C.bench "Fast.multiplyV" (F.multiplyV a) (v2), 41 | 42 | C.bench "transpose" M.transpose a, 43 | C.bench "Fast.transpose" F.transpose a , 44 | C.bench "ident" M.ident n, 45 | C.bench "diag" M.diag v2 46 | ] 47 | 48 | runtimeheavy :: Matrix -> Matrix -> IO () 49 | runtimeheavy a b = do 50 | 51 | C.defaultMainWith (C.defaultConfig {C.timeout = Just 1}) [ 52 | C.bench "multiply" (M.multiply a) b, 53 | C.bench "Fast.multiply" (F.multiply a) b, 54 | C.bench "qr" A.qr a, 55 | C.bench "Fast.qr" FA.qr a 56 | ] 57 | 58 | 59 | main :: IO () 60 | main = do 61 | v <- testVector 62 | a <- testMatrix 63 | b <- testMatrix 64 | 65 | -- 66 | putStrLn "---Benchmarking light operations---" 67 | -- we split heavy and light, we lose some precision in the bar plots from chronos 68 | runtimelight v a 69 | putStrLn "---Benchmarking heavy operations---" 70 | runtimeheavy a b 71 | -------------------------------------------------------------------------------- /dense-linear-algebra/bench/WeighBench.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import qualified Statistics.Matrix as M 4 | import qualified Statistics.Matrix.Fast as F 5 | import qualified Statistics.Matrix.Fast.Algorithms as FA 6 | import Statistics.Matrix (Matrix (..)) 7 | import qualified Statistics.Matrix.Algorithms as A 8 | 9 | import qualified Data.Vector.Unboxed as U 10 | import Data.Vector.Unboxed (Vector) 11 | 12 | import qualified System.Random.MWC as Mwc 13 | 14 | import qualified Weigh as W 15 | 16 | n :: Int 17 | n = 100 18 | 19 | testVector :: IO (Vector Double) 20 | testVector = do 21 | gen <- Mwc.create 22 | Mwc.uniformVector gen (n*n) 23 | 24 | 25 | testMatrix :: IO Matrix 26 | testMatrix = do 27 | vec <- testVector 28 | return $ Matrix n n vec 29 | 30 | 31 | weight :: Vector Double -> Matrix -> Matrix -> IO () 32 | weight v a b = do 33 | let 34 | v2 = U.take n v 35 | W.mainWith (do 36 | W.func "norm" M.norm v2 37 | W.func "Fast.norm" F.norm v2 38 | 39 | W.func "multiplyV" (M.multiplyV a) (v2) 40 | W.func "Fast.multiplyV" (F.multiplyV a) (v2) 41 | W.func "transpose" M.transpose a 42 | W.func "Fast.transpose" F.transpose a 43 | W.func "ident" M.ident n 44 | W.func "diag" M.diag v2 45 | 46 | W.func "multiply" (M.multiply a) b 47 | W.func "Fast.multiply" (F.multiply a) b 48 | 49 | W.func "qr" A.qr a 50 | W.func "Fast.qr" FA.qr a 51 | 52 | ) 53 | 54 | 55 | 56 | main :: IO () 57 | main = do 58 | v <- testVector 59 | a <- testMatrix 60 | b <- testMatrix 61 | 62 | putStrLn "---Benchmarking memory consumption---" 63 | weight v a b 64 | -------------------------------------------------------------------------------- /dense-linear-algebra/dense-linear-algebra.cabal: -------------------------------------------------------------------------------- 1 | name: dense-linear-algebra 2 | version: 0.2.0.0 3 | synopsis: Simple and incomplete pure haskell implementation of linear algebra 4 | description: 5 | This library is simply collection of linear-algebra related modules 6 | split from statistics library. 7 | 8 | license: BSD2 9 | license-file: LICENSE 10 | author: Bryan O'Sullivan 11 | maintainer: Alexey Khudaykov 12 | copyright: 2018 Author name here 13 | category: Math, Statistics, Numeric 14 | build-type: Simple 15 | extra-source-files: README.md 16 | CHANGELOG.md 17 | cabal-version: >=1.10 18 | tested-with: GHC==7.6.3, GHC==7.8.3, GHC==7.10.3, GHC==8.0.1 19 | 20 | library 21 | default-language: Haskell2010 22 | ghc-options: -Wall -O2 23 | hs-source-dirs: src 24 | exposed-modules: Statistics.Matrix 25 | Statistics.Matrix.Algorithms 26 | Statistics.Matrix.Function 27 | Statistics.Matrix.Mutable 28 | Statistics.Matrix.Types 29 | Statistics.Matrix.Fast 30 | Statistics.Matrix.Fast.Algorithms 31 | build-depends: base >= 4.5 && < 5 32 | , deepseq >= 1.1.0.2 33 | , math-functions >= 0.1.7 34 | , primitive >= 0.3 35 | , vector >= 0.10 36 | , vector-algorithms >= 0.4 37 | , vector-th-unbox 38 | , vector-binary-instances >= 0.2.1 39 | , Decimal 40 | 41 | test-suite spec 42 | default-language: Haskell2010 43 | ghc-options: -Wall 44 | type: exitcode-stdio-1.0 45 | hs-source-dirs: test 46 | other-modules: Fixtures, 47 | Utils, 48 | AlgorithmsSpec 49 | main-is: LibSpec.hs 50 | build-depends: base 51 | , dense-linear-algebra 52 | , hspec 53 | , QuickCheck 54 | , vector 55 | , Decimal 56 | 57 | benchmark weigh-bench 58 | type: exitcode-stdio-1.0 59 | main-is: WeighBench.hs 60 | hs-source-dirs: bench 61 | default-extensions: OverloadedStrings 62 | ghc-options: -Wall 63 | build-depends: 64 | base, 65 | dense-linear-algebra, 66 | vector, 67 | mwc-random, 68 | weigh 69 | 70 | benchmark chronos-bench 71 | type: exitcode-stdio-1.0 72 | main-is: ChronosBench.hs 73 | hs-source-dirs: bench 74 | default-extensions: OverloadedStrings 75 | ghc-options: -Wall 76 | build-depends: 77 | base, 78 | dense-linear-algebra, 79 | chronos-bench, 80 | vector, 81 | mwc-random 82 | 83 | source-repository head 84 | type: git 85 | location: https://github.com/bos/statistics 86 | -------------------------------------------------------------------------------- /dense-linear-algebra/src/Statistics/Matrix/Algorithms.hs: -------------------------------------------------------------------------------- 1 | -- | 2 | -- Module : Statistics.Matrix.Algorithms 3 | -- Copyright : 2014 Bryan O'Sullivan 4 | -- License : BSD3 5 | -- 6 | -- Useful matrix functions. 7 | 8 | module Statistics.Matrix.Algorithms 9 | ( 10 | qr 11 | ) where 12 | 13 | import Control.Applicative ((<$>), (<*>)) 14 | import Control.Monad.ST (ST, runST) 15 | import Prelude hiding (replicate) 16 | import Numeric.Sum (sumVector,kbn) 17 | import Statistics.Matrix (Matrix, column, dimension, for, norm) 18 | import qualified Statistics.Matrix.Mutable as M 19 | import qualified Data.Vector.Unboxed as U 20 | 21 | -- | /O(r*c)/ Compute the QR decomposition of a matrix. 22 | -- The result returned is the matrices (/q/,/r/). 23 | qr :: Matrix -> (Matrix, Matrix) 24 | qr mat = runST $ do 25 | let (m,n) = dimension mat 26 | 27 | r <- M.replicate n n 0 28 | a <- M.thaw mat 29 | for 0 n $ \j -> do 30 | cn <- M.immutably a $ \aa -> norm (column aa j) 31 | M.unsafeWrite r j j cn 32 | for 0 m $ \i -> M.unsafeModify a i j (/ cn) 33 | for (j+1) n $ \jj -> do 34 | p <- innerProduct a j jj 35 | M.unsafeWrite r j jj p 36 | for 0 m $ \i -> do 37 | aij <- M.unsafeRead a i j 38 | M.unsafeModify a i jj $ subtract (p * aij) 39 | (,) <$> M.unsafeFreeze a <*> M.unsafeFreeze r 40 | 41 | innerProduct :: M.MMatrix s -> Int -> Int -> ST s Double 42 | innerProduct mmat j k = M.immutably mmat $ \mat -> 43 | sumVector kbn $ U.zipWith (*) (column mat j) (column mat k) 44 | -------------------------------------------------------------------------------- /dense-linear-algebra/src/Statistics/Matrix/Fast.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | 3 | module Statistics.Matrix.Fast ( 4 | multiply, 5 | norm, 6 | multiplyV, 7 | transpose 8 | ) where 9 | 10 | import Prelude hiding (exponent, map) 11 | import Control.Monad.ST 12 | import qualified Data.Vector.Unboxed as U 13 | import qualified Data.Vector.Unboxed.Mutable as UM 14 | 15 | import Statistics.Matrix (row) 16 | import Statistics.Matrix.Function 17 | import Statistics.Matrix.Types 18 | import Statistics.Matrix.Mutable (unsafeNew,unsafeWrite,unsafeFreeze) 19 | 20 | -- | Matrix-matrix multiplication in a more imperative fashion. Matrices must be of compatible 21 | -- sizes (/note: not checked/). Faster but less accurate than Statistics.Matrix.multiply 22 | multiply :: Matrix -> Matrix -> Matrix 23 | multiply m1@(Matrix r1 _ _) m2@(Matrix _ c2 _) = runST $ do 24 | m3 <- unsafeNew r1 c2 25 | for 0 c2 $ \j -> do 26 | for 0 r1 $ \i -> do 27 | let 28 | z = accum i m1 j m2 29 | unsafeWrite m3 i j z 30 | unsafeFreeze m3 31 | 32 | accum :: Int -> Matrix -> Int -> Matrix -> Double 33 | accum ithrow (Matrix _ c1 v1) jthcol (Matrix _ c2 v2) = sub 0 0 34 | where sub !acc !ij | ij == c1 = acc 35 | | otherwise = sub ( valRow*valCol + acc ) (ij+1) 36 | where 37 | valRow = U.unsafeIndex v1 (ithrow*c1 + ij) 38 | valCol = U.unsafeIndex v2 (ij*c2+jthcol) 39 | 40 | -- | Matrix-vector multiplication, with better performances but not as accurate as 41 | -- Statistics.Matrix.multiplyV 42 | multiplyV :: Matrix -> Vector -> Vector 43 | multiplyV m v 44 | | cols m == c = U.generate (rows m) (U.sum . U.zipWith (*) v . row m) 45 | | otherwise = error $ "matrix/vector unconformable " ++ show (cols m,c) 46 | where c = U.length v 47 | 48 | -- | Norm of a vector. Faster but less accurate than Statistics.Matrix.norm 49 | norm :: Vector -> Double 50 | norm = sqrt . U.sum . U.map square 51 | 52 | transpose :: Matrix -> Matrix 53 | transpose (Matrix r0 c0 v0) 54 | = Matrix c0 r0 $ runST $ do 55 | vec <- UM.unsafeNew (r0*c0) 56 | for 0 r0 $ \i -> do 57 | UM.unsafeWrite vec (i + i * c0) $ v0 `U.unsafeIndex` (i + i * c0) 58 | for (i+1) c0 $ \j -> do 59 | let tmp = v0 `U.unsafeIndex` (j + i * c0) 60 | tmp2 = v0 `U.unsafeIndex` (i + j * c0) 61 | UM.unsafeWrite vec (j + i * c0) tmp2 62 | UM.unsafeWrite vec (i + j * c0) tmp 63 | U.unsafeFreeze vec 64 | -------------------------------------------------------------------------------- /dense-linear-algebra/src/Statistics/Matrix/Fast/Algorithms.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | 3 | -- | 4 | -- Module : Statistics.Matrix.Fast.Algorithms 5 | -- Copyright : 2019 Magalame 6 | -- License : BSD3 7 | -- 8 | -- Useful matrix functions. 9 | 10 | module Statistics.Matrix.Fast.Algorithms 11 | ( 12 | qr 13 | ) where 14 | 15 | import Control.Applicative ((<$>), (<*>)) 16 | import Control.Monad.ST (ST, runST) 17 | import Prelude hiding (replicate) 18 | import Statistics.Matrix (Matrix (..),dimension, for) 19 | import qualified Statistics.Matrix.Mutable as M 20 | import qualified Data.Vector.Unboxed as U 21 | 22 | -- | /O(r*c)/ Compute the QR decomposition of a matrix. 23 | -- The result returned is the matrices (/q/,/r/). 24 | qr :: Matrix -> (Matrix, Matrix) 25 | qr mat = runST $ do 26 | let (m,n) = dimension mat 27 | 28 | r <- M.replicate n n 0 29 | a <- M.thaw mat 30 | for 0 n $ \j -> do 31 | cn <- M.immutably a $ \aa -> sqrt $ normCol j aa 32 | M.unsafeWrite r j j cn 33 | for 0 m $ \i -> M.unsafeModify a i j (/ cn) 34 | for (j+1) n $ \jj -> do 35 | p <- innerProduct a j jj 36 | M.unsafeWrite r j jj p 37 | for 0 m $ \i -> do 38 | aij <- M.unsafeRead a i j 39 | M.unsafeModify a i jj $ subtract (p * aij) 40 | (,) <$> M.unsafeFreeze a <*> M.unsafeFreeze r 41 | 42 | normCol :: Int -> Matrix -> Double 43 | normCol jthcol (Matrix r c v) = sub 0 0 44 | where sub !acc !ij | ij == r = acc 45 | | otherwise = sub ( valCol*valCol + acc ) (ij+1) 46 | where 47 | valCol = U.unsafeIndex v (ij*c+jthcol) 48 | 49 | innerProduct :: M.MMatrix s -> Int -> Int -> ST s Double 50 | innerProduct mmat j k = M.immutably mmat $ \mat -> 51 | dotCol j mat k mat 52 | 53 | dotCol :: Int -> Matrix -> Int -> Matrix -> Double 54 | dotCol jthcol (Matrix r1 c1 v1) kthcol (Matrix _ c2 v2) = sub 0 0 55 | where sub !acc !ij | ij == r1 = acc 56 | | otherwise = sub ( valColj*valColk + acc ) (ij+1) 57 | where 58 | valColk = U.unsafeIndex v2 (ij*c2+kthcol) 59 | valColj = U.unsafeIndex v1 (ij*c1+jthcol) -------------------------------------------------------------------------------- /dense-linear-algebra/src/Statistics/Matrix/Function.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | -- | 3 | module Statistics.Matrix.Function where 4 | 5 | -- | Multiply a number by itself. 6 | square :: Double -> Double 7 | square x = x * x 8 | 9 | -- | Simple for loop. Counts from /start/ to /end/-1. 10 | for :: Monad m => Int -> Int -> (Int -> m ()) -> m () 11 | for n0 !n f = loop n0 12 | where 13 | loop i | i == n = return () 14 | | otherwise = f i >> loop (i+1) 15 | {-# INLINE for #-} 16 | -------------------------------------------------------------------------------- /dense-linear-algebra/src/Statistics/Matrix/Mutable.hs: -------------------------------------------------------------------------------- 1 | -- | 2 | -- Module : Statistics.Matrix.Mutable 3 | -- Copyright : (c) 2014 Bryan O'Sullivan 4 | -- License : BSD3 5 | -- 6 | -- Basic mutable matrix operations. 7 | 8 | module Statistics.Matrix.Mutable 9 | ( 10 | MMatrix(..) 11 | , MVector 12 | , replicate 13 | , thaw 14 | , bounds 15 | , unsafeNew 16 | , unsafeFreeze 17 | , unsafeRead 18 | , unsafeWrite 19 | , unsafeModify 20 | , immutably 21 | , unsafeBounds 22 | ) where 23 | 24 | import Control.Applicative ((<$>)) 25 | import Control.DeepSeq (NFData(..)) 26 | import Control.Monad.ST (ST) 27 | import Statistics.Matrix.Types (Matrix(..), MMatrix(..), MVector) 28 | import qualified Data.Vector.Unboxed as U 29 | import qualified Data.Vector.Unboxed.Mutable as M 30 | import Prelude hiding (replicate) 31 | 32 | replicate :: Int -> Int -> Double -> ST s (MMatrix s) 33 | replicate r c k = MMatrix r c <$> M.replicate (r*c) k 34 | 35 | thaw :: Matrix -> ST s (MMatrix s) 36 | thaw (Matrix r c v) = MMatrix r c <$> U.thaw v 37 | 38 | unsafeFreeze :: MMatrix s -> ST s Matrix 39 | unsafeFreeze (MMatrix r c mv) = Matrix r c <$> U.unsafeFreeze mv 40 | 41 | -- | Allocate new matrix. Matrix content is not initialized hence unsafe. 42 | unsafeNew :: Int -- ^ Number of row 43 | -> Int -- ^ Number of columns 44 | -> ST s (MMatrix s) 45 | unsafeNew r c 46 | | r < 0 = error "Statistics.Matrix.Mutable.unsafeNew: negative number of rows" 47 | | c < 0 = error "Statistics.Matrix.Mutable.unsafeNew: negative number of columns" 48 | | otherwise = do 49 | vec <- M.new (r*c) 50 | return $ MMatrix r c vec 51 | 52 | unsafeRead :: MMatrix s -> Int -> Int -> ST s Double 53 | unsafeRead mat r c = unsafeBounds mat r c M.unsafeRead 54 | {-# INLINE unsafeRead #-} 55 | 56 | unsafeWrite :: MMatrix s -> Int -> Int -> Double -> ST s () 57 | unsafeWrite mat row col k = unsafeBounds mat row col $ \v i -> 58 | M.unsafeWrite v i k 59 | {-# INLINE unsafeWrite #-} 60 | 61 | unsafeModify :: MMatrix s -> Int -> Int -> (Double -> Double) -> ST s () 62 | unsafeModify mat row col f = unsafeBounds mat row col $ \v i -> do 63 | k <- M.unsafeRead v i 64 | M.unsafeWrite v i (f k) 65 | {-# INLINE unsafeModify #-} 66 | 67 | -- | Given row and column numbers, calculate the offset into the flat 68 | -- row-major vector. 69 | bounds :: MMatrix s -> Int -> Int -> (MVector s -> Int -> r) -> r 70 | bounds (MMatrix rs cs mv) r c k 71 | | r < 0 || r >= rs = error "row out of bounds" 72 | | c < 0 || c >= cs = error "column out of bounds" 73 | | otherwise = k mv $! r * cs + c 74 | {-# INLINE bounds #-} 75 | 76 | -- | Given row and column numbers, calculate the offset into the flat 77 | -- row-major vector, without checking. 78 | unsafeBounds :: MMatrix s -> Int -> Int -> (MVector s -> Int -> r) -> r 79 | unsafeBounds (MMatrix _ cs mv) r c k = k mv $! r * cs + c 80 | {-# INLINE unsafeBounds #-} 81 | 82 | immutably :: NFData a => MMatrix s -> (Matrix -> a) -> ST s a 83 | immutably mmat f = do 84 | k <- f <$> unsafeFreeze mmat 85 | rnf k `seq` return k 86 | {-# INLINE immutably #-} 87 | -------------------------------------------------------------------------------- /dense-linear-algebra/src/Statistics/Matrix/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveAnyClass #-} 2 | {-# LANGUAGE DeriveGeneric #-} 3 | -- | 4 | -- Module : Statistics.Matrix.Types 5 | -- Copyright : 2014 Bryan O'Sullivan 6 | -- License : BSD3 7 | -- 8 | -- Basic matrix operations. 9 | -- 10 | -- There isn't a widely used matrix package for Haskell yet, so 11 | -- we implement the necessary minimum here. 12 | 13 | module Statistics.Matrix.Types 14 | ( 15 | Vector 16 | , MVector 17 | , Matrix(..) 18 | , MMatrix(..) 19 | , debug 20 | ) where 21 | 22 | import Data.Char (isSpace) 23 | import Numeric (showFFloat) 24 | import qualified Data.Vector.Unboxed as U 25 | import qualified Data.Vector.Unboxed.Mutable as M 26 | 27 | import GHC.Generics (Generic) 28 | import Control.DeepSeq (NFData) 29 | 30 | type Vector = U.Vector Double 31 | type MVector s = M.MVector s Double 32 | 33 | -- | Two-dimensional matrix, stored in row-major order. 34 | data Matrix = Matrix { 35 | rows :: {-# UNPACK #-} !Int -- ^ Rows of matrix. 36 | , cols :: {-# UNPACK #-} !Int -- ^ Columns of matrix. 37 | , _vector :: !Vector -- ^ Matrix data. 38 | } deriving (Eq, Generic, NFData) 39 | 40 | -- | Two-dimensional mutable matrix, stored in row-major order. 41 | data MMatrix s = MMatrix 42 | {-# UNPACK #-} !Int 43 | {-# UNPACK #-} !Int 44 | !(MVector s) 45 | 46 | -- The Show instance is useful only for debugging. 47 | instance Show Matrix where 48 | show = debug 49 | 50 | debug :: Matrix -> String 51 | debug (Matrix r c vs) = unlines $ zipWith (++) (hdr0 : repeat hdr) rrows 52 | where 53 | rrows = map (cleanEnd . unwords) . split $ zipWith (++) ldone tdone 54 | hdr0 = show (r,c) ++ " " 55 | hdr = replicate (length hdr0) ' ' 56 | pad plus k xs = replicate (k - length xs) ' ' `plus` xs 57 | ldone = map (pad (++) (longest lstr)) lstr 58 | tdone = map (pad (flip (++)) (longest tstr)) tstr 59 | (lstr, tstr) = unzip . map (break (=='.') . render) . U.toList $ vs 60 | longest = maximum . map length 61 | render k = reverse . dropWhile (=='.') . dropWhile (=='0') . reverse . 62 | showFFloat (Just 4) k $ "" 63 | split [] = [] 64 | split xs = i : split rest where (i, rest) = splitAt c xs 65 | cleanEnd = reverse . dropWhile isSpace . reverse 66 | -------------------------------------------------------------------------------- /dense-linear-algebra/stack.yaml: -------------------------------------------------------------------------------- 1 | resolver: lts-12.14 2 | 3 | packages: 4 | - . 5 | # Dependency packages to be pulled from upstream that are not in the resolver 6 | # using the same syntax as the packages field. 7 | # (e.g., acme-missiles-0.3) 8 | extra-deps: 9 | - torsor-0.1 10 | - chronos-1.0.5 11 | - chronos-bench-0.2.0.2 12 | - primitive-0.6.4.0 13 | 14 | # Override default flag values for local packages and extra-deps 15 | # flags: {} 16 | 17 | # Extra package databases containing global packages 18 | # extra-package-dbs: [] 19 | 20 | # Control whether we use the GHC we find on the path 21 | # system-ghc: true 22 | # 23 | # Require a specific version of stack, using version ranges 24 | # require-stack-version: -any # Default 25 | # require-stack-version: ">=1.9" 26 | # 27 | # Override the architecture used by stack, especially useful on Windows 28 | # arch: i386 29 | # arch: x86_64 30 | # 31 | # Extra directories used by stack for building 32 | # extra-include-dirs: [/path/to/dir] 33 | # extra-lib-dirs: [/path/to/dir] 34 | # 35 | # Allow a newer minor version of GHC than the snapshot specifies 36 | # compiler-check: newer-minor 37 | -------------------------------------------------------------------------------- /dense-linear-algebra/test/AlgorithmsSpec.hs: -------------------------------------------------------------------------------- 1 | 2 | module AlgorithmsSpec where 3 | 4 | import qualified Statistics.Matrix as M 5 | import qualified Statistics.Matrix.Types as T 6 | import qualified Statistics.Matrix.Algorithms as Alg 7 | import qualified Statistics.Matrix.Function as Func 8 | import qualified Statistics.Matrix.Mutable as Mut 9 | 10 | import Utils 11 | 12 | qrDecompositionInvariant :: T.Matrix -> Bool 13 | qrDecompositionInvariant mat = 14 | (M.multiply (fst res) (snd res)) == mat 15 | where 16 | res = Alg.qr mat 17 | 18 | 19 | qrFirstOrthoInvariant :: T.Matrix -> Bool 20 | qrFirstOrthoInvariant mat = Utils.isOrtho (fst res) 21 | where 22 | res = Alg.qr mat 23 | 24 | qrSecondTriInvariant :: T.Matrix -> Bool 25 | qrSecondTriInvariant mat = Utils.isUpperTri (snd res) 26 | where 27 | res = Alg.qr mat 28 | 29 | 30 | -------------------------------------------------------------------------------- /dense-linear-algebra/test/Fixtures.hs: -------------------------------------------------------------------------------- 1 | 2 | module Fixtures where 3 | 4 | import qualified Statistics.Matrix as M 5 | import qualified Statistics.Matrix.Types as T 6 | 7 | f :: Int -> Int -> Double 8 | f a b = (c + d + 1) / (d + 1) where 9 | c = fromIntegral a 10 | d = fromIntegral b 11 | 12 | g :: Int -> Int -> Double 13 | g a b = (c + d + 1) / (c * d + 1) where 14 | c = fromIntegral a 15 | d = fromIntegral b 16 | 17 | c :: [[Double]] 18 | c = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]] 19 | 20 | d :: [[Double]] 21 | d = [[3.0, 2.5, 1.6], [5.5, 2.3, 6.9], [3.7, 9.1, 10.0]] 22 | 23 | matA :: T.Matrix 24 | matA = M.generate 3 3 f 25 | 26 | matB :: T.Matrix 27 | matB = M.generate 3 3 g 28 | 29 | matC :: T.Matrix 30 | matC = M.fromRowLists c 31 | 32 | matD :: T.Matrix 33 | matD = M.fromRowLists d 34 | 35 | matId :: T.Matrix 36 | matId = M.ident 3 37 | 38 | 39 | -------------------------------------------------------------------------------- /dense-linear-algebra/test/LibSpec.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | import Test.Hspec 4 | -- import Test.Hspec.QuickCheck 5 | 6 | -- import qualified Statistics.Matrix as M 7 | -- import qualified Statistics.Matrix.Types as T 8 | -- import qualified Statistics.Matrix.Algorithms as Alg 9 | 10 | import qualified Fixtures as F 11 | import AlgorithmsSpec 12 | -- import Utils 13 | 14 | spec :: Spec 15 | spec = describe "Q-R Decomposition" $ do 16 | it "Q x R returns the original matrix" $ do 17 | qrDecompositionInvariant F.matB `shouldBe` True 18 | it "Matrix Q is orthogonal for the QR factorization of an invertible matrix" $ do 19 | qrFirstOrthoInvariant F.matD`shouldBe` True 20 | it "Matrix R is triangular" $ do 21 | qrSecondTriInvariant F.matB `shouldBe` True 22 | 23 | 24 | main :: IO () 25 | main = do 26 | hspec spec 27 | 28 | 29 | 30 | 31 | -------------------------------------------------------------------------------- /dense-linear-algebra/test/Spec.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-} 2 | 3 | 4 | -------------------------------------------------------------------------------- /dense-linear-algebra/test/Utils.hs: -------------------------------------------------------------------------------- 1 | 2 | module Utils where 3 | 4 | import Data.Vector.Unboxed as U 5 | import Prelude as P 6 | import Data.Decimal 7 | import GHC.Word 8 | 9 | import qualified Statistics.Matrix as M 10 | import qualified Statistics.Matrix.Types as T 11 | import qualified Statistics.Matrix.Algorithms as Alg 12 | import qualified Statistics.Matrix.Function as Func 13 | import qualified Statistics.Matrix.Mutable as Mut 14 | 15 | import qualified Fixtures as F 16 | 17 | 18 | -- | Operations on vectors 19 | 20 | -- | Operations on matrices 21 | 22 | -- | Check if two matrices are equal within some precision 23 | isEqual :: Word8 -> T.Matrix -> T.Matrix -> Bool 24 | isEqual prec a b = 25 | let 26 | a' = makeDecimal prec a 27 | b' = makeDecimal prec b 28 | in 29 | a' == b' 30 | 31 | 32 | ------------------------------ matrix property checks -------------------------------------- 33 | 34 | 35 | -- | Check if given matrix is orthogonal 36 | isOrtho :: T.Matrix -> Bool 37 | isOrtho mat = isEqual 8 (M.multiply mat (M.transpose mat)) F.matId 38 | 39 | -- | Checks if the n'th row of the matrix have 'n' leading zeros 40 | hasLeadingZeros :: Int -> T.Vector -> Bool 41 | hasLeadingZeros n row = U.foldl (&&) True (U.map (== 0) (U.take n row)) 42 | 43 | -- | pass the n'th row of the matrix to hasLeadingZeros 44 | isRowFromTriMatrix :: T.Matrix -> Int -> Bool 45 | isRowFromTriMatrix mat n = hasLeadingZeros n (M.row mat n) 46 | 47 | -- | Helper function to check if the given matrix is upper triangular 48 | upperTriHelper :: T.Matrix -> Int -> Bool 49 | upperTriHelper mat n = P.foldl (&&) True $ P.map (isRowFromTriMatrix mat) [0..(n-1)] 50 | 51 | -- | to check if the given matrix is upper triangular 52 | isUpperTri :: T.Matrix -> Bool 53 | isUpperTri mat = upperTriHelper mat (fst (M.dimension mat)) 54 | 55 | -- | to check if the given matrix is invertible 56 | isInvertible :: T.Matrix -> Bool 57 | isInvertible mat = isFloatZero (det mat) 0.0000001 58 | 59 | 60 | ------------------------------- matrix manipulation functions ------------------------- 61 | 62 | 63 | -- | make all values of the matrix absolute 64 | makeAbs :: T.Matrix -> T.Matrix 65 | makeAbs mat = M.map P.abs mat 66 | 67 | -- | make all values of the (row-list represented matrix) 68 | -- | decimal values with a given precision 69 | makeDecimal :: Word8 -> T.Matrix -> [[Decimal]] 70 | makeDecimal prec mat = 71 | let 72 | matList = M.toRowLists mat 73 | rowToDecimal prec ls = P.map (realFracToDecimal prec) ls 74 | in 75 | P.map (rowToDecimal prec) matList 76 | 77 | 78 | -- | to get the determinant of a given matrix 79 | -- | taken from here: 80 | -- | http://michaeljgilliland.blogspot.com/2013/04/determinant-in-haskell.html 81 | det :: T.Matrix -> Double 82 | det mat = 83 | let 84 | detHelper [[x]] = x 85 | detHelper mat = P.sum [(-1)^i*x*(detHelper (getRest i mat)) | (i, x) <- P.zip [0..] (P.head mat)] 86 | listMatrix = M.toRowLists mat 87 | in 88 | detHelper listMatrix 89 | 90 | -- | getRest function simply returns the matrix without the head row (topmost) and without the ith column 91 | getRest :: Int -> [[Double]] -> [[Double]] 92 | getRest col mat = 93 | let 94 | decapitatedMat = P.tail mat 95 | in 96 | P.map (leaveElement col) decapitatedMat 97 | 98 | 99 | ----------------------------- auxiliary helper functions ----------------------------------- 100 | 101 | 102 | -- | function to delete i'th element from list 103 | leaveElement :: Ord a => Int -> [a] -> [a] 104 | leaveElement _ [] = [] 105 | leaveElement i (x:xs) 106 | | i == 0 = xs 107 | | otherwise = x : leaveElement (i-1) xs 108 | 109 | -- | compare a double value to some epsilon 110 | isFloatZero :: Double -> Double -> Bool 111 | isFloatZero n eps = 112 | if (abs n) > (abs eps) 113 | then True 114 | else False 115 | -------------------------------------------------------------------------------- /dh-core-accelerate/README.md: -------------------------------------------------------------------------------- 1 | # core-accelerate 2 | 3 | [![Build Status](https://travis-ci.org/DataHaskell/core-accelerate.png)](https://travis-ci.org/DataHaskell/core-accelerate) 4 | 5 | TODO Description. 6 | -------------------------------------------------------------------------------- /dh-core-accelerate/Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /dh-core-accelerate/core-accelerate.cabal: -------------------------------------------------------------------------------- 1 | name: core-accelerate 2 | version: 0.1.0.0 3 | -- synopsis: 4 | -- description: 5 | homepage: https://github.com/DataHaskell/core-accelerate 6 | license: BSD3 7 | license-file: LICENSE 8 | author: DataHaskell 9 | maintainer: Data-Haskell@haskell.org 10 | copyright: 2018 DataHaskell 11 | category: Data Science 12 | build-type: Simple 13 | extra-source-files: README.md 14 | cabal-version: >=1.10 15 | tested-with: GHC == 8.4.3 16 | 17 | library 18 | default-language: Haskell2010 19 | ghc-options: -Wall 20 | hs-source-dirs: src 21 | exposed-modules: Lib 22 | build-depends: base >= 4.7 && < 5 23 | , accelerate 24 | 25 | -- executable core-accelerate 26 | -- default-language: Haskell2010 27 | -- ghc-options: -threaded -rtsopts -with-rtsopts=-N 28 | -- hs-source-dirs: app 29 | -- main-is: Main.hs 30 | -- build-depends: base 31 | -- , core-accelerate 32 | 33 | test-suite spec 34 | default-language: Haskell2010 35 | ghc-options: -Wall 36 | type: exitcode-stdio-1.0 37 | hs-source-dirs: test 38 | main-is: Spec.hs 39 | build-depends: base 40 | , core-accelerate 41 | , hspec 42 | , QuickCheck 43 | 44 | source-repository head 45 | type: git 46 | location: https://github.com/DataHaskell/core-accelerate 47 | -------------------------------------------------------------------------------- /dh-core-accelerate/src/Lib.hs: -------------------------------------------------------------------------------- 1 | module Lib where 2 | 3 | -------------------------------------------------------------------------------- /dh-core-accelerate/test/LibSpec.hs: -------------------------------------------------------------------------------- 1 | module LibSpec where 2 | 3 | import Test.Hspec 4 | import Test.Hspec.QuickCheck 5 | 6 | import Lib 7 | 8 | main :: IO () 9 | main = hspec spec 10 | 11 | spec :: Spec 12 | spec = 13 | describe "Lib" $ do 14 | it "works" $ do 15 | True `shouldBe` True 16 | -- prop "ourAdd is commutative" $ \x y -> 17 | -- ourAdd x y `shouldBe` ourAdd y x 18 | -------------------------------------------------------------------------------- /dh-core-accelerate/test/Spec.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-} 2 | -------------------------------------------------------------------------------- /dh-core/.gitignore: -------------------------------------------------------------------------------- 1 | ## Haskell 2 | dist 3 | dist-* 4 | cabal-dev 5 | *.o 6 | *.hi 7 | *.chi 8 | *.chs.h 9 | *.dyn_o 10 | *.dyn_hi 11 | .hpc 12 | .hsenv 13 | .cabal-sandbox/ 14 | cabal.sandbox.config 15 | *.prof 16 | *.aux 17 | *.hp 18 | *.eventlog 19 | .stack-work/ 20 | cabal.project.local 21 | .HTF/ 22 | 23 | -------------------------------------------------------------------------------- /dh-core/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | 0.1.0.0 2 | First version 3 | -------------------------------------------------------------------------------- /dh-core/LICENSE: -------------------------------------------------------------------------------- 1 | Copyright DataHaskell (c) 2018 2 | 3 | All rights reserved. 4 | 5 | Redistribution and use in source and binary forms, with or without 6 | modification, are permitted provided that the following conditions are met: 7 | 8 | * Redistributions of source code must retain the above copyright 9 | notice, this list of conditions and the following disclaimer. 10 | 11 | * Redistributions in binary form must reproduce the above 12 | copyright notice, this list of conditions and the following 13 | disclaimer in the documentation and/or other materials provided 14 | with the distribution. 15 | 16 | * Neither the name of DataHaskell nor the names of other 17 | contributors may be used to endorse or promote products derived 18 | from this software without specific prior written permission. 19 | 20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /dh-core/README.md: -------------------------------------------------------------------------------- 1 | # Building 2 | 3 | Developers can use this `stack` command, which will trigger a re-build and run the tests every time a file in the project is modified: 4 | 5 | $ stack build --test --ghc-options -Wall --file-watch 6 | 7 | 8 | # GHC and Stackage compatibility 9 | 10 | Tested against: 11 | 12 | - Stackage LTS-12.13 (GHC 8.4.3) -------------------------------------------------------------------------------- /dh-core/Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /dh-core/core.cabal: -------------------------------------------------------------------------------- 1 | name: core 2 | version: 0.1.0.0 3 | synopsis: DataHaskell core project 4 | description: DataHaskell core project, please see README 5 | homepage: https://github.com/DataHaskell/dh-core 6 | license: BSD3 7 | license-file: LICENSE 8 | author: DataHaskell 9 | maintainer: Marco Zocca (ocramz fripost org), data-haskell@haskell.org 10 | copyright: 2018 DataHaskell 11 | category: Data Science 12 | build-type: Simple 13 | extra-source-files: README.md 14 | CHANGELOG.md 15 | cabal-version: >=1.10 16 | tested-with: GHC == 8.2.2, GHC == 8.4.3 17 | 18 | library 19 | default-language: Haskell2010 20 | ghc-options: -Wall 21 | hs-source-dirs: src 22 | exposed-modules: Lib 23 | Core.Numeric.BLAS 24 | Core.Numeric.BLAS.Class 25 | Core.Numeric.Statistics.Classification.DecisionTrees 26 | Core.Numeric.Statistics.Classification.Utils 27 | Core.Numeric.Statistics.Classification.Exceptions 28 | Core.Numeric.Statistics.InformationTheory 29 | Core.Data.Dataset 30 | Core.Data.Datum.Vector 31 | 32 | build-depends: 33 | base >=4.10 && <5, 34 | bytestring >=0.10.8.1, 35 | containers >=0.5.7.1, 36 | exceptions >=0.8.3, 37 | mwc-random, 38 | primitive, 39 | text >=1.2.2.2, 40 | vector >=0.12.0.1, 41 | vector-algorithms >=0.7.0.1, 42 | -- DEBUG 43 | hspec >=2.4.8, 44 | QuickCheck >=2.10.1 45 | 46 | -- executable core 47 | -- default-language: Haskell2010 48 | -- ghc-options: -threaded -rtsopts -with-rtsopts=-N 49 | -- hs-source-dirs: app 50 | -- main-is: Main.hs 51 | -- build-depends: base 52 | -- , core 53 | 54 | test-suite spec 55 | default-language: Haskell2010 56 | ghc-options: -Wall 57 | type: exitcode-stdio-1.0 58 | hs-source-dirs: test 59 | main-is: Spec.hs 60 | other-modules: LibSpec 61 | build-depends: base >=4.10 && < 5, 62 | core -any, 63 | hspec >=2.4.8, 64 | QuickCheck >=2.10.1 65 | 66 | source-repository head 67 | type: git 68 | location: https://github.com/DataHaskell/dh-core 69 | -------------------------------------------------------------------------------- /dh-core/src/Core/Data/Dataset.hs: -------------------------------------------------------------------------------- 1 | {-# language DeriveFunctor, DeriveFoldable, DeriveTraversable #-} 2 | {-# language TypeFamilies #-} 3 | module Core.Data.Dataset where 4 | 5 | import qualified Data.Foldable as F (maximumBy) 6 | import Data.Ord (comparing) 7 | 8 | import qualified Data.Map.Strict as M (Map, empty, fromList, toList, fromListWith, mapWithKey, foldl', foldrWithKey, foldlWithKey', insert) 9 | import qualified Data.Map.Internal.Debug as M (showTree) 10 | -- import qualified Data.IntMap.Strict as IM 11 | -- import qualified Data.Set as S 12 | 13 | import System.Random.MWC 14 | import Control.Monad.Primitive 15 | 16 | import Core.Numeric.Statistics.Classification.Utils (Indexed(..), bootstrapNP) 17 | 18 | -- | Labeled dataset represented as a 'Map'. The map keys are the class labels 19 | newtype Dataset k a = Dataset { unDataset :: M.Map k a } deriving (Eq, Show, Functor, Foldable, Traversable) 20 | 21 | showTree :: (Show k, Show a) => Dataset k a -> String 22 | showTree (Dataset mm) = M.showTree mm 23 | 24 | empty :: Dataset k a 25 | empty = Dataset M.empty 26 | 27 | insert :: Ord k => k -> a -> Dataset k a -> Dataset k a 28 | insert k ls (Dataset ds) = Dataset $ M.insert k ls ds 29 | 30 | mapWithKey :: (k -> a -> b) -> Dataset k a -> Dataset k b 31 | mapWithKey f (Dataset ds) = Dataset $ M.mapWithKey f ds 32 | 33 | foldrWithKey :: (k -> a -> b -> b) -> b -> Dataset k a -> b 34 | foldrWithKey f z (Dataset ds) = M.foldrWithKey f z ds 35 | 36 | foldlWithKey' :: (a -> k -> b -> a) -> a -> Dataset k b -> a 37 | foldlWithKey' f z (Dataset ds) = M.foldlWithKey' f z ds 38 | 39 | fromList :: Ord k => [(k, a)] -> Dataset k a 40 | fromList ld = Dataset $ M.fromList ld 41 | 42 | fromListWith :: Ord k => (a -> a -> a) -> [(k, a)] -> Dataset k a 43 | fromListWith f ld = Dataset $ M.fromListWith f ld 44 | 45 | toList :: Dataset k a -> [(k, a)] 46 | toList (Dataset ds) = M.toList ds 47 | 48 | -- lookup :: Ord k => k -> Dataset k a -> Maybe a 49 | -- lookup k (Dataset ds) = M.lookup k ds 50 | 51 | -- | Size of the dataset 52 | size :: Foldable t => Dataset k (t a) -> Int 53 | size (Dataset ds) = M.foldl' (\acc l -> acc + length l) 0 ds 54 | 55 | -- | Maximum likelihood estimate of class label 56 | mlClass :: Dataset k [a] -> k 57 | mlClass = fst . F.maximumBy (comparing f) . toList where 58 | f (_, ll) = length ll 59 | 60 | 61 | -- | Number of items in each class 62 | sizeClasses :: (Foldable t, Num n) => Dataset k (t a) -> M.Map k n 63 | sizeClasses (Dataset ds) = (fromIntegral . length) <$> ds 64 | 65 | -- | Empirical class probabilities i.e. for each k, number of items in class k / total number of items 66 | probClasses :: (Fractional prob, Foldable t) => Dataset k (t a) -> M.Map k prob 67 | probClasses ds = (\n -> n / fromIntegral (size ds)) <$> sizeClasses ds 68 | 69 | 70 | -- * Bootstrap 71 | 72 | -- | Nonparametric bootstrap: each class is resampled (i.e. sampled with replacement) 73 | bootstrap :: (Indexed f, PrimMonad m, Ix f ~ Int) => 74 | Dataset k (f a) 75 | -> Int -- ^ Number of samples 76 | -> Int -- ^ Number of bootstrap resamples 77 | -> Gen (PrimState m) 78 | -> m [Dataset k [a]] 79 | bootstrap ds@Dataset{} nsamples nboot gen = do 80 | dss <- traverse (bootstrapNP nsamples nboot gen) ds 81 | pure $ sequenceA dss 82 | -------------------------------------------------------------------------------- /dh-core/src/Core/Data/Datum/Vector.hs: -------------------------------------------------------------------------------- 1 | {-# language DeriveFunctor, DeriveFoldable, DeriveTraversable, GeneralizedNewtypeDeriving #-} 2 | module Core.Data.Datum.Vector where 3 | 4 | import qualified Data.IntMap as IM 5 | -- import qualified Data.Vector.Unboxed as VU 6 | import qualified Data.Vector as V 7 | 8 | import Control.Monad.Catch (MonadThrow(..)) 9 | import Core.Numeric.Statistics.Classification.Exceptions 10 | 11 | newtype FeatureLabels = FeatureLabels (IM.IntMap String) deriving (Eq, Show) 12 | 13 | featureLabels :: MonadThrow m => Int -> [String] -> m FeatureLabels 14 | featureLabels n ls 15 | | length ls == n = pure $ FeatureLabels $ IM.fromList $ zip [0..] ls 16 | | otherwise = throwM $ DimMismatchE "featureLabels" n (length ls) 17 | 18 | lookupFeatureLabelUnsafe :: IM.Key -> FeatureLabels -> String 19 | lookupFeatureLabelUnsafe i (FeatureLabels fl) = fl IM.! i 20 | 21 | 22 | -- | A data point i.e. a vector in R^n 23 | newtype V a = V (V.Vector a) deriving (Eq, Show, Functor, Foldable, Traversable, Applicative, Monad) 24 | 25 | fromListV :: [a] -> V a 26 | fromListV = V . V.fromList 27 | {-# inline fromListV #-} 28 | 29 | toListV :: V a -> [a] 30 | toListV (V vv) = V.toList vv 31 | 32 | zipV :: V a -> V b -> V (a, b) 33 | zipV (V v1) (V v2) = V $ V.zip v1 v2 34 | {-# inline zipV #-} 35 | 36 | unzipV :: V (a, b) -> (V a, V b) 37 | unzipV (V vs) = (V v1, V v2) where 38 | (v1, v2) = V.unzip vs 39 | {-# inline unzipV #-} 40 | 41 | mkV :: MonadThrow m => Int -> V.Vector a -> m (V a) 42 | mkV n xs | dxs == n = pure $ V xs 43 | | otherwise = throwM $ DimMismatchE "mkV" n dxs where 44 | dxs = V.length xs 45 | 46 | indexUnsafe :: V a -> Int -> a 47 | (V vv) `indexUnsafe` j = vv V.! j 48 | {-# inline indexUnsafe #-} 49 | 50 | (!) :: MonadThrow m => V a -> Int -> m a 51 | v ! j | j >= 0 && j < d = pure $ v `indexUnsafe` j 52 | | otherwise = throwM $ IndexOobE "(!)" j 0 d where 53 | d = dim v 54 | 55 | dim :: V a -> Int 56 | dim (V vv) = V.length vv 57 | {-# inline dim #-} 58 | 59 | foldrWithKey :: (Int -> a -> b -> b) -> b -> V a -> b 60 | foldrWithKey f z vv = foldr ins z $ zipV (fromListV [0..]) vv where 61 | ins (i, x) acc = f i x acc 62 | {-# inline foldrWithKey #-} 63 | 64 | 65 | dataSplitDecision :: (a -> Bool) -> Int -> (V a -> Bool) 66 | dataSplitDecision p j dat = p (dat `indexUnsafe` j) 67 | 68 | -- allComponents :: V (a -> Bool) -> V a -> Bool 69 | -- allComponents ps dat = all (== True) $ f <$> vps where 70 | -- vps = zipV ps dat 71 | -- f (p, vi) = p vi 72 | -- -- allComponents ps dat = all (== True) $ ps <*> dat 73 | 74 | 75 | 76 | 77 | -- | Vectors with measurable entries 78 | 79 | -- data Measurable a = BoundedBoth a a deriving (Eq, Show) 80 | 81 | -- newtype Xf f a = Xf (V.Vector (f a)) 82 | 83 | -- type XMeas = Xf Measurable 84 | -------------------------------------------------------------------------------- /dh-core/src/Core/Numeric/BLAS.hs: -------------------------------------------------------------------------------- 1 | module Core.Numeric.BLAS where 2 | 3 | -------------------------------------------------------------------------------- /dh-core/src/Core/Numeric/Statistics/Classification/DecisionTrees.hs: -------------------------------------------------------------------------------- 1 | {-# language DeriveFunctor, DeriveFoldable, DeriveTraversable #-} 2 | {-# OPTIONS_GHC -Wno-type-defaults #-} 3 | module Core.Numeric.Statistics.Classification.DecisionTrees where 4 | 5 | import qualified Data.Foldable as F 6 | import qualified Data.Set as S 7 | import Data.Ord (comparing) 8 | 9 | import Core.Data.Dataset 10 | import qualified Core.Data.Datum.Vector as XV 11 | -- import Core.Numeric.Statistics.Classification.Utils 12 | import Core.Numeric.Statistics.InformationTheory (entropyR) 13 | 14 | -- | A binary tree. 15 | -- 16 | -- Each leaf carries data of type 'a' and we can attach metadata of type 'd' at each branching point. 17 | data Tree d a = 18 | Node d (Tree d a) (Tree d a) 19 | | Leaf a 20 | deriving (Eq, Show, Functor, Foldable, Traversable) 21 | 22 | unfoldTree :: (t -> Either a (d, t, t)) -> t -> Tree d a 23 | unfoldTree f x = 24 | either Leaf (\(d, l, r) -> Node d (unfoldTree f l) (unfoldTree f r) ) (f x) 25 | 26 | 27 | -- | Tree state : list of candidate dataset cuts (feature #, level) 28 | data TState k a = TState { 29 | tsFeatCuts :: S.Set (Int, a) 30 | , tsDataset :: Dataset k [XV.V a] } 31 | 32 | -- | Tree state + local tree depth 33 | data TSd k a = TSd { tsDepth :: !Int, tState :: TState k a } 34 | 35 | -- | Global options for growing decision trees 36 | data TOptions = TOptions { 37 | toMaxDepth :: !Int -- ^ Max tree depth 38 | , toMinLeafSize :: !Int -- ^ Minimum size of the contents of a leaf 39 | , toOrder :: Order -- ^ Less than | Equal or larger than 40 | } deriving (Eq, Show) 41 | 42 | -- | Tree node metadata 43 | -- 44 | -- For decision trees, at each node we store the decision feature and its decision threshold 45 | data TNData a = TNData { 46 | tJStar :: !Int -- ^ Decision feature index 47 | , tTStar :: a -- ^ Decision threshold 48 | } deriving (Eq) 49 | 50 | instance Show a => Show (TNData a) where 51 | show (TNData j t) = unwords ["(j =", show j, ", t =", show t, ")"] 52 | 53 | 54 | -- | Split decision: find feature (value, index) that maximizes the entropy drop (i.e the information gain, or KL divergence between the joint and factored datasets) 55 | -- 56 | -- NB generates empty leaves 57 | treeUnfoldStep :: (Ord a, Ord k) => 58 | TOptions 59 | -> TSd k a 60 | -> Either (Dataset k [XV.V a]) (TNData a, TSd k a, TSd k a) 61 | treeUnfoldStep (TOptions maxdepth minls ord) (TSd depth tst) 62 | | depth >= maxdepth || sizeDs tst <= minls = Left (tsDataset tst) 63 | | sizeDs tsl == 0 = Left (tsDataset tsr) 64 | | sizeDs tsr == 0 = Left (tsDataset tsl) 65 | | otherwise = Right (mdata, tdsl, tdsr) 66 | where 67 | sizeDs = size . tsDataset 68 | mdata = TNData jstar tstar 69 | (jstar, tstar, tsl, tsr) = maxInfoGainSplit ordf tst 70 | ordf = fromOrder ord 71 | d' = depth + 1 72 | tdsl = TSd d' tsl 73 | tdsr = TSd d' tsr 74 | 75 | 76 | 77 | {- | Note (OPTIMIZATIONS maxInfoGainSplit) 78 | 79 | 1. After splitting a dataset, remove the (threshold, feature index) pair corresponding to the succesful split 80 | 81 | 2. " " " " , remove /all/ (threshold, index) pairs that are subsumed by the successful test (e.g in the test ((<), 3.2, 27) , remove all [(t, 27) | t <- [tmin ..], t < 3.2 ] ). This is only a useful optimization for /monotonic/ class boundaries. 82 | -} 83 | 84 | -- | Tabulate the information gain for a number of decision thresholds and return a decision function corresponding to the threshold that yields the maximum information gain. 85 | maxInfoGainSplit :: (Ord k, Ord a, Eq a) => 86 | (a -> a -> Bool) 87 | -> TState k a 88 | -> (Int, a, TState k a, TState k a) 89 | maxInfoGainSplit decision (TState tjs ds) = (jstar, tstar, TState tjs' dsl, TState tjs' dsr) where 90 | tjs' = S.delete (jstar, tstar) tjs -- See Note (OPTIMIZATIONS maxInfoGainSPlit) 91 | (jstar, tstar, _, dsl, dsr) = F.maximumBy (comparing third5) $ infog `map` S.toList tjs 92 | infog (j, t) = (j, t, h, dsl', dsr') where 93 | (h, dsl', dsr') = infoGainR (decision t) j ds 94 | 95 | 96 | third5 :: (a, b, c, d, e) -> c 97 | third5 (_, _, c, _, _) = c 98 | 99 | -- | Information gain due to a dataset split (regularized, H(0) := 0) 100 | infoGainR :: (Ord k, Ord h, Floating h) => 101 | (a -> Bool) 102 | -> Int 103 | -> Dataset k [XV.V a] 104 | -> (h, Dataset k [XV.V a], Dataset k [XV.V a]) 105 | infoGainR p j ds = (infoGain, dsl, dsr) where 106 | (dsl, pl, dsr, pr) = splitDatasetAtAttr p j ds 107 | (h0, hl, hr) = (entropyR ds, entropyR dsl, entropyR dsr) 108 | infoGain = h0 - (pl * hl + pr * hr) 109 | 110 | 111 | -- | helper function for 'infoGain' and 'infoGainR' 112 | splitDatasetAtAttr :: (Fractional n, Ord k) => 113 | (a -> Bool) 114 | -> Int 115 | -> Dataset k [XV.V a] 116 | -> (Dataset k [XV.V a], n, Dataset k [XV.V a], n) 117 | splitDatasetAtAttr p j ds = (dsl, pl, dsr, pr) where 118 | sz = fromIntegral . size 119 | (dsl, dsr) = partition p j ds 120 | (s0, sl, sr) = (sz ds, sz dsl, sz dsr) 121 | pl = sl / s0 122 | pr = sr / s0 123 | 124 | 125 | -- | Partition a Dataset in two, according to a decision predicate applied to a given feature. 126 | -- 127 | -- e.g. "is the j'th component of datum X_i larger than threshold t ?" 128 | partition :: (Foldable t, Ord k) => 129 | (a -> Bool) -- ^ Decision function (element-level) 130 | -> Int -- ^ Feature index 131 | -> Dataset k (t (XV.V a)) 132 | -> (Dataset k [XV.V a], Dataset k [XV.V a]) 133 | partition p j ds@Dataset{} = foldrWithKey insf (empty, empty) ds where 134 | insf k lrow (l, r) = (insert k lp l, insert k rp r) where 135 | (lp, rp) = partition1 (XV.dataSplitDecision p j) lrow 136 | 137 | 138 | 139 | -- | Partition a Foldable in two lists according to a predicate 140 | partition1 :: Foldable t => (a -> Bool) -> t a -> ([a], [a]) 141 | partition1 p = foldr ins ([], []) where 142 | ins x (l, r) | p x = (x : l, r) 143 | | otherwise = (l , x : r) 144 | 145 | 146 | 147 | -- | A well-defined Ordering, for strict half-plane separation 148 | data Order = LessThan | GreaterOrEqual deriving (Eq, Ord, Enum, Bounded) 149 | instance Show Order where 150 | show LessThan = "<" 151 | show GreaterOrEqual = ">=" 152 | 153 | fromOrder :: Ord a => Order -> (a -> a -> Bool) 154 | fromOrder o = case o of 155 | LessThan -> (<) 156 | _ -> (>=) 157 | -------------------------------------------------------------------------------- /dh-core/src/Core/Numeric/Statistics/Classification/Exceptions.hs: -------------------------------------------------------------------------------- 1 | module Core.Numeric.Statistics.Classification.Exceptions where 2 | 3 | import Control.Exception 4 | import Data.Typeable 5 | 6 | 7 | -- * Exceptions 8 | 9 | data ValueException = ZeroProbabilityE String deriving (Eq, Show, Typeable) 10 | 11 | instance Exception ValueException 12 | 13 | 14 | data DataException = 15 | -- MissingFeatureE i 16 | IndexOobE String Int Int Int 17 | | DimMismatchE String Int Int 18 | deriving (Eq, Typeable) 19 | 20 | instance Show DataException where 21 | show e = case e of 22 | IndexOobE errMsg ix blo bhi -> unwords [errMsg, ": index", show ix,"out of bounds", show (blo, bhi)] 23 | DimMismatchE errMsg d1 d2 -> unwords [errMsg, ": dimension mismatch : expecting", show d1, "but got", show d2] 24 | 25 | instance Exception DataException 26 | -------------------------------------------------------------------------------- /dh-core/src/Core/Numeric/Statistics/Classification/Utils.hs: -------------------------------------------------------------------------------- 1 | {-# language TypeFamilies #-} 2 | module Core.Numeric.Statistics.Classification.Utils where 3 | 4 | -- import qualified Data.Foldable as F (maximumBy, foldl', toList) 5 | import qualified Data.IntSet as SI 6 | -- import qualified Data.Set as S 7 | import qualified Data.IntMap.Strict as IM 8 | 9 | import System.Random.MWC 10 | import System.Random.MWC.Distributions 11 | import Control.Monad.Primitive 12 | 13 | import Control.Monad (foldM, replicateM) 14 | import Data.Maybe (maybeToList) 15 | 16 | -- import Control.Monad.Catch (MonadThrow(..)) 17 | -- import Numeric.Classification.Exceptions 18 | 19 | -- * Bootstrap 20 | 21 | -- | Non-parametric bootstrap 22 | bootstrapNP :: (Indexed f, PrimMonad m, Ix f ~ Int) => 23 | Int -- ^ Number of samples 24 | -> Int -- ^ Number of bootstrap resamples 25 | -> Gen (PrimState m) 26 | -> f a -- ^ Dataset 27 | -> m [[a]] 28 | bootstrapNP nsamples nboot gen mm = replicateM nboot (resample nsamples mm gen) 29 | 30 | -- * 31 | 32 | -- | Sample with replacement 33 | resample :: (Indexed f, PrimMonad m, Ix f ~ Int) => 34 | Int -> f b -> Gen (PrimState m) -> m [b] 35 | resample nsamples im gen = lookups (resampleIxs nsamples gen) im 36 | 37 | -- | Sample without replacement : return a list of at most M unique random samples from an indexed map of size N : O(N) 38 | sample :: (Indexed f, PrimMonad m, Ix f ~ Int) => 39 | Int -> f b -> Gen (PrimState m) -> m [b] 40 | sample nsamples im gen = lookups (sampleIxs nsamples gen) im 41 | 42 | lookups :: (Monad m, Monoid (t b), Traversable t, Indexed f) => 43 | (Int -> m (t (Ix f))) -- ^ Sampling function 44 | -> f b 45 | -> m (t b) 46 | lookups f mm = do 47 | ixs <- f (length mm) 48 | pure $ mconcat . maybeToList $ traverse (`ix` mm) ixs 49 | 50 | -- -- | Lookup a random subset and its complement 51 | -- lookups2 :: (Monad m, Monoid (t b), Traversable t, Indexed f) => 52 | -- (Int -> m (t (Ix f), t (Ix f))) 53 | -- -> f b 54 | -- -> m (t b, t b) 55 | -- lookups2 f mm = do 56 | -- (ixl, ixr) <- f (length mm) 57 | -- let lookupf = mconcat . maybeToList . traverse (`ix` mm) 58 | -- pure (lookupf ixl, lookupf ixr) 59 | 60 | 61 | 62 | 63 | resampleIxs :: PrimMonad m => Int -> Gen (PrimState m) -> Int -> m [Int] 64 | resampleIxs nsamples gen n = replicateM nsamples (uniformR (0, n - 1) gen) 65 | 66 | sampleIxs :: PrimMonad m => Int -> Gen (PrimState m) -> Int -> m [Int] 67 | sampleIxs nsamples gen n = SI.toList <$> sampleUniques nsamples gen n 68 | 69 | -- | Random split based on extracting 'm' unique entries from a set of size 'n > m' 70 | randomSplit :: PrimMonad m => 71 | Int -- ^ Number of samples 72 | -> Int -- ^ Size of universe set 73 | -> Gen (PrimState m) 74 | -> m (SI.IntSet, SI.IntSet) 75 | randomSplit nsamples n gen = do 76 | srand <- sampleUniques nsamples gen n 77 | let s0 = SI.fromList [0 .. n - 1] 78 | sDiff = s0 `SI.difference` srand 79 | pure (srand, sDiff) 80 | 81 | -- | Stochastic random split of a set of size 'n', based on a Bernoulli trial of parameter '0 <= p <= 1'; /on average/, m = p * n samples will be inserted in the left set, and n - m will be inserted in the right one. 82 | randomSplitBernoulli :: PrimMonad m => 83 | Double -- ^ Parameter of Bernoulli trial 84 | -> Int 85 | -> Gen (PrimState m) 86 | -> m (SI.IntSet, SI.IntSet) 87 | randomSplitBernoulli p n gen = foldM insf (SI.empty, SI.empty) [0.. n-1] where 88 | insf (sl, sr) i = do 89 | c <- bernoulli p gen 90 | pure $ if c then 91 | (SI.insert i sl, sr) 92 | else 93 | (sl, SI.insert i sr) 94 | 95 | 96 | 97 | -- sampleNoReplace iml nsamples gen 98 | -- | nsamples > n = pure $ throwM $ DimMismatchE "sampleIM" n nsamples 99 | -- | otherwise = do 100 | 101 | 102 | -- | Sample without replacement : choose a set S of M unique random samples from a population of size N 103 | sampleUniques :: PrimMonad m => 104 | Int -- ^ # of unique numbers to sample (M) 105 | -> Gen (PrimState m) 106 | -> Int -- ^ Population size (N) 107 | -> m SI.IntSet 108 | sampleUniques nsamples gen n = foldM sample1 SI.empty [p .. n - 1] where 109 | p = n - nsamples + 1 110 | sample1 s j = do 111 | t <- uniformR (0, j) gen 112 | let set' = 113 | if not (SI.member t s) 114 | then 115 | SI.insert t s 116 | else 117 | SI.insert j s 118 | return set' 119 | 120 | -- stest n ntot = withSystemRandom . asGenIO $ \g -> do 121 | -- let set = S.fromList [0..ntot - 1] 122 | -- sampleUniques set n g 123 | 124 | 125 | 126 | 127 | -- | Indexable containers 128 | class Foldable f => Indexed f where 129 | type Ix f :: * 130 | ix :: Ix f -> f a -> Maybe a 131 | 132 | instance Indexed [] where 133 | type Ix [] = Int 134 | ix = indexSafe 135 | 136 | instance Indexed IM.IntMap where 137 | type Ix IM.IntMap = IM.Key 138 | ix = IM.lookup 139 | 140 | 141 | indexSafe :: Int -> [a] -> Maybe a 142 | indexSafe i ll | i < length ll = Just $ ll !! i 143 | | otherwise = Nothing 144 | -------------------------------------------------------------------------------- /dh-core/src/Core/Numeric/Statistics/InformationTheory.hs: -------------------------------------------------------------------------------- 1 | module Core.Numeric.Statistics.InformationTheory where 2 | 3 | import Core.Data.Dataset 4 | 5 | -- | Differential entropy has a singularity at 0 but converges slowly to 0 for small positive values. 6 | entropyR :: (Foldable t, Ord h, Floating h) => Dataset k (t a) -> h 7 | entropyR = entropyR_ . probClasses 8 | 9 | entropyR_ :: (Foldable t, Functor t, Ord c, Floating c) => t c -> c 10 | entropyR_ ps = negate . sum $ entropyReg <$> ps where 11 | entropyReg p | p > 0 = p * logBase 2 p 12 | | otherwise = 0 13 | 14 | 15 | -- | Gini index (expected error rate) 16 | gini :: (Foldable t, Floating c) => Dataset k (t a) -> c 17 | gini = gini_ . probClasses 18 | 19 | gini_ :: (Foldable t, Functor t, Floating c) => t c -> c 20 | gini_ ps = 1 - sum ((**2) <$> ps) 21 | -------------------------------------------------------------------------------- /dh-core/src/Lib.hs: -------------------------------------------------------------------------------- 1 | module Lib where 2 | 3 | -------------------------------------------------------------------------------- /dh-core/stack.yaml: -------------------------------------------------------------------------------- 1 | # resolver: lts-12.13 # GHC 8.4.3 2 | # resolver: lts-11.22 # GHC 8.2.2 3 | # resolver: lts-13.0 # GHC 8.6.3 4 | resolver: nightly-2019-02-27 5 | # resolver: nightly-2019-05-10 # GHC 8.6.5 6 | 7 | packages: 8 | - . 9 | - ../analyze 10 | - ../dense-linear-algebra 11 | - ../datasets 12 | 13 | # Dependency packages to be pulled from upstream that are not in the resolver 14 | # using the same syntax as the packages field. 15 | # (e.g., acme-missiles-0.3) 16 | extra-deps: 17 | - statistics-0.14.0.2 18 | - chronos-bench-0.2.0.2 19 | - github: mjarosie/stack-hpc-coveralls 20 | commit: 318262fe4c8b5ee2de30be54a9d6d36f1babefbf 21 | 22 | nix: 23 | enable: false 24 | packages: 25 | - curl 26 | - zlib 27 | 28 | 29 | # Override default flag values for local packages and extra-deps 30 | # flags: {} 31 | 32 | # Extra package databases containing global packages 33 | # extra-package-dbs: [] 34 | 35 | # Control whether we use the GHC we find on the path 36 | # system-ghc: true 37 | # 38 | # Require a specific version of stack, using version ranges 39 | # require-stack-version: -any # Default 40 | # require-stack-version: ">=1.7" 41 | # 42 | # Override the architecture used by stack, especially useful on Windows 43 | # arch: i386 44 | # arch: x86_64 45 | # 46 | # Extra directories used by stack for building 47 | # extra-include-dirs: [/path/to/dir] 48 | # extra-lib-dirs: [/path/to/dir] 49 | # 50 | # Allow a newer minor version of GHC than the snapshot specifies 51 | # compiler-check: newer-minor 52 | -------------------------------------------------------------------------------- /dh-core/stack.yaml.lock: -------------------------------------------------------------------------------- 1 | # This file was autogenerated by Stack. 2 | # You should not edit this file by hand. 3 | # For more information, please see the documentation at: 4 | # https://docs.haskellstack.org/en/stable/lock_files 5 | 6 | packages: 7 | - completed: 8 | hackage: statistics-0.14.0.2@sha256:23474780ad5ffa28ff3f46fe041a9e3fd309c98b776ce94e8851183bb107a7af,5169 9 | pantry-tree: 10 | size: 5459 11 | sha256: a82e92ae5586361f8ba78c169e3aa28f32294adc67105026595749fbe6ed1ba2 12 | original: 13 | hackage: statistics-0.14.0.2 14 | - completed: 15 | hackage: chronos-bench-0.2.0.2@sha256:dbffaad733d2817f4ead1392f51417310bebb39122d89975ce236971aadbf524,1961 16 | pantry-tree: 17 | size: 481 18 | sha256: 120e1d349c3d0546280b5c567510024dcbff7937a935d0b81d4c690ae358b042 19 | original: 20 | hackage: chronos-bench-0.2.0.2 21 | snapshots: 22 | - completed: 23 | size: 491909 24 | url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/nightly/2019/2/27.yaml 25 | sha256: 024e7b34b021c548cd81acee015d2411c51568f670ef7668832420765fb6c96b 26 | original: nightly-2019-02-27 27 | -------------------------------------------------------------------------------- /dh-core/test/LibSpec.hs: -------------------------------------------------------------------------------- 1 | module LibSpec where 2 | 3 | import Test.Hspec 4 | -- import Test.Hspec.QuickCheck 5 | 6 | -- import Lib 7 | 8 | main :: IO () 9 | main = hspec spec 10 | 11 | spec :: Spec 12 | spec = 13 | describe "Lib" $ do 14 | it "works" $ do 15 | True `shouldBe` True 16 | -- prop "ourAdd is commutative" $ \x y -> 17 | -- ourAdd x y `shouldBe` ourAdd y x 18 | -------------------------------------------------------------------------------- /dh-core/test/Spec.hs: -------------------------------------------------------------------------------- 1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-} 2 | --------------------------------------------------------------------------------