├── .github
└── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
├── .gitignore
├── .travis.yml
├── README.md
├── analyze
├── .gitignore
├── CHANGELOG.md
├── LICENSE.md
├── README.md
├── Setup.hs
├── analyze.cabal
├── bench
│ └── Main.hs
├── src
│ ├── Analyze.hs
│ └── Analyze
│ │ ├── Common.hs
│ │ ├── Conversions.hs
│ │ ├── Csv.hs
│ │ ├── Datasets.hs
│ │ ├── Decoding.hs
│ │ ├── Html.hs
│ │ ├── Ops.hs
│ │ ├── RFrame.hs
│ │ └── Values.hs
├── stack.yaml
└── test
│ ├── Fixtures.hs
│ ├── Generation.hs
│ ├── PropTests.hs
│ ├── Spec.hs
│ └── UnitTests.hs
├── datasets
├── .gitignore
├── .travis.yml
├── LICENSE
├── README.md
├── Setup.hs
├── bench
│ ├── Main.hs
│ └── Numeric
│ │ └── Dataloader
│ │ └── Benchmark.hs
├── bin
│ └── cifar10
│ │ └── Main.hs
├── changelog.md
├── datafiles
│ ├── arff
│ │ ├── README
│ │ ├── ReutersCorn-test.arff
│ │ ├── ReutersCorn-train.arff
│ │ ├── ReutersGrain-test.arff
│ │ ├── ReutersGrain-train.arff
│ │ ├── contact-lens.arff
│ │ ├── cpu.arff
│ │ ├── cpu.with.vendor.arff
│ │ ├── diabetes.arff
│ │ ├── glass.arff
│ │ ├── ionosphere.arff
│ │ ├── iris.arff
│ │ ├── labor.arff
│ │ ├── segment-challenge.arff
│ │ ├── segment-test.arff
│ │ ├── soybean.arff
│ │ ├── supermarket.arff
│ │ ├── vote.arff
│ │ ├── weather.arff
│ │ └── weather.nominal.arff
│ ├── iris.data
│ ├── michelson.json
│ ├── netflix
│ │ ├── movies
│ │ │ └── movie_titles.txt
│ │ ├── test
│ │ │ └── qualifying.txt
│ │ └── training
│ │ │ └── mv_0000001.txt
│ ├── nightingale.json
│ ├── titanic.tsv
│ ├── titanic2_full.tsv
│ └── titanic3.csv
├── datasets.cabal
├── src
│ ├── Numeric
│ │ ├── Dataloader.hs
│ │ ├── Datasets.hs
│ │ └── Datasets
│ │ │ ├── Abalone.hs
│ │ │ ├── Adult.hs
│ │ │ ├── Anscombe.hs
│ │ │ ├── BostonHousing.hs
│ │ │ ├── BreastCancerWisconsin.hs
│ │ │ ├── CIFAR10.hs
│ │ │ ├── CO2.hs
│ │ │ ├── Car.hs
│ │ │ ├── Coal.hs
│ │ │ ├── Diabetes.hs
│ │ │ ├── Gapminder.hs
│ │ │ ├── Internal
│ │ │ ├── ArffParser.hs
│ │ │ └── Streaming.hs
│ │ │ ├── Iris.hs
│ │ │ ├── Michelson.hs
│ │ │ ├── Mushroom.hs
│ │ │ ├── Mushroom.hs~
│ │ │ ├── Netflix.hs
│ │ │ ├── Nightingale.hs
│ │ │ ├── OldFaithful.hs
│ │ │ ├── Quakes.hs
│ │ │ ├── States.hs
│ │ │ ├── Sunspots.hs
│ │ │ ├── Titanic.hs
│ │ │ ├── Titanic.hs~
│ │ │ ├── UN.hs
│ │ │ ├── Vocabulary.hs
│ │ │ ├── Wine.hs
│ │ │ └── WineQuality.hs
│ └── Streaming
│ │ └── Instances.hs
└── test
│ └── Spec.hs
├── dense-linear-algebra
├── CHANGELOG.md
├── LICENSE
├── README.md
├── Setup.hs
├── bench
│ ├── ChronosBench.hs
│ └── WeighBench.hs
├── dense-linear-algebra.cabal
├── src
│ └── Statistics
│ │ ├── Matrix.hs
│ │ └── Matrix
│ │ ├── Algorithms.hs
│ │ ├── Fast.hs
│ │ ├── Fast
│ │ └── Algorithms.hs
│ │ ├── Function.hs
│ │ ├── Mutable.hs
│ │ └── Types.hs
├── stack.yaml
└── test
│ ├── AlgorithmsSpec.hs
│ ├── Fixtures.hs
│ ├── LibSpec.hs
│ ├── Spec.hs
│ └── Utils.hs
├── dh-core-accelerate
├── README.md
├── Setup.hs
├── core-accelerate.cabal
├── src
│ └── Lib.hs
└── test
│ ├── LibSpec.hs
│ └── Spec.hs
└── dh-core
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── Setup.hs
├── core.cabal
├── src
├── Core
│ ├── Data
│ │ ├── Dataset.hs
│ │ └── Datum
│ │ │ └── Vector.hs
│ └── Numeric
│ │ ├── BLAS.hs
│ │ ├── BLAS
│ │ └── Class.hs
│ │ └── Statistics
│ │ ├── Classification
│ │ ├── DecisionTrees.hs
│ │ ├── Exceptions.hs
│ │ └── Utils.hs
│ │ └── InformationTheory.hs
└── Lib.hs
├── stack.yaml
├── stack.yaml.lock
└── test
├── LibSpec.hs
└── Spec.hs
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS: [e.g. iOS]
28 | - Browser [e.g. chrome, safari]
29 | - Version [e.g. 22]
30 |
31 | **Smartphone (please complete the following information):**
32 | - Device: [e.g. iPhone6]
33 | - OS: [e.g. iOS8.1]
34 | - Browser [e.g. stock browser, safari]
35 | - Version [e.g. 22]
36 |
37 | **Additional context**
38 | Add any other context about the problem here.
39 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: ''
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Is your feature request related to a problem? Please describe.**
11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 |
13 | **Describe the solution you'd like**
14 | A clear and concise description of what you want to happen.
15 |
16 | **Describe alternatives you've considered**
17 | A clear and concise description of any alternative solutions or features you've considered.
18 |
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | /dist/
2 | /.cabal-sandbox/
3 | /cabal.sandbox.config
4 | .stack-work/
5 | .ionide/
6 | .vscode/
7 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # Use new container infrastructure to enable caching
2 | sudo: false
3 |
4 | os:
5 | - linux
6 | - osx
7 |
8 | # Choose a lightweight base image; we provide our own build tools.
9 | language: c
10 |
11 | # GHC depends on GMP. You can add other dependencies here as well.
12 | addons:
13 | apt:
14 | packages:
15 | - libgmp-dev
16 |
17 | # The different configurations we want to test. You could also do things like
18 | # change flags or use --stack-yaml to point to a different file.
19 | env:
20 | global:
21 | - CODECOV_TOKEN="c11b1f41-6dad-495b-a5b0-fa4170eab2c4"
22 | - ANALYZE_V=0.2.0
23 | - DENSE_LINEAR_ALGEBRA_V=0.2.0.0
24 | - DATASETS_V=0.4.0
25 | matrix:
26 | - ARGS="--resolver nightly-2019-02-27"
27 | - ARGS="--resolver nightly-2019-05-10"
28 |
29 | before_install:
30 | # Download and unpack the stack executable
31 | - mkdir -p ~/.local/bin
32 | - export PATH=$HOME/.local/bin:$PATH
33 | - travis_retry curl -sSL https://get.haskellstack.org/ | sh
34 |
35 | # This line does all of the work: installs GHC if necessary, build the library,
36 | # executables, and test suites, and runs the test suites. --no-terminal works
37 | # around some quirks in Travis's terminal implementation.
38 | script:
39 | - cd dh-core
40 | - stack $ARGS --no-terminal --install-ghc test --haddock --coverage
41 | - stack $ARGS --no-terminal bench analyze:bench
42 | - stack $ARGS --no-terminal build dense-linear-algebra:weigh-bench
43 | - stack $ARGS --no-terminal build dense-linear-algebra:chronos-bench
44 | # - stack $ARGS --no-terminal bench datasets:bench
45 | #- cat .stack-work/logs/analyze-${ANALYZE_V}-test.log
46 | #- cat .stack-work/logs/dense-linear-algebra-${DENSE_LINEAR_ALGEBRA_V}-test.log
47 | #- cat .stack-work/logs/datasets-${DATASETS_V}-test.log
48 |
49 | after_script:
50 | - stack hpc report --all
51 | - travis_retry curl -sSL https://github.com/lehins/stack-hpc-coveralls/releases/download/0.0.5.0/shc.tar.gz | tar xz shc
52 | - ./shc --repo-token=$COVERALLS_REPO_TOKEN combined all
53 | # - stack exec shc --repo-token=$COVERALLS_REPO_TOKEN spec
54 |
55 | # Caching so the next build will be fast too.
56 | cache:
57 | directories:
58 | - $HOME/.stack
59 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # DataHaskell/dh-core
2 |
3 | [](https://travis-ci.org/DataHaskell/dh-core)
4 |
5 |
6 | DataHaskell core project monorepo
7 |
8 |
9 | ## Aims
10 |
11 | This project aims to provide a native, end-to-end data science toolkit in Haskell. To achieve this, many types of experience are valuable; engineers, scientists, programmers, visualization experts, data journalists are all welcome to join the discussions and contribute.
12 | Not only this should be a working piece of software, but it should be intuitive and pleasant to use.
13 | All contributions, big or small, are very welcome and will be acknowledged.
14 |
15 | ## Architecture
16 |
17 | One single repository allows us to experiment with interfaces and move code around much more freely than many single-purpose repositories. Also, it makes it more convenient to track and visualize progress.
18 |
19 | This is the directory structure of the project; the main project lives in the `dh-core` subdirectory:
20 |
21 | dh-core/
22 | dh-core/
23 | dh-core-accelerate/
24 | ....
25 |
26 |
27 | ## Contributed packages
28 |
29 | A number of authors and maintainers agreed to move ownership of their repositories under the `dh-core` umbrella. In some cases, these packages were already published on Hackage and cannot simply disappear from there, nor can this new line of development break downstream packages.
30 |
31 | For this reason, contributed packages will appear as subdirectories to the main `dh-core` project, and will need to retain their original .cabal file.
32 |
33 | The `stack` tool can take care of multi-package projects; its `packages` stanza in the `stack.yaml` file has only its directory as a default, but can contain a list of paths to other Cabal projects; e.g. in our case it could look like:
34 |
35 | packages:
36 | - .
37 | - analyze/
38 | - datasets/
39 |
40 | Packages that are listed on Hackage already must be added here as distinct sub-directories. Once the migration is complete (PRs merged etc.), add the project to this table :
41 |
42 |
43 | | Package | Description | Original author(s) | First version after merge |
44 | | --- | --- | --- | --- |
45 | | [`analyze`](https://hackage.haskell.org/package/analyze) | Data analysis and manipulation library | [Eric Conlon](https://github.com/ejconlon) | 0.2.0 |
46 | | [`datasets`](https://hackage.haskell.org/package/datasets) | A collection of ready-to-use datasets | [Tom Nielsen](https://github.com/glutamate) | 0.2.6 |
47 | | [`dense-linear-algebra`](https://hackage.haskell.org/package/dense-linear-algebra) | Fast, native dense linear algebra primitives | [Brian O'Sullivan](https://github.com/bos), [Alexey Khudyakov](https://github.com/Shimuuar) | 0.1.0 (a) |
48 |
49 | (a) : To be updated
50 |
51 | NB: Remember to bump version numbers and change web links accordingly when moving in contributed packages.
52 |
53 |
54 |
55 |
56 |
57 | ## Contributing
58 |
59 | 1. Open an issue (https://github.com/DataHaskell/dh-core/issues) with a description of what you want to work on (if it's not already open)
60 | 2. Assign or add yourself to the issue contributors
61 | 3. Pull from `dh-core:master`, start a git branch, add code
62 | 4. Add tests
63 | 5. Update the changelog, describing briefly your changes and their possible effects
64 | 6.
65 |
66 | * If you're working on a contributed package (see next section), increase the version number in the Cabal file accordingly
67 |
68 | * If you bumped version numbers, make sure these are updated accordingly in the Travis CI .yaml file
69 |
70 | 7. Send a pull request with your branch, referencing the issue
71 | 8. `dh-core` admins : merge only _after_ another admin has reviewed and approved the PR
72 |
73 |
74 | ### GHC and Stackage compatibility
75 |
76 | Tested against :
77 |
78 | - Stackage nightly-2019-02-27 (GHC 8.6.3)
79 |
80 |
81 |
82 | ## Development information and guidelines
83 |
84 | ### Dependencies
85 |
86 | We use the [`stack`](https://docs.haskellstack.org/en/stable/README/) build tool.
87 |
88 | Some systems /might/ need binaries and headers for these additional libraries:
89 |
90 | * zlib
91 | * curl
92 |
93 | (however if you're unsure, first try building with your current configuration).
94 |
95 | Nix users should set `nix.enable` to `true` in the `dh-core/dh-core/stack.yaml` file.
96 |
97 |
98 | ### Building instructions
99 |
100 | In the `dh-core/dh-core` subdirectory, run
101 |
102 | $ stack build
103 |
104 | and this will re-build the main project and the contributed packages.
105 |
106 | While developing this `stack` command can come in handy : it will trigger a re-build and run the tests every time a file in the project is modified:
107 |
108 | $ stack build --test --ghc-options -Wall --file-watch
109 |
110 | ## Testing
111 |
112 | Example :
113 |
114 | $ stack test core:doctest core:spec
115 |
116 | The `:` pairs determine which tests will be run.
117 |
118 |
119 | ## Continuous Integration (TravisCI)
120 |
121 | Travis builds `dh-core` and its hosted projects every time a commit is pushed to Github.
122 | Currently the `dh-core/.travis.yml` script uses the following command to install the GHC compiler, build the project and subprojects with `stack`, run the tests and build the Haddock documentation HTMLs:
123 |
124 | - stack $ARGS --no-terminal --install-ghc test core:spec core:doctest dense-linear-algebra:spec --haddock
125 |
126 |
127 | ## Visualizing the dependency tree of a package
128 |
129 | `stack` can produce a .dot file with the dependency graph of a Haskell project, which can then be rendered by the `dot` tool (from the [`graphviz`](https://graphviz.gitlab.io/_pages/Download/Download_source.html) suite).
130 | For example, in the following command the output of `stack dot` will be piped into `dot`, which will produce a SVG file called `deps.svg`:
131 |
132 | `stack dot --external --no-include-base --prune rts,ghc-prim,ghc-boot-th,template-haskell,transformers,containers,deepseq,bytestring,time,primitive,vector,text,hashable | dot -Tsvg > deps.svg`
133 |
134 |
135 |
136 |
137 |
--------------------------------------------------------------------------------
/analyze/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | # Created by https://www.gitignore.io/api/haskell
3 |
4 | ### Haskell ###
5 | dist
6 | dist-*
7 | cabal-dev
8 | *.o
9 | *.hi
10 | *.chi
11 | *.chs.h
12 | *.dyn_o
13 | *.dyn_hi
14 | .hpc
15 | .hsenv
16 | .cabal-sandbox/
17 | cabal.sandbox.config
18 | *.prof
19 | *.aux
20 | *.hp
21 | *.eventlog
22 | .stack-work/
23 | cabal.project.local
24 | .HTF/
25 |
26 |
--------------------------------------------------------------------------------
/analyze/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | 0.2.0
2 | Integrated within DataHaskell/dh-core
3 | Adapted to latest `tasty-hspec` API
4 | Builds with GHC 8.2.2 and 8.4.3
5 |
--------------------------------------------------------------------------------
/analyze/LICENSE.md:
--------------------------------------------------------------------------------
1 | Modified BSD License
2 | ====================
3 |
4 | _Copyright © 2017, Analyze contributors_
5 | _All rights reserved._
6 |
7 | Redistribution and use in source and binary forms, with or without
8 | modification, are permitted provided that the following conditions are met:
9 |
10 | 1. Redistributions of source code must retain the above copyright
11 | notice, this list of conditions and the following disclaimer.
12 | 2. Redistributions in binary form must reproduce the above copyright
13 | notice, this list of conditions and the following disclaimer in the
14 | documentation and/or other materials provided with the distribution.
15 | 3. Neither the name of the `` nor the
16 | names of its contributors may be used to endorse or promote products
17 | derived from this software without specific prior written permission.
18 |
19 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND
20 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 | DISCLAIMED. IN NO EVENT SHALL `` BE LIABLE FOR ANY
23 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
24 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
25 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
26 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
28 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 |
--------------------------------------------------------------------------------
/analyze/README.md:
--------------------------------------------------------------------------------
1 | analyze
2 | =====
3 |
--------------------------------------------------------------------------------
/analyze/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 |
--------------------------------------------------------------------------------
/analyze/analyze.cabal:
--------------------------------------------------------------------------------
1 | cabal-version: 1.12
2 | name: analyze
3 | version: 0.2.0
4 | synopsis: Haskell data analysis and manipulation library
5 | description: Haskell data analysis and manipulation library, please see README.md
6 | category: Data Science
7 | homepage: https://github.com/DataHaskell/dh-core/analyze
8 | author: Eric Conlon , Nikita Tchayka
9 | maintainer: Marco Zocca
10 | copyright: 2017-2018 Analyze contributors
11 | license: BSD3
12 | license-file: LICENSE.md
13 | build-type: Simple
14 | extra-source-files: README.md
15 | CHANGELOG.md
16 | tested-with: GHC == 8.2.2, GHC == 8.4.3
17 |
18 | library
19 | exposed-modules:
20 | Analyze
21 | Analyze.Common
22 | Analyze.Conversions
23 | Analyze.Csv
24 | Analyze.Datasets
25 | Analyze.Decoding
26 | Analyze.Html
27 | Analyze.Ops
28 | Analyze.RFrame
29 | Analyze.Values
30 | other-modules:
31 | Paths_analyze
32 | hs-source-dirs:
33 | src
34 | default-extensions: OverloadedStrings
35 | build-depends:
36 | aeson ,
37 | base >= 4.9 && < 5,
38 | binary ,
39 | bytestring ,
40 | cassava ,
41 | exceptions ,
42 | foldl ,
43 | free ,
44 | hashable ,
45 | lucid ,
46 | text ,
47 | unordered-containers ,
48 | vector ,
49 |
50 | -- DEBUG
51 | tasty-hunit,
52 | tasty-quickcheck ,
53 | criterion ,
54 | mwc-random ,
55 | streamly >= 0.6.0 ,
56 | weigh
57 | -- QuickCheck,
58 | -- hspec
59 |
60 | default-language: Haskell2010
61 |
62 | test-suite spec
63 | type: exitcode-stdio-1.0
64 | main-is: Spec.hs
65 | other-modules:
66 | Fixtures
67 | Generation
68 | Analyze
69 | Analyze.Common
70 | Analyze.Conversions
71 | Analyze.Csv
72 | Analyze.Datasets
73 | Analyze.Decoding
74 | Analyze.Html
75 | Analyze.Ops
76 | Analyze.RFrame
77 | Analyze.Values
78 | Paths_analyze
79 | hs-source-dirs:
80 | test
81 | src
82 | default-extensions: OverloadedStrings
83 | ghc-options: -threaded -rtsopts -with-rtsopts=-N
84 | build-depends:
85 | QuickCheck ,
86 | aeson ,
87 | base ,
88 | binary ,
89 | bytestring ,
90 | cassava ,
91 | exceptions ,
92 | foldl ,
93 | free ,
94 | hashable ,
95 | lucid ,
96 | mwc-random,
97 | tasty ,
98 | tasty-hunit ,
99 | tasty-quickcheck ,
100 | text ,
101 | unordered-containers ,
102 | vector
103 |
104 | default-language: Haskell2010
105 |
106 | benchmark bench
107 | type: exitcode-stdio-1.0
108 | main-is: Main.hs
109 | hs-source-dirs: bench
110 | default-extensions: OverloadedStrings
111 | ghc-options: -threaded -rtsopts -with-rtsopts=-N
112 | build-depends:
113 | base,
114 | analyze ,
115 | criterion,
116 | exceptions ,
117 | mwc-random ,
118 | random ,
119 | streamly >= 0.6.0 ,
120 | -- tasty ,
121 | -- tasty-hunit ,
122 | -- tasty-quickcheck ,
123 | text ,
124 | unordered-containers ,
125 | weigh,
126 | vector
127 |
--------------------------------------------------------------------------------
/analyze/bench/Main.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE GADTs #-}
2 | module Main where
3 |
4 | import Streamly
5 | import qualified Streamly.Prelude as S
6 |
7 | import Control.Monad.Catch (MonadThrow (..))
8 | import Data.Function ((&))
9 | import Control.Monad
10 | import System.IO.Unsafe
11 |
12 | import Analyze.Common (Data (..), DuplicateKeyError (..), RowSizeMismatch (..))
13 | import qualified Analyze as A
14 | import qualified Analyze.Common as AC
15 |
16 |
17 | import qualified Data.Vector as V
18 | import Data.Vector (Vector)
19 |
20 | import qualified Data.Text as T
21 | import Data.Text (Text)
22 |
23 | import Data.HashMap.Strict (HashMap)
24 | -- import qualified Data.HashMap.Strict as HM
25 | -- import qualified Data.HashSet as HS
26 |
27 | import System.Random (randomRs, newStdGen)
28 | import qualified System.Random.MWC as M
29 | import qualified Criterion.Main as C
30 |
31 | import qualified Weigh as W
32 |
33 | main :: IO ()
34 | main = do
35 | speed
36 | -- weight
37 |
38 | -- implementation details required
39 | data RFrameUpdateS k v m where
40 | RFrameUpdateS :: MonadThrow m => {
41 | _rframeUpdateKeys :: !(Vector k),
42 | _rframeUpdateData :: !(SerialT m (Vector v))
43 | } -> RFrameUpdateS k v m
44 |
45 | data RFrameS k v m where
46 |
47 | RFrameS :: MonadThrow m => {
48 | _rframeKeys :: !(Vector k),
49 | _rframeLookup :: !(HashMap k Int),
50 | _rframeData :: !(SerialT m (Vector v))
51 | } -> RFrameS k v m
52 |
53 |
54 | fromUpdate :: (Data k, MonadThrow m) => RFrameUpdateS k v m -> m (RFrameS k v m)
55 | fromUpdate (RFrameUpdateS ks vs) = AC.checkForDupes ks >> pure (RFrameS ks (AC.makeLookup ks) vs)
56 |
57 | update :: (Data k, MonadThrow m) => RFrameUpdateS k v m -> RFrameS k v m -> m (RFrameS k v m)
58 | update (RFrameUpdateS uks uvs) (RFrameS fks _ fvs) = do
59 | fSize <- S.length fvs
60 | uSize <- S.length uvs
61 | if fSize /= uSize
62 | then throwM (RowSizeMismatch fSize uSize)
63 | else do
64 | AC.checkForDupes uks
65 | let kis = AC.mergeKeys fks uks
66 | ks' = (\(k, _, _) -> k) <$> kis
67 | look' = AC.makeLookup ks'
68 | vs' = S.zipWith (AC.runIndexedLookup kis) fvs uvs
69 | return (RFrameS ks' look' vs')
70 |
71 |
72 | -- only concerned with generating data
73 | n :: Int
74 | n = 1500
75 |
76 | testKeys :: Vector Text
77 | testKeys = unsafePerformIO $ V.replicateM n $ liftM (T.pack . take 10 . randomRs ('a','z')) newStdGen
78 |
79 | testData :: Vector (Vector Double)
80 | testData = unsafePerformIO $ do
81 | gen <- M.create
82 | V.replicateM n $ M.uniformVector gen n
83 |
84 | testDataS :: SerialT IO (Vector Double)
85 | testDataS = unsafePerformIO $ do
86 | let
87 | vec = testData
88 | return $ S.fromFoldable vec
89 |
90 | -- the actual benchmarks
91 | cmprVec :: IO Bool
92 | cmprVec = do
93 | let
94 | keys = testKeys
95 | dat = testData
96 | upd = A.RFrameUpdate keys dat
97 | frame1 <- A.fromUpdate upd
98 | frame2 <- A.fromUpdate upd
99 | return $ frame1 == frame2
100 |
101 | -- compr fills the same role as (==) but wrapped in IO
102 | cmpr :: RFrameS Text Double IO -> RFrameS Text Double IO -> IO Bool
103 | cmpr f1 f2 = do
104 | let
105 | sameKeys = _rframeKeys f1 == _rframeKeys f2
106 | sameLookup = _rframeLookup f1 == _rframeLookup f2
107 | dat1 = _rframeData f1
108 | dat2 = _rframeData f2
109 | sameDat <- (S.zipWith (==) dat1 dat2 & S.notElem False)
110 | return $ sameKeys && sameLookup && sameDat
111 |
112 | cmprStream :: IO Bool
113 | cmprStream = do
114 | let
115 | keys = testKeys
116 | dat = testDataS
117 | upd = RFrameUpdateS keys dat
118 | frame1 <- fromUpdate upd
119 | frame2 <- fromUpdate upd
120 | cmpr frame1 frame2
121 |
122 | takeRowsVec :: Int -> IO Bool
123 | takeRowsVec m = do
124 | let
125 | keys = testKeys
126 | dat = testData
127 | upd = A.RFrameUpdate keys dat
128 | frame <- A.fromUpdate upd
129 | let
130 | postTake = A.takeRows m frame
131 | return $ (==) frame postTake -- probably returns false, it's just to force evaluation
132 |
133 | -- | Takes first 'n' rows of an 'RFrameS'.
134 | takeRows :: Int -> RFrameS Text Double IO -> RFrameS Text Double IO
135 | takeRows n (RFrameS ks look srm) = RFrameS ks look (S.take n srm)
136 |
137 | takeRowsS :: Int -> IO Bool
138 | takeRowsS m = do
139 | let
140 | keys = testKeys
141 | dat = testDataS
142 | update = RFrameUpdateS keys dat
143 | frame <- fromUpdate update
144 | let
145 | postTake = takeRows m frame
146 | cmpr frame postTake
147 |
148 | speed :: IO ()
149 | speed = do
150 | -- cmprVec -- to actually generate the data
151 | -- cmprStream -- this too
152 | C.defaultMain [
153 | C.bgroup "Tests" [
154 | C.bench "Vec" $ C.whnfIO cmprVec
155 | , C.bench "Stream" $ C.whnfIO cmprStream
156 | , C.bench "takeRowsVec 1000" $ C.whnfIO (takeRowsVec 1000)
157 | , C.bench "takeRowsVec 500" $ C.whnfIO (takeRowsVec 500)
158 | , C.bench "takeRowsVec 200" $ C.whnfIO (takeRowsVec 200)
159 | , C.bench "takeRowsVec 100" $ C.whnfIO (takeRowsVec 100)
160 | , C.bench "takeRowsStream 1000" $ C.whnfIO (takeRowsS 1000)
161 | , C.bench "takeRowsStream 500" $ C.whnfIO (takeRowsS 500)
162 | , C.bench "takeRowsStream 200" $ C.whnfIO (takeRowsS 200)
163 | , C.bench "takeRowsStream 100" $ C.whnfIO (takeRowsS 100)]]
164 |
165 | weight :: IO ()
166 | weight = do
167 | -- cmprVec
168 | -- cmprStream
169 | W.mainWith ( do
170 | W.action "Vec" cmprVec
171 | W.action "Stream" cmprStream
172 | -- W.io "takeRowsVec 1000" takeRowsVec 1000
173 | W.io "takeRowsVec 500" takeRowsVec 500
174 | W.io "takeRowsVec 200" takeRowsVec 200
175 | W.io "takeRowsVec 100" takeRowsVec 100
176 | -- W.io "takeRowsStream 1000" takeRowsS 1000
177 | W.io "takeRowsStream 500" takeRowsS 500
178 | W.io "takeRowsStream 200" takeRowsS 200
179 | W.io "takeRowsStream 100" takeRowsS 100)
180 |
--------------------------------------------------------------------------------
/analyze/src/Analyze.hs:
--------------------------------------------------------------------------------
1 | -- | See "Analyze.RFrame".
2 | module Analyze
3 | ( module X
4 | ) where
5 |
6 | import Analyze.Common as X
7 | import Analyze.Conversions as X
8 | import Analyze.Csv as X
9 | import Analyze.Datasets as X
10 | import Analyze.Decoding as X
11 | import Analyze.Html as X
12 | import Analyze.Ops as X
13 | import Analyze.RFrame as X
14 | import Analyze.Values as X
15 |
--------------------------------------------------------------------------------
/analyze/src/Analyze/Common.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE ConstraintKinds #-}
2 |
3 | -- | Common internal things (no other internal deps)
4 | module Analyze.Common where
5 |
6 | import Control.Exception
7 | import Control.Monad (forM_, unless)
8 | import Control.Monad.Catch (MonadThrow (..))
9 | import Data.Hashable (Hashable)
10 | import Data.HashMap.Strict (HashMap)
11 | import qualified Data.HashMap.Strict as HM
12 | import Data.HashSet (HashSet)
13 | import qualified Data.HashSet as HS
14 | import Data.Typeable (Typeable)
15 | import Data.Vector (Vector)
16 | import qualified Data.Vector as V
17 |
18 | -- | Column keys need to have equality and hashability.
19 | type Data k = (Eq k, Hashable k, Show k, Typeable k)
20 |
21 | -- | flip <$>
22 | (<&>) :: Functor f => f a -> (a -> b) -> f b
23 | (<&>) x f = f <$> x
24 | {-# INLINE (<&>) #-}
25 | infixl 1 <&>
26 |
27 | -- | Exception for when a column is missing from a frame.
28 | data MissingKeyError k = MissingKeyError k deriving (Show, Eq, Typeable)
29 | instance (Show k, Typeable k) => Exception (MissingKeyError k)
30 |
31 | -- | Exception for when a column is duplicated in a frame.
32 | data DuplicateKeyError k = DuplicateKeyError k deriving (Show, Eq, Typeable)
33 | instance (Show k, Typeable k) => Exception (DuplicateKeyError k)
34 |
35 | -- | Exception for when frame column sizes don't match.
36 | data ColSizeMismatch = ColSizeMismatch Int Int deriving (Show, Eq, Typeable)
37 | instance Exception ColSizeMismatch
38 |
39 | -- | Exception for when frame row sizes don't match.
40 | data RowSizeMismatch = RowSizeMismatch Int Int deriving (Show, Eq, Typeable)
41 | instance Exception RowSizeMismatch
42 |
43 | -- | Throws when duplicate keys are present in a vector.
44 | checkForDupes :: (Data k, MonadThrow m) => Vector k -> m ()
45 | checkForDupes vs = go HS.empty (V.toList vs)
46 | where
47 | go _ [] = pure ()
48 | go s (k:ks) =
49 | if HS.member k s
50 | then throwM (DuplicateKeyError k)
51 | else go (HS.insert k s) ks
52 |
53 | -- | Throws when one vector is not a reordering of the other.
54 | checkReorder :: (Data k, MonadThrow m) => Vector k -> Vector k -> m ()
55 | checkReorder xs ys =
56 | let xSize = V.length xs
57 | ySize = V.length ys
58 | in if xSize /= ySize
59 | then throwM (ColSizeMismatch xSize ySize)
60 | else checkSubset (V.toList xs) (HS.fromList (V.toList ys))
61 |
62 | -- | Throws when any key is not present in the set.
63 | checkSubset :: (Data k, MonadThrow m) => [k] -> HashSet k -> m ()
64 | checkSubset qs ks = forM_ qs (\q -> unless (HS.member q ks) (throwM (MissingKeyError q)))
65 |
66 | -- | Builds a reverse lookup for the vector.
67 | makeLookup :: Data k => Vector k -> HashMap k Int
68 | makeLookup = HM.fromList . flip zip [0..] . V.toList
69 |
70 | -- | Indexes into the vector of values, throwing on key missing or bad index.
71 | runLookup :: (Data k, MonadThrow m) => HashMap k Int -> Vector v -> k -> m v
72 | runLookup look vs k =
73 | case HM.lookup k look >>= (vs V.!?) of
74 | Nothing -> throwM (MissingKeyError k)
75 | Just v -> pure v
76 |
77 | -- | Reorders the vector of values by a new key order and an old lookup.
78 | reorder :: Data k => Vector k -> HashMap k Int -> Vector v -> Vector v
79 | reorder ks look vs = pick <$> ks
80 | where
81 | pick k = vs V.! (look HM.! k)
82 |
83 | -- | Merges two key vectors and tags each with its provenance (favoring the second).
84 | mergeKeys :: Data k => Vector k -> Vector k -> Vector (k, Int, Int)
85 | mergeKeys xs ys =
86 | let m = HM.fromList (V.toList (V.imap (\i x -> (x, (0, i))) xs))
87 | n = HM.fromList (V.toList (V.imap (\i x -> (x, (1, i))) ys))
88 | -- Ties go to the first argument, in this case favoring the update
89 | o = HM.union n m
90 | p = (\x -> let (a, b) = o HM.! x in (x, a, b)) <$> xs
91 | q = (\x -> let (a, b) = n HM.! x in (x, a, b)) <$> V.filter (\x -> not (HM.member x m)) ys
92 | in p V.++ q
93 |
94 | -- | Uses a merged key vector to select values.
95 | runIndexedLookup :: Vector (k, Int, Int) -> Vector v -> Vector v -> Vector v
96 | runIndexedLookup ks xs ys = (\(k, i, j) -> (if i == 0 then xs else ys) V.! j) <$> ks
97 |
--------------------------------------------------------------------------------
/analyze/src/Analyze/Conversions.hs:
--------------------------------------------------------------------------------
1 | -- | Simple structural conversions.
2 | module Analyze.Conversions
3 | ( projectRow
4 | , projectRows
5 | ) where
6 |
7 | import Analyze.Common (Data, MissingKeyError (..), makeLookup)
8 | import Analyze.RFrame (RFrame (..), RFrameUpdate (..), fromUpdate)
9 | import Control.Monad.Catch (MonadThrow (..))
10 | import Data.HashMap.Strict (HashMap)
11 | import qualified Data.HashMap.Strict as HM
12 | import Data.Vector (Vector)
13 | import qualified Data.Vector as V
14 |
15 | -- | Projects values out of the map according to the given key order.
16 | projectRow :: (Data k, MonadThrow m) => Vector k -> HashMap k v -> m (Vector v)
17 | projectRow ks row = V.mapM f ks
18 | where
19 | f k =
20 | case HM.lookup k row of
21 | Nothing -> throwM (MissingKeyError k)
22 | Just v -> pure v
23 |
24 | -- | Projects an 'RFrame' out of many maps according to the given key order.
25 | projectRows :: (Data k, MonadThrow m) => Vector k -> Vector (HashMap k v) -> m (RFrame k v)
26 | projectRows ks rs = do
27 | vs <- V.mapM (projectRow ks) rs
28 | fromUpdate (RFrameUpdate ks vs)
29 |
--------------------------------------------------------------------------------
/analyze/src/Analyze/Csv.hs:
--------------------------------------------------------------------------------
1 | -- | Functions for working with CSV files.
2 | module Analyze.Csv where
3 |
4 | import Analyze.Conversions (projectRows)
5 | import Analyze.RFrame (RFrame (..), RFrameUpdate (..), empty, fromUpdate)
6 | import Control.Monad.Catch (Exception, MonadThrow (..))
7 | import qualified Data.Binary.Builder as B
8 | import qualified Data.ByteString.Lazy as LBS
9 | import qualified Data.ByteString.Lazy.Char8 as LBS8
10 | import qualified Data.Csv as C
11 | import qualified Data.Csv.Builder as CB
12 | import Data.Text (Text)
13 | import Data.Text.Encoding (decodeUtf8, encodeUtf8)
14 | import Data.Typeable (Typeable)
15 | import qualified Data.Vector as V
16 |
17 | -- | Exception to wrap Cassava error strings.
18 | data CsvError = CsvError String deriving (Eq, Show, Typeable)
19 | instance Exception CsvError
20 |
21 | -- | Decode CSV bytes as an 'RFrame' with a header row.
22 | decodeWithHeader :: MonadThrow m => LBS.ByteString -> m (RFrame Text Text)
23 | decodeWithHeader bs =
24 | case C.decodeByName bs of
25 | Left err -> throwM (CsvError err)
26 | Right (header, rows) -> do
27 | let ks = decodeUtf8 <$> header
28 | projectRows ks rows
29 |
30 | -- | Decode CSV bytes as an 'RFrame' without a header row.
31 | decodeWithoutHeader :: MonadThrow m => LBS.ByteString -> m (RFrame Int Text)
32 | decodeWithoutHeader bs =
33 | case C.decode C.NoHeader bs of
34 | Left err -> throwM (CsvError err)
35 | Right rows ->
36 | if V.null rows
37 | then return empty
38 | else do
39 | let ks = V.imap const (V.head rows)
40 | update = RFrameUpdate ks rows
41 | fromUpdate update
42 |
43 | -- | Encode an 'RFrame' as CSV bytes with a header row.
44 | encodeWithHeader :: RFrame Text Text -> LBS.ByteString
45 | encodeWithHeader (RFrame ks _ vs) =
46 | let header = CB.encodeHeader (encodeUtf8 <$> ks)
47 | rows = header `mappend` foldMap (CB.encodeRecord . (encodeUtf8 <$>)) vs
48 | in B.toLazyByteString header
49 |
50 | -- | Encode an 'RFrame' as CSV bytes without header row.
51 | encodeWithoutHeader :: RFrame k Text -> LBS.ByteString
52 | encodeWithoutHeader (RFrame _ _ vs) =
53 | B.toLazyByteString (foldMap (CB.encodeRecord . (encodeUtf8 <$>)) vs)
54 |
55 |
56 | loadCSVFileWithHeader :: FilePath -> IO (RFrame Text Text)
57 | loadCSVFileWithHeader fileName = do
58 | contents <- readFile fileName
59 | decodeWithHeader (LBS8.pack contents)
60 |
61 | loadCSVFileWithoutHeader :: FilePath -> IO (RFrame Int Text)
62 | loadCSVFileWithoutHeader fileName = do
63 | contents <- readFile fileName
64 | decodeWithoutHeader (LBS8.pack contents)
65 |
--------------------------------------------------------------------------------
/analyze/src/Analyze/Datasets.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedStrings #-}
2 |
3 | -- | Functions to work with included datasets.
4 | module Analyze.Datasets where
5 |
6 | import Analyze.Csv
7 | import Analyze.RFrame (RFrame)
8 | import Control.Monad.Catch (MonadThrow (..))
9 | import qualified Data.ByteString.Lazy as LBS
10 | import Data.Text (Text)
11 | import qualified Data.Text as T
12 | import Paths_analyze
13 |
14 | -- | Load an included dataset.
15 | datasetWithHeader :: Text -> Text -> IO (RFrame Text Text)
16 | datasetWithHeader a b = do
17 | let path = "datasets/" ++ T.unpack a ++ "/" ++ T.unpack b ++ ".csv"
18 | newPath <- getDataFileName path
19 | bs <- LBS.readFile newPath
20 | decodeWithHeader bs
21 |
22 | -- | Load the "train" partition of the "titanic" dataset.
23 | titanicTrain :: IO (RFrame Text Text)
24 | titanicTrain = datasetWithHeader "titanic" "train"
25 |
26 | -- | Load the "test" partition of the "titanic" dataset.
27 | titanicTest :: IO (RFrame Text Text)
28 | titanicTest = datasetWithHeader "titanic" "test"
29 |
--------------------------------------------------------------------------------
/analyze/src/Analyze/Decoding.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveFunctor #-}
2 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
3 |
4 | -- | Applicative decoding with key-value lookups.
5 | -- Think of a 'Decoder' as a row function that exposes the columns it uses.
6 | module Analyze.Decoding
7 | ( Arg (..)
8 | , Decoder(..)
9 | , decoderKeys
10 | , fromArg
11 | , require
12 | , requireWhere
13 | , runDecoder
14 | ) where
15 |
16 | import Analyze.Common (Data)
17 | import Control.Applicative.Free (Ap (..), liftAp)
18 | import Data.Maybe (fromMaybe)
19 |
20 | -- | Pair of key and an extraction function.
21 | data Arg m k v a = Arg k (v -> m a) deriving (Functor)
22 |
23 | -- | Free applicative over 'Arg'.
24 | newtype Decoder m k v a = Decoder (Ap (Arg m k v) a) deriving (Functor, Applicative)
25 |
26 | -- | Lifts a single 'Arg' into a 'Decoder'
27 | fromArg :: Arg m k v a -> Decoder m k v a
28 | fromArg = Decoder . liftAp
29 |
30 | -- | Simple 'Decoder' that just looks up and returns the value for a given key.
31 | require :: Applicative m => k -> Decoder m k v v
32 | require k = fromArg (Arg k pure)
33 |
34 | -- | Shorthand for lookup and transform.
35 | requireWhere :: k -> (k -> v -> m a) -> Decoder m k v a
36 | requireWhere k e = fromArg (Arg k (e k))
37 |
38 | -- | List all column names used in the 'Decoder'.
39 | decoderKeys :: Data k => Decoder m k v a -> [k]
40 | decoderKeys (Decoder x) = go x
41 | where
42 | go :: Ap (Arg m k v) a -> [k]
43 | go (Pure _) = []
44 | go (Ap (Arg k _) rest) = k : go rest
45 |
46 | -- This is pretty sensitive to let bindings
47 | apRow :: (Data k, Monad m) => Ap (Arg m k v) a -> (k -> m v) -> m a
48 | apRow (Pure a) _ = pure a
49 | apRow (Ap (Arg k f) rest) row = do
50 | v <- row k
51 | z <- f v
52 | fz <- apRow rest row
53 | return (fz z)
54 |
55 | -- | Run a 'Decoder' with a lookup function (typically row lookup).
56 | runDecoder :: (Data k, Monad m) => Decoder m k v a -> (k -> m v) -> m a
57 | runDecoder (Decoder x) = apRow x
58 |
--------------------------------------------------------------------------------
/analyze/src/Analyze/Html.hs:
--------------------------------------------------------------------------------
1 | -- | Functions for working with HTML.
2 | module Analyze.Html where
3 |
4 | import Analyze.RFrame (RFrame (..))
5 | import Control.Monad (forM_)
6 | import qualified Lucid as L
7 |
8 | -- | Renders an 'RFrame' to an HTML table.
9 | renderHtml :: (L.ToHtml k, L.ToHtml v, Monad m)
10 | => RFrame k v -> L.HtmlT m ()
11 | renderHtml (RFrame ks _ vs) =
12 | L.table_ $ do
13 | L.thead_ $
14 | L.tr_ $ forM_ ks (L.th_ . L.toHtml)
15 | L.tbody_ $ forM_ vs $ \v -> L.tr_ (forM_ v (L.td_ . L.toHtml))
16 |
--------------------------------------------------------------------------------
/analyze/src/Analyze/Ops.hs:
--------------------------------------------------------------------------------
1 | -- | Various data-sciencey functions.
2 | module Analyze.Ops
3 | ( oneHot
4 | ) where
5 |
6 | import Analyze.Common (Data)
7 | import Analyze.RFrame (RFrame (..), RFrameUpdate (..), col, splitCols, update, fromUpdate)
8 | import Control.Monad.Catch (MonadThrow (..))
9 | import qualified Data.HashSet as HS
10 | import Data.Vector (Vector)
11 | import qualified Data.Vector as V
12 |
13 | uniq :: Data k => Vector k -> Vector k
14 | uniq ks = V.reverse (V.fromList newKsR)
15 | where
16 | acc (hs, uks) k =
17 | if HS.member k hs
18 | then (hs, uks)
19 | else (HS.insert k hs, k:uks)
20 | (_, newKsR) = V.foldl acc (HS.empty, []) ks
21 |
22 | match :: Eq k => Vector k -> v -> v -> k -> Vector v
23 | match ks yesVal noVal tk = V.map (\k -> if k == tk then yesVal else noVal) ks
24 |
25 | -- | One-hot encode a given column. (See tests for usage.)
26 | oneHot :: (Data k, MonadThrow m, Show v) => (k -> v -> k) -> k -> v -> v -> RFrame k v -> m (RFrame k v)
27 | oneHot combine key yesVal noVal frame = do
28 | let (target, cold) = splitCols (== key) frame
29 | rawVs <- col key target
30 | let cookedKs = V.map (combine key) rawVs
31 | newKs = uniq cookedKs
32 | newVs = V.map (match newKs yesVal noVal) cookedKs
33 | hot = RFrameUpdate newKs newVs
34 |
35 | fromUpdate hot
36 |
--------------------------------------------------------------------------------
/analyze/src/Analyze/RFrame.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveFunctor #-}
2 | {-# LANGUAGE FlexibleInstances #-}
3 |
4 | -- | Core frame types and functions
5 | module Analyze.RFrame where
6 |
7 | import Analyze.Common
8 | import Analyze.Decoding (Decoder (..), decoderKeys, runDecoder)
9 | -- import qualified Control.Foldl as F
10 | import Control.Monad (join)
11 | import Control.Monad.Catch (MonadThrow (..))
12 | -- import qualified Data.Aeson as A
13 | import Data.HashMap.Strict (HashMap)
14 | import qualified Data.HashMap.Strict as HM
15 | -- import Data.HashSet (HashSet)
16 | import qualified Data.HashSet as HS
17 | -- import Data.Text (Text)
18 | -- import qualified Data.Text as T
19 | import Data.Vector (Vector)
20 | import qualified Data.Vector as V
21 |
22 | -- | In-memory row-oriented frame with columns named by `k` and values by `v`
23 | data RFrame k v = RFrame
24 | { -- | Ordered vector of column names
25 | _rframeKeys :: !(Vector k)
26 | , -- | Quick lookup from column name to column index
27 | _rframeLookup :: !(HashMap k Int)
28 | , -- | Vector of rows. Each element should be the length of number of columns.
29 | _rframeData :: !(Vector (Vector v))
30 | } deriving (Eq, Show, Functor)
31 |
32 | -- | A simpler 'RFrame' for updates
33 | data RFrameUpdate k v = RFrameUpdate
34 | { -- | Ordered vector of column names
35 | _rframeUpdateKeys :: !(Vector k)
36 | , -- | Vector of rows.
37 | _rframeUpdateData :: !(Vector (Vector v))
38 | } deriving (Eq, Show, Functor)
39 |
40 | -- | Alias for a function to be applied to each row
41 | type RFrameMap k v a = Vector k -> HashMap k Int -> Int -> Vector v -> a
42 |
43 | -- | Alias for a row filter
44 | type RFrameFilter k v = RFrameMap k v Bool
45 |
46 | -- | Prettier alias for getting the keys of an 'RFrame'
47 | rframeKeys :: RFrame k v -> Vector k
48 | rframeKeys = _rframeKeys
49 |
50 | -- | Prettier alias for getting the data matrix of an 'RFrame'
51 | rframeData :: RFrame k v -> Vector (Vector v)
52 | rframeData = _rframeData
53 |
54 | -- | An empty frame with no rows or columns
55 | empty :: RFrame k v
56 | empty = RFrame V.empty HM.empty V.empty
57 |
58 | -- | Build an 'RFrame' from an 'RFrameUpdate'.
59 | -- Throws on duplicate keys.
60 | fromUpdate :: (Data k, MonadThrow m) => RFrameUpdate k v -> m (RFrame k v)
61 | fromUpdate (RFrameUpdate ks vs) = checkForDupes ks >> pure (RFrame ks (makeLookup ks) vs)
62 |
63 | -- | Build an 'RFrameUpdate' from an 'RFrame'
64 | toUpdate :: Data k => RFrame k v -> RFrameUpdate k v
65 | toUpdate (RFrame ks _ vs) = RFrameUpdate ks vs
66 |
67 | -- | Number of columns in an 'RFrame'
68 | numCols :: RFrame k v -> Int
69 | numCols (RFrame ks _ _) = V.length ks
70 |
71 | -- | Number of rows in an 'RFrame'
72 | numRows :: RFrame k v -> Int
73 | numRows (RFrame _ _ vs) = V.length vs
74 |
75 | -- | Project to the given column
76 | col :: (Data k, MonadThrow m) => k -> RFrame k v -> m (Vector v)
77 | col k (RFrame _ look vs) = V.mapM (\v -> runLookup look v k) vs
78 |
79 | -- | Decode by row. Each element of the returned vector may fail on decoding error
80 | -- so flatten manually or use 'flatDecode'.
81 | decode :: (Data k, MonadThrow m) => Decoder m k v a -> RFrame k v -> m (Vector (m a))
82 | decode decoder (RFrame ks look vs) = checkSubset required keySet >> pure decoded
83 | where
84 | keySet = HS.fromList (V.toList ks)
85 | required = decoderKeys decoder
86 | decoded = runDecoder decoder . runLookup look <$> vs
87 |
88 | -- | An auto-flattened version of 'decode'.
89 | flatDecode :: (Data k, MonadThrow m) => Decoder m k v a -> RFrame k v -> m (Vector a)
90 | flatDecode decoder rframe = join $ sequence <$> decode decoder rframe
91 |
92 | -- | Filter an 'RFrame' by row
93 | filter :: Data k => RFrameFilter k v -> RFrame k v -> RFrame k v
94 | filter p (RFrame ks look vs) = RFrame ks look vs'
95 | where
96 | vs' = V.ifilter (p ks look) vs
97 |
98 | -- | Update row-wise, adding or replacing values per-column.
99 | -- Retains the existing column order, appending new columns.
100 | -- Throws on row length mismatch or duplicate columns in the update.
101 | update :: (Data k, MonadThrow m) => RFrameUpdate k v -> RFrame k v -> m (RFrame k v)
102 | update (RFrameUpdate uks uvs) (RFrame fks _ fvs) = do
103 | let fSize = V.length fvs
104 | uSize = V.length uvs
105 | if fSize /= uSize
106 | then throwM (RowSizeMismatch fSize uSize)
107 | else do
108 | checkForDupes uks
109 | let kis = mergeKeys fks uks
110 | ks' = (\(k, _, _) -> k) <$> kis
111 | look' = makeLookup ks'
112 | vs' = V.zipWith (runIndexedLookup kis) fvs uvs
113 | return (RFrame ks' look' vs')
114 |
115 | -- | Split columns in an 'RFrame' by a predicate.
116 | splitCols :: Data k => (k -> Bool) -> RFrame k v -> (RFrame k v, RFrame k v)
117 | splitCols p (RFrame ks look vs) = (RFrame keepKs keepLook keepVs, RFrame dropKs dropLook dropVs)
118 | where
119 | (keepKs, dropKs) = V.partition p ks
120 | keepLook = makeLookup keepKs
121 | keepVs = reorder keepKs look <$> vs
122 | dropLook = makeLookup dropKs
123 | dropVs = reorder dropKs look <$> vs
124 |
125 | -- | Drop columns in an 'RFrame' by a predicate.
126 | dropCols :: Data k => (k -> Bool) -> RFrame k v -> RFrame k v
127 | dropCols p frame = snd (splitCols p frame)
128 |
129 | -- | Keep columns in an 'RFrame' by a predicate.
130 | keepCols :: Data k => (k -> Bool) -> RFrame k v -> RFrame k v
131 | keepCols p frame = fst (splitCols p frame)
132 |
133 | -- | Appends rows to an 'RFrame', retaining column order of the first.
134 | -- Throws on column mismatch.
135 | appendRows :: (Data k, MonadThrow m) => RFrame k v -> RFrame k v -> m (RFrame k v)
136 | appendRows (RFrame ks0 look0 vs0) (RFrame ks1 look1 vs1) = do
137 | checkReorder ks0 ks1
138 | let vs1' = reorder ks0 look1 vs1
139 | return (RFrame ks0 look0 (vs0 V.++ vs1'))
140 |
141 | -- | Appends columns to an 'RFrame', retaining column order of the first.
142 | extendCols :: (Data k, MonadThrow m) => RFrame k v -> RFrame k v -> m (RFrame k v)
143 | extendCols f g = update (toUpdate g) f
144 |
145 | -- | Takes first 'n' rows of an 'RFrame'.
146 | takeRows :: Int -> RFrame k v -> RFrame k v
147 | takeRows n (RFrame ks look vs) = RFrame ks look (V.take n vs)
148 |
149 | -- | Adds a 'Vector' column to the 'RFrame'
150 | addColumn :: (Data k, MonadThrow m) => RFrame k v -> k -> Vector v -> m (RFrame k v)
151 | addColumn rf name v = do
152 | c <- newRFrameColumn name $ V.singleton <$> v
153 | extendCols rf c
154 | where
155 | newRFrameColumn rfName = fromUpdate . RFrameUpdate (V.singleton rfName)
156 |
--------------------------------------------------------------------------------
/analyze/src/Analyze/Values.hs:
--------------------------------------------------------------------------------
1 | -- | Simple value types and functions.
2 | module Analyze.Values where
3 |
4 | import Analyze.Common (Data)
5 | import Control.Monad.Catch (Exception, MonadThrow (..))
6 | import Data.Text (Text)
7 | import Data.Typeable (Typeable)
8 |
9 | -- | Singleton type for value types.
10 | data ValueType =
11 | ValueTypeText
12 | | ValueTypeInteger
13 | | ValueTypeDouble
14 | | ValueTypeBool
15 | deriving (Show, Eq, Enum, Bounded)
16 |
17 | -- | Union type for values.
18 | data Value =
19 | ValueText Text
20 | | ValueInteger Integer
21 | | ValueDouble Double
22 | | ValueBool Bool
23 | deriving (Show, Eq)
24 |
25 | -- | Returns the type of the value.
26 | valueToType :: Value -> ValueType
27 | valueToType (ValueText _) = ValueTypeText
28 | valueToType (ValueInteger _) = ValueTypeInteger
29 | valueToType (ValueDouble _) = ValueTypeDouble
30 | valueToType (ValueBool _) = ValueTypeBool
31 |
32 | -- | Extracts 'Text' from the 'Value'.
33 | getText :: Value -> Maybe Text
34 | getText (ValueText s) = Just s
35 | getText _ = Nothing
36 |
37 | -- | Extracts 'Integer' from the 'Value'.
38 | getInteger :: Value -> Maybe Integer
39 | getInteger (ValueInteger i) = Just i
40 | getInteger _ = Nothing
41 |
42 | -- | Extracts 'Double' from the 'Value'.
43 | getDouble :: Value -> Maybe Double
44 | getDouble (ValueDouble d) = Just d
45 | getDouble _ = Nothing
46 |
47 | -- | Extracts 'Bool' from the 'Value'.
48 | getBool :: Value -> Maybe Bool
49 | getBool (ValueBool b) = Just b
50 | getBool _ = Nothing
51 |
52 | -- | Exception for when we encounder unexpected values.
53 | data ValueTypeError k = ValueTypeError k ValueType Value deriving (Show, Eq, Typeable)
54 | instance (Show k, Typeable k) => Exception (ValueTypeError k)
55 |
56 | -- | Use with 'Analyze.Decoding.requireWhere' to read 'Text' values.
57 | textual :: (Data k, MonadThrow m) => k -> Value -> m Text
58 | textual _ (ValueText s) = pure s
59 | textual k v = throwM (ValueTypeError k ValueTypeText v)
60 |
61 | -- | Use with 'Analyze.Decoding.requireWhere' to read 'Integer' values.
62 | integral :: (Data k, MonadThrow m) => k -> Value -> m Integer
63 | integral _ (ValueInteger s) = pure s
64 | integral k v = throwM (ValueTypeError k ValueTypeInteger v)
65 |
66 | -- | Use with 'Analyze.Decoding.requireWhere' to read 'Double' values.
67 | floating :: (Data k, MonadThrow m) => k -> Value -> m Double
68 | floating _ (ValueDouble s) = pure s
69 | floating k v = throwM (ValueTypeError k ValueTypeDouble v)
70 |
71 | -- | Use with 'Analyze.Decoding.requireWhere' to read 'Bool' values.
72 | boolean :: (Data k, MonadThrow m) => k -> Value -> m Bool
73 | boolean _ (ValueBool s) = pure s
74 | boolean k v = throwM (ValueTypeError k ValueTypeBool v)
75 |
--------------------------------------------------------------------------------
/analyze/stack.yaml:
--------------------------------------------------------------------------------
1 | resolver: lts-11.22
2 | packages:
3 | - .
4 | # Dependency packages to be pulled from upstream that are not in the resolver
5 | # using the same syntax as the packages field.
6 | # (e.g., acme-missiles-0.3)
7 | extra-deps:
8 | - streamly-0.6.1
9 | - vector-0.12.0.3
10 | # Override default flag values for local packages and extra-deps
11 | # flags: {}
12 |
13 | # Extra package databases containing global packages
14 | # extra-package-dbs: []
15 |
16 | # Control whether we use the GHC we find on the path
17 | # system-ghc: true
18 | #
19 | # Require a specific version of stack, using version ranges
20 | # require-stack-version: -any # Default
21 | # require-stack-version: ">=1.9"
22 | #
23 | # Override the architecture used by stack, especially useful on Windows
24 | # arch: i386
25 | # arch: x86_64
26 | #
27 | # Extra directories used by stack for building
28 | # extra-include-dirs: [/path/to/dir]
29 | # extra-lib-dirs: [/path/to/dir]
30 | #
31 | # Allow a newer minor version of GHC than the snapshot specifies
32 | # compiler-check: newer-minor
33 |
--------------------------------------------------------------------------------
/analyze/test/Fixtures.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedStrings #-}
2 | {-# LANGUAGE TypeSynonymInstances #-}
3 | module Fixtures where
4 |
5 | import Analyze.RFrame (RFrame (..), RFrameUpdate (..))
6 | import Analyze.Values
7 | import qualified Control.Foldl as F
8 | import Data.HashMap.Strict (HashMap)
9 | import qualified Data.HashMap.Strict as HM
10 | import Data.Text (Text)
11 | import qualified Data.Text as T
12 | import Data.Vector (Vector)
13 | import qualified Data.Vector as V
14 |
15 | exampleDecl :: Vector (Text, ValueType)
16 | exampleDecl = V.fromList
17 | [ ("id", ValueTypeInteger)
18 | , ("name", ValueTypeText)
19 | , ("score", ValueTypeDouble)
20 | ]
21 |
22 | exampleHeader :: Vector Text
23 | exampleHeader = V.fromList
24 | [ "id"
25 | , "name"
26 | , "score"
27 | ]
28 |
29 | exampleObj1 :: Vector (Text, Value)
30 | exampleObj1 = V.fromList
31 | [ ("id", ValueInteger 42)
32 | , ("name", ValueText "foo")
33 | , ("score", ValueDouble 5.0)
34 | ]
35 |
36 | exampleRecord1 :: Vector Value
37 | exampleRecord1 = V.fromList
38 | [ ValueInteger 42
39 | , ValueText "foo"
40 | , ValueDouble 50.0
41 | ]
42 |
43 | exampleObj2 :: Vector (Text, Value)
44 | exampleObj2 = V.fromList
45 | [ ("id", ValueInteger 43)
46 | , ("name", ValueText "bar")
47 | , ("score", ValueDouble 3.0)
48 | ]
49 |
50 | fullUpdate :: RFrameUpdate Text Value
51 | fullUpdate = RFrameUpdate names values
52 | where
53 | names = V.fromList ["id", "name", "score"]
54 | values = V.fromList
55 | [ V.fromList [ValueInteger 42, ValueText "foo", ValueDouble 5.0]
56 | , V.fromList [ValueInteger 43, ValueText "bar", ValueDouble 3.0]
57 | ]
58 |
59 | noNameUpdate :: RFrameUpdate Text Value
60 | noNameUpdate = RFrameUpdate names values
61 | where
62 | names = V.fromList ["id", "score"]
63 | values = V.fromList
64 | [ V.fromList [ValueInteger 42, ValueDouble 5.0]
65 | , V.fromList [ValueInteger 43, ValueDouble 3.0]
66 | ]
67 |
68 | colorUpdate :: RFrameUpdate Text Value
69 | colorUpdate = RFrameUpdate names values
70 | where
71 | names = V.fromList ["color"]
72 | values = V.fromList
73 | [ V.fromList [ValueText "purple"]
74 | , V.fromList [ValueText "orange"]
75 | ]
76 |
77 | colorOneUpdate :: RFrameUpdate Text Value
78 | colorOneUpdate = RFrameUpdate names values
79 | where
80 | names = V.fromList ["color"]
81 | values = V.fromList
82 | [ V.fromList [ValueText "purple"]
83 | ]
84 |
85 | colorSpanishUpdate :: RFrameUpdate Text Value
86 | colorSpanishUpdate = RFrameUpdate names values
87 | where
88 | names = V.fromList ["color", "spanishColor"]
89 | values = V.fromList
90 | [ V.fromList [ValueText "purple", ValueText "lila"]
91 | , V.fromList [ValueText "orange", ValueText "naranja"]
92 | ]
93 |
94 | colorHotUpdate :: RFrameUpdate Text Value
95 | colorHotUpdate = RFrameUpdate names values
96 | where
97 | names = V.fromList ["color/purple", "color/orange"]
98 | values = V.fromList
99 | [ V.fromList [ValueBool True, ValueBool False]
100 | , V.fromList [ValueBool False, ValueBool True]
101 | ]
102 |
103 | fullColorUpdate :: RFrameUpdate Text Value
104 | fullColorUpdate = RFrameUpdate names values
105 | where
106 | names = V.fromList ["id", "name", "score", "color"]
107 | values = V.fromList
108 | [ V.fromList [ValueInteger 42, ValueText "foo", ValueDouble 5.0, ValueText "purple"]
109 | , V.fromList [ValueInteger 43, ValueText "bar", ValueDouble 3.0, ValueText "orange"]
110 | ]
111 |
112 | overlapUpdate :: RFrameUpdate Text Value
113 | overlapUpdate = RFrameUpdate names values
114 | where
115 | names = V.fromList ["color", "score"]
116 | values = V.fromList
117 | [ V.fromList [ValueText "purple", ValueDouble 10.0]
118 | , V.fromList [ValueText "orange", ValueDouble 6.0]
119 | ]
120 |
121 | fullOverlapUpdate :: RFrameUpdate Text Value
122 | fullOverlapUpdate = RFrameUpdate names values
123 | where
124 | names = V.fromList ["id", "name", "score", "color"]
125 | values = V.fromList
126 | [ V.fromList [ValueInteger 42, ValueText "foo", ValueDouble 10.0, ValueText "purple"]
127 | , V.fromList [ValueInteger 43, ValueText "bar", ValueDouble 6.0, ValueText "orange"]
128 | ]
129 |
130 | emptyUpdate :: RFrameUpdate Text Value
131 | emptyUpdate = RFrameUpdate V.empty (V.replicate 2 V.empty)
132 |
133 | fixtures :: HashMap Text (RFrameUpdate Text Value)
134 | fixtures = HM.fromList
135 | [ ("full", fullUpdate)
136 | , ("noName", noNameUpdate)
137 | , ("color", colorUpdate)
138 | , ("colorOne", colorOneUpdate)
139 | , ("colorHot", colorHotUpdate)
140 | , ("colorSpanish", colorSpanishUpdate)
141 | , ("empty", emptyUpdate)
142 | , ("fullColor", fullColorUpdate)
143 | , ("overlap", overlapUpdate)
144 | , ("fullOverlap", fullOverlapUpdate)
145 | ]
146 |
147 | exampleCsv :: Text
148 | exampleCsv = "id,name,score\n" `mappend` "42,foo,5.0\n" `mappend` "43,bar,3.0\n"
149 |
--------------------------------------------------------------------------------
/analyze/test/Generation.hs:
--------------------------------------------------------------------------------
1 | module Generation where
2 |
3 | import Analyze.Common (Data, makeLookup)
4 | import Analyze.RFrame (RFrame (..),RFrameUpdate (..))
5 | import Analyze.Values
6 | import Data.HashSet (HashSet)
7 | import qualified Data.HashSet as HS
8 | import Data.Text (Text)
9 | import qualified Data.Text as T
10 | import Data.Vector (Vector)
11 | import qualified Data.Vector as V
12 | import Test.QuickCheck
13 |
14 | distinctGenSized :: Data k => Gen k -> Int -> Gen (HashSet k)
15 | distinctGenSized = go HS.empty
16 | where
17 | go s g i | i <= 0 = pure s
18 | | otherwise = do
19 | k <- g `suchThat` \k' -> not (HS.member k' s)
20 | go (HS.insert k s) g (i - 1)
21 |
22 | distinctGen :: Data k => Gen k -> Gen (HashSet k)
23 | distinctGen = sized . distinctGenSized
24 |
25 | declGenSized :: Data k => Gen k -> Gen t -> Int -> Gen (Vector (k, t))
26 | declGenSized kg tg i = do
27 | nameSet <- distinctGen kg
28 | let nameVec = V.fromList (HS.toList nameSet)
29 | valueTypeVec <- V.replicateM i tg
30 | pure (V.zip nameVec valueTypeVec)
31 |
32 | declGen :: Data k => Gen k -> Gen t -> Gen (Vector (k, t))
33 | declGen kg tg = sized (declGenSized kg tg)
34 |
35 | rframeGenSized :: Data k => (t -> Gen v) -> Vector (k, t) -> Int -> Gen (RFrame k v)
36 | rframeGenSized prod decl numRows = gen
37 | where
38 | rowGen = sequenceA (prod . snd <$> decl)
39 | allRowsGen = V.replicateM numRows rowGen
40 | keys = fst <$> decl
41 | gen = RFrame keys (makeLookup keys) <$> allRowsGen
42 |
43 | rframeGen :: Data k => (t -> Gen v) -> Vector (k, t) -> Gen (RFrame k v)
44 | rframeGen prod decl = sized (rframeGenSized prod decl)
45 |
46 | -- needed to generate an updated, copied off the
47 | rframeUpdateGenSized :: Data k => (t -> Gen v) -> Vector (k, t) -> Int -> Gen (RFrameUpdate k v)
48 | rframeUpdateGenSized prod decl numRows = gen
49 | where
50 | rowGen = sequenceA (prod . snd <$> decl)
51 | allRowsGen = V.replicateM numRows rowGen
52 | keys = fst <$> decl
53 | gen = RFrameUpdate keys <$> allRowsGen
54 |
55 | -- again some machinery
56 | rframeUpdateGen :: Data k => (t -> Gen v) -> Vector (k, t) -> Gen (RFrameUpdate k v)
57 | rframeUpdateGen prod decl = sized (rframeUpdateGenSized prod decl)
58 |
59 | -- Specifics
60 |
61 | nameGen :: Gen Text
62 | nameGen = T.pack <$> listOf (choose ('a', 'z'))
63 |
64 | valueGen :: ValueType -> Gen Value
65 | valueGen ValueTypeText = ValueText <$> nameGen
66 | valueGen ValueTypeInteger = ValueInteger <$> arbitrary
67 | valueGen ValueTypeDouble = ValueDouble <$> arbitrary
68 | valueGen ValueTypeBool = ValueBool <$> arbitrary
69 |
70 | valueTypeGen :: Gen ValueType
71 | valueTypeGen = arbitraryBoundedEnum
72 |
73 | valueDeclGen :: Gen (Vector (Text, ValueType))
74 | valueDeclGen = declGen nameGen valueTypeGen
75 |
76 | valueRFrameGen :: Gen (RFrame Text Value)
77 | valueRFrameGen = valueDeclGen >>= rframeGen valueGen
78 |
79 |
80 | -- generates an update
81 | valueRFrameUpdateGen :: Gen (RFrameUpdate Text Value)
82 | valueRFrameUpdateGen = valueDeclGen >>= rframeUpdateGen valueGen
83 |
84 |
85 | -- things down there serve to produce updates with only doubles
86 | -- only ouputs Vector (Text, ValueTypeDouble))
87 | valueDeclGenDouble :: Gen (Vector (Text, ValueType))
88 | valueDeclGenDouble = declGen nameGen (elements [ValueTypeDouble])
89 |
90 | -- the actual generator
91 | doubleRFrameUpdateGen :: Gen (RFrameUpdate Text Value)
92 | doubleRFrameUpdateGen = valueDeclGenDouble >>= rframeUpdateGen valueGen-- a frame generator that will only have Double's as data
--------------------------------------------------------------------------------
/analyze/test/Spec.hs:
--------------------------------------------------------------------------------
1 | module Main where
2 |
3 | import qualified PropTests as P
4 | import qualified UnitTests as U
5 | import qualified Test.Tasty as Ts
6 |
7 | groupTest :: Ts.TestTree
8 | groupTest = Ts.testGroup "Analyze tests" [P.propTests, U.tests]
9 |
10 | main :: IO ()
11 | main = do
12 | Ts.defaultMain groupTest
--------------------------------------------------------------------------------
/analyze/test/UnitTests.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleInstances #-}
2 | {-# LANGUAGE OverloadedStrings #-}
3 | {-# LANGUAGE TypeSynonymInstances #-}
4 | module UnitTests where
5 |
6 | import qualified Analyze as A
7 | import Analyze.Common ((<&>))
8 | import Control.Monad.Catch
9 | import qualified Data.HashMap.Strict as HM
10 | import qualified Data.HashSet as HS
11 | import Data.Monoid ((<>))
12 | import Data.Text (Text)
13 | import qualified Data.Text as T
14 | import Data.Vector (Vector)
15 | import qualified Data.Vector as V
16 | import Fixtures
17 | import Generation
18 | import Test.QuickCheck
19 | import qualified Test.QuickCheck.Property as P
20 | import Test.Tasty
21 | import Test.Tasty.HUnit
22 | import Test.Tasty.QuickCheck
23 |
24 | -- Boilerplate
25 |
26 | propertyIO :: Assertion -> Property
27 | propertyIO action = ioProperty tester
28 | where
29 | tester :: IO P.Result
30 | tester = catch (action >> return P.succeeded) handler
31 | handler (HUnitFailure _ err) = return P.failed { P.reason = err }
32 |
33 | testPropertyIO :: TestName -> Gen a -> (a -> Assertion) -> TestTree
34 | testPropertyIO name g t = testProperty name (propertyIO . t <$> g)
35 |
36 | -- Aux
37 |
38 | getUpdateFixture :: Text -> IO (A.RFrameUpdate Text A.Value)
39 | getUpdateFixture name =
40 | case HM.lookup name fixtures of
41 | Just u -> return u
42 | Nothing -> error ("fixture not found: " ++ T.unpack name)
43 |
44 |
45 | getFrameFixture :: Text -> IO (A.RFrame Text A.Value)
46 | getFrameFixture name = A.fromUpdate =<< getUpdateFixture name
47 |
48 | -- Tests
49 |
50 | testFixture :: TestTree
51 | testFixture = testCase "fixture" $ do
52 | frame <- getFrameFixture "full"
53 | A._rframeKeys frame @?= exampleHeader
54 | A.numRows frame @?= 2
55 | A.numCols frame @?= 3
56 |
57 | testRowDecode :: TestTree
58 | testRowDecode = testCase "rowDecode" $ do
59 | frame <- getFrameFixture "full"
60 | let decoder = A.requireWhere "score" A.floating <&> (*2)
61 | result <- sequenceA =<< A.decode decoder frame
62 | V.fromList [10.0, 6.0] @?= result
63 |
64 | testDrop :: TestTree
65 | testDrop = testCase "drop" $ do
66 | original <- getFrameFixture "full"
67 | expected <- getFrameFixture "noName"
68 | A.numCols original @?= 3
69 | A.numCols expected @?= 2
70 | let names = HS.singleton "name"
71 | let actual = A.dropCols (`HS.member` names) original
72 | A._rframeKeys actual @?= A._rframeKeys expected
73 |
74 | testKeep :: TestTree
75 | testKeep = testCase "keep" $ do
76 | original <- getFrameFixture "full"
77 | expected <- getFrameFixture "noName"
78 | A.numCols original @?= 3
79 | A.numCols expected @?= 2
80 | let names = HS.fromList ["id", "score"]
81 | let actual = A.keepCols (`HS.member` names) original
82 | A._rframeKeys actual @?= A._rframeKeys expected
83 |
84 | testUpdateEmpty :: TestTree
85 | testUpdateEmpty = testCase "update empty" $ do
86 | update <- getUpdateFixture "full"
87 | empty <- A.fromUpdate =<< getUpdateFixture "empty"
88 | expected <- A.fromUpdate update
89 | actual <- A.update update empty
90 | actual @?= expected
91 |
92 | testUpdateEmpty2 :: TestTree
93 | testUpdateEmpty2 = testCase "update empty 2" $ do
94 | frame <- getFrameFixture "full"
95 | update <- getUpdateFixture "empty"
96 | actual <- A.update update frame
97 | actual @?= frame
98 |
99 | testUpdateAdd :: TestTree
100 | testUpdateAdd = testCase "update add" $ do
101 | frame <- getFrameFixture "full"
102 | update <- getUpdateFixture "color"
103 | expected <- getFrameFixture "fullColor"
104 | actual <- A.update update frame
105 | actual @?= expected
106 |
107 | testUpdateOverlap :: TestTree
108 | testUpdateOverlap = testCase "update overlap" $ do
109 | frame <- getFrameFixture "full"
110 | update <- getUpdateFixture "overlap"
111 | expected <- getFrameFixture "fullOverlap"
112 | actual <- A.update update frame
113 | actual @?= expected
114 |
115 | testTakeRows :: TestTree
116 | testTakeRows = testCase "takeRows" $ do
117 | before <- getFrameFixture "color"
118 | A.numRows before @?= 2
119 | expected <- getFrameFixture "colorOne"
120 | A.numRows expected @?= 1
121 | let actual = A.takeRows 1 before
122 | --A.numRows actual @?= 1
123 | actual @?= expected
124 |
125 | testAddColumn :: TestTree
126 | testAddColumn = testCase "add column" $ do
127 | before <- getFrameFixture "color"
128 | A.numCols before @?= 1
129 | expected <- getFrameFixture "colorSpanish"
130 | A.numCols expected @?= 2
131 | actual <- A.addColumn before "spanishColor" (V.fromList [ A.ValueText "lila", A.ValueText "naranja"])
132 | --A.numRows actual @?= 1
133 | actual @?= expected
134 |
135 | testOneHot :: TestTree
136 | testOneHot = testCase "one hot" $ do
137 | color <- getFrameFixture "color"
138 | colorHot <- getFrameFixture "colorHot"
139 | actual <- A.oneHot (\k (A.ValueText v) -> k <> "/" <> v) "color" (A.ValueBool True) (A.ValueBool False) color
140 | actual @?= colorHot
141 |
142 | -- Runner
143 |
144 | tests :: TestTree
145 | tests = testGroup "Unit tests"
146 | [ testFixture
147 | , testRowDecode
148 | , testDrop
149 | , testKeep
150 | , testUpdateEmpty
151 | , testUpdateEmpty2
152 | , testUpdateAdd
153 | , testUpdateOverlap
154 | , testTakeRows
155 | , testAddColumn
156 | , testOneHot
157 | ]
158 |
159 | main :: IO ()
160 | main = defaultMain tests
161 |
--------------------------------------------------------------------------------
/datasets/.gitignore:
--------------------------------------------------------------------------------
1 | # CUSTOM IGNORES
2 | ### Datasets in development ###
3 | cifar-10
4 | mnist
5 |
6 | ### NixOS ###
7 | result
8 |
9 | # Created by https://www.gitignore.io/api/vim,emacs,linux,macos,windows,haskell
10 | # Edit at https://www.gitignore.io/?templates=vim,emacs,linux,macos,windows,haskell
11 |
12 | ### Emacs ###
13 | # -*- mode: gitignore; -*-
14 | *~
15 | \#*\#
16 | /.emacs.desktop
17 | /.emacs.desktop.lock
18 | *.elc
19 | auto-save-list
20 | tramp
21 | .\#*
22 |
23 | # Org-mode
24 | .org-id-locations
25 | *_archive
26 |
27 | # flymake-mode
28 | *_flymake.*
29 |
30 | # eshell files
31 | /eshell/history
32 | /eshell/lastdir
33 |
34 | # elpa packages
35 | /elpa/
36 |
37 | # reftex files
38 | *.rel
39 |
40 | # AUCTeX auto folder
41 | /auto/
42 |
43 | # cask packages
44 | .cask/
45 | dist/
46 |
47 | # Flycheck
48 | flycheck_*.el
49 |
50 | # server auth directory
51 | /server/
52 |
53 | # projectiles files
54 | .projectile
55 |
56 | # directory configuration
57 | .dir-locals.el
58 |
59 | # network security
60 | /network-security.data
61 |
62 |
63 | ### Haskell ###
64 | dist
65 | dist-*
66 | cabal-dev
67 | *.o
68 | *.hi
69 | *.chi
70 | *.chs.h
71 | *.dyn_o
72 | *.dyn_hi
73 | .hpc
74 | .hsenv
75 | .cabal-sandbox/
76 | cabal.sandbox.config
77 | *.prof
78 | *.aux
79 | *.hp
80 | *.eventlog
81 | .stack-work/
82 | cabal.project.local
83 | cabal.project.local~
84 | .HTF/
85 | .ghc.environment.*
86 |
87 | ### Linux ###
88 |
89 | # temporary files which can be created if a process still has a handle open of a deleted file
90 | .fuse_hidden*
91 |
92 | # KDE directory preferences
93 | .directory
94 |
95 | # Linux trash folder which might appear on any partition or disk
96 | .Trash-*
97 |
98 | # .nfs files are created when an open file is removed but is still being accessed
99 | .nfs*
100 |
101 | ### macOS ###
102 | # General
103 | .DS_Store
104 | .AppleDouble
105 | .LSOverride
106 |
107 | # Icon must end with two \r
108 | Icon
109 |
110 | # Thumbnails
111 | ._*
112 |
113 | # Files that might appear in the root of a volume
114 | .DocumentRevisions-V100
115 | .fseventsd
116 | .Spotlight-V100
117 | .TemporaryItems
118 | .Trashes
119 | .VolumeIcon.icns
120 | .com.apple.timemachine.donotpresent
121 |
122 | # Directories potentially created on remote AFP share
123 | .AppleDB
124 | .AppleDesktop
125 | Network Trash Folder
126 | Temporary Items
127 | .apdisk
128 |
129 | ### Vim ###
130 | # Swap
131 | [._]*.s[a-v][a-z]
132 | [._]*.sw[a-p]
133 | [._]s[a-rt-v][a-z]
134 | [._]ss[a-gi-z]
135 | [._]sw[a-p]
136 |
137 | # Session
138 | Session.vim
139 |
140 | # Temporary
141 | .netrwhist
142 | # Auto-generated tag files
143 | tags
144 | # Persistent undo
145 | [._]*.un~
146 |
147 | ### Windows ###
148 | # Windows thumbnail cache files
149 | Thumbs.db
150 | ehthumbs.db
151 | ehthumbs_vista.db
152 |
153 | # Dump file
154 | *.stackdump
155 |
156 | # Folder config file
157 | [Dd]esktop.ini
158 |
159 | # Recycle Bin used on file shares
160 | $RECYCLE.BIN/
161 |
162 | # Windows Installer files
163 | *.cab
164 | *.msi
165 | *.msix
166 | *.msm
167 | *.msp
168 |
169 | # Windows shortcuts
170 | *.lnk
171 |
172 | # End of https://www.gitignore.io/api/vim,emacs,linux,macos,windows,haskell
173 |
--------------------------------------------------------------------------------
/datasets/.travis.yml:
--------------------------------------------------------------------------------
1 | # This Travis job script has been generated by a script via
2 | #
3 | # make_travis_yml_2.hs 'datasets.cabal'
4 | #
5 | # For more information, see https://github.com/hvr/multi-ghc-travis
6 | #
7 | language: c
8 | sudo: false
9 |
10 | git:
11 | submodules: false # whether to recursively clone submodules
12 |
13 | cache:
14 | directories:
15 | - $HOME/.cabal/packages
16 | - $HOME/.cabal/store
17 |
18 | before_cache:
19 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/build-reports.log
20 | # remove files that are regenerated by 'cabal update'
21 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/00-index.*
22 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/*.json
23 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.cache
24 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.tar
25 | - rm -fv $HOME/.cabal/packages/hackage.haskell.org/01-index.tar.idx
26 |
27 | matrix:
28 | include:
29 | - compiler: "ghc-7.6.3"
30 | # env: TEST=--disable-tests BENCH=--disable-benchmarks
31 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.6.3], sources: [hvr-ghc]}}
32 | - compiler: "ghc-7.8.4"
33 | # env: TEST=--disable-tests BENCH=--disable-benchmarks
34 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.8.4], sources: [hvr-ghc]}}
35 | - compiler: "ghc-7.10.2"
36 | # env: TEST=--disable-tests BENCH=--disable-benchmarks
37 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.10.2], sources: [hvr-ghc]}}
38 | - compiler: "ghc-7.10.3"
39 | # env: TEST=--disable-tests BENCH=--disable-benchmarks
40 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-7.10.3], sources: [hvr-ghc]}}
41 | - compiler: "ghc-8.0.1"
42 | # env: TEST=--disable-tests BENCH=--disable-benchmarks
43 | addons: {apt: {packages: [ghc-ppa-tools,cabal-install-head,ghc-8.0.1], sources: [hvr-ghc]}}
44 |
45 | before_install:
46 | - HC=${CC}
47 | - unset CC
48 | - PATH=/opt/ghc/bin:/opt/ghc-ppa-tools/bin:$PATH
49 | - PKGNAME='datasets'
50 |
51 | install:
52 | - cabal --version
53 | - echo "$(${HC} --version) [$(${HC} --print-project-git-commit-id 2> /dev/null || echo '?')]"
54 | - BENCH=${BENCH---enable-benchmarks}
55 | - TEST=${TEST---enable-tests}
56 | - travis_retry cabal update -v
57 | - sed -i 's/^jobs:/-- jobs:/' ${HOME}/.cabal/config
58 | - rm -fv cabal.project.local
59 | - "echo 'packages: .' > cabal.project"
60 | - rm -f cabal.project.freeze
61 | - cabal new-build -w ${HC} ${TEST} ${BENCH} --dep -j2
62 | - cabal new-build -w ${HC} --disable-tests --disable-benchmarks --dep -j2
63 |
64 | # Here starts the actual work to be performed for the package under test;
65 | # any command which exits with a non-zero exit code causes the build to fail.
66 | script:
67 | - if [ -f configure.ac ]; then autoreconf -i; fi
68 | - rm -rf dist/
69 | - cabal sdist # test that a source-distribution can be generated
70 | - cd dist/
71 | - SRCTAR=(${PKGNAME}-*.tar.gz)
72 | - SRC_BASENAME="${SRCTAR/%.tar.gz}"
73 | - tar -xvf "./$SRC_BASENAME.tar.gz"
74 | - cd "$SRC_BASENAME/"
75 | ## from here on, CWD is inside the extracted source-tarball
76 | - rm -fv cabal.project.local
77 | - "echo 'packages: .' > cabal.project"
78 | # this builds all libraries and executables (without tests/benchmarks)
79 | - rm -f cabal.project.freeze
80 | - cabal new-build -w ${HC} --disable-tests --disable-benchmarks
81 | # this builds all libraries and executables (including tests/benchmarks)
82 | # - rm -rf ./dist-newstyle
83 | - cabal new-build -w ${HC} ${TEST} ${BENCH}
84 |
85 | # there's no 'cabal new-test' yet, so let's emulate for now
86 | - TESTS=( $(awk 'tolower($0) ~ /^test-suite / { print $2 }' *.cabal) )
87 | - if [ "$TEST" != "--enable-tests" ]; then TESTS=(); fi
88 | - shopt -s globstar;
89 | RC=true; for T in ${TESTS[@]}; do echo "== $T ==";
90 | if dist-newstyle/build/**/$SRC_BASENAME/**/build/$T/$T; then echo "= $T OK =";
91 | else echo "= $T FAILED ="; RC=false; fi; done; $RC
92 |
93 | # EOF
94 |
--------------------------------------------------------------------------------
/datasets/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2016 Tom Nielsen
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/datasets/README.md:
--------------------------------------------------------------------------------
1 | datasets: data sets for statistics and machine learning, in Haskell
2 | =====
3 |
4 | [](https://hackage.haskell.org/package/datasets) [](http://travis-ci.org/glutamate/datasets)
5 |
6 | This library provides easy access in Haskell to a series of data sets
7 | for Statistics and Machine learning.
8 |
9 | Most of these datasets come from the [UCI Machine Learning Reposity](http://archive.ics.uci.edu/ml/)
10 | ([Mirror](http://mlr.cs.umass.edu/ml/))
11 |
12 | ## Usage
13 |
14 | ```haskell
15 |
16 | import Numeric.Datasets (getDataset)
17 | import Numeric.Datasets.Iris (iris)
18 | import Numeric.Datasets.Abalone (abalone)
19 |
20 | main = do
21 |
22 | -- The Iris data set is embedded
23 | print (length iris)
24 | print (head iris)
25 |
26 | -- The Abalone dataset is fetched
27 | abas <- getDataset abalone
28 | print (length abas)
29 | print (head abas)
30 |
31 | ```
32 |
--------------------------------------------------------------------------------
/datasets/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 |
--------------------------------------------------------------------------------
/datasets/bench/Main.hs:
--------------------------------------------------------------------------------
1 | module Main where
2 |
3 | import qualified Numeric.Dataloader.Benchmark
4 |
5 | main :: IO ()
6 | main = Numeric.Dataloader.Benchmark.main
7 |
8 |
--------------------------------------------------------------------------------
/datasets/bench/Numeric/Dataloader/Benchmark.hs:
--------------------------------------------------------------------------------
1 | {-# OPTIONS_GHC -Wno-orphans #-}
2 | {-# LANGUAGE DataKinds #-}
3 | {-# LANGUAGE LambdaCase #-}
4 | module Numeric.Dataloader.Benchmark where
5 |
6 | import Control.DeepSeq
7 | import Control.Concurrent
8 | import Numeric.Dataloader
9 | import Numeric.Datasets
10 | import Numeric.Datasets.Abalone
11 | import Streaming (Of)
12 | import qualified Streaming.Prelude as S
13 | import qualified System.Random.MWC as MWC
14 |
15 | -- ImageLoading bench
16 | import Numeric.Datasets.Internal.Streaming
17 | import Numeric.Datasets.CIFAR10
18 | import System.FilePath
19 | import System.Directory
20 | import Codec.Picture
21 | import Control.Exception.Safe
22 | import Text.Read
23 | import System.IO.Unsafe
24 | import qualified Data.List.NonEmpty as NonEmpty
25 |
26 | import Criterion.Main
27 |
28 | instance NFData Abalone
29 | instance NFData Sex
30 | instance (NFData a, NFData b) => NFData (Of a b)
31 |
32 |
33 | mkDataloaderWithIx :: Dataset a -> IO (Dataloader a a)
34 | mkDataloaderWithIx ds = MWC.withSystemRandom $ \g -> do
35 | ixl <- uniformIxline ds g
36 | pure $ Dataloader 1 (Just ixl) ds id
37 |
38 |
39 | main :: IO ()
40 | main = do
41 | dl <- mkDataloaderWithIx abalone
42 | cifar10l <- cifar10ImageLoader
43 | defaultMain
44 | [ bgroup "Numeric.Dataloader"
45 | [ bench "making an ixline" $ nfIO $ MWC.withSystemRandom (uniformIxline abalone)
46 | , bgroup "testStream"
47 | [ bench "with ixline" . nfIO $ foldStream (S.take 100 $ slow . stream $ dl)
48 | , bench "no ixline" . nfIO $ foldStream (S.take 100 $ slow . stream $ Dataloader 1 Nothing abalone id)
49 | ]
50 | , bench "cifar10 image folder" $ nfIO $ foldStream $ S.take 1000 $ stream cifar10l
51 | , bench "cifar10 batch folder" $ nfIO $ foldStream $ S.take 1 $ batchStream (cifar10l { batchSize = 1000 })
52 | ]
53 | ]
54 |
55 | slow :: S.Stream (Of a) IO r -> S.Stream (Of a) IO r
56 | slow = S.mapM (\a -> threadDelay 2 >> pure a)
57 |
58 | foldStream :: Show a => S.Stream (Of a) IO () -> IO (Of [a] ())
59 | foldStream = S.fold (\memo a -> a:memo) [] id
60 |
61 | -------------------------------------------------------------------------------
62 | -- Image Folder loading
63 |
64 |
65 |
66 |
67 | -- | dataloading the image folder dataset
68 | cifar10ImageLoader :: IO (Dataloader (String, FilePath) CIFARImage)
69 | cifar10ImageLoader = do
70 | xdgCache <- getXdgDirectory XdgCache "datasets-hs"
71 | let imfolder = xdgCache > "cifar-10-imagefolder"
72 | pure $ Dataloader 1 Nothing (imgFolderDataset imfolder) load
73 |
74 | where
75 | labelFolders :: NonEmpty.NonEmpty String
76 | labelFolders = show <$> NonEmpty.fromList [minBound..maxBound::Label]
77 |
78 | imgFolderDataset :: FilePath -> Dataset (String, FilePath)
79 | imgFolderDataset fp =
80 | Dataset
81 | (ImgFolder fp labelFolders)
82 | Nothing
83 | Nothing
84 | (ImageFolder labelFolders)
85 |
86 | load :: (String, FilePath) -> CIFARImage
87 | load (str, fp) = CIFARImage (img, lbl)
88 | where
89 | lbl :: Label
90 | lbl = either error id $ readEither str
91 |
92 | img :: Image PixelRGB8
93 | img = case unsafePerformIO (readPng fp) of
94 | Left err -> error err
95 | Right (ImageRGB8 i) -> i
96 | Right _ -> error "decoded image was not rgb8"
97 |
98 |
--------------------------------------------------------------------------------
/datasets/bin/cifar10/Main.hs:
--------------------------------------------------------------------------------
1 | module Main where
2 |
3 | import Codec.Picture (writePng)
4 | import Numeric.Datasets.Internal.Streaming (streamDataset)
5 | import Numeric.Datasets.CIFAR10 (CIFARImage(..), cifar10)
6 | import System.FilePath
7 | import System.Directory (XdgDirectory(..), getXdgDirectory, createDirectoryIfMissing, listDirectory)
8 | import qualified Streaming.Prelude as S
9 | import System.ProgressBar (newProgressBar, defStyle, Progress(..), incProgress)
10 |
11 | main :: IO ()
12 | main = provision
13 |
14 | -- may be required if you don't already have CIFAR10 loaded
15 | provision :: IO ()
16 | provision = do
17 | xdgCache <- getXdgDirectory XdgCache "datasets-hs"
18 | let imfolder = xdgCache > "cifar-10-imagefolder"
19 | createDirectoryIfMissing True imfolder
20 | -- progress bar
21 | let n = 60000
22 | pb <- newProgressBar defStyle 10 (Progress 0 n ())
23 | let
24 | go :: FilePath -> CIFARImage -> IO ()
25 | go cachefolder (CIFARImage (im, lbl)) = do
26 | let labelfolder = cachefolder > show lbl
27 | createDirectoryIfMissing True labelfolder
28 | ix <- length <$> listDirectory labelfolder
29 | let
30 | fname = show lbl ++ "_" ++ show ix ++ ".png"
31 | writePng (labelfolder > fname) im
32 | incProgress pb 1 -- increment progress bar
33 |
34 | -- build the image folder
35 | S.mapM_ (go imfolder) $ streamDataset cifar10
36 |
37 |
38 |
--------------------------------------------------------------------------------
/datasets/changelog.md:
--------------------------------------------------------------------------------
1 | 0.4
2 | * Get rid of dependency on 'data-default' (introduced by previous versions of 'req')
3 |
4 | * Bump 'req' dependency to 2.0.0
5 | 0.3
6 | * 'datasets' hosted within the DataHaskell/dh-core project
7 |
8 | * use 'req' for HTTP and HTTPS requests, instead of 'wreq'
9 |
10 | * Mushroom and Titanic datasets
11 |
12 | * Restructured top-level documentation
13 |
14 | * Removed 'csvDatasetPreprocess' and added 'withPreprocess'. Now bytestring preprocessing is more compositional, i.e. 'withPreprocess' can be used with JSON datasets as well.
15 |
16 |
17 | 0.2.5
18 |
19 | * Old Faithful matches R dataset
20 |
21 | 0.2.4
22 |
23 | * Netflix dataset
24 |
25 | 0.2.3
26 |
27 | * Coal dataset
28 |
29 | * New internal API
30 |
31 | * Ord instance for IrisClass
32 |
33 | 0.2.2
34 |
35 | * Enum, Bounded instances for IrisClass
36 |
37 | * Gapminder dataset
38 |
39 | * Use wreq for HTTP and HTTPS requests
40 |
41 | 0.2.1
42 |
43 | * Wine quality datasets
44 |
45 | * Vocabulary, UN, States datasets
46 |
47 | * CO2, Sunspots and Quakes datasets
48 |
49 | 0.2.0.3
50 |
51 | * Further GHC portability
52 |
53 | 0.2.0.2
54 |
55 | * Improve GHC portability
56 |
57 | 0.2.0.1
58 |
59 | * Bugfix: include embedded data files in cabal extra-source-files
60 |
61 | 0.2
62 |
63 | * iris dataset is a pure value (with file-embed)
64 |
65 | * Michelson, Nightingale and BostonHousing datasets
66 |
--------------------------------------------------------------------------------
/datasets/datafiles/arff/README:
--------------------------------------------------------------------------------
1 | This folder contains many weka ARFF files used for testing the ARFF parser.
2 |
3 | The files were obtained from: https://storm.cis.fordham.edu/~gweiss/data-mining/datasets.html
--------------------------------------------------------------------------------
/datasets/datafiles/arff/contact-lens.arff:
--------------------------------------------------------------------------------
1 | % 1. Title: Database for fitting contact lenses
2 | %
3 | % 2. Sources:
4 | % (a) Cendrowska, J. "PRISM: An algorithm for inducing modular rules",
5 | % International Journal of Man-Machine Studies, 1987, 27, 349-370
6 | % (b) Donor: Benoit Julien (Julien@ce.cmu.edu)
7 | % (c) Date: 1 August 1990
8 | %
9 | % 3. Past Usage:
10 | % 1. See above.
11 | % 2. Witten, I. H. & MacDonald, B. A. (1988). Using concept
12 | % learning for knowledge acquisition. International Journal of
13 | % Man-Machine Studies, 27, (pp. 349-370).
14 | %
15 | % Notes: This database is complete (all possible combinations of
16 | % attribute-value pairs are represented).
17 | %
18 | % Each instance is complete and correct.
19 | %
20 | % 9 rules cover the training set.
21 | %
22 | % 4. Relevant Information Paragraph:
23 | % The examples are complete and noise free.
24 | % The examples highly simplified the problem. The attributes do not
25 | % fully describe all the factors affecting the decision as to which type,
26 | % if any, to fit.
27 | %
28 | % 5. Number of Instances: 24
29 | %
30 | % 6. Number of Attributes: 4 (all nominal)
31 | %
32 | % 7. Attribute Information:
33 | % -- 3 Classes
34 | % 1 : the patient should be fitted with hard contact lenses,
35 | % 2 : the patient should be fitted with soft contact lenses,
36 | % 1 : the patient should not be fitted with contact lenses.
37 | %
38 | % 1. age of the patient: (1) young, (2) pre-presbyopic, (3) presbyopic
39 | % 2. spectacle prescription: (1) myope, (2) hypermetrope
40 | % 3. astigmatic: (1) no, (2) yes
41 | % 4. tear production rate: (1) reduced, (2) normal
42 | %
43 | % 8. Number of Missing Attribute Values: 0
44 | %
45 | % 9. Class Distribution:
46 | % 1. hard contact lenses: 4
47 | % 2. soft contact lenses: 5
48 | % 3. no contact lenses: 15
49 |
50 | @relation contact-lenses
51 |
52 | @attribute age {young, pre-presbyopic, presbyopic}
53 | @attribute spectacle-prescrip {myope, hypermetrope}
54 | @attribute astigmatism {no, yes}
55 | @attribute tear-prod-rate {reduced, normal}
56 | @attribute contact-lenses {soft, hard, none}
57 |
58 | @data
59 | %
60 | % 24 instances
61 | %
62 | young,myope,no,reduced,none
63 | young,myope,no,normal,soft
64 | young,myope,yes,reduced,none
65 | young,myope,yes,normal,hard
66 | young,hypermetrope,no,reduced,none
67 | young,hypermetrope,no,normal,soft
68 | young,hypermetrope,yes,reduced,none
69 | young,hypermetrope,yes,normal,hard
70 | pre-presbyopic,myope,no,reduced,none
71 | pre-presbyopic,myope,no,normal,soft
72 | pre-presbyopic,myope,yes,reduced,none
73 | pre-presbyopic,myope,yes,normal,hard
74 | pre-presbyopic,hypermetrope,no,reduced,none
75 | pre-presbyopic,hypermetrope,no,normal,soft
76 | pre-presbyopic,hypermetrope,yes,reduced,none
77 | pre-presbyopic,hypermetrope,yes,normal,none
78 | presbyopic,myope,no,reduced,none
79 | presbyopic,myope,no,normal,none
80 | presbyopic,myope,yes,reduced,none
81 | presbyopic,myope,yes,normal,hard
82 | presbyopic,hypermetrope,no,reduced,none
83 | presbyopic,hypermetrope,no,normal,soft
84 | presbyopic,hypermetrope,yes,reduced,none
85 | presbyopic,hypermetrope,yes,normal,none
86 |
--------------------------------------------------------------------------------
/datasets/datafiles/arff/cpu.arff:
--------------------------------------------------------------------------------
1 | %
2 | % As used by Kilpatrick, D. & Cameron-Jones, M. (1998). Numeric prediction
3 | % using instance-based learning with encoding length selection. In Progress
4 | % in Connectionist-Based Information Systems. Singapore: Springer-Verlag.
5 | %
6 | % Deleted "vendor" attribute to make data consistent with with what we
7 | % used in the data mining book.
8 | %
9 | @relation 'cpu'
10 | @attribute MYCT real
11 | @attribute MMIN real
12 | @attribute MMAX real
13 | @attribute CACH real
14 | @attribute CHMIN real
15 | @attribute CHMAX real
16 | @attribute class real
17 | @data
18 | 125,256,6000,256,16,128,198
19 | 29,8000,32000,32,8,32,269
20 | 29,8000,32000,32,8,32,220
21 | 29,8000,32000,32,8,32,172
22 | 29,8000,16000,32,8,16,132
23 | 26,8000,32000,64,8,32,318
24 | 23,16000,32000,64,16,32,367
25 | 23,16000,32000,64,16,32,489
26 | 23,16000,64000,64,16,32,636
27 | 23,32000,64000,128,32,64,1144
28 | 400,1000,3000,0,1,2,38
29 | 400,512,3500,4,1,6,40
30 | 60,2000,8000,65,1,8,92
31 | 50,4000,16000,65,1,8,138
32 | 350,64,64,0,1,4,10
33 | 200,512,16000,0,4,32,35
34 | 167,524,2000,8,4,15,19
35 | 143,512,5000,0,7,32,28
36 | 143,1000,2000,0,5,16,31
37 | 110,5000,5000,142,8,64,120
38 | 143,1500,6300,0,5,32,30
39 | 143,3100,6200,0,5,20,33
40 | 143,2300,6200,0,6,64,61
41 | 110,3100,6200,0,6,64,76
42 | 320,128,6000,0,1,12,23
43 | 320,512,2000,4,1,3,69
44 | 320,256,6000,0,1,6,33
45 | 320,256,3000,4,1,3,27
46 | 320,512,5000,4,1,5,77
47 | 320,256,5000,4,1,6,27
48 | 25,1310,2620,131,12,24,274
49 | 25,1310,2620,131,12,24,368
50 | 50,2620,10480,30,12,24,32
51 | 50,2620,10480,30,12,24,63
52 | 56,5240,20970,30,12,24,106
53 | 64,5240,20970,30,12,24,208
54 | 50,500,2000,8,1,4,20
55 | 50,1000,4000,8,1,5,29
56 | 50,2000,8000,8,1,5,71
57 | 50,1000,4000,8,3,5,26
58 | 50,1000,8000,8,3,5,36
59 | 50,2000,16000,8,3,5,40
60 | 50,2000,16000,8,3,6,52
61 | 50,2000,16000,8,3,6,60
62 | 133,1000,12000,9,3,12,72
63 | 133,1000,8000,9,3,12,72
64 | 810,512,512,8,1,1,18
65 | 810,1000,5000,0,1,1,20
66 | 320,512,8000,4,1,5,40
67 | 200,512,8000,8,1,8,62
68 | 700,384,8000,0,1,1,24
69 | 700,256,2000,0,1,1,24
70 | 140,1000,16000,16,1,3,138
71 | 200,1000,8000,0,1,2,36
72 | 110,1000,4000,16,1,2,26
73 | 110,1000,12000,16,1,2,60
74 | 220,1000,8000,16,1,2,71
75 | 800,256,8000,0,1,4,12
76 | 800,256,8000,0,1,4,14
77 | 800,256,8000,0,1,4,20
78 | 800,256,8000,0,1,4,16
79 | 800,256,8000,0,1,4,22
80 | 125,512,1000,0,8,20,36
81 | 75,2000,8000,64,1,38,144
82 | 75,2000,16000,64,1,38,144
83 | 75,2000,16000,128,1,38,259
84 | 90,256,1000,0,3,10,17
85 | 105,256,2000,0,3,10,26
86 | 105,1000,4000,0,3,24,32
87 | 105,2000,4000,8,3,19,32
88 | 75,2000,8000,8,3,24,62
89 | 75,3000,8000,8,3,48,64
90 | 175,256,2000,0,3,24,22
91 | 300,768,3000,0,6,24,36
92 | 300,768,3000,6,6,24,44
93 | 300,768,12000,6,6,24,50
94 | 300,768,4500,0,1,24,45
95 | 300,384,12000,6,1,24,53
96 | 300,192,768,6,6,24,36
97 | 180,768,12000,6,1,31,84
98 | 330,1000,3000,0,2,4,16
99 | 300,1000,4000,8,3,64,38
100 | 300,1000,16000,8,2,112,38
101 | 330,1000,2000,0,1,2,16
102 | 330,1000,4000,0,3,6,22
103 | 140,2000,4000,0,3,6,29
104 | 140,2000,4000,0,4,8,40
105 | 140,2000,4000,8,1,20,35
106 | 140,2000,32000,32,1,20,134
107 | 140,2000,8000,32,1,54,66
108 | 140,2000,32000,32,1,54,141
109 | 140,2000,32000,32,1,54,189
110 | 140,2000,4000,8,1,20,22
111 | 57,4000,16000,1,6,12,132
112 | 57,4000,24000,64,12,16,237
113 | 26,16000,32000,64,16,24,465
114 | 26,16000,32000,64,8,24,465
115 | 26,8000,32000,0,8,24,277
116 | 26,8000,16000,0,8,16,185
117 | 480,96,512,0,1,1,6
118 | 203,1000,2000,0,1,5,24
119 | 115,512,6000,16,1,6,45
120 | 1100,512,1500,0,1,1,7
121 | 1100,768,2000,0,1,1,13
122 | 600,768,2000,0,1,1,16
123 | 400,2000,4000,0,1,1,32
124 | 400,4000,8000,0,1,1,32
125 | 900,1000,1000,0,1,2,11
126 | 900,512,1000,0,1,2,11
127 | 900,1000,4000,4,1,2,18
128 | 900,1000,4000,8,1,2,22
129 | 900,2000,4000,0,3,6,37
130 | 225,2000,4000,8,3,6,40
131 | 225,2000,4000,8,3,6,34
132 | 180,2000,8000,8,1,6,50
133 | 185,2000,16000,16,1,6,76
134 | 180,2000,16000,16,1,6,66
135 | 225,1000,4000,2,3,6,24
136 | 25,2000,12000,8,1,4,49
137 | 25,2000,12000,16,3,5,66
138 | 17,4000,16000,8,6,12,100
139 | 17,4000,16000,32,6,12,133
140 | 1500,768,1000,0,0,0,12
141 | 1500,768,2000,0,0,0,18
142 | 800,768,2000,0,0,0,20
143 | 50,2000,4000,0,3,6,27
144 | 50,2000,8000,8,3,6,45
145 | 50,2000,8000,8,1,6,56
146 | 50,2000,16000,24,1,6,70
147 | 50,2000,16000,24,1,6,80
148 | 50,8000,16000,48,1,10,136
149 | 100,1000,8000,0,2,6,16
150 | 100,1000,8000,24,2,6,26
151 | 100,1000,8000,24,3,6,32
152 | 50,2000,16000,12,3,16,45
153 | 50,2000,16000,24,6,16,54
154 | 50,2000,16000,24,6,16,65
155 | 150,512,4000,0,8,128,30
156 | 115,2000,8000,16,1,3,50
157 | 115,2000,4000,2,1,5,40
158 | 92,2000,8000,32,1,6,62
159 | 92,2000,8000,32,1,6,60
160 | 92,2000,8000,4,1,6,50
161 | 75,4000,16000,16,1,6,66
162 | 60,4000,16000,32,1,6,86
163 | 60,2000,16000,64,5,8,74
164 | 60,4000,16000,64,5,8,93
165 | 50,4000,16000,64,5,10,111
166 | 72,4000,16000,64,8,16,143
167 | 72,2000,8000,16,6,8,105
168 | 40,8000,16000,32,8,16,214
169 | 40,8000,32000,64,8,24,277
170 | 35,8000,32000,64,8,24,370
171 | 38,16000,32000,128,16,32,510
172 | 48,4000,24000,32,8,24,214
173 | 38,8000,32000,64,8,24,326
174 | 30,16000,32000,256,16,24,510
175 | 112,1000,1000,0,1,4,8
176 | 84,1000,2000,0,1,6,12
177 | 56,1000,4000,0,1,6,17
178 | 56,2000,6000,0,1,8,21
179 | 56,2000,8000,0,1,8,24
180 | 56,4000,8000,0,1,8,34
181 | 56,4000,12000,0,1,8,42
182 | 56,4000,16000,0,1,8,46
183 | 38,4000,8000,32,16,32,51
184 | 38,4000,8000,32,16,32,116
185 | 38,8000,16000,64,4,8,100
186 | 38,8000,24000,160,4,8,140
187 | 38,4000,16000,128,16,32,212
188 | 200,1000,2000,0,1,2,25
189 | 200,1000,4000,0,1,4,30
190 | 200,2000,8000,64,1,5,41
191 | 250,512,4000,0,1,7,25
192 | 250,512,4000,0,4,7,50
193 | 250,1000,16000,1,1,8,50
194 | 160,512,4000,2,1,5,30
195 | 160,512,2000,2,3,8,32
196 | 160,1000,4000,8,1,14,38
197 | 160,1000,8000,16,1,14,60
198 | 160,2000,8000,32,1,13,109
199 | 240,512,1000,8,1,3,6
200 | 240,512,2000,8,1,5,11
201 | 105,2000,4000,8,3,8,22
202 | 105,2000,6000,16,6,16,33
203 | 105,2000,8000,16,4,14,58
204 | 52,4000,16000,32,4,12,130
205 | 70,4000,12000,8,6,8,75
206 | 59,4000,12000,32,6,12,113
207 | 59,8000,16000,64,12,24,188
208 | 26,8000,24000,32,8,16,173
209 | 26,8000,32000,64,12,16,248
210 | 26,8000,32000,128,24,32,405
211 | 116,2000,8000,32,5,28,70
212 | 50,2000,32000,24,6,26,114
213 | 50,2000,32000,48,26,52,208
214 | 50,2000,32000,112,52,104,307
215 | 50,4000,32000,112,52,104,397
216 | 30,8000,64000,96,12,176,915
217 | 30,8000,64000,128,12,176,1150
218 | 180,262,4000,0,1,3,12
219 | 180,512,4000,0,1,3,14
220 | 180,262,4000,0,1,3,18
221 | 180,512,4000,0,1,3,21
222 | 124,1000,8000,0,1,8,42
223 | 98,1000,8000,32,2,8,46
224 | 125,2000,8000,0,2,14,52
225 | 480,512,8000,32,0,0,67
226 | 480,1000,4000,0,0,0,45
--------------------------------------------------------------------------------
/datasets/datafiles/arff/cpu.with.vendor.arff:
--------------------------------------------------------------------------------
1 | %
2 | % As used by Kilpatrick, D. & Cameron-Jones, M. (1998). Numeric prediction
3 | % using instance-based learning with encoding length selection. In Progress
4 | % in Connectionist-Based Information Systems. Singapore: Springer-Verlag.
5 | %
6 |
7 | @relation 'cpu'
8 | @attribute vendor { adviser, amdahl, apollo, basf, bti, burroughs, c.r.d, cdc, cambex, dec, dg, formation, four-phase, gould, hp, harris, honeywell, ibm, ipl, magnuson, microdata, nas, ncr, nixdorf, perkin-elmer, prime, siemens, sperry, sratus, wang}
9 | @attribute MYCT real
10 | @attribute MMIN real
11 | @attribute MMAX real
12 | @attribute CACH real
13 | @attribute CHMIN real
14 | @attribute CHMAX real
15 | @attribute class real
16 | @data
17 | adviser,125,256,6000,256,16,128,199
18 | amdahl,29,8000,32000,32,8,32,253
19 | amdahl,29,8000,32000,32,8,32,253
20 | amdahl,29,8000,32000,32,8,32,253
21 | amdahl,29,8000,16000,32,8,16,132
22 | amdahl,26,8000,32000,64,8,32,290
23 | amdahl,23,16000,32000,64,16,32,381
24 | amdahl,23,16000,32000,64,16,32,381
25 | amdahl,23,16000,64000,64,16,32,749
26 | amdahl,23,32000,64000,128,32,64,1238
27 | apollo,400,1000,3000,0,1,2,23
28 | apollo,400,512,3500,4,1,6,24
29 | basf,60,2000,8000,65,1,8,70
30 | basf,50,4000,16000,65,1,8,117
31 | bti,350,64,64,0,1,4,15
32 | bti,200,512,16000,0,4,32,64
33 | burroughs,167,524,2000,8,4,15,23
34 | burroughs,143,512,5000,0,7,32,29
35 | burroughs,143,1000,2000,0,5,16,22
36 | burroughs,110,5000,5000,142,8,64,124
37 | burroughs,143,1500,6300,0,5,32,35
38 | burroughs,143,3100,6200,0,5,20,39
39 | burroughs,143,2300,6200,0,6,64,40
40 | burroughs,110,3100,6200,0,6,64,45
41 | c.r.d,320,128,6000,0,1,12,28
42 | c.r.d,320,512,2000,4,1,3,21
43 | c.r.d,320,256,6000,0,1,6,28
44 | c.r.d,320,256,3000,4,1,3,22
45 | c.r.d,320,512,5000,4,1,5,28
46 | c.r.d,320,256,5000,4,1,6,27
47 | cdc,25,1310,2620,131,12,24,102
48 | cdc,25,1310,2620,131,12,24,102
49 | cdc,50,2620,10480,30,12,24,74
50 | cdc,50,2620,10480,30,12,24,74
51 | cdc,56,5240,20970,30,12,24,138
52 | cdc,64,5240,20970,30,12,24,136
53 | cdc,50,500,2000,8,1,4,23
54 | cdc,50,1000,4000,8,1,5,29
55 | cdc,50,2000,8000,8,1,5,44
56 | cambex,50,1000,4000,8,3,5,30
57 | cambex,50,1000,8000,8,3,5,41
58 | cambex,50,2000,16000,8,3,5,74
59 | cambex,50,2000,16000,8,3,6,74
60 | cambex,50,2000,16000,8,3,6,74
61 | dec,133,1000,12000,9,3,12,54
62 | dec,133,1000,8000,9,3,12,41
63 | dec,810,512,512,8,1,1,18
64 | dec,810,1000,5000,0,1,1,28
65 | dec,320,512,8000,4,1,5,36
66 | dec,200,512,8000,8,1,8,38
67 | dg,700,384,8000,0,1,1,34
68 | dg,700,256,2000,0,1,1,19
69 | dg,140,1000,16000,16,1,3,72
70 | dg,200,1000,8000,0,1,2,36
71 | dg,110,1000,4000,16,1,2,30
72 | dg,110,1000,12000,16,1,2,56
73 | dg,220,1000,8000,16,1,2,42
74 | formation,800,256,8000,0,1,4,34
75 | formation,800,256,8000,0,1,4,34
76 | formation,800,256,8000,0,1,4,34
77 | formation,800,256,8000,0,1,4,34
78 | formation,800,256,8000,0,1,4,34
79 | four-phase,125,512,1000,0,8,20,19
80 | gould,75,2000,8000,64,1,38,75
81 | gould,75,2000,16000,64,1,38,113
82 | gould,75,2000,16000,128,1,38,157
83 | hp,90,256,1000,0,3,10,18
84 | hp,105,256,2000,0,3,10,20
85 | hp,105,1000,4000,0,3,24,28
86 | hp,105,2000,4000,8,3,19,33
87 | hp,75,2000,8000,8,3,24,47
88 | hp,75,3000,8000,8,3,48,54
89 | hp,175,256,2000,0,3,24,20
90 | harris,300,768,3000,0,6,24,23
91 | harris,300,768,3000,6,6,24,25
92 | harris,300,768,12000,6,6,24,52
93 | harris,300,768,4500,0,1,24,27
94 | harris,300,384,12000,6,1,24,50
95 | harris,300,192,768,6,6,24,18
96 | harris,180,768,12000,6,1,31,53
97 | honeywell,330,1000,3000,0,2,4,23
98 | honeywell,300,1000,4000,8,3,64,30
99 | honeywell,300,1000,16000,8,2,112,73
100 | honeywell,330,1000,2000,0,1,2,20
101 | honeywell,330,1000,4000,0,3,6,25
102 | honeywell,140,2000,4000,0,3,6,28
103 | honeywell,140,2000,4000,0,4,8,29
104 | honeywell,140,2000,4000,8,1,20,32
105 | honeywell,140,2000,32000,32,1,20,175
106 | honeywell,140,2000,8000,32,1,54,57
107 | honeywell,140,2000,32000,32,1,54,181
108 | honeywell,140,2000,32000,32,1,54,181
109 | honeywell,140,2000,4000,8,1,20,32
110 | ibm,57,4000,16000,1,6,12,82
111 | ibm,57,4000,24000,64,12,16,171
112 | ibm,26,16000,32000,64,16,24,361
113 | ibm,26,16000,32000,64,8,24,350
114 | ibm,26,8000,32000,0,8,24,220
115 | ibm,26,8000,16000,0,8,16,113
116 | ibm,480,96,512,0,1,1,15
117 | ibm,203,1000,2000,0,1,5,21
118 | ibm,115,512,6000,16,1,6,35
119 | ibm,1100,512,1500,0,1,1,18
120 | ibm,1100,768,2000,0,1,1,20
121 | ibm,600,768,2000,0,1,1,20
122 | ibm,400,2000,4000,0,1,1,28
123 | ibm,400,4000,8000,0,1,1,45
124 | ibm,900,1000,1000,0,1,2,18
125 | ibm,900,512,1000,0,1,2,17
126 | ibm,900,1000,4000,4,1,2,26
127 | ibm,900,1000,4000,8,1,2,28
128 | ibm,900,2000,4000,0,3,6,28
129 | ibm,225,2000,4000,8,3,6,31
130 | ibm,225,2000,4000,8,3,6,31
131 | ibm,180,2000,8000,8,1,6,42
132 | ibm,185,2000,16000,16,1,6,76
133 | ibm,180,2000,16000,16,1,6,76
134 | ibm,225,1000,4000,2,3,6,26
135 | ibm,25,2000,12000,8,1,4,59
136 | ibm,25,2000,12000,16,3,5,65
137 | ibm,17,4000,16000,8,6,12,101
138 | ibm,17,4000,16000,32,6,12,116
139 | ibm,1500,768,1000,0,0,0,18
140 | ibm,1500,768,2000,0,0,0,20
141 | ibm,800,768,2000,0,0,0,20
142 | ipl,50,2000,4000,0,3,6,30
143 | ipl,50,2000,8000,8,3,6,44
144 | ipl,50,2000,8000,8,1,6,44
145 | ipl,50,2000,16000,24,1,6,82
146 | ipl,50,2000,16000,24,1,6,82
147 | ipl,50,8000,16000,48,1,10,128
148 | magnuson,100,1000,8000,0,2,6,37
149 | magnuson,100,1000,8000,24,2,6,46
150 | magnuson,100,1000,8000,24,3,6,46
151 | magnuson,50,2000,16000,12,3,16,80
152 | magnuson,50,2000,16000,24,6,16,88
153 | magnuson,50,2000,16000,24,6,16,88
154 | microdata,150,512,4000,0,8,128,33
155 | nas,115,2000,8000,16,1,3,46
156 | nas,115,2000,4000,2,1,5,29
157 | nas,92,2000,8000,32,1,6,53
158 | nas,92,2000,8000,32,1,6,53
159 | nas,92,2000,8000,4,1,6,41
160 | nas,75,4000,16000,16,1,6,86
161 | nas,60,4000,16000,32,1,6,95
162 | nas,60,2000,16000,64,5,8,107
163 | nas,60,4000,16000,64,5,8,117
164 | nas,50,4000,16000,64,5,10,119
165 | nas,72,4000,16000,64,8,16,120
166 | nas,72,2000,8000,16,6,8,48
167 | nas,40,8000,16000,32,8,16,126
168 | nas,40,8000,32000,64,8,24,266
169 | nas,35,8000,32000,64,8,24,270
170 | nas,38,16000,32000,128,16,32,426
171 | nas,48,4000,24000,32,8,24,151
172 | nas,38,8000,32000,64,8,24,267
173 | nas,30,16000,32000,256,16,24,603
174 | ncr,112,1000,1000,0,1,4,19
175 | ncr,84,1000,2000,0,1,6,21
176 | ncr,56,1000,4000,0,1,6,26
177 | ncr,56,2000,6000,0,1,8,35
178 | ncr,56,2000,8000,0,1,8,41
179 | ncr,56,4000,8000,0,1,8,47
180 | ncr,56,4000,12000,0,1,8,62
181 | ncr,56,4000,16000,0,1,8,78
182 | ncr,38,4000,8000,32,16,32,80
183 | ncr,38,4000,8000,32,16,32,80
184 | ncr,38,8000,16000,64,4,8,142
185 | ncr,38,8000,24000,160,4,8,281
186 | ncr,38,4000,16000,128,16,32,190
187 | nixdorf,200,1000,2000,0,1,2,21
188 | nixdorf,200,1000,4000,0,1,4,25
189 | nixdorf,200,2000,8000,64,1,5,67
190 | perkin-elmer,250,512,4000,0,1,7,24
191 | perkin-elmer,250,512,4000,0,4,7,24
192 | perkin-elmer,250,1000,16000,1,1,8,64
193 | prime,160,512,4000,2,1,5,25
194 | prime,160,512,2000,2,3,8,20
195 | prime,160,1000,4000,8,1,14,29
196 | prime,160,1000,8000,16,1,14,43
197 | prime,160,2000,8000,32,1,13,53
198 | siemens,240,512,1000,8,1,3,19
199 | siemens,240,512,2000,8,1,5,22
200 | siemens,105,2000,4000,8,3,8,31
201 | siemens,105,2000,6000,16,6,16,41
202 | siemens,105,2000,8000,16,4,14,47
203 | siemens,52,4000,16000,32,4,12,99
204 | siemens,70,4000,12000,8,6,8,67
205 | siemens,59,4000,12000,32,6,12,81
206 | siemens,59,8000,16000,64,12,24,149
207 | siemens,26,8000,24000,32,8,16,183
208 | siemens,26,8000,32000,64,12,16,275
209 | siemens,26,8000,32000,128,24,32,382
210 | sperry,116,2000,8000,32,5,28,56
211 | sperry,50,2000,32000,24,6,26,182
212 | sperry,50,2000,32000,48,26,52,227
213 | sperry,50,2000,32000,112,52,104,341
214 | sperry,50,4000,32000,112,52,104,360
215 | sperry,30,8000,64000,96,12,176,919
216 | sperry,30,8000,64000,128,12,176,978
217 | sperry,180,262,4000,0,1,3,24
218 | sperry,180,512,4000,0,1,3,24
219 | sperry,180,262,4000,0,1,3,24
220 | sperry,180,512,4000,0,1,3,24
221 | sperry,124,1000,8000,0,1,8,37
222 | sperry,98,1000,8000,32,2,8,50
223 | sratus,125,2000,8000,0,2,14,41
224 | wang,480,512,8000,32,0,0,47
225 | wang,480,1000,4000,0,0,0,25
--------------------------------------------------------------------------------
/datasets/datafiles/arff/weather.arff:
--------------------------------------------------------------------------------
1 | @relation weather
2 |
3 | @attribute outlook {sunny, overcast, rainy}
4 | @attribute temperature real
5 | @attribute humidity real
6 | @attribute windy {TRUE, FALSE}
7 | @attribute play {yes, no}
8 |
9 | @data
10 | sunny,85,85,FALSE,no
11 | sunny,80,90,TRUE,no
12 | overcast,83,86,FALSE,yes
13 | rainy,70,96,FALSE,yes
14 | rainy,68,80,FALSE,yes
15 | rainy,65,70,TRUE,no
16 | overcast,64,65,TRUE,yes
17 | sunny,72,95,FALSE,no
18 | sunny,69,70,FALSE,yes
19 | rainy,75,80,FALSE,yes
20 | sunny,75,70,TRUE,yes
21 | overcast,72,90,TRUE,yes
22 | overcast,81,75,FALSE,yes
23 | rainy,71,91,TRUE,no
--------------------------------------------------------------------------------
/datasets/datafiles/arff/weather.nominal.arff:
--------------------------------------------------------------------------------
1 | @relation weather.symbolic
2 |
3 | @attribute outlook {sunny, overcast, rainy}
4 | @attribute temperature {hot, mild, cool}
5 | @attribute humidity {high, normal}
6 | @attribute windy {TRUE, FALSE}
7 | @attribute play {yes, no}
8 |
9 | @data
10 | sunny,hot,high,FALSE,no
11 | sunny,hot,high,TRUE,no
12 | overcast,hot,high,FALSE,yes
13 | rainy,mild,high,FALSE,yes
14 | rainy,cool,normal,FALSE,yes
15 | rainy,cool,normal,TRUE,no
16 | overcast,cool,normal,TRUE,yes
17 | sunny,mild,high,FALSE,no
18 | sunny,cool,normal,FALSE,yes
19 | rainy,mild,normal,FALSE,yes
20 | sunny,mild,normal,TRUE,yes
21 | overcast,mild,high,TRUE,yes
22 | overcast,hot,normal,FALSE,yes
23 | rainy,mild,high,TRUE,no
--------------------------------------------------------------------------------
/datasets/datafiles/iris.data:
--------------------------------------------------------------------------------
1 | 5.1,3.5,1.4,0.2,Iris-setosa
2 | 4.9,3.0,1.4,0.2,Iris-setosa
3 | 4.7,3.2,1.3,0.2,Iris-setosa
4 | 4.6,3.1,1.5,0.2,Iris-setosa
5 | 5.0,3.6,1.4,0.2,Iris-setosa
6 | 5.4,3.9,1.7,0.4,Iris-setosa
7 | 4.6,3.4,1.4,0.3,Iris-setosa
8 | 5.0,3.4,1.5,0.2,Iris-setosa
9 | 4.4,2.9,1.4,0.2,Iris-setosa
10 | 4.9,3.1,1.5,0.1,Iris-setosa
11 | 5.4,3.7,1.5,0.2,Iris-setosa
12 | 4.8,3.4,1.6,0.2,Iris-setosa
13 | 4.8,3.0,1.4,0.1,Iris-setosa
14 | 4.3,3.0,1.1,0.1,Iris-setosa
15 | 5.8,4.0,1.2,0.2,Iris-setosa
16 | 5.7,4.4,1.5,0.4,Iris-setosa
17 | 5.4,3.9,1.3,0.4,Iris-setosa
18 | 5.1,3.5,1.4,0.3,Iris-setosa
19 | 5.7,3.8,1.7,0.3,Iris-setosa
20 | 5.1,3.8,1.5,0.3,Iris-setosa
21 | 5.4,3.4,1.7,0.2,Iris-setosa
22 | 5.1,3.7,1.5,0.4,Iris-setosa
23 | 4.6,3.6,1.0,0.2,Iris-setosa
24 | 5.1,3.3,1.7,0.5,Iris-setosa
25 | 4.8,3.4,1.9,0.2,Iris-setosa
26 | 5.0,3.0,1.6,0.2,Iris-setosa
27 | 5.0,3.4,1.6,0.4,Iris-setosa
28 | 5.2,3.5,1.5,0.2,Iris-setosa
29 | 5.2,3.4,1.4,0.2,Iris-setosa
30 | 4.7,3.2,1.6,0.2,Iris-setosa
31 | 4.8,3.1,1.6,0.2,Iris-setosa
32 | 5.4,3.4,1.5,0.4,Iris-setosa
33 | 5.2,4.1,1.5,0.1,Iris-setosa
34 | 5.5,4.2,1.4,0.2,Iris-setosa
35 | 4.9,3.1,1.5,0.1,Iris-setosa
36 | 5.0,3.2,1.2,0.2,Iris-setosa
37 | 5.5,3.5,1.3,0.2,Iris-setosa
38 | 4.9,3.1,1.5,0.1,Iris-setosa
39 | 4.4,3.0,1.3,0.2,Iris-setosa
40 | 5.1,3.4,1.5,0.2,Iris-setosa
41 | 5.0,3.5,1.3,0.3,Iris-setosa
42 | 4.5,2.3,1.3,0.3,Iris-setosa
43 | 4.4,3.2,1.3,0.2,Iris-setosa
44 | 5.0,3.5,1.6,0.6,Iris-setosa
45 | 5.1,3.8,1.9,0.4,Iris-setosa
46 | 4.8,3.0,1.4,0.3,Iris-setosa
47 | 5.1,3.8,1.6,0.2,Iris-setosa
48 | 4.6,3.2,1.4,0.2,Iris-setosa
49 | 5.3,3.7,1.5,0.2,Iris-setosa
50 | 5.0,3.3,1.4,0.2,Iris-setosa
51 | 7.0,3.2,4.7,1.4,Iris-versicolor
52 | 6.4,3.2,4.5,1.5,Iris-versicolor
53 | 6.9,3.1,4.9,1.5,Iris-versicolor
54 | 5.5,2.3,4.0,1.3,Iris-versicolor
55 | 6.5,2.8,4.6,1.5,Iris-versicolor
56 | 5.7,2.8,4.5,1.3,Iris-versicolor
57 | 6.3,3.3,4.7,1.6,Iris-versicolor
58 | 4.9,2.4,3.3,1.0,Iris-versicolor
59 | 6.6,2.9,4.6,1.3,Iris-versicolor
60 | 5.2,2.7,3.9,1.4,Iris-versicolor
61 | 5.0,2.0,3.5,1.0,Iris-versicolor
62 | 5.9,3.0,4.2,1.5,Iris-versicolor
63 | 6.0,2.2,4.0,1.0,Iris-versicolor
64 | 6.1,2.9,4.7,1.4,Iris-versicolor
65 | 5.6,2.9,3.6,1.3,Iris-versicolor
66 | 6.7,3.1,4.4,1.4,Iris-versicolor
67 | 5.6,3.0,4.5,1.5,Iris-versicolor
68 | 5.8,2.7,4.1,1.0,Iris-versicolor
69 | 6.2,2.2,4.5,1.5,Iris-versicolor
70 | 5.6,2.5,3.9,1.1,Iris-versicolor
71 | 5.9,3.2,4.8,1.8,Iris-versicolor
72 | 6.1,2.8,4.0,1.3,Iris-versicolor
73 | 6.3,2.5,4.9,1.5,Iris-versicolor
74 | 6.1,2.8,4.7,1.2,Iris-versicolor
75 | 6.4,2.9,4.3,1.3,Iris-versicolor
76 | 6.6,3.0,4.4,1.4,Iris-versicolor
77 | 6.8,2.8,4.8,1.4,Iris-versicolor
78 | 6.7,3.0,5.0,1.7,Iris-versicolor
79 | 6.0,2.9,4.5,1.5,Iris-versicolor
80 | 5.7,2.6,3.5,1.0,Iris-versicolor
81 | 5.5,2.4,3.8,1.1,Iris-versicolor
82 | 5.5,2.4,3.7,1.0,Iris-versicolor
83 | 5.8,2.7,3.9,1.2,Iris-versicolor
84 | 6.0,2.7,5.1,1.6,Iris-versicolor
85 | 5.4,3.0,4.5,1.5,Iris-versicolor
86 | 6.0,3.4,4.5,1.6,Iris-versicolor
87 | 6.7,3.1,4.7,1.5,Iris-versicolor
88 | 6.3,2.3,4.4,1.3,Iris-versicolor
89 | 5.6,3.0,4.1,1.3,Iris-versicolor
90 | 5.5,2.5,4.0,1.3,Iris-versicolor
91 | 5.5,2.6,4.4,1.2,Iris-versicolor
92 | 6.1,3.0,4.6,1.4,Iris-versicolor
93 | 5.8,2.6,4.0,1.2,Iris-versicolor
94 | 5.0,2.3,3.3,1.0,Iris-versicolor
95 | 5.6,2.7,4.2,1.3,Iris-versicolor
96 | 5.7,3.0,4.2,1.2,Iris-versicolor
97 | 5.7,2.9,4.2,1.3,Iris-versicolor
98 | 6.2,2.9,4.3,1.3,Iris-versicolor
99 | 5.1,2.5,3.0,1.1,Iris-versicolor
100 | 5.7,2.8,4.1,1.3,Iris-versicolor
101 | 6.3,3.3,6.0,2.5,Iris-virginica
102 | 5.8,2.7,5.1,1.9,Iris-virginica
103 | 7.1,3.0,5.9,2.1,Iris-virginica
104 | 6.3,2.9,5.6,1.8,Iris-virginica
105 | 6.5,3.0,5.8,2.2,Iris-virginica
106 | 7.6,3.0,6.6,2.1,Iris-virginica
107 | 4.9,2.5,4.5,1.7,Iris-virginica
108 | 7.3,2.9,6.3,1.8,Iris-virginica
109 | 6.7,2.5,5.8,1.8,Iris-virginica
110 | 7.2,3.6,6.1,2.5,Iris-virginica
111 | 6.5,3.2,5.1,2.0,Iris-virginica
112 | 6.4,2.7,5.3,1.9,Iris-virginica
113 | 6.8,3.0,5.5,2.1,Iris-virginica
114 | 5.7,2.5,5.0,2.0,Iris-virginica
115 | 5.8,2.8,5.1,2.4,Iris-virginica
116 | 6.4,3.2,5.3,2.3,Iris-virginica
117 | 6.5,3.0,5.5,1.8,Iris-virginica
118 | 7.7,3.8,6.7,2.2,Iris-virginica
119 | 7.7,2.6,6.9,2.3,Iris-virginica
120 | 6.0,2.2,5.0,1.5,Iris-virginica
121 | 6.9,3.2,5.7,2.3,Iris-virginica
122 | 5.6,2.8,4.9,2.0,Iris-virginica
123 | 7.7,2.8,6.7,2.0,Iris-virginica
124 | 6.3,2.7,4.9,1.8,Iris-virginica
125 | 6.7,3.3,5.7,2.1,Iris-virginica
126 | 7.2,3.2,6.0,1.8,Iris-virginica
127 | 6.2,2.8,4.8,1.8,Iris-virginica
128 | 6.1,3.0,4.9,1.8,Iris-virginica
129 | 6.4,2.8,5.6,2.1,Iris-virginica
130 | 7.2,3.0,5.8,1.6,Iris-virginica
131 | 7.4,2.8,6.1,1.9,Iris-virginica
132 | 7.9,3.8,6.4,2.0,Iris-virginica
133 | 6.4,2.8,5.6,2.2,Iris-virginica
134 | 6.3,2.8,5.1,1.5,Iris-virginica
135 | 6.1,2.6,5.6,1.4,Iris-virginica
136 | 7.7,3.0,6.1,2.3,Iris-virginica
137 | 6.3,3.4,5.6,2.4,Iris-virginica
138 | 6.4,3.1,5.5,1.8,Iris-virginica
139 | 6.0,3.0,4.8,1.8,Iris-virginica
140 | 6.9,3.1,5.4,2.1,Iris-virginica
141 | 6.7,3.1,5.6,2.4,Iris-virginica
142 | 6.9,3.1,5.1,2.3,Iris-virginica
143 | 5.8,2.7,5.1,1.9,Iris-virginica
144 | 6.8,3.2,5.9,2.3,Iris-virginica
145 | 6.7,3.3,5.7,2.5,Iris-virginica
146 | 6.7,3.0,5.2,2.3,Iris-virginica
147 | 6.3,2.5,5.0,1.9,Iris-virginica
148 | 6.5,3.0,5.2,2.0,Iris-virginica
149 | 6.2,3.4,5.4,2.3,Iris-virginica
150 | 5.9,3.0,5.1,1.8,Iris-virginica
151 |
152 |
--------------------------------------------------------------------------------
/datasets/datafiles/michelson.json:
--------------------------------------------------------------------------------
1 | [[299.85,299.74,299.9,300.07,299.93,299.85,299.95,299.98,299.98,299.88,300,299.98,299.93,299.65,299.76,299.81,300,300,299.96,299.96],[299.96,299.94,299.96,299.94,299.88,299.8,299.85,299.88,299.9,299.84,299.83,299.79,299.81,299.88,299.88,299.83,299.8,299.79,299.76,299.8],[299.88,299.88,299.88,299.86,299.72,299.72,299.62,299.86,299.97,299.95,299.88,299.91,299.85,299.87,299.84,299.84,299.85,299.84,299.84,299.84],[299.89,299.81,299.81,299.82,299.8,299.77,299.76,299.74,299.75,299.76,299.91,299.92,299.89,299.86,299.88,299.72,299.84,299.85,299.85,299.78],[299.89,299.84,299.78,299.81,299.76,299.81,299.79,299.81,299.82,299.85,299.87,299.87,299.81,299.74,299.81,299.94,299.95,299.8,299.81,299.87]]
2 |
--------------------------------------------------------------------------------
/datasets/datafiles/netflix/movies/movie_titles.txt:
--------------------------------------------------------------------------------
1 | 597,1977,The Brady Bunch Variety Hour
2 | 598,1973,Bobby Darin: Mack is Back
3 |
--------------------------------------------------------------------------------
/datasets/datafiles/netflix/test/qualifying.txt:
--------------------------------------------------------------------------------
1 | 1:
2 | 1046323,2005-12-19
3 | 1080030,2005-12-23
4 | 2127527,2005-12-04
5 | 1944918,2005-10-05
6 | 1057066,2005-11-07
7 | 954049,2005-12-20
8 | 10:
9 | 12868,2004-10-19
10 | 627923,2005-12-16
11 | 690763,2005-12-13
12 |
--------------------------------------------------------------------------------
/datasets/datafiles/netflix/training/mv_0000001.txt:
--------------------------------------------------------------------------------
1 | 1:
2 | 1488844,3,2005-09-06
3 | 822109,5,2005-05-13
4 | 885013,4,2005-10-19
5 | 30878,4,2005-12-26
6 | 823519,3,2004-05-03
7 |
--------------------------------------------------------------------------------
/datasets/datafiles/nightingale.json:
--------------------------------------------------------------------------------
1 | [{"date":"1854-04-01T07:00:00.000Z","army_size":8571,"disease":1,"wounds":0,"other":5},{"date":"1854-05-01T07:00:00.000Z","army_size":23333,"disease":12,"wounds":0,"other":9},{"date":"1854-06-01T07:00:00.000Z","army_size":28333,"disease":11,"wounds":0,"other":6},{"date":"1854-07-01T07:00:00.000Z","army_size":28722,"disease":359,"wounds":0,"other":23},{"date":"1854-08-01T07:00:00.000Z","army_size":30246,"disease":828,"wounds":1,"other":30},{"date":"1854-09-01T07:00:00.000Z","army_size":30290,"disease":788,"wounds":81,"other":70},{"date":"1854-10-01T07:00:00.000Z","army_size":30643,"disease":503,"wounds":132,"other":128},{"date":"1854-11-01T07:00:00.000Z","army_size":29736,"disease":844,"wounds":287,"other":106},{"date":"1854-12-01T08:00:00.000Z","army_size":32779,"disease":1725,"wounds":114,"other":131},{"date":"1855-01-01T08:00:00.000Z","army_size":32393,"disease":2761,"wounds":83,"other":324},{"date":"1855-02-01T08:00:00.000Z","army_size":30919,"disease":2120,"wounds":42,"other":361},{"date":"1855-03-01T08:00:00.000Z","army_size":30107,"disease":1205,"wounds":32,"other":172},{"date":"1855-04-01T07:00:00.000Z","army_size":32252,"disease":477,"wounds":48,"other":57},{"date":"1855-05-01T07:00:00.000Z","army_size":35473,"disease":508,"wounds":49,"other":37},{"date":"1855-06-01T07:00:00.000Z","army_size":38863,"disease":802,"wounds":209,"other":31},{"date":"1855-07-01T07:00:00.000Z","army_size":42647,"disease":382,"wounds":134,"other":33},{"date":"1855-08-01T07:00:00.000Z","army_size":44614,"disease":483,"wounds":164,"other":25},{"date":"1855-09-01T07:00:00.000Z","army_size":47751,"disease":189,"wounds":276,"other":20},{"date":"1855-10-01T07:00:00.000Z","army_size":46852,"disease":128,"wounds":53,"other":18},{"date":"1855-11-01T07:00:00.000Z","army_size":37853,"disease":178,"wounds":33,"other":32},{"date":"1855-12-01T08:00:00.000Z","army_size":43217,"disease":91,"wounds":18,"other":28},{"date":"1856-01-01T08:00:00.000Z","army_size":44212,"disease":42,"wounds":2,"other":48},{"date":"1856-02-01T08:00:00.000Z","army_size":43485,"disease":24,"wounds":0,"other":19},{"date":"1856-03-01T08:00:00.000Z","army_size":46140,"disease":15,"wounds":0,"other":35}]
2 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Dataloader.hs:
--------------------------------------------------------------------------------
1 | -------------------------------------------------------------------------------
2 | -- |
3 | -- Module : Numeric.Dataloader
4 | -- Stability : experimental
5 | -- Portability: non-portable
6 | --
7 | -- A Dataloader is an extension of a Dataset and is primarily intended for
8 | -- compute-intensive, batch loading interfaces. When used with ImageFolder
9 | -- representations of Datasets, it shuffles the order of files to be loaded
10 | -- and leverages the async library when possible.
11 | --
12 | -- Concurrent loading primarily takes place in 'batchStream'. 'stream' exists
13 | -- primarily to provide a unified API with training that is not batch-oriented.
14 | -------------------------------------------------------------------------------
15 | {-# LANGUAGE ScopedTypeVariables #-}
16 | {-# OPTIONS_GHC -Wno-unused-top-binds -Wno-name-shadowing #-}
17 | module Numeric.Dataloader
18 | ( Dataloader(..)
19 | , uniformIxline
20 |
21 | , stream
22 | , batchStream
23 | ) where
24 |
25 | import Control.Monad ((>=>))
26 | import Control.Monad.IO.Class (MonadIO, liftIO)
27 | import Data.Vector (Vector)
28 | import Streaming (Stream, Of(..))
29 | import System.Random.MWC (GenIO)
30 | import qualified Data.Vector as V
31 | import qualified Streaming as S
32 | import qualified Streaming.Prelude as S
33 | import qualified System.Random.MWC.Distributions as MWC
34 | import Control.Exception.Safe (MonadThrow)
35 | import Streaming.Instances ()
36 |
37 | import Control.Parallel.Strategies
38 |
39 | import Numeric.Datasets
40 |
41 | -- * Configuring data loaders
42 |
43 |
44 | -- | Options for a data loading functions.
45 | data Dataloader a b = Dataloader
46 | { batchSize :: Int -- ^ Batch size used with 'batchStream'.
47 | , shuffle :: Maybe (Vector Int) -- ^ Optional shuffle order (forces the dataset to be loaded in memory if it wasn't already).
48 | , dataset :: Dataset a -- ^ Dataset associated with the dataloader.
49 | , transform :: a -> b -- ^ Transformation associated with the dataloader which will be run in parallel. If using an
50 | -- ImageFolder, this is where you would transform image filepaths to an image (or other
51 | -- compute-optimized form). Additionally, this is where you should perform any
52 | -- static normalization.
53 | }
54 |
55 |
56 | -- | Generate a uniformly random index line from a dataset and a generator.
57 | uniformIxline
58 | :: Dataset a
59 | -> GenIO
60 | -> IO (Vector Int)
61 | uniformIxline ds gen = do
62 | len <- V.length <$> getDatavec ds
63 | MWC.uniformPermutation len gen
64 |
65 | -- * Data loading functions
66 |
67 |
68 | -- | Stream a dataset in-memory, applying a transformation function.
69 | stream
70 | :: (MonadThrow io, MonadIO io)
71 | => Dataloader a b
72 | -> Stream (Of b) io ()
73 | stream dl = S.maps (\(a:>b) -> (transform dl a `using` rpar) :> b) (sourceStream dl)
74 |
75 |
76 | -- | Stream batches of a dataset, concurrently processing each element
77 | --
78 | -- NOTE: Run with @-threaded -rtsopts@ to concurrently load data in-memory.
79 | batchStream
80 | :: (MonadThrow io, MonadIO io, NFData b)
81 | => Dataloader a b
82 | -> Stream (Of [b]) io ()
83 | batchStream dl
84 | = S.mapsM (S.toList >=> liftIO . firstOfM go)
85 | $ S.chunksOf (batchSize dl)
86 | $ sourceStream dl
87 | where
88 | go as = fmap (transform dl) as `usingIO` parList rdeepseq
89 |
90 |
91 | -- * helper functions (not for export)
92 |
93 |
94 | -- | Stream a dataset in-memory
95 | sourceStream
96 | :: (MonadThrow io, MonadIO io)
97 | => Dataloader a b
98 | -> Stream (Of a) io ()
99 | sourceStream loader
100 | = permute loader <$> getDatavec (dataset loader)
101 | >>= S.each
102 | where
103 | -- Use a dataloader's shuffle order to return a permuted vector (or return the
104 | -- identity vector).
105 | permute :: Dataloader a b -> Vector a -> Vector a
106 | permute loader va = maybe va (V.backpermute va) (shuffle loader)
107 |
108 |
109 | -- | Monadic, concrete version of Control.Arrow.first for @Of@
110 | firstOfM :: Monad m => (a -> m b) -> Of a c -> m (Of b c)
111 | firstOfM fm (a:>c) = do
112 | b <- fm a
113 | pure (b:>c)
114 |
115 |
116 |
117 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Abalone.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 |
3 | {-|
4 |
5 | Abalone data set
6 |
7 | UCI ML Repository link
8 |
9 | -}
10 |
11 | module Numeric.Datasets.Abalone where
12 |
13 | import Numeric.Datasets
14 |
15 | import Data.Csv
16 | import GHC.Generics
17 | import Network.HTTP.Req ((/:), http, Scheme(..))
18 |
19 | data Sex = M | F | I
20 | deriving (Show, Read, Eq, Generic, Bounded, Enum)
21 |
22 | instance FromField Sex where
23 | parseField = parseReadField
24 |
25 | data Abalone = Abalone
26 | { sex :: Sex
27 | , abaloneLength :: Double
28 | , diameter :: Double
29 | , height :: Double
30 | , wholeWeight :: Double
31 | , shuckedWeight :: Double
32 | , visceraWeight :: Double
33 | , shellWeight :: Double
34 | , rings :: Int
35 | } deriving (Show, Read, Generic)
36 |
37 | instance FromRecord Abalone
38 |
39 | abalone :: Dataset Abalone
40 | abalone = csvDataset $ URL $ uciMLDB /: "abalone" /: "abalone.data"
41 |
42 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Adult.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 |
3 | {-|
4 |
5 | Adult (AKA Census Income) dataset.
6 |
7 | UCI ML Repository link
8 |
9 | -}
10 |
11 | module Numeric.Datasets.Adult where
12 |
13 | import Numeric.Datasets
14 |
15 | import Data.Csv
16 | import GHC.Generics
17 | import Control.Applicative
18 | import Data.Text (Text, strip)
19 | import Network.HTTP.Req ((/:), Scheme(..))
20 |
21 | data WorkClass = Private | SelfEmpNotInc | SelfEmpInc | FederalGov
22 | | LocalGov | StateGov | WithoutPay | NeverWorked
23 | deriving (Show, Read, Eq, Generic, Bounded, Enum)
24 |
25 | instance FromField WorkClass where
26 | parseField = parseDashToCamelField
27 |
28 |
29 | data MaritalStatus = MarriedCivSpouse | Divorced | NeverMarried
30 | | Separated | Widowed | MarriedSpouseAbsent | MarriedAFSpouse
31 | deriving (Show, Read, Eq, Generic, Bounded, Enum)
32 |
33 | instance FromField MaritalStatus where
34 | -- parseField "Married-AF-spouse" = pure MarriedAFSpouse
35 | parseField = parseDashToCamelField
36 |
37 | data Occupation = TechSupport | CraftRepair | OtherService | Sales | ExecManagerial | ProfSpecialty
38 | | HandlersCleaners | MachineOpInspct | AdmClerical | FarmingFishing | TransportMoving
39 | | PrivHouseServ | ProtectiveServ | ArmedForces
40 | deriving (Show, Read, Eq, Generic, Bounded, Enum)
41 |
42 | instance FromField Occupation where
43 | parseField = parseDashToCamelField
44 |
45 | data Relationship = Wife | OwnChild | Husband | NotInFamily | OtherRelative | Unmarried
46 | deriving (Show, Read, Eq, Generic, Bounded, Enum)
47 |
48 | instance FromField Relationship where
49 | parseField = parseDashToCamelField
50 |
51 | data Race = White | AsianPacIslander | AmerIndianEskimo | Other | Black
52 | deriving (Show, Read, Eq, Generic, Bounded, Enum)
53 |
54 | instance FromField Race where
55 | parseField = parseDashToCamelField
56 |
57 | data Sex = Female | Male
58 | deriving (Show, Read, Eq, Generic, Bounded, Enum)
59 |
60 | instance FromField Sex where
61 | parseField = parseDashToCamelField
62 |
63 | data Income = GT50K | LE50K
64 | deriving (Show, Read, Eq, Generic, Bounded, Enum)
65 |
66 | instance FromField Income where
67 | parseField " >50K" = pure GT50K
68 | parseField " <=50K" = pure LE50K
69 | parseField " >50K." = pure GT50K
70 | parseField " <=50K." = pure LE50K
71 | parseField ">50K" = pure GT50K
72 | parseField "<=50K" = pure LE50K
73 | parseField _ = fail "unknown income"
74 |
75 | data Adult = Adult
76 | { age :: Int
77 | , workClass :: Maybe WorkClass
78 | , finalWeight :: Int
79 | , education :: Text
80 | , educationNum :: Int
81 | , maritalStatus :: MaritalStatus
82 | , occupation :: Maybe Occupation
83 | , relationship :: Relationship
84 | , race :: Race
85 | , sex :: Sex
86 | , capitalGain :: Int
87 | , capitalLoss :: Int
88 | , hoursPerWeek :: Int
89 | , nativeCountry :: Text
90 | , income :: Income
91 | } deriving (Show, Read, Generic)
92 |
93 | instance FromRecord Adult where
94 | parseRecord v = Adult <$> v .! 0 <*> (v.! 1 <|> return Nothing) <*> v.!2 <*> (strip <$> v.!3)
95 | <*> v.!4 <*> v.!5 <*> (v.!6 <|> return Nothing) <*> v.!7 <*> v.!8
96 | <*> v.!9 <*> v.!10 <*> v.!11 <*> v.!12 <*> v.!13 <*> v.!14
97 |
98 | adult :: Dataset Adult
99 | adult = csvDataset $ URL $ uciMLDB /: "adult" /: "adult.data"
100 |
101 | adultTestSet :: Dataset Adult
102 | adultTestSet = withPreprocess (dropLines 1) $ csvDataset $ URL $ umassMLDB /: "adult" /: "adult.test"
103 |
104 | -- "http://mlr.cs.umass.edu/ml/machine-learning-databases/adult/adult.test"
105 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Anscombe.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings #-}
2 |
3 | {-|
4 |
5 | Anscombe's quartet
6 |
7 | Four datasets with nearly identical statistical properties
8 |
9 | Wikipedia article:
10 |
11 | -}
12 |
13 | module Numeric.Datasets.Anscombe where
14 |
15 | anscombe :: [[(Double,Double)]]
16 | anscombe = [anscombe1, anscombe2, anscombe3, anscombe4]
17 |
18 | anscombe1, anscombe2, anscombe3, anscombe4 :: [(Double, Double)]
19 | anscombe1 = [
20 | (10,8.04),
21 | (8,6.95),
22 | (13,7.58),
23 | (9,8.81),
24 | (11,8.33),
25 | (14,9.96),
26 | (6,7.24),
27 | (4,4.26),
28 | (12,10.84),
29 | (7,4.82),
30 | (5,5.68)
31 | ]
32 |
33 | anscombe2 = [
34 | (10,9.14),
35 | (8,8.14),
36 | (13,8.74),
37 | (9,8.77),
38 | (11,9.26),
39 | (14,8.1),
40 | (6,6.13),
41 | (4,3.1),
42 | (12,9.13),
43 | (7,7.26),
44 | (5,4.74)
45 | ]
46 |
47 | anscombe3 = [
48 | (10,7.46),
49 | (8,6.77),
50 | (13,12.74),
51 | (9,7.11),
52 | (11,7.81),
53 | (14,8.84),
54 | (6,6.08),
55 | (4,5.39),
56 | (12,8.15),
57 | (7,6.42),
58 | (5,5.73)
59 | ]
60 |
61 | anscombe4 = [
62 | (8,6.58),
63 | (8,5.76),
64 | (8,7.71),
65 | (8,8.84),
66 | (8,8.47),
67 | (8,7.04),
68 | (8,5.25),
69 | (19,12.5),
70 | (8,5.56),
71 | (8,7.91),
72 | (8,6.89)
73 | ]
74 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/BostonHousing.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-}
3 |
4 | {-|
5 |
6 | BostonHousing Data set
7 |
8 | scikit-learn calls this "boston" and UCI calls it "Housing"
9 |
10 | UCI ML Repository link
11 |
12 | -}
13 |
14 | module Numeric.Datasets.BostonHousing where
15 |
16 | import Numeric.Datasets
17 |
18 | import Data.Csv
19 | import GHC.Generics
20 | import Control.Applicative
21 | import Network.HTTP.Req ((/:), http, Scheme(..))
22 |
23 |
24 | data BostonHousing = BostonHousing
25 | { crimeRate :: Double
26 | , zoned :: Double
27 | , industrial :: Double
28 | , charlesRiver :: Bool
29 | , nitricOxides :: Double
30 | , rooms :: Double
31 | , age :: Double
32 | , distance :: Double
33 | , radialHwy :: Double
34 | , tax :: Double
35 | , ptRatio :: Double
36 | , b :: Double
37 | , lowerStatus :: Double
38 | , medianValue :: Double
39 | } deriving (Show, Read, Generic)
40 |
41 | instance FromRecord BostonHousing where
42 | parseRecord v = BostonHousing <$>
43 | v .! 0 <*>
44 | v .! 1 <*>
45 | v .! 2 <*>
46 | (intToBool <$> v .! 3) <*>
47 | v .! 4 <*>
48 | v .! 5 <*>
49 | v .! 6 <*>
50 | v .! 7 <*>
51 | v .! 8 <*>
52 | v .! 9 <*>
53 | v .! 10 <*>
54 | v .! 11 <*>
55 | v .! 12 <*>
56 | v .! 13
57 | where intToBool :: Int -> Bool
58 | intToBool 0 = False
59 | intToBool 1 = True
60 | intToBool _ = error "intToBool"
61 |
62 | bostonHousing :: Dataset BostonHousing
63 | bostonHousing = withPreprocess fixedWidthToCSV $
64 | csvDataset $ URL $ uciMLDB /: "housing" /: "housing.data"
65 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/BreastCancerWisconsin.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 |
3 | {-|
4 |
5 | Breast Cancer Wisconsin (Diagnostic) Data Set
6 |
7 | Repository link:
8 |
9 | -}
10 |
11 | module Numeric.Datasets.BreastCancerWisconsin where
12 |
13 | import Numeric.Datasets
14 |
15 | import Data.Csv
16 | import GHC.Generics
17 | import Control.Applicative
18 | import Network.HTTP.Req ((/:), Scheme(..))
19 |
20 | data Diagnosis = Malignant | Benign deriving (Show, Read, Eq, Generic, Bounded, Enum)
21 |
22 | data Prognosis = Recurrent | Nonrecurrent deriving (Show, Read, Eq, Generic, Bounded, Enum)
23 |
24 | intToDiagnosis :: Int -> Diagnosis
25 | intToDiagnosis 2 = Benign
26 | intToDiagnosis 4 = Malignant
27 | intToDiagnosis _ = error "unknown diagnosis code"
28 |
29 | data BreastCancerEntry = BreastCancerEntry
30 | { sampleCodeNumber :: Int
31 | , clumpThickness :: Int
32 | , uniformityCellSize :: Int
33 | , uniformityCellShape :: Int
34 | , marginalAdhesion :: Int
35 | , singleEpithelialCellSize :: Int
36 | , bareNuclei :: Maybe Int
37 | , blandChromatin :: Int
38 | , normalNucleoli :: Int
39 | , mitosis :: Int
40 | , sampleClass :: Diagnosis
41 | } deriving (Show, Read, Generic)
42 |
43 | instance FromRecord BreastCancerEntry where
44 | parseRecord v = BreastCancerEntry <$> v .! 0 <*> v .! 1 <*> v .! 2 <*> v .! 3 <*> v .! 4 <*> v .! 5 <*> (v .! 6 <|> return Nothing) <*> v .! 7 <*> v .! 8 <*> v .! 9 <*> (intToDiagnosis <$> v .! 10)
45 |
46 | breastCancerDatabase :: Dataset BreastCancerEntry
47 | breastCancerDatabase = csvDataset
48 | $ URL $ umassMLDB /: "breast-cancer-wisconsin" /: "breast-cancer-wisconsin.data"
49 |
50 | data DiagnosticBreastCancer = DiagnosticBreastCancer
51 | { diagnosticID :: Int
52 | , diagnosis :: Diagnosis
53 | , diagnosticCells :: CellFeatures
54 | } deriving (Show, Read, Generic)
55 |
56 | data PrognosticBreastCancer = PrognosticBreastCancer
57 | { prognosticID :: Int
58 | , prognosis :: Prognosis
59 | , prognosticCells :: CellFeatures
60 | } deriving (Show, Read, Generic)
61 |
62 | data CellFeatures = CellFeatures
63 | { radius :: Double
64 | , perimeter :: Double
65 | , area :: Double
66 | , smoothness :: Double
67 | , compactness :: Double
68 | , concavity :: Double
69 | , concavePoints :: Double
70 | , symmetry :: Double
71 | , fractalDimension :: Double
72 | } deriving (Show, Read, Generic)
73 |
74 | charToDiagnosis :: String -> Diagnosis
75 | charToDiagnosis "M" = Malignant
76 | charToDiagnosis "B" = Benign
77 | charToDiagnosis _ = error "unknown diagnosis"
78 |
79 | charToPrognosis :: String -> Prognosis
80 | charToPrognosis "N" = Nonrecurrent
81 | charToPrognosis "R" = Recurrent
82 | charToPrognosis _ = error "unknown diagnosis"
83 |
84 | instance FromRecord DiagnosticBreastCancer where
85 | parseRecord v = DiagnosticBreastCancer <$> v .! 0 <*> (charToDiagnosis <$> v .! 1) <*> parseRecord v
86 |
87 | instance FromRecord PrognosticBreastCancer where
88 | parseRecord v = PrognosticBreastCancer <$> v .! 0 <*> (charToPrognosis <$> v .! 1) <*> parseRecord v
89 |
90 | instance FromRecord CellFeatures where
91 | parseRecord v = CellFeatures <$> v .! 2 <*> v .! 3 <*> v .! 4 <*> v .! 5 <*> v .! 6 <*> v .! 7 <*> v .! 8 <*> v .! 9 <*> v .! 10
92 |
93 | diagnosticBreastCancer :: Dataset DiagnosticBreastCancer
94 | diagnosticBreastCancer = csvDataset
95 | $ URL $ umassMLDB /: "breast-cancer-wisconsin" /: "wdbc.data"
96 |
97 | prognosticBreastCancer :: Dataset PrognosticBreastCancer
98 | prognosticBreastCancer = csvDataset
99 | $ URL $ umassMLDB /: "breast-cancer-wisconsin" /: "wpbc.data"
100 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/CIFAR10.hs:
--------------------------------------------------------------------------------
1 | -------------------------------------------------------------------------------
2 | -- |
3 | -- Module : Numeric.Datasets.CIFAR10
4 | -- License : BSD-3-Clause
5 | -- Stability : experimental
6 | -- Portability: non-portable
7 | --
8 | -- The binary version contains the files data_batch_1.bin, data_batch_2.bin,
9 | -- ..., data_batch_5.bin, as well as test_batch.bin. Each of these files is
10 | -- formatted as follows:
11 | --
12 | -- <1 x label><3072 x pixel>
13 | -- ...
14 | -- <1 x label><3072 x pixel>
15 | --
16 | -- In other words, the first byte is the label of the first image, which is a
17 | -- number in the range 0-9. The next 3072 bytes are the values of the pixels of
18 | -- the image. The first 1024 bytes are the red channel values, the next 1024
19 | -- the green, and the final 1024 the blue. The values are stored in row-major
20 | -- order, so the first 32 bytes are the red channel values of the first row of
21 | -- the image.
22 | -------------------------------------------------------------------------------
23 | {-# LANGUAGE CPP #-}
24 | {-# LANGUAGE DataKinds #-}
25 | {-# LANGUAGE DeriveAnyClass #-}
26 | {-# LANGUAGE DeriveGeneric #-}
27 | #if MIN_VERSION_GLASGOW_HASKELL(8,2,0,0)
28 | {-# LANGUAGE DerivingStrategies #-}
29 | #endif
30 | {-# LANGUAGE GeneralizedNewtypeDeriving #-}
31 | {-# LANGUAGE OverloadedStrings #-}
32 | {-# LANGUAGE ScopedTypeVariables #-}
33 | {-# LANGUAGE TupleSections #-}
34 | #if MIN_VERSION_JuicyPixels(3,3,0)
35 | #else
36 | {-# LANGUAGE UndecidableInstances #-}
37 | #endif
38 | module Numeric.Datasets.CIFAR10
39 | ( Label(..)
40 | , CIFARImage(..), height, width, image, label
41 | , cifarURL
42 | , cifar10
43 | , parseCifar
44 | ) where
45 |
46 | import Codec.Picture (Image, PixelRGB8(PixelRGB8), Pixel8, writePixel)
47 | import Codec.Picture.Types (newMutableImage, freezeImage)
48 | import Control.DeepSeq
49 | import Control.Exception (throw)
50 | import Control.Monad.ST (runST)
51 | import Data.List (zipWith4)
52 | import GHC.Generics (Generic)
53 | import Network.HTTP.Req (Url, (/:), https, Scheme(..))
54 | import qualified Codec.Archive.Tar as Tar
55 | import qualified Codec.Compression.GZip as GZip
56 | import qualified Data.Attoparsec.ByteString.Lazy as Atto
57 | import qualified Data.ByteString as BS
58 | import qualified Data.ByteString.Lazy as BL
59 |
60 | import Numeric.Datasets
61 |
62 | #if MIN_VERSION_JuicyPixels(3,3,0)
63 | #else
64 | import Foreign.Storable (Storable)
65 | import qualified Codec.Picture as Compat
66 | #endif
67 | -- ========================================================================= --
68 |
69 | -- | labels of CIFAR-10 dataset. Enum corresponds to binary-based uint8 label.
70 | data Label
71 | = Airplane
72 | | Automobile
73 | | Bird
74 | | Cat
75 | | Deer
76 | | Dog
77 | | Frog
78 | | Horse
79 | | Ship
80 | | Truck
81 | deriving (Show, Eq, Generic, Bounded, Enum, Read, NFData)
82 |
83 | #if MIN_VERSION_JuicyPixels(3,3,0)
84 | #else
85 | instance (Eq (Compat.PixelBaseComponent a), Storable (Compat.PixelBaseComponent a))
86 | => Eq (Image a) where
87 | a == b = Compat.imageWidth a == Compat.imageWidth b &&
88 | Compat.imageHeight a == Compat.imageHeight b &&
89 | Compat.imageData a == Compat.imageData b
90 | #endif
91 |
92 | -- | Data representation of a CIFAR image is a 32x32 RGB image
93 | newtype CIFARImage = CIFARImage { getXY :: (Image PixelRGB8, Label) }
94 | #if MIN_VERSION_GLASGOW_HASKELL(8,2,0,0)
95 | deriving newtype (Eq, NFData)
96 | #else
97 | deriving (Eq, Generic, NFData)
98 | #endif
99 |
100 | instance Show CIFARImage where
101 | show im = "CIFARImage{Height: 32, Width: 32, Pixel: RGB8, Label: " ++ show (label im) ++ "}"
102 |
103 | -- | height of 'CIFARImage'
104 | height :: Int
105 | height = 32
106 |
107 | -- | width of 'CIFARImage'
108 | width :: Int
109 | width = 32
110 |
111 | -- | extract the JuicyPixel representation from a CIFAR datapoint
112 | image :: CIFARImage -> Image PixelRGB8
113 | image = fst . getXY
114 |
115 | -- | extract the label from a CIFAR datapoint
116 | label :: CIFARImage -> Label
117 | label = snd . getXY
118 |
119 | -- | Source URL for cifar-10 and cifar-100
120 | cifarURL :: Url 'Https
121 | cifarURL = https "www.cs.toronto.edu" /: "~kriz"
122 |
123 | -------------------------------------------------------------------------------
124 | tempdir :: Maybe FilePath
125 | tempdir = Nothing
126 |
127 | -- | Define a dataset from a source for a CSV file
128 | cifar10 :: Dataset CIFARImage
129 | cifar10 = Dataset
130 | (URL $ cifarURL /: "cifar-10-binary.tar.gz")
131 | tempdir
132 | (Just unzipCifar)
133 | (Parsable parseCifar)
134 |
135 | -- cifar10Sha256 = "c4a38c50a1bc5f3a1c5537f2155ab9d68f9f25eb1ed8d9ddda3db29a59bca1dd"
136 |
137 | -- | parser for a cifar binary
138 | parseCifar :: Atto.Parser CIFARImage
139 | parseCifar = do
140 | lbl :: Label <- toEnum . fromIntegral <$> Atto.anyWord8
141 | rs :: [Pixel8] <- BS.unpack <$> Atto.take 1024
142 | gs :: [Pixel8] <- BS.unpack <$> Atto.take 1024
143 | bs :: [Pixel8] <- BS.unpack <$> Atto.take 1024
144 | let ipixels = zipWith4 (\ix r g b -> (ix, PixelRGB8 r g b)) ixs rs gs bs
145 | pure $ CIFARImage (newImage ipixels, lbl)
146 | where
147 | newImage :: [((Int, Int), PixelRGB8)] -> Image PixelRGB8
148 | newImage ipixels = runST $ do
149 | mim <- newMutableImage height width
150 | mapM_ (\((x, y), rgb) -> writePixel mim x y rgb) ipixels
151 | freezeImage mim
152 |
153 | ixs :: [(Int, Int)]
154 | ixs = concat $ zipWith (\(row::Int) cols -> (row,) <$> cols) [0..] (replicate height [0..width - 1])
155 |
156 | -- | how to unpack the tarball
157 | --
158 | -- FIXME: this should be in MonadThrow
159 | unzipCifar :: BL.ByteString -> BL.ByteString
160 | unzipCifar zipbs = do
161 | either (throw . fst) (BL.concat) $ Tar.foldlEntries go [] entries
162 | where
163 | entries :: Tar.Entries Tar.FormatError
164 | entries = Tar.read $ GZip.decompress zipbs
165 |
166 | go :: [BL.ByteString] -> Tar.Entry -> [BL.ByteString]
167 | go agg entry =
168 | case Tar.entryContent entry of
169 | Tar.NormalFile ps fs ->
170 | -- Each file is exactly 30730000 bytes long. All other files are metadata. See https://www.cs.toronto.edu/~kriz/cifar.html
171 | if fs == 30730000
172 | then ps:agg
173 | else agg
174 | _ -> agg
175 |
176 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/CO2.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 |
3 | {-|
4 |
5 | Mauna Loa CO2 time-series
6 |
7 |
8 | Listed as co2 here: http://vincentarelbundock.github.io/Rdatasets/datasets.html
9 |
10 | See
11 |
12 | -}
13 |
14 | module Numeric.Datasets.CO2 where
15 |
16 | import Numeric.Datasets
17 |
18 | import Data.Csv
19 | import GHC.Generics
20 | import Network.HTTP.Req ((/:), https, Scheme(..))
21 |
22 | data CO2 = CO2
23 | { time :: Double
24 | , value :: Double
25 | } deriving (Show, Read, Generic)
26 |
27 | instance FromNamedRecord CO2
28 |
29 | maunaLoaCO2 :: Dataset CO2
30 | maunaLoaCO2 = csvHdrDataset
31 | $ URL $ https "raw.githubusercontent.com" /: "vincentarelbundock" /: "Rdatasets" /: "master" /: "csv" /: "datasets" /: "CO2.csv"
32 |
33 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Car.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-}
3 |
4 | {-|
5 |
6 | Car dataset
7 |
8 | UCI ML Repository link
9 |
10 | -}
11 |
12 | module Numeric.Datasets.Car where
13 |
14 | import Numeric.Datasets
15 |
16 | import Data.Csv
17 | import GHC.Generics
18 | import Control.Applicative
19 | import Network.HTTP.Req ((/:), http, Scheme(..))
20 |
21 | data RelScore = Low | Med | High | VeryHigh deriving (Show, Read, Eq, Generic, Bounded, Enum)
22 |
23 | instance FromField RelScore where
24 | parseField "vhigh" = pure VeryHigh
25 | parseField "high" = pure High
26 | parseField "med" = pure Med
27 | parseField "low" = pure Low
28 | parseField _ = fail "unknown relative score"
29 |
30 | data RelSize = Small | Medium | Big deriving (Show, Read, Eq, Generic, Bounded, Enum)
31 |
32 | instance FromField RelSize where
33 | parseField "small" = pure Small
34 | parseField "med" = pure Medium
35 | parseField "big" = pure Big
36 | parseField _ = fail "unknown relative size"
37 |
38 | data Acceptability = Unacceptable | Acceptable | Good | VeryGood deriving (Show, Read, Eq, Generic, Bounded, Enum)
39 |
40 | instance FromField Acceptability where
41 | parseField "unacc" = pure Unacceptable
42 | parseField "acc" = pure Acceptable
43 | parseField "good" = pure Good
44 | parseField "vgood" = pure VeryGood
45 | parseField _ = fail "unknown acceptability"
46 |
47 | data Count = N Int | NOrMore Int | More deriving (Show, Read, Eq, Generic)
48 |
49 | instance FromField Count where
50 | parseField "more" = pure More
51 | parseField "5more" = pure (NOrMore 5)
52 | parseField "2" = pure (N 2)
53 | parseField "3" = pure (N 3)
54 | parseField "4" = pure (N 4)
55 | parseField _ = fail "unknown count"
56 |
57 | data Car = Car
58 | { buying :: RelScore
59 | , maintenance :: RelScore
60 | , doors :: Count
61 | , persons :: Count
62 | , luggageBoot :: RelSize
63 | , safety :: RelScore
64 | , acceptability:: Acceptability
65 | } deriving (Show, Read, Generic)
66 |
67 | instance FromRecord Car
68 |
69 | car :: Dataset Car
70 | car = csvDataset
71 | $ URL $ uciMLDB /: "car" /: "car.data"
72 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Coal.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 | -- {-# LANGUAGE GADTs, QuasiQuotes, ViewPatterns, FlexibleContexts #-}
3 | -- {-# OPTIONS_GHC -fno-warn-unused-imports #-}
4 |
5 | {-|
6 |
7 | Coal data set
8 |
9 | Dates of mining disasters, from the `coal` dataset in the R package `boot`.
10 |
11 | For further information, see
12 |
13 | -}
14 |
15 | module Numeric.Datasets.Coal ( Coal, coal, date ) where
16 |
17 | import Numeric.Datasets
18 |
19 | import Data.Csv
20 | import GHC.Generics
21 | -- import Control.Applicative
22 | import Network.HTTP.Req ((/:), http, Scheme(..))
23 |
24 | newtype Coal = Coal
25 | { date :: Double
26 | } deriving (Show, Read, Generic)
27 |
28 | instance FromRecord Coal where
29 | parseRecord v = Coal <$> v .! 1
30 |
31 | coal :: Dataset Coal
32 | coal = let src = URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "boot" /: "coal.csv"
33 | in Dataset src Nothing Nothing $ CSVRecord HasHeader defaultDecodeOptions
34 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Diabetes.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE OverloadedStrings, TemplateHaskell #-}
2 |
3 | {- |
4 | Module : Numeric.Datasets.Diabetes
5 | Description : Diabetes dataset created from an ARFF input file
6 | Copyright : (c) Arvind Devarajan
7 | License : BSD-3-Clause
8 |
9 | Maintainer : arvindd
10 | Stability : experimental
11 | Portability : portable
12 | -}
13 |
14 | module Numeric.Datasets.Diabetes
15 | ( DiabetesClass
16 | , PimaDiabetesEntry
17 | , pimaDiabetes
18 | ) where
19 |
20 | import Numeric.Datasets
21 | import Data.FileEmbed
22 | import qualified Data.ByteString.Lazy as BL (fromStrict, ByteString)
23 | import Numeric.Datasets.Internal.ArffParser
24 |
25 | data DiabetesClass = TestedNegative | TestedPositive | UnknownClass deriving (Show)
26 |
27 | data PimaDiabetesEntry = PimaDiabetesEntry
28 | { preg :: !Double
29 | , plas :: !Double
30 | , pres :: !Double
31 | , skin :: !Double
32 | , insu :: !Double
33 | , mass :: !Double
34 | , pedi :: !Double
35 | , age :: !Double
36 | , diabetesClass :: !DiabetesClass
37 | } deriving (Show)
38 |
39 | -- | Diabetes dataset, containing a list of Pima Indian diabetes entries
40 | pimaDiabetes :: [PimaDiabetesEntry]
41 | pimaDiabetes = toPimaDiabetes records
42 | where records = readArff (BL.fromStrict $(embedFile "datafiles/arff/diabetes.arff"))
43 |
44 | -- | Converts each ARFF record into a Pima diabetes entry
45 | toPimaDiabetes :: [ArffRecord] -> [PimaDiabetesEntry]
46 | toPimaDiabetes recs =
47 | let toPD :: ArffRecord -> PimaDiabetesEntry
48 | toPD = PimaDiabetesEntry <$> dblval 0
49 | <*> dblval 1
50 | <*> dblval 2
51 | <*> dblval 3
52 | <*> dblval 4
53 | <*> dblval 5
54 | <*> dblval 6
55 | <*> dblval 7
56 | <*> diabClass 8
57 |
58 | dblval :: Int -> ArffRecord -> Double
59 | dblval idx r = value (\_->0) idx r
60 |
61 | strval :: Int -> ArffRecord -> BL.ByteString
62 | strval idx r = value (\_->"") idx r
63 |
64 | diabClass :: Int -> ArffRecord -> DiabetesClass
65 | diabClass i r =
66 | let s = strval i r
67 | in case s of
68 | "tested_negative" -> TestedNegative
69 | "tested_positive" -> TestedPositive
70 | _ -> UnknownClass
71 |
72 | in fmap toPD recs
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Gapminder.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-}
3 |
4 | {-|
5 |
6 | Gapminder dataset - Life expectancy, GDP, population every five years per country
7 |
8 | Source:
9 |
10 | More information: https://cran.r-project.org/web/packages/gapminder/gapminder.pdf
11 |
12 | -}
13 |
14 | module Numeric.Datasets.Gapminder where
15 |
16 | import Numeric.Datasets
17 |
18 | import Data.Csv
19 | import GHC.Generics
20 | import Control.Applicative
21 | import Data.Text (Text)
22 | import Network.HTTP.Req ((/:), https, Scheme(..))
23 |
24 | data Gapminder = Gapminder
25 | { country :: Text
26 | , year :: Int
27 | , pop :: Integer
28 | , continent :: Text
29 | , lifeExp :: Double
30 | , gdpPercap :: Double
31 | } deriving (Show, Read, Generic)
32 |
33 | instance FromNamedRecord Gapminder where
34 | parseNamedRecord m = Gapminder <$>
35 | m .: "country" <*>
36 | m .: "year" <*>
37 | (roundIt <$> m .: "pop") <*>
38 | m .: "continent" <*>
39 | m .: "lifeExp" <*>
40 | m .: "gdpPercap"
41 | where roundIt :: Double -> Integer
42 | roundIt = round
43 |
44 | gapminder :: Dataset Gapminder
45 | gapminder = csvHdrDataset
46 | $ URL $ https "raw.githubusercontent.com" /: "plotly" /: "datasets" /: "master" /: "gapminderDataFiveYear.csv"
47 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Internal/Streaming.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE FlexibleContexts #-}
2 | {-# LANGUAGE GADTs #-}
3 | {-# LANGUAGE LambdaCase #-}
4 | {-# LANGUAGE OverloadedStrings #-}
5 | {-# LANGUAGE ScopedTypeVariables #-}
6 | {-# OPTIONS_GHC -Wno-unused-top-binds -Wno-name-shadowing #-}
7 | module Numeric.Datasets.Internal.Streaming
8 | ( streamDataset
9 | , streamByteString
10 | ) where
11 |
12 | import Control.Exception.Safe (MonadThrow, Exception, throwString, throwM)
13 | import Control.Monad.Error.Class (MonadError)
14 | import Control.Monad.IO.Class (MonadIO, liftIO)
15 | import Data.Attoparsec.ByteString.Lazy (Parser)
16 | import Data.Maybe (fromMaybe)
17 | import Data.Text.Encoding (decodeUtf8)
18 | import Data.List.NonEmpty (NonEmpty, toList)
19 | import Streaming (Stream, Of((:>)), lift)
20 | import qualified Data.ByteString.Streaming as BS (fromLazy, ByteString, null)
21 | import qualified Data.ByteString as B' (pack)
22 | import qualified Data.ByteString.Lazy as B (ByteString, concat)
23 | import qualified Data.List as L (intercalate)
24 | import qualified Data.Attoparsec.ByteString.Streaming as Atto (parse)
25 | import qualified Data.Attoparsec.ByteString.Lazy as Atto (anyWord8)
26 | import qualified Streaming as S (hoist, unfold)
27 | import qualified Streaming.Cassava as S (decodeWith, decodeByNameWith, CsvParseException)
28 | import qualified Streaming.Prelude as S (print, maps)
29 |
30 | import Numeric.Datasets
31 | import Streaming.Instances ()
32 |
33 | -- | Stream a dataset
34 | streamDataset
35 | :: forall io a . (MonadThrow io, MonadIO io)
36 | => Dataset a
37 | -> Stream (Of a) io ()
38 | streamDataset ds = do
39 | folder <- liftIO $ defaultTempDir (temporaryDirectory ds)
40 | files <- liftIO $ getFileFromSource folder (source ds)
41 | streamByteString (readAs ds) (fromMaybe id (preProcess ds) <$> files)
42 |
43 |
44 | -- | Stream a ByteString into a Haskell value
45 | streamByteString
46 | :: forall m a
47 | . (MonadThrow m)
48 | => ReadAs a
49 | -> NonEmpty B.ByteString
50 | -> Stream (Of a) m ()
51 | streamByteString ra bs = _streamDataset ra (BS.fromLazy $ B.concat $ toList bs)
52 |
53 |
54 | -- private function which uses the streaming interface of bytestring
55 | _streamDataset
56 | :: forall mt a e
57 | . (MonadThrow mt, Exception e)
58 | => (MonadError S.CsvParseException (Either e))
59 | => ReadAs a
60 | -> BS.ByteString (Either e) ()
61 | -> Stream (Of a) mt ()
62 | _streamDataset ra bs =
63 | case ra of
64 | JSON -> lift $ throwString "Not implemented: JSON streaming"
65 | CSVRecord hhdr opts -> S.hoist either2Throw $ S.decodeWith opts hhdr bs
66 | CSVNamedRecord opts -> S.hoist either2Throw $ S.decodeByNameWith opts bs
67 | MultiRecordParsable _ -> lift $ throwString "Not implemented: MultiRecord streaming"
68 | Parsable psr -> parseStream psr (S.hoist either2Throw bs)
69 | ImageFolder _ -> lift $ throwString "Not implemented: Image Folder streaming, use Dataloader"
70 | where
71 | either2Throw :: MonadThrow m => (forall x e . Exception e => Either e x -> m x)
72 | either2Throw = \case
73 | Left e -> throwM e
74 | Right r -> pure r
75 |
76 |
77 | -- private function to generate a stream from a parser
78 | parseStream
79 | :: forall m a . MonadThrow m => Parser a -> BS.ByteString m () -> Stream (Of a) m ()
80 | parseStream psr = S.unfold go
81 | where
82 | go :: BS.ByteString m () -> m (Either () (Of a (BS.ByteString m ())))
83 | go bs = do
84 | (eea, rest) <- Atto.parse psr bs
85 | BS.null rest >>= \(empty :> _) ->
86 | if empty
87 | then pure $ Left ()
88 | else case eea of
89 | Left (es, lst) -> throwString (lst ++ "\n" ++ L.intercalate "\n" es)
90 | Right a -> pure $ Right (a :> rest)
91 |
92 |
93 | -- make this a real test
94 | test :: IO ()
95 | test = do
96 | S.print $ S.maps render $ parseStream Atto.anyWord8 (BS.fromLazy "1")
97 | S.print $ S.maps render $ parseStream Atto.anyWord8 (BS.fromLazy "1 2 3 4")
98 | where
99 | render (a:>b) = (decodeUtf8 (B'.pack [a]) :> b)
100 |
101 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Iris.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, TemplateHaskell #-}
2 |
3 | {-|
4 |
5 | The classical Iris dataset, due to R.A. Fisher.
6 |
7 | UCI ML Repository link
8 |
9 | -}
10 |
11 | module Numeric.Datasets.Iris where
12 |
13 | import Numeric.Datasets
14 |
15 | import Data.Csv
16 | import GHC.Generics
17 | import Data.FileEmbed
18 | import Data.ByteString.Lazy (fromStrict)
19 |
20 |
21 | data IrisClass = Setosa | Versicolor | Virginica
22 | deriving (Show, Read, Eq, Ord, Generic, Enum, Bounded)
23 |
24 | instance FromField IrisClass where
25 | parseField "Iris-setosa" = return Setosa
26 | parseField "Iris-versicolor" = return Versicolor
27 | parseField "Iris-virginica" = return Virginica
28 | parseField _ = fail "unknown iris class"
29 |
30 | data Iris = Iris
31 | { sepalLength :: Double
32 | , sepalWidth :: Double
33 | , petalLength :: Double
34 | , petalWidth :: Double
35 | , irisClass :: IrisClass
36 | } deriving (Show, Read, Generic)
37 |
38 | instance FromRecord Iris
39 |
40 | iris :: [Iris]
41 | iris = readDataset csvRecord (fromStrict $(embedFile "datafiles/iris.data"))
42 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Michelson.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, TemplateHaskell #-}
2 |
3 | {-|
4 |
5 | Michelson's speed of light dataset - five repeated measurements of the speed of light.
6 |
7 | Data from
8 |
9 | The embedded dataset is Copyright (c) 2015 The Compute.io Authors.
10 |
11 | -}
12 |
13 | module Numeric.Datasets.Michelson where
14 |
15 | import Numeric.Datasets
16 | import Data.FileEmbed
17 | import Data.ByteString.Lazy (fromStrict)
18 |
19 |
20 | michelson :: [[Double]]
21 | michelson = readDataset JSON (fromStrict $(embedFile "datafiles/michelson.json"))
22 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Mushroom.hs~:
--------------------------------------------------------------------------------
1 | module Numeric.Datasets.Mushroom where
2 |
3 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Nightingale.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, TemplateHaskell #-}
2 |
3 | {-|
4 |
5 | Florence Nightingale's count of injuries in the Crimean War, used for her rose plots
6 |
7 | Data from
8 |
9 | The embedded dataset is Copyright (c) 2015 The Compute.io Authors.
10 |
11 | -}
12 |
13 | module Numeric.Datasets.Nightingale where
14 |
15 | import Numeric.Datasets
16 | import Data.FileEmbed
17 | import Data.ByteString.Lazy (fromStrict)
18 | import Data.Aeson hiding (parseJSON)
19 | import Data.Time (UTCTime)
20 | import GHC.Generics
21 |
22 |
23 | data Nightingale = Nightingale
24 | { date :: UTCTime
25 | , army_size :: Int
26 | , disease :: Int
27 | , wounds :: Int
28 | , other :: Int
29 | } deriving (Show, Read, Generic)
30 |
31 | instance FromJSON Nightingale
32 |
33 | nightingale :: [Nightingale]
34 | nightingale = readDataset JSON $ fromStrict $(embedFile "datafiles/nightingale.json")
35 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/OldFaithful.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-}
3 |
4 | {-|
5 |
6 | Old Faithful Geyser Eruption data
7 |
8 | Article: http://web.pdx.edu/~jfreder/M212/oldfaithful.pdf
9 |
10 | These data from:
11 |
12 | For more data, see
13 |
14 | -}
15 |
16 | module Numeric.Datasets.OldFaithful where
17 |
18 | import Numeric.Datasets
19 |
20 | import Data.Csv
21 | import Control.Applicative
22 | import Network.HTTP.Req ((/:), https, Scheme(..))
23 |
24 |
25 | data OldFaithful = OldFaithful
26 | { waiting :: Double -- ^ waiting time until next eruption
27 | , duration :: Double -- ^ duration of eruption in minutes
28 | } deriving Show
29 |
30 | instance FromRecord OldFaithful where
31 | parseRecord v = OldFaithful <$> v .! 2 <*> v.! 1
32 |
33 | oldFaithful :: Dataset OldFaithful
34 | oldFaithful
35 | = let src = URL $ https "raw.githubusercontent.com" /: "vincentarelbundock" /: "Rdatasets" /: "master" /: "csv" /: "datasets" /: "faithful.csv"
36 | in Dataset src Nothing Nothing $ CSVRecord HasHeader defaultDecodeOptions
37 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Quakes.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 |
3 | {-|
4 |
5 | Locations of Earthquakes off Fiji
6 |
7 | Listed as quakes here: http://vincentarelbundock.github.io/Rdatasets/datasets.html
8 |
9 |
10 | -}
11 |
12 | module Numeric.Datasets.Quakes where
13 |
14 | import Numeric.Datasets
15 |
16 | import Data.Csv
17 | import GHC.Generics
18 | import Network.HTTP.Req ((/:), http, Scheme(..))
19 |
20 | data Quake = Quake
21 | { lat :: Double
22 | , long :: Double
23 | , depth :: Double
24 | , mag :: Double
25 | , stations :: Int
26 | } deriving (Show, Read, Generic)
27 |
28 | instance FromNamedRecord Quake
29 |
30 | quakes :: Dataset Quake
31 | quakes = csvHdrDataset
32 | $ URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "datasets" /: "quakes.csv"
33 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/States.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-}
3 |
4 | {-|
5 |
6 | Data on education in US states
7 |
8 |
9 |
10 | -}
11 |
12 | module Numeric.Datasets.States where
13 |
14 | import Numeric.Datasets
15 |
16 | import Data.Csv
17 | import GHC.Generics
18 | import Control.Applicative
19 | import Network.HTTP.Req ((/:), http, Scheme(..))
20 |
21 | data StateEdu = StateEdu
22 | { state :: String
23 | , region :: String
24 | , population :: Int
25 | , satVerbal :: Int
26 | , satMath :: Int
27 | , satPercent :: Int
28 | , dollarSpend :: Double
29 | , teacherPay :: Int
30 | } deriving (Show, Read, Generic)
31 |
32 | instance FromNamedRecord StateEdu where
33 | parseNamedRecord m = StateEdu <$>
34 | m .: "" <*>
35 | m .: "region" <*>
36 | m .: "pop" <*>
37 | m .: "SATV" <*>
38 | m .: "SATM" <*>
39 | m .: "percent" <*>
40 | m .: "dollars" <*>
41 | m .: "pay"
42 |
43 | states :: Dataset StateEdu
44 | states = csvHdrDataset
45 | $ URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "car" /: "States.csv"
46 |
47 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Sunspots.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-}
3 |
4 | {-|
5 |
6 | Monthly sunspots from 1749
7 |
8 | Listed as sunspot.month here: http://vincentarelbundock.github.io/Rdatasets/datasets.html
9 |
10 | See
11 |
12 | -}
13 |
14 | module Numeric.Datasets.Sunspots where
15 |
16 | import Numeric.Datasets
17 |
18 | import Data.Csv
19 | import GHC.Generics
20 | import Control.Applicative
21 | import Network.HTTP.Req ((/:), http, Scheme(..))
22 |
23 | data Sunspot = Sunspot
24 | { time :: Double
25 | , sunspotMonth :: Double
26 | } deriving (Show, Read, Generic)
27 |
28 | instance FromNamedRecord Sunspot where
29 | parseNamedRecord m = Sunspot <$>
30 | m .: "time" <*>
31 | m .: "value"
32 |
33 | sunspots :: Dataset Sunspot
34 | sunspots = csvHdrDataset
35 | $ URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "datasets" /: "sunspot.month.csv"
36 |
37 | -- "http://vincentarelbundock.github.io/Rdatasets/csv/datasets/sunspot.month.csv"
38 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Titanic.hs:
--------------------------------------------------------------------------------
1 | {-# language DeriveGeneric, OverloadedStrings, LambdaCase, DataKinds #-}
2 | {-|
3 |
4 | Titanic Data Set. For each person on board the fatal maiden voyage of the ocean liner SS Titanic, this dataset records Sex, Age (child/adult), Class (Crew, 1st, 2nd, 3rd Class) and whether or not the person survived.
5 |
6 | ``The Titanic survival data seem to become to categorical data analysis what Fisher's Iris data are to discriminant analysis.'' - Buja A.: A word from the editor of JCGS. Statistical Computing & Graphics Newsletter 10 (1), pp.32-33, 1999.
7 |
8 | Retrieved from: . A copy of the dataset can be found in datasets/titanic2_full.tsv .
9 |
10 | Header:
11 |
12 | "PassengerId" "Survived" "Pclass" "Name" "Sex" "Age" "SibSp" "Parch" "Ticket" "Fare" "Cabin" "Embarked"
13 |
14 | Example rows :
15 |
16 | 10 1 2 "Nasser, Mrs. Nicholas (Adele Achem)" "female" 14 1 0 "237736" 30.0708 "" "C"
17 | 29 1 3 "O'Dwyer, Miss. Ellen \"Nellie\"" "female" NA 0 0 "330959" 7.8792 "" "Q"
18 |
19 | -}
20 | module Numeric.Datasets.Titanic (titanicRemote, titanicLocal, TitanicEntry(..), Class(..), Age(..), Sex(..))where
21 |
22 | import Numeric.Datasets
23 |
24 | import Data.Csv
25 | import GHC.Generics
26 | import Network.HTTP.Req ((/:), https, Scheme(..))
27 |
28 | data TitanicEntry = TitanicEntry {
29 | tClass :: Class
30 | , tAge :: Age
31 | , tSex :: Sex
32 | , tSurvived :: Bool -- ^ Did the passenger survive ?
33 | } deriving (Eq, Read, Show, Generic)
34 |
35 |
36 | instance FromNamedRecord TitanicEntry where
37 | parseNamedRecord v = TitanicEntry <$>
38 | (parseClass <$> v .: "Pclass") <*>
39 | (v .: "Age") <*>
40 | (parseSex <$> v .: "Sex") <*>
41 | (parseBool <$> v .: "Survived")
42 |
43 |
44 | data Class = First | Second | Third | Crew deriving (Eq, Read, Show, Generic, Enum, Bounded)
45 |
46 | parseClass :: String -> Class
47 | parseClass = \case
48 | "1" -> First
49 | "2" -> Second
50 | "3" -> Third
51 | "Crew" -> Crew
52 | x -> error $ unwords ["Unexpected feature value :", show x]
53 |
54 |
55 | newtype Age = Age (Maybe Double) deriving (Eq, Read, Show, Generic)
56 |
57 | -- | The "Age" field requires a custom FromField instance because its value may be "NA"
58 | instance FromField Age where
59 | parseField s = case s of
60 | "NA" -> pure $ Age Nothing
61 | ss -> case runParser (parseField ss :: Parser Double) of
62 | Left _ -> pure $ Age Nothing
63 | Right x -> pure $ Age $ Just x
64 |
65 | data Sex = Female | Male deriving (Eq, Read, Show, Generic, Enum, Bounded)
66 |
67 | parseSex :: String -> Sex
68 | parseSex = \case
69 | "female" -> Female
70 | "male" -> Male
71 | x -> error $ unwords ["Unexpected feature value :", show x]
72 |
73 | parseBool :: String -> Bool
74 | parseBool = \case
75 | "1" -> True
76 | "0" -> False
77 | x -> error $ unwords ["Unexpected feature value :", show x]
78 |
79 | -- | The Titanic dataset, to be downloaded from
80 | titanicRemote :: Dataset TitanicEntry
81 | titanicRemote = withPreprocess removeEscQuotes $ csvHdrDatasetSep '\t' $ URL $ https "raw.githubusercontent.com" /: "JackStat" /: "6003Data" /: "master" /: "Titanic.txt"
82 |
83 | -- | The Titanic dataset, parsed from a local copy
84 | titanicLocal :: Dataset TitanicEntry
85 | titanicLocal = withPreprocess removeEscQuotes $ csvHdrDatasetSep '\t' $ File "datafiles/titanic2_full.tsv"
86 |
87 |
88 |
89 |
90 |
91 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Titanic.hs~:
--------------------------------------------------------------------------------
1 | {-# language DeriveGeneric, OverloadedStrings, TemplateHaskell #-}
2 | {-|
3 |
4 | Titanic Data Set. For each person on board the fatal maiden voyage of the ocean liner SS Titanic, this dataset records Sex, Age (child/adult), Class (Crew, 1st, 2nd, 3rd Class) and whether or not the person survived.
5 |
6 | ``The Titanic survival data seem to become to categorical data analysis what Fisher's Iris data are to discriminant analysis.'' - Buja A.: A word from the editor of JCGS. Statistical Computing & Graphics Newsletter 10 (1), pp.32-33, 1999.
7 |
8 | Retrieved from:
9 |
10 | -}
11 | module Numeric.Datasets.Titanic where
12 |
13 | import Numeric.Datasets
14 |
15 | import Data.Csv
16 | import GHC.Generics
17 | import Data.FileEmbed
18 | import Data.ByteString.Lazy (fromStrict)
19 |
20 |
21 | titanic = readDataset $ csvHdrDatasetSep '\t' (fromStrict $(embedFile "datafiles/titanic.tsv"))
22 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/UN.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 |
3 | {-|
4 |
5 | GDP and infant mortality
6 |
7 |
8 |
9 | -}
10 |
11 | module Numeric.Datasets.UN where
12 |
13 | import Numeric.Datasets
14 |
15 | import Data.Csv
16 | import GHC.Generics
17 | import Control.Applicative
18 | import Network.HTTP.Req ((/:), http, Scheme(..))
19 |
20 | data GdpMortality = GdpMortality
21 | { country :: String
22 | , infantMortality :: Maybe Int
23 | , gdp :: Maybe Int
24 | } deriving (Show, Read, Generic)
25 |
26 | instance FromNamedRecord GdpMortality where
27 | parseNamedRecord m = GdpMortality <$>
28 | m .: "" <*>
29 | (m .: "infant.mortality" <|> return Nothing) <*>
30 | (m .: "gdp" <|> return Nothing)
31 |
32 | gdpMortalityUN :: Dataset GdpMortality
33 | gdpMortalityUN = csvHdrDataset
34 | $ URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "car" /: "UN.csv"
35 |
36 | -- "http://vincentarelbundock.github.io/Rdatasets/csv/car/UN.csv"
37 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Vocabulary.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 |
3 | {-|
4 |
5 | Vocabulary and Education
6 |
7 |
8 |
9 | -}
10 |
11 | module Numeric.Datasets.Vocabulary where
12 |
13 | import Numeric.Datasets
14 |
15 | import Data.Csv
16 | import GHC.Generics
17 | import Network.HTTP.Req ((/:), http, Scheme(..))
18 |
19 | data Sex = Female | Male
20 | deriving (Show, Read, Eq, Generic, Bounded, Enum)
21 |
22 | instance FromField Sex where
23 | parseField = parseReadField
24 |
25 | data Vocab = Vocab
26 | { year :: Integer
27 | , sex :: Sex
28 | , education :: Int
29 | , vocabulary :: Int
30 | } deriving (Show, Read, Generic)
31 |
32 | instance FromNamedRecord Vocab
33 |
34 | vocab :: Dataset Vocab
35 | vocab = csvHdrDataset
36 | $ URL $ http "vincentarelbundock.github.io" /: "Rdatasets" /: "csv" /: "car" /: "Vocab.csv"
37 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/Wine.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 |
3 | {-|
4 |
5 | Wine Data set
6 |
7 | UCI ML Repository link
8 |
9 | -}
10 |
11 | module Numeric.Datasets.Wine where
12 |
13 | import Numeric.Datasets
14 |
15 | import Data.Csv
16 | import GHC.Generics
17 | import Network.HTTP.Req ((/:), http, Scheme(..))
18 |
19 | data Wine = Wine
20 | { wineClass :: Int
21 | , alcohol :: Double
22 | , malicAcid :: Double
23 | , ash :: Double
24 | , ashAlcalinity :: Double
25 | , magnesium :: Double
26 | , totalPhenols :: Double
27 | , flavanoids :: Double
28 | , nonflavanoidPhenols :: Double
29 | , proanthocyanins :: Double
30 | , colorIntensity :: Double
31 | , hue :: Double
32 | , dilutedOD280toOD315 :: Double
33 | , proline :: Int
34 | } deriving (Show, Read, Generic)
35 |
36 | instance FromRecord Wine
37 |
38 | wine :: Dataset Wine
39 | wine = withPreprocess fixAmericanDecimals $
40 | csvDataset $ URL $ umassMLDB /: "wine" /: "wine.data"
41 |
--------------------------------------------------------------------------------
/datasets/src/Numeric/Datasets/WineQuality.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveGeneric, OverloadedStrings, DataKinds #-}
2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-}
3 |
4 | {-|
5 |
6 | Quality of red and white wines based on physicochemical properties
7 |
8 | See
9 |
10 | -}
11 |
12 | module Numeric.Datasets.WineQuality where
13 |
14 | import Numeric.Datasets
15 |
16 | import Data.Csv
17 | import GHC.Generics
18 | import Control.Applicative
19 | import Network.HTTP.Req ((/:), http, Scheme(..))
20 |
21 | data WineQuality = WineQuality
22 | { fixedAcidity :: Double
23 | , volatileAcidity :: Double
24 | , citricAcid :: Double
25 | , residualSugar :: Double
26 | , chlorides :: Double
27 | , freeSulfurDioxide :: Double
28 | , totalSulfurDioxide :: Double
29 | , density :: Double
30 | , pH :: Double
31 | , sulphates :: Double
32 | , alcohol :: Double
33 | , quality :: Int
34 | } deriving (Show, Read, Generic)
35 |
36 | instance FromNamedRecord WineQuality where
37 | parseNamedRecord m = WineQuality <$>
38 | m .: "fixed acidity" <*>
39 | m .: "volatile acidity" <*>
40 | m .: "citric acid" <*>
41 | m .: "residual sugar" <*>
42 | m .: "chlorides" <*>
43 | m .: "free sulfur dioxide" <*>
44 | m .: "total sulfur dioxide" <*>
45 | m .: "density" <*>
46 | m .: "pH" <*>
47 | m .: "sulphates" <*>
48 | m .: "alcohol" <*>
49 | m .: "quality"
50 |
51 | redWineQuality, whiteWineQuality :: Dataset WineQuality
52 | redWineQuality = csvHdrDatasetSep ';'
53 | $ URL $ umassMLDB /: "wine-quality" /: "winequality-red.csv"
54 |
55 | whiteWineQuality = csvHdrDatasetSep ';'
56 | $ URL $ umassMLDB /: "wine-quality" /: "winequality-white.csv"
57 |
--------------------------------------------------------------------------------
/datasets/src/Streaming/Instances.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE TypeSynonymInstances #-}
2 | {-# LANGUAGE FlexibleInstances #-}
3 | {-# OPTIONS_GHC -Wno-orphans #-}
4 | module Streaming.Instances () where
5 |
6 | import Streaming (Stream, lift)
7 | import Data.Attoparsec.ByteString.Streaming (Errors)
8 | import Control.Monad.Catch (MonadThrow(throwM), Exception)
9 |
10 | instance (Functor f, MonadThrow m) => MonadThrow (Stream f m) where
11 | throwM e = lift (throwM e)
12 |
13 | instance Exception Errors where
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/datasets/test/Spec.hs:
--------------------------------------------------------------------------------
1 | module Main where
2 |
3 | import Test.Hspec
4 | import Test.Hspec.QuickCheck
5 | import Test.QuickCheck
6 |
7 |
8 | main :: IO ()
9 | main = hspec spec
10 |
11 | spec :: Spec
12 | spec = do
13 | describe "Spec" $
14 | it "works" $ do
15 | 41 + 1 `shouldBe` 42
16 |
--------------------------------------------------------------------------------
/dense-linear-algebra/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | 0.2.0.0
2 | * Added unit tests
3 |
4 | * Introduced 'Decimal' for floating point comparison
5 |
6 |
--------------------------------------------------------------------------------
/dense-linear-algebra/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright (c) 2009, 2010 Bryan O'Sullivan
2 | All rights reserved.
3 |
4 | Redistribution and use in source and binary forms, with or without
5 | modification, are permitted provided that the following conditions
6 | are met:
7 |
8 | * Redistributions of source code must retain the above copyright
9 | notice, this list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above
12 | copyright notice, this list of conditions and the following
13 | disclaimer in the documentation and/or other materials provided
14 | with the distribution.
15 |
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 |
--------------------------------------------------------------------------------
/dense-linear-algebra/README.md:
--------------------------------------------------------------------------------
1 | # statistics-dense-linear-algebra
2 |
3 | [](https://travis-ci.org/githubuser/statistics-dense-linear-algebra)
4 |
5 | The dense linear algebra functionality and related modules, extracted from `statistics-0.14.0.2`
6 |
--------------------------------------------------------------------------------
/dense-linear-algebra/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 |
--------------------------------------------------------------------------------
/dense-linear-algebra/bench/ChronosBench.hs:
--------------------------------------------------------------------------------
1 | module Main where
2 |
3 | import qualified Statistics.Matrix as M
4 | import qualified Statistics.Matrix.Fast as F
5 | import qualified Statistics.Matrix.Fast.Algorithms as FA
6 | import Statistics.Matrix (Matrix (..))
7 | import qualified Statistics.Matrix.Algorithms as A
8 |
9 | import qualified Data.Vector.Unboxed as U
10 | import Data.Vector.Unboxed (Vector)
11 |
12 | import qualified System.Random.MWC as Mwc
13 |
14 | import qualified Chronos.Bench as C
15 |
16 | n :: Int
17 | n = 100
18 |
19 | testVector :: IO (Vector Double)
20 | testVector = do
21 | gen <- Mwc.create
22 | Mwc.uniformVector gen (n*n)
23 |
24 |
25 | testMatrix :: IO Matrix
26 | testMatrix = do
27 | vec <- testVector
28 | return $ Matrix n n vec
29 |
30 | runtimelight :: Vector Double -> Matrix -> IO ()
31 | runtimelight v a = do
32 | let
33 | v2 = U.take n v
34 |
35 | C.defaultMainWith (C.defaultConfig {C.timeout = Just 3}) [
36 | C.bench "norm" M.norm v2,
37 | C.bench "Fast.norm" F.norm v2,
38 |
39 | C.bench "multiplyV" (M.multiplyV a) (v2),
40 | C.bench "Fast.multiplyV" (F.multiplyV a) (v2),
41 |
42 | C.bench "transpose" M.transpose a,
43 | C.bench "Fast.transpose" F.transpose a ,
44 | C.bench "ident" M.ident n,
45 | C.bench "diag" M.diag v2
46 | ]
47 |
48 | runtimeheavy :: Matrix -> Matrix -> IO ()
49 | runtimeheavy a b = do
50 |
51 | C.defaultMainWith (C.defaultConfig {C.timeout = Just 1}) [
52 | C.bench "multiply" (M.multiply a) b,
53 | C.bench "Fast.multiply" (F.multiply a) b,
54 | C.bench "qr" A.qr a,
55 | C.bench "Fast.qr" FA.qr a
56 | ]
57 |
58 |
59 | main :: IO ()
60 | main = do
61 | v <- testVector
62 | a <- testMatrix
63 | b <- testMatrix
64 |
65 | --
66 | putStrLn "---Benchmarking light operations---"
67 | -- we split heavy and light, we lose some precision in the bar plots from chronos
68 | runtimelight v a
69 | putStrLn "---Benchmarking heavy operations---"
70 | runtimeheavy a b
71 |
--------------------------------------------------------------------------------
/dense-linear-algebra/bench/WeighBench.hs:
--------------------------------------------------------------------------------
1 | module Main where
2 |
3 | import qualified Statistics.Matrix as M
4 | import qualified Statistics.Matrix.Fast as F
5 | import qualified Statistics.Matrix.Fast.Algorithms as FA
6 | import Statistics.Matrix (Matrix (..))
7 | import qualified Statistics.Matrix.Algorithms as A
8 |
9 | import qualified Data.Vector.Unboxed as U
10 | import Data.Vector.Unboxed (Vector)
11 |
12 | import qualified System.Random.MWC as Mwc
13 |
14 | import qualified Weigh as W
15 |
16 | n :: Int
17 | n = 100
18 |
19 | testVector :: IO (Vector Double)
20 | testVector = do
21 | gen <- Mwc.create
22 | Mwc.uniformVector gen (n*n)
23 |
24 |
25 | testMatrix :: IO Matrix
26 | testMatrix = do
27 | vec <- testVector
28 | return $ Matrix n n vec
29 |
30 |
31 | weight :: Vector Double -> Matrix -> Matrix -> IO ()
32 | weight v a b = do
33 | let
34 | v2 = U.take n v
35 | W.mainWith (do
36 | W.func "norm" M.norm v2
37 | W.func "Fast.norm" F.norm v2
38 |
39 | W.func "multiplyV" (M.multiplyV a) (v2)
40 | W.func "Fast.multiplyV" (F.multiplyV a) (v2)
41 | W.func "transpose" M.transpose a
42 | W.func "Fast.transpose" F.transpose a
43 | W.func "ident" M.ident n
44 | W.func "diag" M.diag v2
45 |
46 | W.func "multiply" (M.multiply a) b
47 | W.func "Fast.multiply" (F.multiply a) b
48 |
49 | W.func "qr" A.qr a
50 | W.func "Fast.qr" FA.qr a
51 |
52 | )
53 |
54 |
55 |
56 | main :: IO ()
57 | main = do
58 | v <- testVector
59 | a <- testMatrix
60 | b <- testMatrix
61 |
62 | putStrLn "---Benchmarking memory consumption---"
63 | weight v a b
64 |
--------------------------------------------------------------------------------
/dense-linear-algebra/dense-linear-algebra.cabal:
--------------------------------------------------------------------------------
1 | name: dense-linear-algebra
2 | version: 0.2.0.0
3 | synopsis: Simple and incomplete pure haskell implementation of linear algebra
4 | description:
5 | This library is simply collection of linear-algebra related modules
6 | split from statistics library.
7 |
8 | license: BSD2
9 | license-file: LICENSE
10 | author: Bryan O'Sullivan
11 | maintainer: Alexey Khudaykov
12 | copyright: 2018 Author name here
13 | category: Math, Statistics, Numeric
14 | build-type: Simple
15 | extra-source-files: README.md
16 | CHANGELOG.md
17 | cabal-version: >=1.10
18 | tested-with: GHC==7.6.3, GHC==7.8.3, GHC==7.10.3, GHC==8.0.1
19 |
20 | library
21 | default-language: Haskell2010
22 | ghc-options: -Wall -O2
23 | hs-source-dirs: src
24 | exposed-modules: Statistics.Matrix
25 | Statistics.Matrix.Algorithms
26 | Statistics.Matrix.Function
27 | Statistics.Matrix.Mutable
28 | Statistics.Matrix.Types
29 | Statistics.Matrix.Fast
30 | Statistics.Matrix.Fast.Algorithms
31 | build-depends: base >= 4.5 && < 5
32 | , deepseq >= 1.1.0.2
33 | , math-functions >= 0.1.7
34 | , primitive >= 0.3
35 | , vector >= 0.10
36 | , vector-algorithms >= 0.4
37 | , vector-th-unbox
38 | , vector-binary-instances >= 0.2.1
39 | , Decimal
40 |
41 | test-suite spec
42 | default-language: Haskell2010
43 | ghc-options: -Wall
44 | type: exitcode-stdio-1.0
45 | hs-source-dirs: test
46 | other-modules: Fixtures,
47 | Utils,
48 | AlgorithmsSpec
49 | main-is: LibSpec.hs
50 | build-depends: base
51 | , dense-linear-algebra
52 | , hspec
53 | , QuickCheck
54 | , vector
55 | , Decimal
56 |
57 | benchmark weigh-bench
58 | type: exitcode-stdio-1.0
59 | main-is: WeighBench.hs
60 | hs-source-dirs: bench
61 | default-extensions: OverloadedStrings
62 | ghc-options: -Wall
63 | build-depends:
64 | base,
65 | dense-linear-algebra,
66 | vector,
67 | mwc-random,
68 | weigh
69 |
70 | benchmark chronos-bench
71 | type: exitcode-stdio-1.0
72 | main-is: ChronosBench.hs
73 | hs-source-dirs: bench
74 | default-extensions: OverloadedStrings
75 | ghc-options: -Wall
76 | build-depends:
77 | base,
78 | dense-linear-algebra,
79 | chronos-bench,
80 | vector,
81 | mwc-random
82 |
83 | source-repository head
84 | type: git
85 | location: https://github.com/bos/statistics
86 |
--------------------------------------------------------------------------------
/dense-linear-algebra/src/Statistics/Matrix/Algorithms.hs:
--------------------------------------------------------------------------------
1 | -- |
2 | -- Module : Statistics.Matrix.Algorithms
3 | -- Copyright : 2014 Bryan O'Sullivan
4 | -- License : BSD3
5 | --
6 | -- Useful matrix functions.
7 |
8 | module Statistics.Matrix.Algorithms
9 | (
10 | qr
11 | ) where
12 |
13 | import Control.Applicative ((<$>), (<*>))
14 | import Control.Monad.ST (ST, runST)
15 | import Prelude hiding (replicate)
16 | import Numeric.Sum (sumVector,kbn)
17 | import Statistics.Matrix (Matrix, column, dimension, for, norm)
18 | import qualified Statistics.Matrix.Mutable as M
19 | import qualified Data.Vector.Unboxed as U
20 |
21 | -- | /O(r*c)/ Compute the QR decomposition of a matrix.
22 | -- The result returned is the matrices (/q/,/r/).
23 | qr :: Matrix -> (Matrix, Matrix)
24 | qr mat = runST $ do
25 | let (m,n) = dimension mat
26 |
27 | r <- M.replicate n n 0
28 | a <- M.thaw mat
29 | for 0 n $ \j -> do
30 | cn <- M.immutably a $ \aa -> norm (column aa j)
31 | M.unsafeWrite r j j cn
32 | for 0 m $ \i -> M.unsafeModify a i j (/ cn)
33 | for (j+1) n $ \jj -> do
34 | p <- innerProduct a j jj
35 | M.unsafeWrite r j jj p
36 | for 0 m $ \i -> do
37 | aij <- M.unsafeRead a i j
38 | M.unsafeModify a i jj $ subtract (p * aij)
39 | (,) <$> M.unsafeFreeze a <*> M.unsafeFreeze r
40 |
41 | innerProduct :: M.MMatrix s -> Int -> Int -> ST s Double
42 | innerProduct mmat j k = M.immutably mmat $ \mat ->
43 | sumVector kbn $ U.zipWith (*) (column mat j) (column mat k)
44 |
--------------------------------------------------------------------------------
/dense-linear-algebra/src/Statistics/Matrix/Fast.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE BangPatterns #-}
2 |
3 | module Statistics.Matrix.Fast (
4 | multiply,
5 | norm,
6 | multiplyV,
7 | transpose
8 | ) where
9 |
10 | import Prelude hiding (exponent, map)
11 | import Control.Monad.ST
12 | import qualified Data.Vector.Unboxed as U
13 | import qualified Data.Vector.Unboxed.Mutable as UM
14 |
15 | import Statistics.Matrix (row)
16 | import Statistics.Matrix.Function
17 | import Statistics.Matrix.Types
18 | import Statistics.Matrix.Mutable (unsafeNew,unsafeWrite,unsafeFreeze)
19 |
20 | -- | Matrix-matrix multiplication in a more imperative fashion. Matrices must be of compatible
21 | -- sizes (/note: not checked/). Faster but less accurate than Statistics.Matrix.multiply
22 | multiply :: Matrix -> Matrix -> Matrix
23 | multiply m1@(Matrix r1 _ _) m2@(Matrix _ c2 _) = runST $ do
24 | m3 <- unsafeNew r1 c2
25 | for 0 c2 $ \j -> do
26 | for 0 r1 $ \i -> do
27 | let
28 | z = accum i m1 j m2
29 | unsafeWrite m3 i j z
30 | unsafeFreeze m3
31 |
32 | accum :: Int -> Matrix -> Int -> Matrix -> Double
33 | accum ithrow (Matrix _ c1 v1) jthcol (Matrix _ c2 v2) = sub 0 0
34 | where sub !acc !ij | ij == c1 = acc
35 | | otherwise = sub ( valRow*valCol + acc ) (ij+1)
36 | where
37 | valRow = U.unsafeIndex v1 (ithrow*c1 + ij)
38 | valCol = U.unsafeIndex v2 (ij*c2+jthcol)
39 |
40 | -- | Matrix-vector multiplication, with better performances but not as accurate as
41 | -- Statistics.Matrix.multiplyV
42 | multiplyV :: Matrix -> Vector -> Vector
43 | multiplyV m v
44 | | cols m == c = U.generate (rows m) (U.sum . U.zipWith (*) v . row m)
45 | | otherwise = error $ "matrix/vector unconformable " ++ show (cols m,c)
46 | where c = U.length v
47 |
48 | -- | Norm of a vector. Faster but less accurate than Statistics.Matrix.norm
49 | norm :: Vector -> Double
50 | norm = sqrt . U.sum . U.map square
51 |
52 | transpose :: Matrix -> Matrix
53 | transpose (Matrix r0 c0 v0)
54 | = Matrix c0 r0 $ runST $ do
55 | vec <- UM.unsafeNew (r0*c0)
56 | for 0 r0 $ \i -> do
57 | UM.unsafeWrite vec (i + i * c0) $ v0 `U.unsafeIndex` (i + i * c0)
58 | for (i+1) c0 $ \j -> do
59 | let tmp = v0 `U.unsafeIndex` (j + i * c0)
60 | tmp2 = v0 `U.unsafeIndex` (i + j * c0)
61 | UM.unsafeWrite vec (j + i * c0) tmp2
62 | UM.unsafeWrite vec (i + j * c0) tmp
63 | U.unsafeFreeze vec
64 |
--------------------------------------------------------------------------------
/dense-linear-algebra/src/Statistics/Matrix/Fast/Algorithms.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE BangPatterns #-}
2 |
3 | -- |
4 | -- Module : Statistics.Matrix.Fast.Algorithms
5 | -- Copyright : 2019 Magalame
6 | -- License : BSD3
7 | --
8 | -- Useful matrix functions.
9 |
10 | module Statistics.Matrix.Fast.Algorithms
11 | (
12 | qr
13 | ) where
14 |
15 | import Control.Applicative ((<$>), (<*>))
16 | import Control.Monad.ST (ST, runST)
17 | import Prelude hiding (replicate)
18 | import Statistics.Matrix (Matrix (..),dimension, for)
19 | import qualified Statistics.Matrix.Mutable as M
20 | import qualified Data.Vector.Unboxed as U
21 |
22 | -- | /O(r*c)/ Compute the QR decomposition of a matrix.
23 | -- The result returned is the matrices (/q/,/r/).
24 | qr :: Matrix -> (Matrix, Matrix)
25 | qr mat = runST $ do
26 | let (m,n) = dimension mat
27 |
28 | r <- M.replicate n n 0
29 | a <- M.thaw mat
30 | for 0 n $ \j -> do
31 | cn <- M.immutably a $ \aa -> sqrt $ normCol j aa
32 | M.unsafeWrite r j j cn
33 | for 0 m $ \i -> M.unsafeModify a i j (/ cn)
34 | for (j+1) n $ \jj -> do
35 | p <- innerProduct a j jj
36 | M.unsafeWrite r j jj p
37 | for 0 m $ \i -> do
38 | aij <- M.unsafeRead a i j
39 | M.unsafeModify a i jj $ subtract (p * aij)
40 | (,) <$> M.unsafeFreeze a <*> M.unsafeFreeze r
41 |
42 | normCol :: Int -> Matrix -> Double
43 | normCol jthcol (Matrix r c v) = sub 0 0
44 | where sub !acc !ij | ij == r = acc
45 | | otherwise = sub ( valCol*valCol + acc ) (ij+1)
46 | where
47 | valCol = U.unsafeIndex v (ij*c+jthcol)
48 |
49 | innerProduct :: M.MMatrix s -> Int -> Int -> ST s Double
50 | innerProduct mmat j k = M.immutably mmat $ \mat ->
51 | dotCol j mat k mat
52 |
53 | dotCol :: Int -> Matrix -> Int -> Matrix -> Double
54 | dotCol jthcol (Matrix r1 c1 v1) kthcol (Matrix _ c2 v2) = sub 0 0
55 | where sub !acc !ij | ij == r1 = acc
56 | | otherwise = sub ( valColj*valColk + acc ) (ij+1)
57 | where
58 | valColk = U.unsafeIndex v2 (ij*c2+kthcol)
59 | valColj = U.unsafeIndex v1 (ij*c1+jthcol)
--------------------------------------------------------------------------------
/dense-linear-algebra/src/Statistics/Matrix/Function.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE BangPatterns #-}
2 | -- |
3 | module Statistics.Matrix.Function where
4 |
5 | -- | Multiply a number by itself.
6 | square :: Double -> Double
7 | square x = x * x
8 |
9 | -- | Simple for loop. Counts from /start/ to /end/-1.
10 | for :: Monad m => Int -> Int -> (Int -> m ()) -> m ()
11 | for n0 !n f = loop n0
12 | where
13 | loop i | i == n = return ()
14 | | otherwise = f i >> loop (i+1)
15 | {-# INLINE for #-}
16 |
--------------------------------------------------------------------------------
/dense-linear-algebra/src/Statistics/Matrix/Mutable.hs:
--------------------------------------------------------------------------------
1 | -- |
2 | -- Module : Statistics.Matrix.Mutable
3 | -- Copyright : (c) 2014 Bryan O'Sullivan
4 | -- License : BSD3
5 | --
6 | -- Basic mutable matrix operations.
7 |
8 | module Statistics.Matrix.Mutable
9 | (
10 | MMatrix(..)
11 | , MVector
12 | , replicate
13 | , thaw
14 | , bounds
15 | , unsafeNew
16 | , unsafeFreeze
17 | , unsafeRead
18 | , unsafeWrite
19 | , unsafeModify
20 | , immutably
21 | , unsafeBounds
22 | ) where
23 |
24 | import Control.Applicative ((<$>))
25 | import Control.DeepSeq (NFData(..))
26 | import Control.Monad.ST (ST)
27 | import Statistics.Matrix.Types (Matrix(..), MMatrix(..), MVector)
28 | import qualified Data.Vector.Unboxed as U
29 | import qualified Data.Vector.Unboxed.Mutable as M
30 | import Prelude hiding (replicate)
31 |
32 | replicate :: Int -> Int -> Double -> ST s (MMatrix s)
33 | replicate r c k = MMatrix r c <$> M.replicate (r*c) k
34 |
35 | thaw :: Matrix -> ST s (MMatrix s)
36 | thaw (Matrix r c v) = MMatrix r c <$> U.thaw v
37 |
38 | unsafeFreeze :: MMatrix s -> ST s Matrix
39 | unsafeFreeze (MMatrix r c mv) = Matrix r c <$> U.unsafeFreeze mv
40 |
41 | -- | Allocate new matrix. Matrix content is not initialized hence unsafe.
42 | unsafeNew :: Int -- ^ Number of row
43 | -> Int -- ^ Number of columns
44 | -> ST s (MMatrix s)
45 | unsafeNew r c
46 | | r < 0 = error "Statistics.Matrix.Mutable.unsafeNew: negative number of rows"
47 | | c < 0 = error "Statistics.Matrix.Mutable.unsafeNew: negative number of columns"
48 | | otherwise = do
49 | vec <- M.new (r*c)
50 | return $ MMatrix r c vec
51 |
52 | unsafeRead :: MMatrix s -> Int -> Int -> ST s Double
53 | unsafeRead mat r c = unsafeBounds mat r c M.unsafeRead
54 | {-# INLINE unsafeRead #-}
55 |
56 | unsafeWrite :: MMatrix s -> Int -> Int -> Double -> ST s ()
57 | unsafeWrite mat row col k = unsafeBounds mat row col $ \v i ->
58 | M.unsafeWrite v i k
59 | {-# INLINE unsafeWrite #-}
60 |
61 | unsafeModify :: MMatrix s -> Int -> Int -> (Double -> Double) -> ST s ()
62 | unsafeModify mat row col f = unsafeBounds mat row col $ \v i -> do
63 | k <- M.unsafeRead v i
64 | M.unsafeWrite v i (f k)
65 | {-# INLINE unsafeModify #-}
66 |
67 | -- | Given row and column numbers, calculate the offset into the flat
68 | -- row-major vector.
69 | bounds :: MMatrix s -> Int -> Int -> (MVector s -> Int -> r) -> r
70 | bounds (MMatrix rs cs mv) r c k
71 | | r < 0 || r >= rs = error "row out of bounds"
72 | | c < 0 || c >= cs = error "column out of bounds"
73 | | otherwise = k mv $! r * cs + c
74 | {-# INLINE bounds #-}
75 |
76 | -- | Given row and column numbers, calculate the offset into the flat
77 | -- row-major vector, without checking.
78 | unsafeBounds :: MMatrix s -> Int -> Int -> (MVector s -> Int -> r) -> r
79 | unsafeBounds (MMatrix _ cs mv) r c k = k mv $! r * cs + c
80 | {-# INLINE unsafeBounds #-}
81 |
82 | immutably :: NFData a => MMatrix s -> (Matrix -> a) -> ST s a
83 | immutably mmat f = do
84 | k <- f <$> unsafeFreeze mmat
85 | rnf k `seq` return k
86 | {-# INLINE immutably #-}
87 |
--------------------------------------------------------------------------------
/dense-linear-algebra/src/Statistics/Matrix/Types.hs:
--------------------------------------------------------------------------------
1 | {-# LANGUAGE DeriveAnyClass #-}
2 | {-# LANGUAGE DeriveGeneric #-}
3 | -- |
4 | -- Module : Statistics.Matrix.Types
5 | -- Copyright : 2014 Bryan O'Sullivan
6 | -- License : BSD3
7 | --
8 | -- Basic matrix operations.
9 | --
10 | -- There isn't a widely used matrix package for Haskell yet, so
11 | -- we implement the necessary minimum here.
12 |
13 | module Statistics.Matrix.Types
14 | (
15 | Vector
16 | , MVector
17 | , Matrix(..)
18 | , MMatrix(..)
19 | , debug
20 | ) where
21 |
22 | import Data.Char (isSpace)
23 | import Numeric (showFFloat)
24 | import qualified Data.Vector.Unboxed as U
25 | import qualified Data.Vector.Unboxed.Mutable as M
26 |
27 | import GHC.Generics (Generic)
28 | import Control.DeepSeq (NFData)
29 |
30 | type Vector = U.Vector Double
31 | type MVector s = M.MVector s Double
32 |
33 | -- | Two-dimensional matrix, stored in row-major order.
34 | data Matrix = Matrix {
35 | rows :: {-# UNPACK #-} !Int -- ^ Rows of matrix.
36 | , cols :: {-# UNPACK #-} !Int -- ^ Columns of matrix.
37 | , _vector :: !Vector -- ^ Matrix data.
38 | } deriving (Eq, Generic, NFData)
39 |
40 | -- | Two-dimensional mutable matrix, stored in row-major order.
41 | data MMatrix s = MMatrix
42 | {-# UNPACK #-} !Int
43 | {-# UNPACK #-} !Int
44 | !(MVector s)
45 |
46 | -- The Show instance is useful only for debugging.
47 | instance Show Matrix where
48 | show = debug
49 |
50 | debug :: Matrix -> String
51 | debug (Matrix r c vs) = unlines $ zipWith (++) (hdr0 : repeat hdr) rrows
52 | where
53 | rrows = map (cleanEnd . unwords) . split $ zipWith (++) ldone tdone
54 | hdr0 = show (r,c) ++ " "
55 | hdr = replicate (length hdr0) ' '
56 | pad plus k xs = replicate (k - length xs) ' ' `plus` xs
57 | ldone = map (pad (++) (longest lstr)) lstr
58 | tdone = map (pad (flip (++)) (longest tstr)) tstr
59 | (lstr, tstr) = unzip . map (break (=='.') . render) . U.toList $ vs
60 | longest = maximum . map length
61 | render k = reverse . dropWhile (=='.') . dropWhile (=='0') . reverse .
62 | showFFloat (Just 4) k $ ""
63 | split [] = []
64 | split xs = i : split rest where (i, rest) = splitAt c xs
65 | cleanEnd = reverse . dropWhile isSpace . reverse
66 |
--------------------------------------------------------------------------------
/dense-linear-algebra/stack.yaml:
--------------------------------------------------------------------------------
1 | resolver: lts-12.14
2 |
3 | packages:
4 | - .
5 | # Dependency packages to be pulled from upstream that are not in the resolver
6 | # using the same syntax as the packages field.
7 | # (e.g., acme-missiles-0.3)
8 | extra-deps:
9 | - torsor-0.1
10 | - chronos-1.0.5
11 | - chronos-bench-0.2.0.2
12 | - primitive-0.6.4.0
13 |
14 | # Override default flag values for local packages and extra-deps
15 | # flags: {}
16 |
17 | # Extra package databases containing global packages
18 | # extra-package-dbs: []
19 |
20 | # Control whether we use the GHC we find on the path
21 | # system-ghc: true
22 | #
23 | # Require a specific version of stack, using version ranges
24 | # require-stack-version: -any # Default
25 | # require-stack-version: ">=1.9"
26 | #
27 | # Override the architecture used by stack, especially useful on Windows
28 | # arch: i386
29 | # arch: x86_64
30 | #
31 | # Extra directories used by stack for building
32 | # extra-include-dirs: [/path/to/dir]
33 | # extra-lib-dirs: [/path/to/dir]
34 | #
35 | # Allow a newer minor version of GHC than the snapshot specifies
36 | # compiler-check: newer-minor
37 |
--------------------------------------------------------------------------------
/dense-linear-algebra/test/AlgorithmsSpec.hs:
--------------------------------------------------------------------------------
1 |
2 | module AlgorithmsSpec where
3 |
4 | import qualified Statistics.Matrix as M
5 | import qualified Statistics.Matrix.Types as T
6 | import qualified Statistics.Matrix.Algorithms as Alg
7 | import qualified Statistics.Matrix.Function as Func
8 | import qualified Statistics.Matrix.Mutable as Mut
9 |
10 | import Utils
11 |
12 | qrDecompositionInvariant :: T.Matrix -> Bool
13 | qrDecompositionInvariant mat =
14 | (M.multiply (fst res) (snd res)) == mat
15 | where
16 | res = Alg.qr mat
17 |
18 |
19 | qrFirstOrthoInvariant :: T.Matrix -> Bool
20 | qrFirstOrthoInvariant mat = Utils.isOrtho (fst res)
21 | where
22 | res = Alg.qr mat
23 |
24 | qrSecondTriInvariant :: T.Matrix -> Bool
25 | qrSecondTriInvariant mat = Utils.isUpperTri (snd res)
26 | where
27 | res = Alg.qr mat
28 |
29 |
30 |
--------------------------------------------------------------------------------
/dense-linear-algebra/test/Fixtures.hs:
--------------------------------------------------------------------------------
1 |
2 | module Fixtures where
3 |
4 | import qualified Statistics.Matrix as M
5 | import qualified Statistics.Matrix.Types as T
6 |
7 | f :: Int -> Int -> Double
8 | f a b = (c + d + 1) / (d + 1) where
9 | c = fromIntegral a
10 | d = fromIntegral b
11 |
12 | g :: Int -> Int -> Double
13 | g a b = (c + d + 1) / (c * d + 1) where
14 | c = fromIntegral a
15 | d = fromIntegral b
16 |
17 | c :: [[Double]]
18 | c = [[1.0, 2.0, 3.0], [4.0, 5.0, 6.0], [7.0, 8.0, 9.0]]
19 |
20 | d :: [[Double]]
21 | d = [[3.0, 2.5, 1.6], [5.5, 2.3, 6.9], [3.7, 9.1, 10.0]]
22 |
23 | matA :: T.Matrix
24 | matA = M.generate 3 3 f
25 |
26 | matB :: T.Matrix
27 | matB = M.generate 3 3 g
28 |
29 | matC :: T.Matrix
30 | matC = M.fromRowLists c
31 |
32 | matD :: T.Matrix
33 | matD = M.fromRowLists d
34 |
35 | matId :: T.Matrix
36 | matId = M.ident 3
37 |
38 |
39 |
--------------------------------------------------------------------------------
/dense-linear-algebra/test/LibSpec.hs:
--------------------------------------------------------------------------------
1 | module Main where
2 |
3 | import Test.Hspec
4 | -- import Test.Hspec.QuickCheck
5 |
6 | -- import qualified Statistics.Matrix as M
7 | -- import qualified Statistics.Matrix.Types as T
8 | -- import qualified Statistics.Matrix.Algorithms as Alg
9 |
10 | import qualified Fixtures as F
11 | import AlgorithmsSpec
12 | -- import Utils
13 |
14 | spec :: Spec
15 | spec = describe "Q-R Decomposition" $ do
16 | it "Q x R returns the original matrix" $ do
17 | qrDecompositionInvariant F.matB `shouldBe` True
18 | it "Matrix Q is orthogonal for the QR factorization of an invertible matrix" $ do
19 | qrFirstOrthoInvariant F.matD`shouldBe` True
20 | it "Matrix R is triangular" $ do
21 | qrSecondTriInvariant F.matB `shouldBe` True
22 |
23 |
24 | main :: IO ()
25 | main = do
26 | hspec spec
27 |
28 |
29 |
30 |
31 |
--------------------------------------------------------------------------------
/dense-linear-algebra/test/Spec.hs:
--------------------------------------------------------------------------------
1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-}
2 |
3 |
4 |
--------------------------------------------------------------------------------
/dense-linear-algebra/test/Utils.hs:
--------------------------------------------------------------------------------
1 |
2 | module Utils where
3 |
4 | import Data.Vector.Unboxed as U
5 | import Prelude as P
6 | import Data.Decimal
7 | import GHC.Word
8 |
9 | import qualified Statistics.Matrix as M
10 | import qualified Statistics.Matrix.Types as T
11 | import qualified Statistics.Matrix.Algorithms as Alg
12 | import qualified Statistics.Matrix.Function as Func
13 | import qualified Statistics.Matrix.Mutable as Mut
14 |
15 | import qualified Fixtures as F
16 |
17 |
18 | -- | Operations on vectors
19 |
20 | -- | Operations on matrices
21 |
22 | -- | Check if two matrices are equal within some precision
23 | isEqual :: Word8 -> T.Matrix -> T.Matrix -> Bool
24 | isEqual prec a b =
25 | let
26 | a' = makeDecimal prec a
27 | b' = makeDecimal prec b
28 | in
29 | a' == b'
30 |
31 |
32 | ------------------------------ matrix property checks --------------------------------------
33 |
34 |
35 | -- | Check if given matrix is orthogonal
36 | isOrtho :: T.Matrix -> Bool
37 | isOrtho mat = isEqual 8 (M.multiply mat (M.transpose mat)) F.matId
38 |
39 | -- | Checks if the n'th row of the matrix have 'n' leading zeros
40 | hasLeadingZeros :: Int -> T.Vector -> Bool
41 | hasLeadingZeros n row = U.foldl (&&) True (U.map (== 0) (U.take n row))
42 |
43 | -- | pass the n'th row of the matrix to hasLeadingZeros
44 | isRowFromTriMatrix :: T.Matrix -> Int -> Bool
45 | isRowFromTriMatrix mat n = hasLeadingZeros n (M.row mat n)
46 |
47 | -- | Helper function to check if the given matrix is upper triangular
48 | upperTriHelper :: T.Matrix -> Int -> Bool
49 | upperTriHelper mat n = P.foldl (&&) True $ P.map (isRowFromTriMatrix mat) [0..(n-1)]
50 |
51 | -- | to check if the given matrix is upper triangular
52 | isUpperTri :: T.Matrix -> Bool
53 | isUpperTri mat = upperTriHelper mat (fst (M.dimension mat))
54 |
55 | -- | to check if the given matrix is invertible
56 | isInvertible :: T.Matrix -> Bool
57 | isInvertible mat = isFloatZero (det mat) 0.0000001
58 |
59 |
60 | ------------------------------- matrix manipulation functions -------------------------
61 |
62 |
63 | -- | make all values of the matrix absolute
64 | makeAbs :: T.Matrix -> T.Matrix
65 | makeAbs mat = M.map P.abs mat
66 |
67 | -- | make all values of the (row-list represented matrix)
68 | -- | decimal values with a given precision
69 | makeDecimal :: Word8 -> T.Matrix -> [[Decimal]]
70 | makeDecimal prec mat =
71 | let
72 | matList = M.toRowLists mat
73 | rowToDecimal prec ls = P.map (realFracToDecimal prec) ls
74 | in
75 | P.map (rowToDecimal prec) matList
76 |
77 |
78 | -- | to get the determinant of a given matrix
79 | -- | taken from here:
80 | -- | http://michaeljgilliland.blogspot.com/2013/04/determinant-in-haskell.html
81 | det :: T.Matrix -> Double
82 | det mat =
83 | let
84 | detHelper [[x]] = x
85 | detHelper mat = P.sum [(-1)^i*x*(detHelper (getRest i mat)) | (i, x) <- P.zip [0..] (P.head mat)]
86 | listMatrix = M.toRowLists mat
87 | in
88 | detHelper listMatrix
89 |
90 | -- | getRest function simply returns the matrix without the head row (topmost) and without the ith column
91 | getRest :: Int -> [[Double]] -> [[Double]]
92 | getRest col mat =
93 | let
94 | decapitatedMat = P.tail mat
95 | in
96 | P.map (leaveElement col) decapitatedMat
97 |
98 |
99 | ----------------------------- auxiliary helper functions -----------------------------------
100 |
101 |
102 | -- | function to delete i'th element from list
103 | leaveElement :: Ord a => Int -> [a] -> [a]
104 | leaveElement _ [] = []
105 | leaveElement i (x:xs)
106 | | i == 0 = xs
107 | | otherwise = x : leaveElement (i-1) xs
108 |
109 | -- | compare a double value to some epsilon
110 | isFloatZero :: Double -> Double -> Bool
111 | isFloatZero n eps =
112 | if (abs n) > (abs eps)
113 | then True
114 | else False
115 |
--------------------------------------------------------------------------------
/dh-core-accelerate/README.md:
--------------------------------------------------------------------------------
1 | # core-accelerate
2 |
3 | [](https://travis-ci.org/DataHaskell/core-accelerate)
4 |
5 | TODO Description.
6 |
--------------------------------------------------------------------------------
/dh-core-accelerate/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 |
--------------------------------------------------------------------------------
/dh-core-accelerate/core-accelerate.cabal:
--------------------------------------------------------------------------------
1 | name: core-accelerate
2 | version: 0.1.0.0
3 | -- synopsis:
4 | -- description:
5 | homepage: https://github.com/DataHaskell/core-accelerate
6 | license: BSD3
7 | license-file: LICENSE
8 | author: DataHaskell
9 | maintainer: Data-Haskell@haskell.org
10 | copyright: 2018 DataHaskell
11 | category: Data Science
12 | build-type: Simple
13 | extra-source-files: README.md
14 | cabal-version: >=1.10
15 | tested-with: GHC == 8.4.3
16 |
17 | library
18 | default-language: Haskell2010
19 | ghc-options: -Wall
20 | hs-source-dirs: src
21 | exposed-modules: Lib
22 | build-depends: base >= 4.7 && < 5
23 | , accelerate
24 |
25 | -- executable core-accelerate
26 | -- default-language: Haskell2010
27 | -- ghc-options: -threaded -rtsopts -with-rtsopts=-N
28 | -- hs-source-dirs: app
29 | -- main-is: Main.hs
30 | -- build-depends: base
31 | -- , core-accelerate
32 |
33 | test-suite spec
34 | default-language: Haskell2010
35 | ghc-options: -Wall
36 | type: exitcode-stdio-1.0
37 | hs-source-dirs: test
38 | main-is: Spec.hs
39 | build-depends: base
40 | , core-accelerate
41 | , hspec
42 | , QuickCheck
43 |
44 | source-repository head
45 | type: git
46 | location: https://github.com/DataHaskell/core-accelerate
47 |
--------------------------------------------------------------------------------
/dh-core-accelerate/src/Lib.hs:
--------------------------------------------------------------------------------
1 | module Lib where
2 |
3 |
--------------------------------------------------------------------------------
/dh-core-accelerate/test/LibSpec.hs:
--------------------------------------------------------------------------------
1 | module LibSpec where
2 |
3 | import Test.Hspec
4 | import Test.Hspec.QuickCheck
5 |
6 | import Lib
7 |
8 | main :: IO ()
9 | main = hspec spec
10 |
11 | spec :: Spec
12 | spec =
13 | describe "Lib" $ do
14 | it "works" $ do
15 | True `shouldBe` True
16 | -- prop "ourAdd is commutative" $ \x y ->
17 | -- ourAdd x y `shouldBe` ourAdd y x
18 |
--------------------------------------------------------------------------------
/dh-core-accelerate/test/Spec.hs:
--------------------------------------------------------------------------------
1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-}
2 |
--------------------------------------------------------------------------------
/dh-core/.gitignore:
--------------------------------------------------------------------------------
1 | ## Haskell
2 | dist
3 | dist-*
4 | cabal-dev
5 | *.o
6 | *.hi
7 | *.chi
8 | *.chs.h
9 | *.dyn_o
10 | *.dyn_hi
11 | .hpc
12 | .hsenv
13 | .cabal-sandbox/
14 | cabal.sandbox.config
15 | *.prof
16 | *.aux
17 | *.hp
18 | *.eventlog
19 | .stack-work/
20 | cabal.project.local
21 | .HTF/
22 |
23 |
--------------------------------------------------------------------------------
/dh-core/CHANGELOG.md:
--------------------------------------------------------------------------------
1 | 0.1.0.0
2 | First version
3 |
--------------------------------------------------------------------------------
/dh-core/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright DataHaskell (c) 2018
2 |
3 | All rights reserved.
4 |
5 | Redistribution and use in source and binary forms, with or without
6 | modification, are permitted provided that the following conditions are met:
7 |
8 | * Redistributions of source code must retain the above copyright
9 | notice, this list of conditions and the following disclaimer.
10 |
11 | * Redistributions in binary form must reproduce the above
12 | copyright notice, this list of conditions and the following
13 | disclaimer in the documentation and/or other materials provided
14 | with the distribution.
15 |
16 | * Neither the name of DataHaskell nor the names of other
17 | contributors may be used to endorse or promote products derived
18 | from this software without specific prior written permission.
19 |
20 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
21 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
22 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
23 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
26 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
30 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 |
--------------------------------------------------------------------------------
/dh-core/README.md:
--------------------------------------------------------------------------------
1 | # Building
2 |
3 | Developers can use this `stack` command, which will trigger a re-build and run the tests every time a file in the project is modified:
4 |
5 | $ stack build --test --ghc-options -Wall --file-watch
6 |
7 |
8 | # GHC and Stackage compatibility
9 |
10 | Tested against:
11 |
12 | - Stackage LTS-12.13 (GHC 8.4.3)
--------------------------------------------------------------------------------
/dh-core/Setup.hs:
--------------------------------------------------------------------------------
1 | import Distribution.Simple
2 | main = defaultMain
3 |
--------------------------------------------------------------------------------
/dh-core/core.cabal:
--------------------------------------------------------------------------------
1 | name: core
2 | version: 0.1.0.0
3 | synopsis: DataHaskell core project
4 | description: DataHaskell core project, please see README
5 | homepage: https://github.com/DataHaskell/dh-core
6 | license: BSD3
7 | license-file: LICENSE
8 | author: DataHaskell
9 | maintainer: Marco Zocca (ocramz fripost org), data-haskell@haskell.org
10 | copyright: 2018 DataHaskell
11 | category: Data Science
12 | build-type: Simple
13 | extra-source-files: README.md
14 | CHANGELOG.md
15 | cabal-version: >=1.10
16 | tested-with: GHC == 8.2.2, GHC == 8.4.3
17 |
18 | library
19 | default-language: Haskell2010
20 | ghc-options: -Wall
21 | hs-source-dirs: src
22 | exposed-modules: Lib
23 | Core.Numeric.BLAS
24 | Core.Numeric.BLAS.Class
25 | Core.Numeric.Statistics.Classification.DecisionTrees
26 | Core.Numeric.Statistics.Classification.Utils
27 | Core.Numeric.Statistics.Classification.Exceptions
28 | Core.Numeric.Statistics.InformationTheory
29 | Core.Data.Dataset
30 | Core.Data.Datum.Vector
31 |
32 | build-depends:
33 | base >=4.10 && <5,
34 | bytestring >=0.10.8.1,
35 | containers >=0.5.7.1,
36 | exceptions >=0.8.3,
37 | mwc-random,
38 | primitive,
39 | text >=1.2.2.2,
40 | vector >=0.12.0.1,
41 | vector-algorithms >=0.7.0.1,
42 | -- DEBUG
43 | hspec >=2.4.8,
44 | QuickCheck >=2.10.1
45 |
46 | -- executable core
47 | -- default-language: Haskell2010
48 | -- ghc-options: -threaded -rtsopts -with-rtsopts=-N
49 | -- hs-source-dirs: app
50 | -- main-is: Main.hs
51 | -- build-depends: base
52 | -- , core
53 |
54 | test-suite spec
55 | default-language: Haskell2010
56 | ghc-options: -Wall
57 | type: exitcode-stdio-1.0
58 | hs-source-dirs: test
59 | main-is: Spec.hs
60 | other-modules: LibSpec
61 | build-depends: base >=4.10 && < 5,
62 | core -any,
63 | hspec >=2.4.8,
64 | QuickCheck >=2.10.1
65 |
66 | source-repository head
67 | type: git
68 | location: https://github.com/DataHaskell/dh-core
69 |
--------------------------------------------------------------------------------
/dh-core/src/Core/Data/Dataset.hs:
--------------------------------------------------------------------------------
1 | {-# language DeriveFunctor, DeriveFoldable, DeriveTraversable #-}
2 | {-# language TypeFamilies #-}
3 | module Core.Data.Dataset where
4 |
5 | import qualified Data.Foldable as F (maximumBy)
6 | import Data.Ord (comparing)
7 |
8 | import qualified Data.Map.Strict as M (Map, empty, fromList, toList, fromListWith, mapWithKey, foldl', foldrWithKey, foldlWithKey', insert)
9 | import qualified Data.Map.Internal.Debug as M (showTree)
10 | -- import qualified Data.IntMap.Strict as IM
11 | -- import qualified Data.Set as S
12 |
13 | import System.Random.MWC
14 | import Control.Monad.Primitive
15 |
16 | import Core.Numeric.Statistics.Classification.Utils (Indexed(..), bootstrapNP)
17 |
18 | -- | Labeled dataset represented as a 'Map'. The map keys are the class labels
19 | newtype Dataset k a = Dataset { unDataset :: M.Map k a } deriving (Eq, Show, Functor, Foldable, Traversable)
20 |
21 | showTree :: (Show k, Show a) => Dataset k a -> String
22 | showTree (Dataset mm) = M.showTree mm
23 |
24 | empty :: Dataset k a
25 | empty = Dataset M.empty
26 |
27 | insert :: Ord k => k -> a -> Dataset k a -> Dataset k a
28 | insert k ls (Dataset ds) = Dataset $ M.insert k ls ds
29 |
30 | mapWithKey :: (k -> a -> b) -> Dataset k a -> Dataset k b
31 | mapWithKey f (Dataset ds) = Dataset $ M.mapWithKey f ds
32 |
33 | foldrWithKey :: (k -> a -> b -> b) -> b -> Dataset k a -> b
34 | foldrWithKey f z (Dataset ds) = M.foldrWithKey f z ds
35 |
36 | foldlWithKey' :: (a -> k -> b -> a) -> a -> Dataset k b -> a
37 | foldlWithKey' f z (Dataset ds) = M.foldlWithKey' f z ds
38 |
39 | fromList :: Ord k => [(k, a)] -> Dataset k a
40 | fromList ld = Dataset $ M.fromList ld
41 |
42 | fromListWith :: Ord k => (a -> a -> a) -> [(k, a)] -> Dataset k a
43 | fromListWith f ld = Dataset $ M.fromListWith f ld
44 |
45 | toList :: Dataset k a -> [(k, a)]
46 | toList (Dataset ds) = M.toList ds
47 |
48 | -- lookup :: Ord k => k -> Dataset k a -> Maybe a
49 | -- lookup k (Dataset ds) = M.lookup k ds
50 |
51 | -- | Size of the dataset
52 | size :: Foldable t => Dataset k (t a) -> Int
53 | size (Dataset ds) = M.foldl' (\acc l -> acc + length l) 0 ds
54 |
55 | -- | Maximum likelihood estimate of class label
56 | mlClass :: Dataset k [a] -> k
57 | mlClass = fst . F.maximumBy (comparing f) . toList where
58 | f (_, ll) = length ll
59 |
60 |
61 | -- | Number of items in each class
62 | sizeClasses :: (Foldable t, Num n) => Dataset k (t a) -> M.Map k n
63 | sizeClasses (Dataset ds) = (fromIntegral . length) <$> ds
64 |
65 | -- | Empirical class probabilities i.e. for each k, number of items in class k / total number of items
66 | probClasses :: (Fractional prob, Foldable t) => Dataset k (t a) -> M.Map k prob
67 | probClasses ds = (\n -> n / fromIntegral (size ds)) <$> sizeClasses ds
68 |
69 |
70 | -- * Bootstrap
71 |
72 | -- | Nonparametric bootstrap: each class is resampled (i.e. sampled with replacement)
73 | bootstrap :: (Indexed f, PrimMonad m, Ix f ~ Int) =>
74 | Dataset k (f a)
75 | -> Int -- ^ Number of samples
76 | -> Int -- ^ Number of bootstrap resamples
77 | -> Gen (PrimState m)
78 | -> m [Dataset k [a]]
79 | bootstrap ds@Dataset{} nsamples nboot gen = do
80 | dss <- traverse (bootstrapNP nsamples nboot gen) ds
81 | pure $ sequenceA dss
82 |
--------------------------------------------------------------------------------
/dh-core/src/Core/Data/Datum/Vector.hs:
--------------------------------------------------------------------------------
1 | {-# language DeriveFunctor, DeriveFoldable, DeriveTraversable, GeneralizedNewtypeDeriving #-}
2 | module Core.Data.Datum.Vector where
3 |
4 | import qualified Data.IntMap as IM
5 | -- import qualified Data.Vector.Unboxed as VU
6 | import qualified Data.Vector as V
7 |
8 | import Control.Monad.Catch (MonadThrow(..))
9 | import Core.Numeric.Statistics.Classification.Exceptions
10 |
11 | newtype FeatureLabels = FeatureLabels (IM.IntMap String) deriving (Eq, Show)
12 |
13 | featureLabels :: MonadThrow m => Int -> [String] -> m FeatureLabels
14 | featureLabels n ls
15 | | length ls == n = pure $ FeatureLabels $ IM.fromList $ zip [0..] ls
16 | | otherwise = throwM $ DimMismatchE "featureLabels" n (length ls)
17 |
18 | lookupFeatureLabelUnsafe :: IM.Key -> FeatureLabels -> String
19 | lookupFeatureLabelUnsafe i (FeatureLabels fl) = fl IM.! i
20 |
21 |
22 | -- | A data point i.e. a vector in R^n
23 | newtype V a = V (V.Vector a) deriving (Eq, Show, Functor, Foldable, Traversable, Applicative, Monad)
24 |
25 | fromListV :: [a] -> V a
26 | fromListV = V . V.fromList
27 | {-# inline fromListV #-}
28 |
29 | toListV :: V a -> [a]
30 | toListV (V vv) = V.toList vv
31 |
32 | zipV :: V a -> V b -> V (a, b)
33 | zipV (V v1) (V v2) = V $ V.zip v1 v2
34 | {-# inline zipV #-}
35 |
36 | unzipV :: V (a, b) -> (V a, V b)
37 | unzipV (V vs) = (V v1, V v2) where
38 | (v1, v2) = V.unzip vs
39 | {-# inline unzipV #-}
40 |
41 | mkV :: MonadThrow m => Int -> V.Vector a -> m (V a)
42 | mkV n xs | dxs == n = pure $ V xs
43 | | otherwise = throwM $ DimMismatchE "mkV" n dxs where
44 | dxs = V.length xs
45 |
46 | indexUnsafe :: V a -> Int -> a
47 | (V vv) `indexUnsafe` j = vv V.! j
48 | {-# inline indexUnsafe #-}
49 |
50 | (!) :: MonadThrow m => V a -> Int -> m a
51 | v ! j | j >= 0 && j < d = pure $ v `indexUnsafe` j
52 | | otherwise = throwM $ IndexOobE "(!)" j 0 d where
53 | d = dim v
54 |
55 | dim :: V a -> Int
56 | dim (V vv) = V.length vv
57 | {-# inline dim #-}
58 |
59 | foldrWithKey :: (Int -> a -> b -> b) -> b -> V a -> b
60 | foldrWithKey f z vv = foldr ins z $ zipV (fromListV [0..]) vv where
61 | ins (i, x) acc = f i x acc
62 | {-# inline foldrWithKey #-}
63 |
64 |
65 | dataSplitDecision :: (a -> Bool) -> Int -> (V a -> Bool)
66 | dataSplitDecision p j dat = p (dat `indexUnsafe` j)
67 |
68 | -- allComponents :: V (a -> Bool) -> V a -> Bool
69 | -- allComponents ps dat = all (== True) $ f <$> vps where
70 | -- vps = zipV ps dat
71 | -- f (p, vi) = p vi
72 | -- -- allComponents ps dat = all (== True) $ ps <*> dat
73 |
74 |
75 |
76 |
77 | -- | Vectors with measurable entries
78 |
79 | -- data Measurable a = BoundedBoth a a deriving (Eq, Show)
80 |
81 | -- newtype Xf f a = Xf (V.Vector (f a))
82 |
83 | -- type XMeas = Xf Measurable
84 |
--------------------------------------------------------------------------------
/dh-core/src/Core/Numeric/BLAS.hs:
--------------------------------------------------------------------------------
1 | module Core.Numeric.BLAS where
2 |
3 |
--------------------------------------------------------------------------------
/dh-core/src/Core/Numeric/Statistics/Classification/DecisionTrees.hs:
--------------------------------------------------------------------------------
1 | {-# language DeriveFunctor, DeriveFoldable, DeriveTraversable #-}
2 | {-# OPTIONS_GHC -Wno-type-defaults #-}
3 | module Core.Numeric.Statistics.Classification.DecisionTrees where
4 |
5 | import qualified Data.Foldable as F
6 | import qualified Data.Set as S
7 | import Data.Ord (comparing)
8 |
9 | import Core.Data.Dataset
10 | import qualified Core.Data.Datum.Vector as XV
11 | -- import Core.Numeric.Statistics.Classification.Utils
12 | import Core.Numeric.Statistics.InformationTheory (entropyR)
13 |
14 | -- | A binary tree.
15 | --
16 | -- Each leaf carries data of type 'a' and we can attach metadata of type 'd' at each branching point.
17 | data Tree d a =
18 | Node d (Tree d a) (Tree d a)
19 | | Leaf a
20 | deriving (Eq, Show, Functor, Foldable, Traversable)
21 |
22 | unfoldTree :: (t -> Either a (d, t, t)) -> t -> Tree d a
23 | unfoldTree f x =
24 | either Leaf (\(d, l, r) -> Node d (unfoldTree f l) (unfoldTree f r) ) (f x)
25 |
26 |
27 | -- | Tree state : list of candidate dataset cuts (feature #, level)
28 | data TState k a = TState {
29 | tsFeatCuts :: S.Set (Int, a)
30 | , tsDataset :: Dataset k [XV.V a] }
31 |
32 | -- | Tree state + local tree depth
33 | data TSd k a = TSd { tsDepth :: !Int, tState :: TState k a }
34 |
35 | -- | Global options for growing decision trees
36 | data TOptions = TOptions {
37 | toMaxDepth :: !Int -- ^ Max tree depth
38 | , toMinLeafSize :: !Int -- ^ Minimum size of the contents of a leaf
39 | , toOrder :: Order -- ^ Less than | Equal or larger than
40 | } deriving (Eq, Show)
41 |
42 | -- | Tree node metadata
43 | --
44 | -- For decision trees, at each node we store the decision feature and its decision threshold
45 | data TNData a = TNData {
46 | tJStar :: !Int -- ^ Decision feature index
47 | , tTStar :: a -- ^ Decision threshold
48 | } deriving (Eq)
49 |
50 | instance Show a => Show (TNData a) where
51 | show (TNData j t) = unwords ["(j =", show j, ", t =", show t, ")"]
52 |
53 |
54 | -- | Split decision: find feature (value, index) that maximizes the entropy drop (i.e the information gain, or KL divergence between the joint and factored datasets)
55 | --
56 | -- NB generates empty leaves
57 | treeUnfoldStep :: (Ord a, Ord k) =>
58 | TOptions
59 | -> TSd k a
60 | -> Either (Dataset k [XV.V a]) (TNData a, TSd k a, TSd k a)
61 | treeUnfoldStep (TOptions maxdepth minls ord) (TSd depth tst)
62 | | depth >= maxdepth || sizeDs tst <= minls = Left (tsDataset tst)
63 | | sizeDs tsl == 0 = Left (tsDataset tsr)
64 | | sizeDs tsr == 0 = Left (tsDataset tsl)
65 | | otherwise = Right (mdata, tdsl, tdsr)
66 | where
67 | sizeDs = size . tsDataset
68 | mdata = TNData jstar tstar
69 | (jstar, tstar, tsl, tsr) = maxInfoGainSplit ordf tst
70 | ordf = fromOrder ord
71 | d' = depth + 1
72 | tdsl = TSd d' tsl
73 | tdsr = TSd d' tsr
74 |
75 |
76 |
77 | {- | Note (OPTIMIZATIONS maxInfoGainSplit)
78 |
79 | 1. After splitting a dataset, remove the (threshold, feature index) pair corresponding to the succesful split
80 |
81 | 2. " " " " , remove /all/ (threshold, index) pairs that are subsumed by the successful test (e.g in the test ((<), 3.2, 27) , remove all [(t, 27) | t <- [tmin ..], t < 3.2 ] ). This is only a useful optimization for /monotonic/ class boundaries.
82 | -}
83 |
84 | -- | Tabulate the information gain for a number of decision thresholds and return a decision function corresponding to the threshold that yields the maximum information gain.
85 | maxInfoGainSplit :: (Ord k, Ord a, Eq a) =>
86 | (a -> a -> Bool)
87 | -> TState k a
88 | -> (Int, a, TState k a, TState k a)
89 | maxInfoGainSplit decision (TState tjs ds) = (jstar, tstar, TState tjs' dsl, TState tjs' dsr) where
90 | tjs' = S.delete (jstar, tstar) tjs -- See Note (OPTIMIZATIONS maxInfoGainSPlit)
91 | (jstar, tstar, _, dsl, dsr) = F.maximumBy (comparing third5) $ infog `map` S.toList tjs
92 | infog (j, t) = (j, t, h, dsl', dsr') where
93 | (h, dsl', dsr') = infoGainR (decision t) j ds
94 |
95 |
96 | third5 :: (a, b, c, d, e) -> c
97 | third5 (_, _, c, _, _) = c
98 |
99 | -- | Information gain due to a dataset split (regularized, H(0) := 0)
100 | infoGainR :: (Ord k, Ord h, Floating h) =>
101 | (a -> Bool)
102 | -> Int
103 | -> Dataset k [XV.V a]
104 | -> (h, Dataset k [XV.V a], Dataset k [XV.V a])
105 | infoGainR p j ds = (infoGain, dsl, dsr) where
106 | (dsl, pl, dsr, pr) = splitDatasetAtAttr p j ds
107 | (h0, hl, hr) = (entropyR ds, entropyR dsl, entropyR dsr)
108 | infoGain = h0 - (pl * hl + pr * hr)
109 |
110 |
111 | -- | helper function for 'infoGain' and 'infoGainR'
112 | splitDatasetAtAttr :: (Fractional n, Ord k) =>
113 | (a -> Bool)
114 | -> Int
115 | -> Dataset k [XV.V a]
116 | -> (Dataset k [XV.V a], n, Dataset k [XV.V a], n)
117 | splitDatasetAtAttr p j ds = (dsl, pl, dsr, pr) where
118 | sz = fromIntegral . size
119 | (dsl, dsr) = partition p j ds
120 | (s0, sl, sr) = (sz ds, sz dsl, sz dsr)
121 | pl = sl / s0
122 | pr = sr / s0
123 |
124 |
125 | -- | Partition a Dataset in two, according to a decision predicate applied to a given feature.
126 | --
127 | -- e.g. "is the j'th component of datum X_i larger than threshold t ?"
128 | partition :: (Foldable t, Ord k) =>
129 | (a -> Bool) -- ^ Decision function (element-level)
130 | -> Int -- ^ Feature index
131 | -> Dataset k (t (XV.V a))
132 | -> (Dataset k [XV.V a], Dataset k [XV.V a])
133 | partition p j ds@Dataset{} = foldrWithKey insf (empty, empty) ds where
134 | insf k lrow (l, r) = (insert k lp l, insert k rp r) where
135 | (lp, rp) = partition1 (XV.dataSplitDecision p j) lrow
136 |
137 |
138 |
139 | -- | Partition a Foldable in two lists according to a predicate
140 | partition1 :: Foldable t => (a -> Bool) -> t a -> ([a], [a])
141 | partition1 p = foldr ins ([], []) where
142 | ins x (l, r) | p x = (x : l, r)
143 | | otherwise = (l , x : r)
144 |
145 |
146 |
147 | -- | A well-defined Ordering, for strict half-plane separation
148 | data Order = LessThan | GreaterOrEqual deriving (Eq, Ord, Enum, Bounded)
149 | instance Show Order where
150 | show LessThan = "<"
151 | show GreaterOrEqual = ">="
152 |
153 | fromOrder :: Ord a => Order -> (a -> a -> Bool)
154 | fromOrder o = case o of
155 | LessThan -> (<)
156 | _ -> (>=)
157 |
--------------------------------------------------------------------------------
/dh-core/src/Core/Numeric/Statistics/Classification/Exceptions.hs:
--------------------------------------------------------------------------------
1 | module Core.Numeric.Statistics.Classification.Exceptions where
2 |
3 | import Control.Exception
4 | import Data.Typeable
5 |
6 |
7 | -- * Exceptions
8 |
9 | data ValueException = ZeroProbabilityE String deriving (Eq, Show, Typeable)
10 |
11 | instance Exception ValueException
12 |
13 |
14 | data DataException =
15 | -- MissingFeatureE i
16 | IndexOobE String Int Int Int
17 | | DimMismatchE String Int Int
18 | deriving (Eq, Typeable)
19 |
20 | instance Show DataException where
21 | show e = case e of
22 | IndexOobE errMsg ix blo bhi -> unwords [errMsg, ": index", show ix,"out of bounds", show (blo, bhi)]
23 | DimMismatchE errMsg d1 d2 -> unwords [errMsg, ": dimension mismatch : expecting", show d1, "but got", show d2]
24 |
25 | instance Exception DataException
26 |
--------------------------------------------------------------------------------
/dh-core/src/Core/Numeric/Statistics/Classification/Utils.hs:
--------------------------------------------------------------------------------
1 | {-# language TypeFamilies #-}
2 | module Core.Numeric.Statistics.Classification.Utils where
3 |
4 | -- import qualified Data.Foldable as F (maximumBy, foldl', toList)
5 | import qualified Data.IntSet as SI
6 | -- import qualified Data.Set as S
7 | import qualified Data.IntMap.Strict as IM
8 |
9 | import System.Random.MWC
10 | import System.Random.MWC.Distributions
11 | import Control.Monad.Primitive
12 |
13 | import Control.Monad (foldM, replicateM)
14 | import Data.Maybe (maybeToList)
15 |
16 | -- import Control.Monad.Catch (MonadThrow(..))
17 | -- import Numeric.Classification.Exceptions
18 |
19 | -- * Bootstrap
20 |
21 | -- | Non-parametric bootstrap
22 | bootstrapNP :: (Indexed f, PrimMonad m, Ix f ~ Int) =>
23 | Int -- ^ Number of samples
24 | -> Int -- ^ Number of bootstrap resamples
25 | -> Gen (PrimState m)
26 | -> f a -- ^ Dataset
27 | -> m [[a]]
28 | bootstrapNP nsamples nboot gen mm = replicateM nboot (resample nsamples mm gen)
29 |
30 | -- *
31 |
32 | -- | Sample with replacement
33 | resample :: (Indexed f, PrimMonad m, Ix f ~ Int) =>
34 | Int -> f b -> Gen (PrimState m) -> m [b]
35 | resample nsamples im gen = lookups (resampleIxs nsamples gen) im
36 |
37 | -- | Sample without replacement : return a list of at most M unique random samples from an indexed map of size N : O(N)
38 | sample :: (Indexed f, PrimMonad m, Ix f ~ Int) =>
39 | Int -> f b -> Gen (PrimState m) -> m [b]
40 | sample nsamples im gen = lookups (sampleIxs nsamples gen) im
41 |
42 | lookups :: (Monad m, Monoid (t b), Traversable t, Indexed f) =>
43 | (Int -> m (t (Ix f))) -- ^ Sampling function
44 | -> f b
45 | -> m (t b)
46 | lookups f mm = do
47 | ixs <- f (length mm)
48 | pure $ mconcat . maybeToList $ traverse (`ix` mm) ixs
49 |
50 | -- -- | Lookup a random subset and its complement
51 | -- lookups2 :: (Monad m, Monoid (t b), Traversable t, Indexed f) =>
52 | -- (Int -> m (t (Ix f), t (Ix f)))
53 | -- -> f b
54 | -- -> m (t b, t b)
55 | -- lookups2 f mm = do
56 | -- (ixl, ixr) <- f (length mm)
57 | -- let lookupf = mconcat . maybeToList . traverse (`ix` mm)
58 | -- pure (lookupf ixl, lookupf ixr)
59 |
60 |
61 |
62 |
63 | resampleIxs :: PrimMonad m => Int -> Gen (PrimState m) -> Int -> m [Int]
64 | resampleIxs nsamples gen n = replicateM nsamples (uniformR (0, n - 1) gen)
65 |
66 | sampleIxs :: PrimMonad m => Int -> Gen (PrimState m) -> Int -> m [Int]
67 | sampleIxs nsamples gen n = SI.toList <$> sampleUniques nsamples gen n
68 |
69 | -- | Random split based on extracting 'm' unique entries from a set of size 'n > m'
70 | randomSplit :: PrimMonad m =>
71 | Int -- ^ Number of samples
72 | -> Int -- ^ Size of universe set
73 | -> Gen (PrimState m)
74 | -> m (SI.IntSet, SI.IntSet)
75 | randomSplit nsamples n gen = do
76 | srand <- sampleUniques nsamples gen n
77 | let s0 = SI.fromList [0 .. n - 1]
78 | sDiff = s0 `SI.difference` srand
79 | pure (srand, sDiff)
80 |
81 | -- | Stochastic random split of a set of size 'n', based on a Bernoulli trial of parameter '0 <= p <= 1'; /on average/, m = p * n samples will be inserted in the left set, and n - m will be inserted in the right one.
82 | randomSplitBernoulli :: PrimMonad m =>
83 | Double -- ^ Parameter of Bernoulli trial
84 | -> Int
85 | -> Gen (PrimState m)
86 | -> m (SI.IntSet, SI.IntSet)
87 | randomSplitBernoulli p n gen = foldM insf (SI.empty, SI.empty) [0.. n-1] where
88 | insf (sl, sr) i = do
89 | c <- bernoulli p gen
90 | pure $ if c then
91 | (SI.insert i sl, sr)
92 | else
93 | (sl, SI.insert i sr)
94 |
95 |
96 |
97 | -- sampleNoReplace iml nsamples gen
98 | -- | nsamples > n = pure $ throwM $ DimMismatchE "sampleIM" n nsamples
99 | -- | otherwise = do
100 |
101 |
102 | -- | Sample without replacement : choose a set S of M unique random samples from a population of size N
103 | sampleUniques :: PrimMonad m =>
104 | Int -- ^ # of unique numbers to sample (M)
105 | -> Gen (PrimState m)
106 | -> Int -- ^ Population size (N)
107 | -> m SI.IntSet
108 | sampleUniques nsamples gen n = foldM sample1 SI.empty [p .. n - 1] where
109 | p = n - nsamples + 1
110 | sample1 s j = do
111 | t <- uniformR (0, j) gen
112 | let set' =
113 | if not (SI.member t s)
114 | then
115 | SI.insert t s
116 | else
117 | SI.insert j s
118 | return set'
119 |
120 | -- stest n ntot = withSystemRandom . asGenIO $ \g -> do
121 | -- let set = S.fromList [0..ntot - 1]
122 | -- sampleUniques set n g
123 |
124 |
125 |
126 |
127 | -- | Indexable containers
128 | class Foldable f => Indexed f where
129 | type Ix f :: *
130 | ix :: Ix f -> f a -> Maybe a
131 |
132 | instance Indexed [] where
133 | type Ix [] = Int
134 | ix = indexSafe
135 |
136 | instance Indexed IM.IntMap where
137 | type Ix IM.IntMap = IM.Key
138 | ix = IM.lookup
139 |
140 |
141 | indexSafe :: Int -> [a] -> Maybe a
142 | indexSafe i ll | i < length ll = Just $ ll !! i
143 | | otherwise = Nothing
144 |
--------------------------------------------------------------------------------
/dh-core/src/Core/Numeric/Statistics/InformationTheory.hs:
--------------------------------------------------------------------------------
1 | module Core.Numeric.Statistics.InformationTheory where
2 |
3 | import Core.Data.Dataset
4 |
5 | -- | Differential entropy has a singularity at 0 but converges slowly to 0 for small positive values.
6 | entropyR :: (Foldable t, Ord h, Floating h) => Dataset k (t a) -> h
7 | entropyR = entropyR_ . probClasses
8 |
9 | entropyR_ :: (Foldable t, Functor t, Ord c, Floating c) => t c -> c
10 | entropyR_ ps = negate . sum $ entropyReg <$> ps where
11 | entropyReg p | p > 0 = p * logBase 2 p
12 | | otherwise = 0
13 |
14 |
15 | -- | Gini index (expected error rate)
16 | gini :: (Foldable t, Floating c) => Dataset k (t a) -> c
17 | gini = gini_ . probClasses
18 |
19 | gini_ :: (Foldable t, Functor t, Floating c) => t c -> c
20 | gini_ ps = 1 - sum ((**2) <$> ps)
21 |
--------------------------------------------------------------------------------
/dh-core/src/Lib.hs:
--------------------------------------------------------------------------------
1 | module Lib where
2 |
3 |
--------------------------------------------------------------------------------
/dh-core/stack.yaml:
--------------------------------------------------------------------------------
1 | # resolver: lts-12.13 # GHC 8.4.3
2 | # resolver: lts-11.22 # GHC 8.2.2
3 | # resolver: lts-13.0 # GHC 8.6.3
4 | resolver: nightly-2019-02-27
5 | # resolver: nightly-2019-05-10 # GHC 8.6.5
6 |
7 | packages:
8 | - .
9 | - ../analyze
10 | - ../dense-linear-algebra
11 | - ../datasets
12 |
13 | # Dependency packages to be pulled from upstream that are not in the resolver
14 | # using the same syntax as the packages field.
15 | # (e.g., acme-missiles-0.3)
16 | extra-deps:
17 | - statistics-0.14.0.2
18 | - chronos-bench-0.2.0.2
19 | - github: mjarosie/stack-hpc-coveralls
20 | commit: 318262fe4c8b5ee2de30be54a9d6d36f1babefbf
21 |
22 | nix:
23 | enable: false
24 | packages:
25 | - curl
26 | - zlib
27 |
28 |
29 | # Override default flag values for local packages and extra-deps
30 | # flags: {}
31 |
32 | # Extra package databases containing global packages
33 | # extra-package-dbs: []
34 |
35 | # Control whether we use the GHC we find on the path
36 | # system-ghc: true
37 | #
38 | # Require a specific version of stack, using version ranges
39 | # require-stack-version: -any # Default
40 | # require-stack-version: ">=1.7"
41 | #
42 | # Override the architecture used by stack, especially useful on Windows
43 | # arch: i386
44 | # arch: x86_64
45 | #
46 | # Extra directories used by stack for building
47 | # extra-include-dirs: [/path/to/dir]
48 | # extra-lib-dirs: [/path/to/dir]
49 | #
50 | # Allow a newer minor version of GHC than the snapshot specifies
51 | # compiler-check: newer-minor
52 |
--------------------------------------------------------------------------------
/dh-core/stack.yaml.lock:
--------------------------------------------------------------------------------
1 | # This file was autogenerated by Stack.
2 | # You should not edit this file by hand.
3 | # For more information, please see the documentation at:
4 | # https://docs.haskellstack.org/en/stable/lock_files
5 |
6 | packages:
7 | - completed:
8 | hackage: statistics-0.14.0.2@sha256:23474780ad5ffa28ff3f46fe041a9e3fd309c98b776ce94e8851183bb107a7af,5169
9 | pantry-tree:
10 | size: 5459
11 | sha256: a82e92ae5586361f8ba78c169e3aa28f32294adc67105026595749fbe6ed1ba2
12 | original:
13 | hackage: statistics-0.14.0.2
14 | - completed:
15 | hackage: chronos-bench-0.2.0.2@sha256:dbffaad733d2817f4ead1392f51417310bebb39122d89975ce236971aadbf524,1961
16 | pantry-tree:
17 | size: 481
18 | sha256: 120e1d349c3d0546280b5c567510024dcbff7937a935d0b81d4c690ae358b042
19 | original:
20 | hackage: chronos-bench-0.2.0.2
21 | snapshots:
22 | - completed:
23 | size: 491909
24 | url: https://raw.githubusercontent.com/commercialhaskell/stackage-snapshots/master/nightly/2019/2/27.yaml
25 | sha256: 024e7b34b021c548cd81acee015d2411c51568f670ef7668832420765fb6c96b
26 | original: nightly-2019-02-27
27 |
--------------------------------------------------------------------------------
/dh-core/test/LibSpec.hs:
--------------------------------------------------------------------------------
1 | module LibSpec where
2 |
3 | import Test.Hspec
4 | -- import Test.Hspec.QuickCheck
5 |
6 | -- import Lib
7 |
8 | main :: IO ()
9 | main = hspec spec
10 |
11 | spec :: Spec
12 | spec =
13 | describe "Lib" $ do
14 | it "works" $ do
15 | True `shouldBe` True
16 | -- prop "ourAdd is commutative" $ \x y ->
17 | -- ourAdd x y `shouldBe` ourAdd y x
18 |
--------------------------------------------------------------------------------
/dh-core/test/Spec.hs:
--------------------------------------------------------------------------------
1 | {-# OPTIONS_GHC -F -pgmF hspec-discover #-}
2 |
--------------------------------------------------------------------------------