├── .ghci
├── .github
    └── workflows
    │   └── haskell-ci.yml
├── .gitignore
├── CHANGELOG.md
├── LICENSE
├── README.md
├── app
    └── Main.hs
├── benchmark
    └── Main.hs
├── data
    ├── chipotle.tsv
    ├── effects-of-covid-19-on-trade-at-15-december-2021-provisional.csv
    ├── housing.csv
    ├── measurements.txt
    └── starwars.csv
├── dataframe.cabal
├── docs
    ├── California Housing.ipynb
    ├── coming_from_dplyr.md
    ├── coming_from_pandas.md
    ├── coming_from_polars.md
    ├── configuration_notes.md
    ├── exploratory_data_analysis_primer.md
    └── haskell_for_data_analysis.md
├── flake.nix
├── run_compiled_repl.sh
├── run_profiling.sh
├── src
    ├── DataFrame.hs
    └── DataFrame
    │   ├── Display
    │       └── Terminal
    │       │   ├── Colours.hs
    │       │   ├── Plot.hs
    │       │   └── PrettyPrint.hs
    │   ├── Errors.hs
    │   ├── IO
    │       └── CSV.hs
    │   ├── Internal
    │       ├── Column.hs
    │       ├── DataFrame.hs
    │       ├── Function.hs
    │       ├── Parsing.hs
    │       ├── Row.hs
    │       └── Types.hs
    │   └── Operations
    │       ├── Aggregation.hs
    │       ├── Core.hs
    │       ├── Sorting.hs
    │       ├── Statistics.hs
    │       ├── Subset.hs
    │       ├── Transformations.hs
    │       └── Typing.hs
├── static
    └── example.gif
├── test_coverage.md
└── tests
    ├── Assertions.hs
    ├── Main.hs
    └── Operations
        ├── Apply.hs
        ├── Derive.hs
        ├── Filter.hs
        ├── GroupBy.hs
        ├── InsertColumn.hs
        ├── Sort.hs
        └── Take.hs


/.ghci:
--------------------------------------------------------------------------------
1 | :set -XOverloadedStrings
2 | :set -XTypeApplications
3 | 
4 | import qualified Data.Text as Str
5 | 
6 | default (Int, Str.Text, Double)
7 | 
8 | 


--------------------------------------------------------------------------------
/.github/workflows/haskell-ci.yml:
--------------------------------------------------------------------------------
  1 | # This GitHub workflow config has been generated by a script via
  2 | #
  3 | #   haskell-ci 'github' 'dataframe.cabal'
  4 | #
  5 | # To regenerate the script (for example after adjusting tested-with) run
  6 | #
  7 | #   haskell-ci regenerate
  8 | #
  9 | # For more information, see https://github.com/haskell-CI/haskell-ci
 10 | #
 11 | # version: 0.19.20241202
 12 | #
 13 | # REGENDATA ("0.19.20241202",["github","dataframe.cabal"])
 14 | #
 15 | name: Haskell-CI
 16 | on:
 17 |   - push
 18 |   - pull_request
 19 | jobs:
 20 |   linux:
 21 |     name: Haskell-CI - Linux - ${{ matrix.compiler }}
 22 |     runs-on: ubuntu-latest
 23 |     timeout-minutes:
 24 |       60
 25 |     container:
 26 |       image: buildpack-deps:jammy
 27 |     continue-on-error: ${{ matrix.allow-failure }}
 28 |     strategy:
 29 |       matrix:
 30 |         include:
 31 |           - compiler: ghc-9.4.8
 32 |             compilerKind: ghc
 33 |             compilerVersion: 9.4.8
 34 |             setup-method: ghcup
 35 |             allow-failure: false
 36 |           - compiler: ghc-9.8.3
 37 |             compilerKind: ghc
 38 |             compilerVersion: 9.8.3
 39 |             setup-method: ghcup-vanilla
 40 |             allow-failure: false
 41 |           - compiler: ghc-9.10.1
 42 |             compilerKind: ghc
 43 |             compilerVersion: 9.4.8
 44 |             setup-method: ghcup
 45 |             allow-failure: false
 46 |           - compiler: ghc-9.6.6
 47 |             compilerKind: ghc
 48 |             compilerVersion: 9.6.6
 49 |             setup-method: ghcup
 50 |             allow-failure: false
 51 |       fail-fast: false
 52 |     steps:
 53 |       - name: apt-get install
 54 |         run: |
 55 |           apt-get update
 56 |           apt-get install -y --no-install-recommends gnupg ca-certificates dirmngr curl git software-properties-common libtinfo5
 57 |       - name: Install GHCup
 58 |         run: |
 59 |           mkdir -p "$HOME/.ghcup/bin"
 60 |           curl -sL https://downloads.haskell.org/ghcup/0.1.30.0/x86_64-linux-ghcup-0.1.30.0 > "$HOME/.ghcup/bin/ghcup"
 61 |           chmod a+x "$HOME/.ghcup/bin/ghcup"
 62 |       - name: Install cabal-install
 63 |         run: |
 64 |           "$HOME/.ghcup/bin/ghcup" install cabal 3.12.1.0 || (cat "$HOME"/.ghcup/logs/*.* && false)
 65 |           echo "CABAL=$HOME/.ghcup/bin/cabal-3.12.1.0 -vnormal+nowrap" >> "$GITHUB_ENV"
 66 |       - name: Install GHC (GHCup)
 67 |         if: matrix.setup-method == 'ghcup'
 68 |         run: |
 69 |           "$HOME/.ghcup/bin/ghcup" install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false)
 70 |           HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER")
 71 |           HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#')
 72 |           HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#')
 73 |           echo "HC=$HC" >> "$GITHUB_ENV"
 74 |           echo "HCPKG=$HCPKG" >> "$GITHUB_ENV"
 75 |           echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV"
 76 |         env:
 77 |           HCKIND: ${{ matrix.compilerKind }}
 78 |           HCNAME: ${{ matrix.compiler }}
 79 |           HCVER: ${{ matrix.compilerVersion }}
 80 |       - name: Install GHC (GHCup vanilla)
 81 |         if: matrix.setup-method == 'ghcup-vanilla'
 82 |         run: |
 83 |           "$HOME/.ghcup/bin/ghcup" -s https://raw.githubusercontent.com/haskell/ghcup-metadata/master/ghcup-vanilla-0.0.8.yaml install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false)
 84 |           HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER")
 85 |           HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#')
 86 |           HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#')
 87 |           echo "HC=$HC" >> "$GITHUB_ENV"
 88 |           echo "HCPKG=$HCPKG" >> "$GITHUB_ENV"
 89 |           echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV"
 90 |         env:
 91 |           HCKIND: ${{ matrix.compilerKind }}
 92 |           HCNAME: ${{ matrix.compiler }}
 93 |           HCVER: ${{ matrix.compilerVersion }}
 94 |       - name: Set PATH and environment variables
 95 |         run: |
 96 |           echo "$HOME/.cabal/bin" >> $GITHUB_PATH
 97 |           echo "LANG=C.UTF-8" >> "$GITHUB_ENV"
 98 |           echo "CABAL_DIR=$HOME/.cabal" >> "$GITHUB_ENV"
 99 |           echo "CABAL_CONFIG=$HOME/.cabal/config" >> "$GITHUB_ENV"
100 |           HCNUMVER=$(${HC} --numeric-version|perl -ne '/^(\d+)\.(\d+)\.(\d+)(\.(\d+))?$/; print(10000 * $1 + 100 * $2 + ($3 == 0 ? $5 != 1 : $3))')
101 |           echo "HCNUMVER=$HCNUMVER" >> "$GITHUB_ENV"
102 |           echo "ARG_TESTS=--enable-tests" >> "$GITHUB_ENV"
103 |           echo "ARG_BENCH=--enable-benchmarks" >> "$GITHUB_ENV"
104 |           echo "HEADHACKAGE=false" >> "$GITHUB_ENV"
105 |           echo "ARG_COMPILER=--$HCKIND --with-compiler=$HC" >> "$GITHUB_ENV"
106 |         env:
107 |           HCKIND: ${{ matrix.compilerKind }}
108 |           HCNAME: ${{ matrix.compiler }}
109 |           HCVER: ${{ matrix.compilerVersion }}
110 |       - name: env
111 |         run: |
112 |           env
113 |       - name: write cabal config
114 |         run: |
115 |           mkdir -p $CABAL_DIR
116 |           cat >> $CABAL_CONFIG <<EOF
117 |           remote-build-reporting: anonymous
118 |           write-ghc-environment-files: never
119 |           remote-repo-cache: $CABAL_DIR/packages
120 |           logs-dir:          $CABAL_DIR/logs
121 |           world-file:        $CABAL_DIR/world
122 |           extra-prog-path:   $CABAL_DIR/bin
123 |           symlink-bindir:    $CABAL_DIR/bin
124 |           installdir:        $CABAL_DIR/bin
125 |           build-summary:     $CABAL_DIR/logs/build.log
126 |           store-dir:         $CABAL_DIR/store
127 |           install-dirs user
128 |             prefix: $CABAL_DIR
129 |           repository hackage.haskell.org
130 |             url: http://hackage.haskell.org/
131 |           EOF
132 |           cat >> $CABAL_CONFIG <<EOF
133 |           program-default-options
134 |             ghc-options: $GHCJOBS +RTS -M3G -RTS
135 |           EOF
136 |           cat $CABAL_CONFIG
137 |       - name: versions
138 |         run: |
139 |           $HC --version || true
140 |           $HC --print-project-git-commit-id || true
141 |           $CABAL --version || true
142 |       - name: update cabal index
143 |         run: |
144 |           $CABAL v2-update -v
145 |       - name: install cabal-plan
146 |         run: |
147 |           mkdir -p $HOME/.cabal/bin
148 |           curl -sL https://github.com/haskell-hvr/cabal-plan/releases/download/v0.7.3.0/cabal-plan-0.7.3.0-x86_64-linux.xz > cabal-plan.xz
149 |           echo 'f62ccb2971567a5f638f2005ad3173dba14693a45154c1508645c52289714cb2  cabal-plan.xz' | sha256sum -c -
150 |           xz -d < cabal-plan.xz > $HOME/.cabal/bin/cabal-plan
151 |           rm -f cabal-plan.xz
152 |           chmod a+x $HOME/.cabal/bin/cabal-plan
153 |           cabal-plan --version
154 |       - name: checkout
155 |         uses: actions/checkout@v4
156 |         with:
157 |           path: source
158 |       - name: initial cabal.project for sdist
159 |         run: |
160 |           touch cabal.project
161 |           echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project
162 |           cat cabal.project
163 |       - name: sdist
164 |         run: |
165 |           mkdir -p sdist
166 |           $CABAL sdist all --output-dir $GITHUB_WORKSPACE/sdist
167 |       - name: unpack
168 |         run: |
169 |           mkdir -p unpacked
170 |           find sdist -maxdepth 1 -type f -name '*.tar.gz' -exec tar -C $GITHUB_WORKSPACE/unpacked -xzvf {} \;
171 |       - name: generate cabal.project
172 |         run: |
173 |           PKGDIR_dataframe="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/dataframe-[0-9.]*')"
174 |           echo "PKGDIR_dataframe=${PKGDIR_dataframe}" >> "$GITHUB_ENV"
175 |           rm -f cabal.project cabal.project.local
176 |           touch cabal.project
177 |           touch cabal.project.local
178 |           echo "packages: ${PKGDIR_dataframe}" >> cabal.project
179 |           echo "package dataframe" >> cabal.project
180 |           echo "    ghc-options: -Werror=missing-methods" >> cabal.project
181 |           cat >> cabal.project <<EOF
182 |           EOF
183 |           $HCPKG list --simple-output --names-only | perl -ne 'for (split /\s+/) { print "constraints: any.$_ installed\n" unless /^(dataframe)$/; }' >> cabal.project.local
184 |           cat cabal.project
185 |           cat cabal.project.local
186 |       - name: dump install plan
187 |         run: |
188 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dry-run all
189 |           cabal-plan
190 |       - name: restore cache
191 |         uses: actions/cache/restore@v4
192 |         with:
193 |           key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }}
194 |           path: ~/.cabal/store
195 |           restore-keys: ${{ runner.os }}-${{ matrix.compiler }}-
196 |       - name: install dependencies
197 |         run: |
198 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks --dependencies-only -j2 all
199 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dependencies-only -j2 all
200 |       - name: build w/o tests
201 |         run: |
202 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all
203 |       - name: build
204 |         run: |
205 |           $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --write-ghc-environment-files=always
206 |       - name: tests
207 |         run: |
208 |           $CABAL v2-test $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --test-show-details=direct
209 |       - name: cabal check
210 |         run: |
211 |           cd ${PKGDIR_dataframe} || false
212 |           ${CABAL} -vnormal check
213 |       - name: haddock
214 |         run: |
215 |           $CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all
216 |       - name: unconstrained build
217 |         run: |
218 |           rm -f cabal.project.local
219 |           $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all
220 |       - name: save cache
221 |         if: always()
222 |         uses: actions/cache/save@v4
223 |         with:
224 |           key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }}
225 |           path: ~/.cabal/store
226 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | dist
 2 | dist-*
 3 | cabal-dev
 4 | *.o
 5 | *.hi
 6 | *.hie
 7 | *.chi
 8 | *.chs.h
 9 | *.dyn_o
10 | *.dyn_hi
11 | .hpc
12 | .hsenv
13 | .cabal-sandbox/
14 | cabal.sandbox.config
15 | *.prof
16 | *.aux
17 | *.hp
18 | *.eventlog
19 | .stack-work/
20 | cabal.project.local
21 | cabal.project.local~
22 | .HTF/
23 | .ghc.environment.*
24 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Revision history for dataframe
 2 | 
 3 | ## 0.1.0.0
 4 | 
 5 | * Initial release
 6 | 
 7 | ## 0.1.0.1
 8 | 
 9 | * Fixed parse failure on nested, escaped quotation.
10 | * Fixed column info when field name isn't found. 
11 | 
12 | ## 0.1.0.2
13 | 
14 | * Change namespace from `Data.DataFrame` to `DataFrame`
15 | * Add `toVector` function for converting columns to vectors.
16 | * Add `impute` function for replacing `Nothing` values in optional columns.
17 | * Add `filterAllJust` to filter out all rows with missing data.
18 | * Add `distinct` function that returns a dataframe with distict rows.
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # DataFrame
 2 | 
 3 | An intuitive, dynamically-typed DataFrame library.
 4 | 
 5 | A tool for exploratory data analysis.
 6 | 
 7 | ## Installing
 8 | 
 9 | ### CLI
10 | * Install Haskell (ghc + cabal) via [ghcup](https://www.haskell.org/ghcup/install/) selecting all the default options.
11 | * To install dataframe run `cabal update && cabal install dataframe`
12 | * Open a Haskell repl with dataframe loaded by running `cabal repl --build-depends dataframe`.
13 | * Follow along any one of the tutorials below.
14 | 
15 | ### Jupyter notebook
16 | * Jupyter notebook is still underway with some local tests/examples in the works.
17 | * For a preview check out the [California Housing](https://github.com/mchav/dataframe/blob/main/docs/California%20Housing.ipynb) notebook.
18 | 
19 | ## What is exploratory data analysis?
20 | We provide a primer [here](https://github.com/mchav/dataframe/blob/main/docs/exploratory_data_analysis_primer.md) and show how to do some common analyses.
21 | 
22 | ## Coming from other dataframe libraries
23 | Familiar with another dataframe library? Get started:
24 | * [Coming from Pandas](https://github.com/mchav/dataframe/blob/main/docs/coming_from_pandas.md)
25 | * [Coming from Polars](https://github.com/mchav/dataframe/blob/main/docs/coming_from_polars.md)
26 | * [Coming from dplyr](https://github.com/mchav/dataframe/blob/main/docs/coming_from_dplyr.md)
27 | 
28 | ## Example usage
29 | 
30 | ### Code example
31 | ```haskell
32 | import qualified DataFrame as D
33 | 
34 | import DataFrame ((|>))
35 | 
36 | main :: IO ()
37 |     df <- D.readTsv "./data/chipotle.tsv"
38 |     print $ df
39 |       |> D.select ["item_name", "quantity"]
40 |       |> D.groupBy ["item_name"]
41 |       |> D.aggregate (zip (repeat "quantity") [D.Maximum, D.Mean, D.Sum])
42 |       |> D.sortBy D.Descending ["Sum_quantity"]
43 | ```
44 | 
45 | Output:
46 | 
47 | ```
48 | ----------------------------------------------------------------------------------------------------
49 | index |               item_name               | Sum_quantity |   Mean_quantity    | Maximum_quantity
50 | ------|---------------------------------------|--------------|--------------------|-----------------
51 |  Int  |                 Text                  |     Int      |       Double       |       Int       
52 | ------|---------------------------------------|--------------|--------------------|-----------------
53 | 0     | Chips and Fresh Tomato Salsa          | 130          | 1.1818181818181819 | 15              
54 | 1     | Izze                                  | 22           | 1.1                | 3               
55 | 2     | Nantucket Nectar                      | 31           | 1.1481481481481481 | 3               
56 | 3     | Chips and Tomatillo-Green Chili Salsa | 35           | 1.1290322580645162 | 3               
57 | 4     | Chicken Bowl                          | 761          | 1.0482093663911847 | 3               
58 | 5     | Side of Chips                         | 110          | 1.0891089108910892 | 8               
59 | 6     | Steak Burrito                         | 386          | 1.048913043478261  | 3               
60 | 7     | Steak Soft Tacos                      | 56           | 1.018181818181818  | 2               
61 | 8     | Chips and Guacamole                   | 506          | 1.0563674321503131 | 4               
62 | 9     | Chicken Crispy Tacos                  | 50           | 1.0638297872340425 | 2
63 | ```
64 | 
65 | Full example in `./app` folder using many of the constructs in the API.
66 | 
67 | ### Visual example
68 | ![Screencast of usage in GHCI](./static/example.gif)
69 | 
70 | ## Future work
71 | * Jupyter/ihaskell support (soon)
72 | * Apache arrow and Parquet compatability
73 | * Integration with common data formats (currently only supports CSV)
74 | * Support windowed plotting (currently only supports ASCII plots)
75 | * Create a lazy API that builds an execution graph instead of running eagerly (will be used to compute on files larger than RAM)
76 | 
77 | ## Contributing
78 | * Please first submit an issue and we can discuss there.
79 | 


--------------------------------------------------------------------------------
/app/Main.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ExtendedDefaultRules #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | {-# LANGUAGE ScopedTypeVariables #-}
  4 | {-# LANGUAGE TypeApplications #-}
  5 | {-# LANGUAGE TupleSections #-}
  6 | 
  7 | module Main where
  8 | 
  9 | import qualified DataFrame as D
 10 | import DataFrame (dimensions, (|>))
 11 | import Data.List (delete)
 12 | import Data.Maybe (fromMaybe, isJust, isNothing)
 13 | import qualified Data.Text as T
 14 | import qualified Data.Vector as V
 15 | import qualified Data.Vector.Generic as VG
 16 | import qualified Data.Vector.Unboxed as VU
 17 | 
 18 | -- Numbers default to int and double, and strings to text
 19 | default (Int, T.Text, Double)
 20 | 
 21 | -- Example usage of DataFrame library
 22 | 
 23 | main :: IO ()
 24 | main = do
 25 |   putStrLn "Housing"
 26 |   housing
 27 |   putStrLn $ replicate 100 '-'
 28 | 
 29 |   putStrLn "Chipotle Data"
 30 |   chipotle
 31 |   putStrLn $ replicate 100 '-'
 32 | 
 33 |   putStrLn "One Billion Row Challenge"
 34 |   oneBillingRowChallenge
 35 |   putStrLn $ replicate 100 '-'
 36 | 
 37 |   putStrLn "Covid Data"
 38 |   covid
 39 |   putStrLn $ replicate 100 '-'
 40 | 
 41 | 
 42 | mean :: (Fractional a, VG.Vector v a) => v a -> a
 43 | mean xs = VG.sum xs / fromIntegral (VG.length xs)
 44 | 
 45 | oneBillingRowChallenge :: IO ()
 46 | oneBillingRowChallenge = do
 47 |   parsed <- D.readSeparated ';' D.defaultOptions "./data/measurements.txt"
 48 |   print $
 49 |     parsed
 50 |       |> D.groupBy ["City"]
 51 |       |> D.reduceBy (\v -> (VG.minimum v, mean @Double v, VG.maximum v)) "Measurement"
 52 |       |> D.sortBy D.Ascending ["City"]
 53 | 
 54 | housing :: IO ()
 55 | housing = do
 56 |   parsed <- D.readCsv "./data/housing.csv"
 57 | 
 58 |   print $ D.columnInfo parsed
 59 | 
 60 |   -- Sample.
 61 |   print $ D.take 5 parsed
 62 | 
 63 |   D.plotHistograms D.PlotAll D.VerticalHistogram parsed
 64 | 
 65 | covid :: IO ()
 66 | covid = do
 67 |   rawFrame <- D.readCsv "./data/effects-of-covid-19-on-trade-at-15-december-2021-provisional.csv"
 68 |   print $ dimensions rawFrame
 69 |   print $ D.take 10 rawFrame
 70 | 
 71 |   D.plotHistograms D.PlotAll D.VerticalHistogram rawFrame
 72 | 
 73 |   -- value of all exports from 2015
 74 |   print $
 75 |     rawFrame
 76 |       |> D.filter "Direction" (== "Exports")
 77 |       |> D.select ["Direction", "Year", "Country", "Value"]
 78 |       |> D.groupBy ["Direction", "Year", "Country"]
 79 |       |> D.reduceByAgg D.Sum "Value"
 80 | 
 81 | chipotle :: IO ()
 82 | chipotle = do
 83 |   rawFrame <- D.readTsv "./data/chipotle.tsv"
 84 |   print $ D.dimensions rawFrame
 85 | 
 86 |   -- -- Sampling the dataframe
 87 |   print $ D.take 5 rawFrame
 88 | 
 89 |   -- Transform the data from a raw string into
 90 |   -- respective types (throws error on failure)
 91 |   let f =
 92 |         rawFrame
 93 |           -- Change a specfic order ID
 94 |           |> D.applyWhere (== 1) "order_id" (+ 2) "quantity"
 95 |           -- Index based change.
 96 |           |> D.applyAtIndex 0 (\n -> n - 2) "quantity"
 97 |           -- Custom parsing: drop dollar sign and parse price as double
 98 |           |> D.apply (D.readValue @Double . T.drop 1) "item_price"
 99 | 
100 |   -- sample the dataframe.
101 |   print $ D.take 10 f
102 | 
103 |   -- Create a total_price column that is quantity * item_price
104 |   let multiply (a :: Int) (b :: Double) = fromIntegral a * b
105 |   let withTotalPrice = D.deriveFrom (["quantity", "item_price"], D.func multiply) "total_price" f
106 | 
107 |   -- sample a filtered subset of the dataframe
108 |   putStrLn "Sample dataframe"
109 |   print $
110 |     withTotalPrice
111 |       |> D.select ["quantity", "item_name", "item_price", "total_price"]
112 |       |> D.filter "total_price" (100.0 <)
113 |       |> D.take 10
114 | 
115 |   -- Check how many chicken burritos were ordered.
116 |   -- There are two ways to checking how many chicken burritos
117 |   -- were ordered.
118 |   let searchTerm = "Chicken Burrito" :: T.Text
119 | 
120 |   print $
121 |     f
122 |       |> D.select ["item_name", "quantity"]
123 |       -- It's more efficient to filter before grouping.
124 |       |> D.filter "item_name" (searchTerm ==)
125 |       |> D.groupBy ["item_name"]
126 |       -- can also be written as:
127 |       --    D.aggregate (zip (repeat "quantity") [D.Sum, D.Maximum, D.Mean])
128 |       |> D.aggregate (map ("quantity",) [D.Sum, D.Maximum, D.Mean])
129 |       -- Automatically create a variable called <Agg>_<variable>
130 |       |> D.sortBy D.Descending ["Sum_quantity"]
131 | 
132 |   -- Similarly, we can aggregate quantities by all rows.
133 |   print $
134 |     f
135 |       |> D.select ["item_name", "quantity"]
136 |       |> D.groupBy ["item_name"]
137 |       -- Aggregate written more explicitly.
138 |       -- We have the full expressiveness of Haskell and we needn't fall
139 |       -- use a DSL.
140 |       |> D.aggregate [("quantity", D.Maximum), ("quantity", D.Mean), ("quantity", D.Sum)]
141 |       |> D.take 10
142 | 
143 |   let firstOrder =
144 |         withTotalPrice
145 |           |> D.filterBy (maybe False (T.isInfixOf "Guacamole")) "choice_description"
146 |           |> D.filterBy (("Chicken Bowl" :: T.Text) ==) "item_name"
147 | 
148 |   print $ D.take 10 firstOrder
149 | 


--------------------------------------------------------------------------------
/benchmark/Main.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE NumericUnderscores #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | 
 4 | import qualified DataFrame as D
 5 | import qualified Data.Vector.Unboxed as VU
 6 | 
 7 | import Control.Monad (replicateM)
 8 | import Criterion.Main
 9 | import System.Random (randomRIO)
10 | 
11 | stats :: Int -> IO ()
12 | stats n = do
13 |   ns <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
14 |   xs <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
15 |   ys <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0))
16 |   let df = D.fromList [("first", D.UnboxedColumn ns),
17 |                        ("second", D.UnboxedColumn xs),
18 |                        ("third", D.UnboxedColumn ys)]
19 |   
20 |   print $ D.mean "first" df
21 |   print $ D.variance "second" df
22 |   print $ D.correlation "second" "third" df
23 |   print $ D.select ["first"] df D.|> D.take 1
24 | 
25 | main = defaultMain [
26 |   bgroup "stats" [ bench    "300_000" $ nfIO (stats 100_000)
27 |                  , bench  "3_000_000" $ nfIO (stats 1_000_000)
28 |                  , bench "30_000_000" $ nfIO (stats 30_000_000)
29 |                  ]
30 |   ]
31 | 


--------------------------------------------------------------------------------
/data/starwars.csv:
--------------------------------------------------------------------------------
 1 | name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,films,vehicles,starships
 2 | Luke Skywalker,172,77,blond,fair,blue,19,male,masculine,Tatooine,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi, Revenge of the Sith, The Force Awakens","Snowspeeder, Imperial Speeder Bike","X-wing, Imperial shuttle"
 3 | C-3PO,167,75,NA,gold,yellow,112,none,masculine,Tatooine,Droid,"A New Hope, The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith",,
 4 | R2-D2,96,32,NA,"white, blue",red,33,none,masculine,Naboo,Droid,"A New Hope, The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith, The Force Awakens",,
 5 | Darth Vader,202,136,none,white,yellow,41.9,male,masculine,Tatooine,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi, Revenge of the Sith",,TIE Advanced x1
 6 | Leia Organa,150,49,brown,light,brown,19,female,feminine,Alderaan,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi, Revenge of the Sith, The Force Awakens",Imperial Speeder Bike,
 7 | Owen Lars,178,120,"brown, grey",light,blue,52,male,masculine,Tatooine,Human,"A New Hope, Attack of the Clones, Revenge of the Sith",,
 8 | Beru Whitesun Lars,165,75,brown,light,blue,47,female,feminine,Tatooine,Human,"A New Hope, Attack of the Clones, Revenge of the Sith",,
 9 | R5-D4,97,32,NA,"white, red",red,NA,none,masculine,Tatooine,Droid,A New Hope,,
10 | Biggs Darklighter,183,84,black,light,brown,24,male,masculine,Tatooine,Human,A New Hope,,X-wing
11 | Obi-Wan Kenobi,182,77,"auburn, white",fair,blue-gray,57,male,masculine,Stewjon,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith",Tribubble bongo,"Jedi starfighter, Trade Federation cruiser, Naboo star skiff, Jedi Interceptor, Belbullab-22 starfighter"
12 | Anakin Skywalker,188,84,blond,fair,blue,41.9,male,masculine,Tatooine,Human,"The Phantom Menace, Attack of the Clones, Revenge of the Sith","Zephyr-G swoop bike, XJ-6 airspeeder","Naboo fighter, Trade Federation cruiser, Jedi Interceptor"
13 | Wilhuff Tarkin,180,NA,"auburn, grey",fair,blue,64,male,masculine,Eriadu,Human,"A New Hope, Revenge of the Sith",,
14 | Chewbacca,228,112,brown,unknown,blue,200,male,masculine,Kashyyyk,Wookiee,"A New Hope, The Empire Strikes Back, Return of the Jedi, Revenge of the Sith, The Force Awakens",AT-ST,"Millennium Falcon, Imperial shuttle"
15 | Han Solo,180,80,brown,fair,brown,29,male,masculine,Corellia,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi, The Force Awakens",,"Millennium Falcon, Imperial shuttle"
16 | Greedo,173,74,NA,green,black,44,male,masculine,Rodia,Rodian,A New Hope,,
17 | Jabba Desilijic Tiure,175,1358,NA,"green-tan, brown",orange,600,hermaphroditic,masculine,Nal Hutta,Hutt,"A New Hope, Return of the Jedi, The Phantom Menace",,
18 | Wedge Antilles,170,77,brown,fair,hazel,21,male,masculine,Corellia,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi",Snowspeeder,X-wing
19 | Jek Tono Porkins,180,110,brown,fair,blue,NA,NA,NA,Bestine IV,NA,A New Hope,,X-wing
20 | Yoda,66,17,white,green,brown,896,male,masculine,NA,Yoda's species,"The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith",,
21 | Palpatine,170,75,grey,pale,yellow,82,male,masculine,Naboo,Human,"The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith",,
22 | Boba Fett,183,78.2,black,fair,brown,31.5,male,masculine,Kamino,Human,"The Empire Strikes Back, Return of the Jedi, Attack of the Clones",,Slave 1
23 | IG-88,200,140,none,metal,red,15,none,masculine,NA,Droid,The Empire Strikes Back,,
24 | Bossk,190,113,none,green,red,53,male,masculine,Trandosha,Trandoshan,The Empire Strikes Back,,
25 | Lando Calrissian,177,79,black,dark,brown,31,male,masculine,Socorro,Human,"The Empire Strikes Back, Return of the Jedi",,Millennium Falcon
26 | Lobot,175,79,none,light,blue,37,male,masculine,Bespin,Human,The Empire Strikes Back,,
27 | Ackbar,180,83,none,brown mottle,orange,41,male,masculine,Mon Cala,Mon Calamari,"Return of the Jedi, The Force Awakens",,
28 | Mon Mothma,150,NA,auburn,fair,blue,48,female,feminine,Chandrila,Human,Return of the Jedi,,
29 | Arvel Crynyd,NA,NA,brown,fair,brown,NA,male,masculine,NA,Human,Return of the Jedi,,A-wing
30 | Wicket Systri Warrick,88,20,brown,brown,brown,8,male,masculine,Endor,Ewok,Return of the Jedi,,
31 | Nien Nunb,160,68,none,grey,black,NA,male,masculine,Sullust,Sullustan,Return of the Jedi,,Millennium Falcon
32 | Qui-Gon Jinn,193,89,brown,fair,blue,92,male,masculine,NA,Human,The Phantom Menace,Tribubble bongo,
33 | Nute Gunray,191,90,none,mottled green,red,NA,male,masculine,Cato Neimoidia,Neimodian,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",,
34 | Finis Valorum,170,NA,blond,fair,blue,91,male,masculine,Coruscant,Human,The Phantom Menace,,
35 | Padmé Amidala,185,45,brown,light,brown,46,female,feminine,Naboo,Human,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",,"Naboo fighter, H-type Nubian yacht, Naboo star skiff"
36 | Jar Jar Binks,196,66,none,orange,orange,52,male,masculine,Naboo,Gungan,"The Phantom Menace, Attack of the Clones",,
37 | Roos Tarpals,224,82,none,grey,orange,NA,male,masculine,Naboo,Gungan,The Phantom Menace,,
38 | Rugor Nass,206,NA,none,green,orange,NA,male,masculine,Naboo,Gungan,The Phantom Menace,,
39 | Ric Olié,183,NA,brown,fair,blue,NA,male,masculine,Naboo,Human,The Phantom Menace,,Naboo Royal Starship
40 | Watto,137,NA,black,"blue, grey",yellow,NA,male,masculine,Toydaria,Toydarian,"The Phantom Menace, Attack of the Clones",,
41 | Sebulba,112,40,none,"grey, red",orange,NA,male,masculine,Malastare,Dug,The Phantom Menace,,
42 | Quarsh Panaka,183,NA,black,dark,brown,62,male,masculine,Naboo,Human,The Phantom Menace,,
43 | Shmi Skywalker,163,NA,black,fair,brown,72,female,feminine,Tatooine,Human,"The Phantom Menace, Attack of the Clones",,
44 | Darth Maul,175,80,none,red,yellow,54,male,masculine,Dathomir,Zabrak,The Phantom Menace,Sith speeder,Scimitar
45 | Bib Fortuna,180,NA,none,pale,pink,NA,male,masculine,Ryloth,Twi'lek,Return of the Jedi,,
46 | Ayla Secura,178,55,none,blue,hazel,48,female,feminine,Ryloth,Twi'lek,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",,
47 | Ratts Tyerel,79,15,none,"grey, blue",unknown,NA,male,masculine,Aleen Minor,Aleena,The Phantom Menace,,
48 | Dud Bolt,94,45,none,"blue, grey",yellow,NA,male,masculine,Vulpter,Vulptereen,The Phantom Menace,,
49 | Gasgano,122,NA,none,"white, blue",black,NA,male,masculine,Troiken,Xexto,The Phantom Menace,,
50 | Ben Quadinaros,163,65,none,"grey, green, yellow",orange,NA,male,masculine,Tund,Toong,The Phantom Menace,,
51 | Mace Windu,188,84,none,dark,brown,72,male,masculine,Haruun Kal,Human,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",,
52 | Ki-Adi-Mundi,198,82,white,pale,yellow,92,male,masculine,Cerea,Cerean,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",,
53 | Kit Fisto,196,87,none,green,black,NA,male,masculine,Glee Anselm,Nautolan,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",,
54 | Eeth Koth,171,NA,black,brown,brown,NA,male,masculine,Iridonia,Zabrak,"The Phantom Menace, Revenge of the Sith",,
55 | Adi Gallia,184,50,none,dark,blue,NA,female,feminine,Coruscant,Tholothian,"The Phantom Menace, Revenge of the Sith",,
56 | Saesee Tiin,188,NA,none,pale,orange,NA,male,masculine,Iktotch,Iktotchi,"The Phantom Menace, Revenge of the Sith",,
57 | Yarael Poof,264,NA,none,white,yellow,NA,male,masculine,Quermia,Quermian,The Phantom Menace,,
58 | Plo Koon,188,80,none,orange,black,22,male,masculine,Dorin,Kel Dor,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",,Jedi starfighter
59 | Mas Amedda,196,NA,none,blue,blue,NA,male,masculine,Champala,Chagrian,"The Phantom Menace, Attack of the Clones",,
60 | Gregar Typho,185,85,black,dark,brown,NA,NA,NA,Naboo,NA,Attack of the Clones,,Naboo fighter
61 | Cordé,157,NA,brown,light,brown,NA,NA,NA,Naboo,NA,Attack of the Clones,,
62 | Cliegg Lars,183,NA,brown,fair,blue,82,male,masculine,Tatooine,Human,Attack of the Clones,,
63 | Poggle the Lesser,183,80,none,green,yellow,NA,male,masculine,Geonosis,Geonosian,"Attack of the Clones, Revenge of the Sith",,
64 | Luminara Unduli,170,56.2,black,yellow,blue,58,female,feminine,Mirial,Mirialan,"Attack of the Clones, Revenge of the Sith",,
65 | Barriss Offee,166,50,black,yellow,blue,40,female,feminine,Mirial,Mirialan,Attack of the Clones,,
66 | Dormé,165,NA,brown,light,brown,NA,female,feminine,Naboo,Human,Attack of the Clones,,
67 | Dooku,193,80,white,fair,brown,102,male,masculine,Serenno,Human,"Attack of the Clones, Revenge of the Sith",Flitknot speeder,
68 | Bail Prestor Organa,191,NA,black,tan,brown,67,male,masculine,Alderaan,Human,"Attack of the Clones, Revenge of the Sith",,
69 | Jango Fett,183,79,black,tan,brown,66,male,masculine,Concord Dawn,Human,Attack of the Clones,,
70 | Zam Wesell,168,55,blonde,"fair, green, yellow",yellow,NA,female,feminine,Zolan,Clawdite,Attack of the Clones,Koro-2 Exodrive airspeeder,
71 | Dexter Jettster,198,102,none,brown,yellow,NA,male,masculine,Ojom,Besalisk,Attack of the Clones,,
72 | Lama Su,229,88,none,grey,black,NA,male,masculine,Kamino,Kaminoan,Attack of the Clones,,
73 | Taun We,213,NA,none,grey,black,NA,female,feminine,Kamino,Kaminoan,Attack of the Clones,,
74 | Jocasta Nu,167,NA,white,fair,blue,NA,female,feminine,Coruscant,Human,Attack of the Clones,,
75 | R4-P17,96,NA,none,"silver, red","red, blue",NA,none,feminine,NA,Droid,"Attack of the Clones, Revenge of the Sith",,
76 | Wat Tambor,193,48,none,"green, grey",unknown,NA,male,masculine,Skako,Skakoan,Attack of the Clones,,
77 | San Hill,191,NA,none,grey,gold,NA,male,masculine,Muunilinst,Muun,Attack of the Clones,,
78 | Shaak Ti,178,57,none,"red, blue, white",black,NA,female,feminine,Shili,Togruta,"Attack of the Clones, Revenge of the Sith",,
79 | Grievous,216,159,none,"brown, white","green, yellow",NA,male,masculine,Kalee,Kaleesh,Revenge of the Sith,Tsmeu-6 personal wheel bike,Belbullab-22 starfighter
80 | Tarfful,234,136,brown,brown,blue,NA,male,masculine,Kashyyyk,Wookiee,Revenge of the Sith,,
81 | Raymus Antilles,188,79,brown,light,brown,NA,male,masculine,Alderaan,Human,"A New Hope, Revenge of the Sith",,
82 | Sly Moore,178,48,none,pale,white,NA,NA,NA,Umbara,NA,"Attack of the Clones, Revenge of the Sith",,
83 | Tion Medon,206,80,none,grey,black,NA,male,masculine,Utapau,Pau'an,Revenge of the Sith,,
84 | Finn,NA,NA,black,dark,dark,NA,male,masculine,NA,Human,The Force Awakens,,
85 | Rey,NA,NA,brown,light,hazel,NA,female,feminine,NA,Human,The Force Awakens,,
86 | Poe Dameron,NA,NA,brown,light,brown,NA,male,masculine,NA,Human,The Force Awakens,,X-wing
87 | BB8,NA,NA,none,none,black,NA,none,masculine,NA,Droid,The Force Awakens,,
88 | Captain Phasma,NA,NA,none,none,unknown,NA,female,feminine,NA,Human,The Force Awakens,,
89 | 


--------------------------------------------------------------------------------
/dataframe.cabal:
--------------------------------------------------------------------------------
  1 | cabal-version:      2.4
  2 | name:               dataframe
  3 | version:            0.1.0.3
  4 | 
  5 | synopsis: An intuitive, dynamically-typed DataFrame library.
  6 | 
  7 | description: An intuitive, dynamically-typed DataFrame library for exploratory data analysis.
  8 | 
  9 | bug-reports: https://github.com/mchav/dataframe/issues
 10 | license:            GPL-3.0-or-later
 11 | license-file:       LICENSE
 12 | author:             Michael Chavinda
 13 | maintainer:         mschavinda@gmail.com
 14 | 
 15 | copyright: (c) 2024-2024 Michael Chavinda
 16 | category: Data
 17 | tested-with: GHC ==9.8.3 || ==9.6.6 || == 9.4.8
 18 | extra-doc-files: CHANGELOG.md README.md
 19 | 
 20 | source-repository head
 21 |   type:     git
 22 |   location: https://github.com/mchav/dataframe
 23 | 
 24 | library
 25 |     exposed-modules: DataFrame
 26 |     other-modules: DataFrame.Internal.Types,
 27 |                    DataFrame.Internal.Function,
 28 |                    DataFrame.Internal.Parsing,
 29 |                    DataFrame.Internal.Column,
 30 |                    DataFrame.Display.Terminal.PrettyPrint,
 31 |                    DataFrame.Display.Terminal.Colours,
 32 |                    DataFrame.Internal.DataFrame,
 33 |                    DataFrame.Internal.Row,
 34 |                    DataFrame.Errors,
 35 |                    DataFrame.Operations.Core,
 36 |                    DataFrame.Operations.Subset,
 37 |                    DataFrame.Operations.Sorting,
 38 |                    DataFrame.Operations.Statistics,
 39 |                    DataFrame.Operations.Transformations,
 40 |                    DataFrame.Operations.Typing,
 41 |                    DataFrame.Operations.Aggregation,
 42 |                    DataFrame.Display.Terminal.Plot,
 43 |                    DataFrame.IO.CSV
 44 |     build-depends:    base >= 4.17.2.0 && < 4.21,
 45 |                       array ^>= 0.5,
 46 |                       attoparsec >= 0.12 && <= 0.14.4,
 47 |                       bytestring >= 0.11 && <= 0.12.2.0,
 48 |                       containers >= 0.6.7 && < 0.8,
 49 |                       directory >= 1.3.0.0 && <= 1.3.9.0,
 50 |                       hashable >= 1.2 && <= 1.5.0.0,
 51 |                       statistics >= 0.16.2.1 && <= 0.16.3.0,
 52 |                       text >= 2.0 && <= 2.1.2,
 53 |                       time >= 1.12 && <= 1.14,
 54 |                       vector ^>= 0.13,
 55 |                       vector-algorithms ^>= 0.9
 56 |     hs-source-dirs:   src
 57 |     default-language: Haskell2010
 58 | 
 59 | executable dataframe
 60 |     main-is:       Main.hs
 61 |     other-modules: DataFrame,
 62 |                    DataFrame.Internal.Types,
 63 |                    DataFrame.Internal.Function,
 64 |                    DataFrame.Internal.Parsing,
 65 |                    DataFrame.Internal.Column,
 66 |                    DataFrame.Display.Terminal.PrettyPrint,
 67 |                    DataFrame.Display.Terminal.Colours,
 68 |                    DataFrame.Internal.DataFrame,
 69 |                    DataFrame.Internal.Row,
 70 |                    DataFrame.Errors,
 71 |                    DataFrame.Operations.Core,
 72 |                    DataFrame.Operations.Subset,
 73 |                    DataFrame.Operations.Sorting,
 74 |                    DataFrame.Operations.Statistics,
 75 |                    DataFrame.Operations.Transformations,
 76 |                    DataFrame.Operations.Typing,
 77 |                    DataFrame.Operations.Aggregation,
 78 |                    DataFrame.Display.Terminal.Plot,
 79 |                    DataFrame.IO.CSV
 80 |     build-depends:    base >= 4.17.2.0 && < 4.21,
 81 |                       array ^>= 0.5,
 82 |                       attoparsec >= 0.12 && <= 0.14.4,
 83 |                       bytestring >= 0.11 && <= 0.12.2.0,
 84 |                       containers >= 0.6.7 && < 0.8,
 85 |                       directory >= 1.3.0.0 && <= 1.3.9.0,
 86 |                       hashable >= 1.2 && <= 1.5.0.0,
 87 |                       statistics >= 0.16.2.1 && <= 0.16.3.0,
 88 |                       text >= 2.0 && <= 2.1.2,
 89 |                       time >= 1.12 && <= 1.14,
 90 |                       vector ^>= 0.13,
 91 |                       vector-algorithms ^>= 0.9
 92 |     hs-source-dirs:   app,
 93 |                       src
 94 |     default-language: Haskell2010
 95 | 
 96 | benchmark dataframe-benchmark
 97 |     type:       exitcode-stdio-1.0
 98 |     main-is:    Main.hs
 99 |     hs-source-dirs: benchmark
100 |     build-depends: base >= 4.17.2.0 && < 4.21,
101 |                    criterion >= 1 && <= 1.6.4.0,
102 |                    text >= 2.0 && <= 2.1.2,
103 |                    random >= 1 && <= 1.3.1,
104 |                    vector ^>= 0.13,
105 |                    dataframe
106 |     default-language: Haskell2010
107 | 
108 | test-suite tests
109 |     type: exitcode-stdio-1.0
110 |     main-is: Main.hs
111 |     other-modules: Assertions,
112 |                    Operations.Apply,
113 |                    Operations.Derive,
114 |                    Operations.Filter,
115 |                    Operations.GroupBy,
116 |                    Operations.InsertColumn,
117 |                    Operations.Sort,
118 |                    Operations.Take
119 |     build-depends: base >= 4.17.2.0 && < 4.21,
120 |                    HUnit ^>= 1.6,
121 |                    random >= 1,
122 |                    random-shuffle >= 0.0.4,
123 |                    text >= 2.0,
124 |                    time >= 1.12,
125 |                    vector ^>= 0.13,
126 |                    dataframe
127 |     hs-source-dirs: tests
128 |     default-language: Haskell2010
129 | 


--------------------------------------------------------------------------------
/docs/coming_from_dplyr.md:
--------------------------------------------------------------------------------
  1 | # Coming from dplyr
  2 | 
  3 | This tutorial will walk through the examples in dplyr's [mini tutorial](https://dplyr.tidyverse.org/) showing how concepts in dplyr map to dataframe.
  4 | 
  5 | ## Filtering
  6 | Filtering looks similar in both libraries.
  7 | 
  8 | ```r
  9 | starwars %>% 
 10 |   filter(species == "Droid")
 11 | #> # A tibble: 6 × 14
 12 | #>   name   height  mass hair_color skin_color  eye_color birth_year sex   gender  
 13 | #>   <chr>   <int> <dbl> <chr>      <chr>       <chr>          <dbl> <chr> <chr>   
 14 | #> 1 C-3PO     167    75 <NA>       gold        yellow           112 none  masculi…
 15 | #> 2 R2-D2      96    32 <NA>       white, blue red               33 none  masculi…
 16 | #> 3 R5-D4      97    32 <NA>       white, red  red               NA none  masculi…
 17 | #> 4 IG-88     200   140 none       metal       red               15 none  masculi…
 18 | #> 5 R4-P17     96    NA none       silver, red red, blue         NA none  feminine
 19 | #> # ℹ 1 more row
 20 | #> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
 21 | #> #   vehicles <list>, starships <list>
 22 | ```
 23 | 
 24 | ```haskell
 25 | starwars |> D.filter "species" (("Droid" :: Str.Text) ==)
 26 |          |> D.take 10
 27 | ```
 28 | 
 29 | ```
 30 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 31 | index |  name  |  height   |   mass    | hair_color | skin_color  | eye_color | birth_year | sex  |  gender   | homeworld | species |                                                                   films                                                                   |  vehicles  | starships 
 32 | ------|--------|-----------|-----------|------------|-------------|-----------|------------|------|-----------|-----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------|------------|-----------
 33 |  Int  |  Text  | Maybe Int | Maybe Int |    Text    |    Text     |   Text    | Maybe Int  | Text |   Text    |   Text    |  Text   |                                                                   Text                                                                    | Maybe Text | Maybe Text
 34 | ------|--------|-----------|-----------|------------|-------------|-----------|------------|------|-----------|-----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------|------------|-----------
 35 | 0     | C-3PO  | Just 167  | Just 75   | NA         | gold        | yellow    | Just 112   | none | masculine | Tatooine  | Droid   | A New Hope, The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith                    | Nothing    | Nothing   
 36 | 1     | R2-D2  | Just 96   | Just 32   | NA         | white, blue | red       | Just 33    | none | masculine | Naboo     | Droid   | A New Hope, The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith, The Force Awakens | Nothing    | Nothing   
 37 | 2     | R5-D4  | Just 97   | Just 32   | NA         | white, red  | red       | Nothing    | none | masculine | Tatooine  | Droid   | A New Hope                                                                                                                                | Nothing    | Nothing   
 38 | 3     | IG-88  | Just 200  | Just 140  | none       | metal       | red       | Just 15    | none | masculine | NA        | Droid   | The Empire Strikes Back                                                                                                                   | Nothing    | Nothing   
 39 | 4     | R4-P17 | Just 96   | Nothing   | none       | silver, red | red, blue | Nothing    | none | feminine  | NA        | Droid   | Attack of the Clones, Revenge of the Sith                                                                                                 | Nothing    | Nothing   
 40 | 5     | BB8    | Nothing   | Nothing   | none       | none        | black     | Nothing    | none | masculine | NA        | Droid   | The Force Awakens                                                                                                                         | Nothing    | Nothing
 41 | ```
 42 | 
 43 | ## Selecting columns
 44 | Select looks similar except in Haskell we take as argument a list of strings instead of a mix of predicates and strings.
 45 | 
 46 | ```r
 47 | starwars %>% 
 48 |   select(name, ends_with("color"))
 49 | #> # A tibble: 87 × 4
 50 | #>   name           hair_color skin_color  eye_color
 51 | #>   <chr>          <chr>      <chr>       <chr>    
 52 | #> 1 Luke Skywalker blond      fair        blue     
 53 | #> 2 C-3PO          <NA>       gold        yellow   
 54 | #> 3 R2-D2          <NA>       white, blue red      
 55 | #> 4 Darth Vader    none       white       yellow   
 56 | #> 5 Leia Organa    brown      light       brown    
 57 | #> # ℹ 82 more rows
 58 | ```
 59 | 
 60 | To get the same predicate-like functionality we use `selectBy`.
 61 | 
 62 | ```haskell
 63 | starwars |> D.selectBy (\cname -> cname == "name" || T.isSuffixOf "color" cname)
 64 |          |> D.take 10
 65 | ```
 66 | 
 67 | 
 68 | ```
 69 | --------------------------------------------------------------------
 70 | index |        name        |  hair_color   | skin_color  | eye_color
 71 | ------|--------------------|---------------|-------------|----------
 72 |  Int  |        Text        |     Text      |    Text     |   Text   
 73 | ------|--------------------|---------------|-------------|----------
 74 | 0     | Luke Skywalker     | blond         | fair        | blue     
 75 | 1     | C-3PO              | NA            | gold        | yellow   
 76 | 2     | R2-D2              | NA            | white, blue | red      
 77 | 3     | Darth Vader        | none          | white       | yellow   
 78 | 4     | Leia Organa        | brown         | light       | brown    
 79 | 5     | Owen Lars          | brown, grey   | light       | blue     
 80 | 6     | Beru Whitesun Lars | brown         | light       | blue     
 81 | 7     | R5-D4              | NA            | white, red  | red      
 82 | 8     | Biggs Darklighter  | black         | light       | brown    
 83 | 9     | Obi-Wan Kenobi     | auburn, white | fair        | blue-gray
 84 | ```
 85 | 
 86 | ## Transforming columns
 87 | 
 88 | R has a general mutate function that takes in a mix of expressions and column names.
 89 | 
 90 | ```r
 91 | starwars %>% 
 92 |   mutate(name, bmi = mass / ((height / 100)  ^ 2)) %>%
 93 |   select(name:mass, bmi)
 94 | #> # A tibble: 87 × 4
 95 | #>   name           height  mass   bmi
 96 | #>   <chr>           <int> <dbl> <dbl>
 97 | #> 1 Luke Skywalker    172    77  26.0
 98 | #> 2 C-3PO             167    75  26.9
 99 | #> 3 R2-D2              96    32  34.7
100 | #> 4 Darth Vader       202   136  33.3
101 | #> 5 Leia Organa       150    49  21.8
102 | #> # ℹ 82 more rows
103 | ```
104 | 
105 | Our logic is more explicit about what's going on. Because both our fields are nullable/optional we have to specify the type.
106 | 
107 | ```haskell
108 | bmi (w :: Int) (h :: Int) = (fromIntegral w) / (fromIntegral h / 100) ** 2 :: Double
109 | 
110 | starwars
111 |   |> D.selectRange ("name", "mass")
112 |   -- mass and height are optionals so we combine them with
113 |   -- Haskell's Applicative operators.
114 |   |> D.deriveFrom (["mass", "height"], D.func (\w h -> bmi <$> w <*> h)) "bmi" 
115 |   |> D.take 10
116 | ```
117 | 
118 | ```
119 | -------------------------------------------------------------------------------
120 | index |         name          |  height   |   mass    |           bmi          
121 | ------|-----------------------|-----------|-----------|------------------------
122 |  Int  |         Text          | Maybe Int | Maybe Int |      Maybe Double      
123 | ------|-----------------------|-----------|-----------|------------------------
124 | 0     | Luke Skywalker        | Just 172  | Just 77   | Just 26.027582477014604
125 | 1     | C-3PO                 | Just 167  | Just 75   | Just 26.89232313815483 
126 | 2     | R2-D2                 | Just 96   | Just 32   | Just 34.72222222222222 
127 | 3     | Darth Vader           | Just 202  | Just 136  | Just 33.33006567983531 
128 | 4     | Leia Organa           | Just 150  | Just 49   | Just 21.77777777777778 
129 | 5     | Owen Lars             | Just 178  | Just 120  | Just 37.87400580734756 
130 | 6     | Beru Whitesun Lars    | Just 165  | Just 75   | Just 27.548209366391188
131 | 7     | R5-D4                 | Just 97   | Just 32   | Just 34.009990434690195
132 | 8     | Biggs Darklighter     | Just 183  | Just 84   | Just 25.082863029651524
133 | 9     | Obi-Wan Kenobi        | Just 182  | Just 77   | Just 23.24598478444632 
134 | ```
135 | 
136 | Haskell's applicative syntax does take some getting used to.
137 | 
138 | `f <$> a` means apply f to the thing inside the "container". In this
139 | case the container (or more infamously the monad) is of type `Maybe`.
140 | So this can also be written as `fmap f a`.
141 | 
142 | But this only works if our `f` takes a single argument. If it takes
143 | two arguments then the we use `<*>` to specify the second argument.
144 | 
145 | So, applying bmi to two optionals can be written as:
146 | 
147 | ```haskell
148 | ghci> fmap (+) (Just 2) <*> Just 2
149 | Just 4
150 | ghci> (+) <$> Just 2 <*> Just 2
151 | Just 4
152 | ```
153 | 
154 | You'll find a wealth of functions for dealing with optionals in the package
155 | `Data.Maybe`.
156 | 
157 | ## Sorting
158 | 
159 | ```r
160 | starwars %>% 
161 |   arrange(desc(mass))
162 | #> # A tibble: 87 × 14
163 | #>   name      height  mass hair_color skin_color eye_color birth_year sex   gender
164 | #>   <chr>      <int> <dbl> <chr>      <chr>      <chr>          <dbl> <chr> <chr> 
165 | #> 1 Jabba De…    175  1358 <NA>       green-tan… orange         600   herm… mascu…
166 | #> 2 Grievous     216   159 none       brown, wh… green, y…       NA   male  mascu…
167 | #> 3 IG-88        200   140 none       metal      red             15   none  mascu…
168 | #> 4 Darth Va…    202   136 none       white      yellow          41.9 male  mascu…
169 | #> 5 Tarfful      234   136 brown      brown      blue            NA   male  mascu…
170 | #> # ℹ 82 more rows
171 | #> # ℹ 5 more variables: homeworld <chr>, species <chr>, films <list>,
172 | #> #   vehicles <list>, starships <list>
173 | ```
174 | 
175 | ```haskell
176 | starwars |> D.sortBy D.Descending ["mass"] |> D.take 5
177 | ```
178 | 
179 | ```
180 | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
181 | index |         name          |  height   |   mass    | hair_color |    skin_color    |   eye_color   | birth_year |      sex       |  gender   | homeworld | species |                                    films                                     |              vehicles              |            starships           
182 | ------|-----------------------|-----------|-----------|------------|------------------|---------------|------------|----------------|-----------|-----------|---------|------------------------------------------------------------------------------|------------------------------------|--------------------------------
183 |  Int  |         Text          | Maybe Int | Maybe Int |    Text    |       Text       |     Text      | Maybe Int  |      Text      |   Text    |   Text    |  Text   |                                     Text                                     |             Maybe Text             |           Maybe Text           
184 | ------|-----------------------|-----------|-----------|------------|------------------|---------------|------------|----------------|-----------|-----------|---------|------------------------------------------------------------------------------|------------------------------------|--------------------------------
185 | 0     | Jabba Desilijic Tiure | Just 175  | Just 1358 | NA         | green-tan, brown | orange        | Just 600   | hermaphroditic | masculine | Nal Hutta | Hutt    | A New Hope, Return of the Jedi, The Phantom Menace                           | Nothing                            | Nothing                        
186 | 1     | Grievous              | Just 216  | Just 159  | none       | brown, white     | green, yellow | Nothing    | male           | masculine | Kalee     | Kaleesh | Revenge of the Sith                                                          | Just "Tsmeu-6 personal wheel bike" | Just "Belbullab-22 starfighter"
187 | 2     | IG-88                 | Just 200  | Just 140  | none       | metal            | red           | Just 15    | none           | masculine | NA        | Droid   | The Empire Strikes Back                                                      | Nothing                            | Nothing                        
188 | 3     | Tarfful               | Just 234  | Just 136  | brown      | brown            | blue          | Nothing    | male           | masculine | Kashyyyk  | Wookiee | Revenge of the Sith                                                          | Nothing                            | Nothing                        
189 | 4     | Darth Vader           | Just 202  | Just 136  | none       | white            | yellow        | Nothing    | male           | masculine | Tatooine  | Human   | A New Hope, The Empire Strikes Back, Return of the Jedi, Revenge of the Sith | Nothing                            | Just "TIE Advanced x1"
190 | ```
191 | 
192 | ## Grouping and aggregating
193 | 
194 | ```r
195 | starwars %>%
196 |   group_by(species) %>%
197 |   summarise(
198 |     n = n(),
199 |     mass = mean(mass, na.rm = TRUE)
200 |   ) %>%
201 |   filter(
202 |     n > 1,
203 |     mass > 50
204 |   )
205 | #> # A tibble: 9 × 3
206 | #>   species      n  mass
207 | #>   <chr>    <int> <dbl>
208 | #> 1 Droid        6  69.8
209 | #> 2 Gungan       3  74  
210 | #> 3 Human       35  81.3
211 | #> 4 Kaminoan     2  88  
212 | #> 5 Mirialan     2  53.1
213 | #> # ℹ 4 more rows
214 | ```
215 | 
216 | ```haskell
217 | starwars |> D.select ["species", "mass"]
218 |          |> D.groupByAgg D.Count ["species"]
219 |          -- This will be saved in a variable called  "Mean_mass"
220 |          |> D.reduceByAgg D.Mean "mass"
221 |          -- Always better to be explcit about types for
222 |          -- numbers but you can also turn on defaults
223 |          -- to save keystrokes.
224 |          |> D.filterWhere (["Count", "Mean_mass"],
225 |                            D.func (\(n :: Int) (mass :: Double) -> n > 1 && mass > 50))
226 | ```
227 | 
228 | ```
229 | --------------------------------------------
230 | index | species  |     Mean_mass     | Count
231 | ------|----------|-------------------|------
232 |  Int  |   Text   |      Double       |  Int 
233 | ------|----------|-------------------|------
234 | 0     | Human    | 81.47368421052632 | 35   
235 | 1     | Droid    | 69.75             | 6    
236 | 2     | Wookiee  | 124.0             | 2    
237 | 3     | NA       | 81.0              | 4    
238 | 4     | Gungan   | 74.0              | 3    
239 | 5     | Zabrak   | 80.0              | 2    
240 | 6     | Twi'lek  | 55.0              | 2    
241 | 7     | Kaminoan | 88.0              | 2
242 | ```
243 | 


--------------------------------------------------------------------------------
/docs/coming_from_pandas.md:
--------------------------------------------------------------------------------
  1 | # Coming from pandas
  2 | 
  3 | We'll be porting over concepts from [10 minutes to Pandas](https://pandas.pydata.org/docs/user_guide/10min.html).
  4 | 
  5 | ## Basic Data Structures
  6 | 
  7 | A pandas `Series` maps to a `Column`. `Series` are indexable (labelled) arrays. We currently don't support indexing so `Column`s aren't meant to be manipulated directly so we don't focus on them too much.
  8 | 
  9 | A `DataFrame` maps to a `DataFrame` as expected. Our dataframes are essentially a list of `Vector`s with some metadata for managing state.
 10 | 
 11 | ## Creating our structures
 12 | 
 13 | Creaing a series.
 14 | 
 15 | ```python
 16 | python> s = pd.Series([1, 3, 5, np.nan, 6, 8])
 17 | python> s
 18 | 0    1.0
 19 | 1    3.0
 20 | 2    5.0
 21 | 3    NaN
 22 | 4    6.0
 23 | 5    8.0
 24 | dtype: float64
 25 | ```
 26 | 
 27 | ```haskell
 28 | ghci> import qualified DataFrame as D
 29 | ghci> D.toColumn [1, 3, 5, read @Float "NaN", 6, 8]
 30 | [1.0,3.0,5.0,NaN,6.0,8.0]
 31 | ```
 32 | 
 33 | ```python
 34 | python> dates = pd.date_range("20130101", periods=6)
 35 | python> dates
 36 | DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
 37 |                '2013-01-05', '2013-01-06'],
 38 |               dtype='datetime64[ns]', freq='D')
 39 | ```
 40 | 
 41 | ```haskell
 42 | ghci> import Data.Time.Calendar
 43 | ghci> dates = D.toColumn $ Prelude.take 6 $ [fromGregorian 2013 01 01..]
 44 | ghci> dates
 45 | [2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06]
 46 | ```
 47 | 
 48 | Use the series to create a dataframe.
 49 | 
 50 | ```python
 51 | python> df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD"))
 52 | python> df
 53 |                    A         B         C         D
 54 | 2013-01-01  0.469112 -0.282863 -1.509059 -1.135632
 55 | 2013-01-02  1.212112 -0.173215  0.119209 -1.044236
 56 | 2013-01-03 -0.861849 -2.104569 -0.494929  1.071804
 57 | 2013-01-04  0.721555 -0.706771 -1.039575  0.271860
 58 | 2013-01-05 -0.424972  0.567020  0.276232 -1.087401
 59 | 2013-01-06 -0.673690  0.113648 -1.478427  0.524988
 60 | ```
 61 | 
 62 | ```haskell
 63 | ghci> import qualified Data.Vector as V
 64 | ghci> import System.Random (randomRIO)
 65 | ghci> import Control.Monad (replicateM)
 66 | ghci> import Data.List (foldl')
 67 | ghci> :set -XOverloadedStrings
 68 | ghci> initDf = D.fromList [("date", dates)]
 69 | ghci> ns <- replicateM 4 (replicateM 6 (randomRIO (-2.0, 2.0)))
 70 | ghci> df = foldl' (\d (name, col) -> D.insertColumn name (V.fromList col) d) initDf (zip ["A","B","C","D"] ns)
 71 | ghci> df
 72 | ------------------------------------------------------------------------------------------------------------
 73 | index |    date    |          A          |          B           |          C           |          D         
 74 | ------|------------|---------------------|----------------------|----------------------|--------------------
 75 |  Int  |    Day     |       Double        |        Double        |        Double        |       Double       
 76 | ------|------------|---------------------|----------------------|----------------------|--------------------
 77 | 0     | 2013-01-01 | 0.49287792598710745 | 1.2126312556288785   | -1.3553292904555625  | 1.8491213627748553 
 78 | 1     | 2013-01-02 | 0.7936547276080512  | -1.5209756494542028  | -0.5208055385837551  | 0.8895325450813525 
 79 | 2     | 2013-01-03 | 1.8883976214395153  | 1.3453541205495676   | -1.1801018894304223  | 0.20583994035730901
 80 | 3     | 2013-01-04 | -1.3262867911904324 | -0.37375298679005686 | -0.8580515357149543  | 1.4681616115128593 
 81 | 4     | 2013-01-05 | 1.9068894062167745  | 0.792553168600036    | -0.13526265076664545 | -1.6239378251651466
 82 | 5     | 2013-01-06 | -0.5541246187320041 | -1.5791034339829042  | -1.5650415391333796  | -1.7802523632196152
 83 | ```
 84 | 
 85 | As hinted in the previous example we can create a dataframe with `fromList`. This function takes in a list of tuples. We don't broadast values like python does i.e if you put in a single value into a column all other values will be null/nothing. But we'll detail how to get the same functionality.
 86 | 
 87 | ```python
 88 | df2 = pd.DataFrame(
 89 |     {
 90 |         "A": 1.0,
 91 |         "B": pd.Timestamp("20130102"),
 92 |         "C": pd.Series(1, index=list(range(4)), dtype="float32"),
 93 |         "D": np.array([3] * 4, dtype="int32"),
 94 |         "E": pd.Categorical(["test", "train", "test", "train"]),
 95 |         "F": "foo",
 96 |     }
 97 | )
 98 | 
 99 | # Result
100 | # df2
101 | #      A          B    C  D      E    F
102 | # 0  1.0 2013-01-02  1.0  3   test  foo
103 | # 1  1.0 2013-01-02  1.0  3  train  foo
104 | # 2  1.0 2013-01-02  1.0  3   test  foo
105 | # 3  1.0 2013-01-02  1.0  3  train  foo
106 | 
107 | ```
108 | 
109 | ```haskell
110 | -- All our data types must be printable and orderable.
111 | data Transport = Test | Train deriving (Show, Ord, Eq)
112 | ghci> :{
113 | ghci| df = D.fromList [
114 | ghci|        ("A", D.toColumn (replicate 4 1.0)),
115 | ghci|        ("B", D.toColumn (replicate 4 (fromGregorian 2013 01 02))),
116 | ghci|        ("C", D.toColumn (replicate 4 (1.0 :: Float))),
117 | ghci|        ("D", D.toColumn (replicate 4 (3 :: Int))),
118 | ghci|        ("E", D.toColumn (take 4 $ cycle [Test, Train])),
119 | ghci|        ("F", D.toColumn (replicate 4 "foo"))]
120 | ghci|:}
121 | ghci> df
122 | --------------------------------------------------------------
123 | index |   A    |     B      |   C   |  D  |     E     |   F   
124 | ------|--------|------------|-------|-----|-----------|-------
125 |  Int  | Double |    Day     | Float | Int | Transport | [Char]
126 | ------|--------|------------|-------|-----|-----------|-------
127 | 0     | 1.0    | 2013-01-02 | 1.0   | 3   | Test      | foo   
128 | 1     | 1.0    | 2013-01-02 | 1.0   | 3   | Train     | foo   
129 | 2     | 1.0    | 2013-01-02 | 1.0   | 3   | Test      | foo   
130 | 3     | 1.0    | 2013-01-02 | 1.0   | 3   | Train     | foo
131 | ```
132 | 
133 | Rather than label a string value as categorial we create a type that encapsulates the value.
134 | 
135 | ## Viewing data
136 | 
137 | By default we print the whole dataframe. To see the first `n` rows we instead provide a `take` function that takes in as arguments `n` and the dataframe.
138 | 
139 | ```haskell
140 | ghci> D.take 2 df
141 | --------------------------------------------------------------
142 | index |   A    |     B      |   C   |  D  |     E     |   F   
143 | ------|--------|------------|-------|-----|-----------|-------
144 |  Int  | Double |    Day     | Float | Int | Transport | [Char]
145 | ------|--------|------------|-------|-----|-----------|-------
146 | 0     | 1.0    | 2013-01-02 | 1.0   | 3   | Test      | foo   
147 | 1     | 1.0    | 2013-01-02 | 1.0   | 3   | Train     | foo 
148 | ```
149 | 
150 | Our equivalent of describe is `summarize`:
151 | 
152 | ```haskell
153 | ghci> D.summarize df
154 | -----------------------------------------------------
155 | index | Statistic |     D     |     C     |     A    
156 | ------|-----------|-----------|-----------|----------
157 |  Int  |   Text    |  Double   |  Double   |  Double  
158 | ------|-----------|-----------|-----------|----------
159 | 0     | Mean      | 3.0       | 1.0       | 1.0      
160 | 1     | Minimum   | 3.0       | 1.0       | 1.0      
161 | 2     | 25%       | 3.0       | 1.0       | 1.0      
162 | 3     | Median    | 3.0       | 1.0       | 1.0      
163 | 4     | 75%       | 3.0       | 1.0       | 1.0      
164 | 5     | Max       | 3.0       | 1.0       | 1.0      
165 | 6     | StdDev    | 0.0       | 0.0       | 0.0      
166 | 7     | IQR       | 0.0       | 0.0       | 0.0      
167 | 8     | Skewness  | -Infinity | -Infinity | -Infinity
168 | ```
169 | 
170 | #### Sorting
171 | 
172 | Since we don't have indexes we only have one sort function that sorts by a column.
173 | 
174 | ```haskell
175 | ghci> D.sortBy D.Ascending ["E"] df
176 | --------------------------------------------------------------
177 | index |   A    |     B      |   C   |  D  |     E     |   F   
178 | ------|--------|------------|-------|-----|-----------|-------
179 |  Int  | Double |    Day     | Float | Int | Transport | [Char]
180 | ------|--------|------------|-------|-----|-----------|-------
181 | 0     | 1.0    | 2013-01-02 | 1.0   | 3   | Test      | foo   
182 | 1     | 1.0    | 2013-01-02 | 1.0   | 3   | Test      | foo   
183 | 2     | 1.0    | 2013-01-02 | 1.0   | 3   | Train     | foo   
184 | 3     | 1.0    | 2013-01-02 | 1.0   | 3   | Train     | foo
185 | ```
186 | 
187 | ## Selection
188 | Panda's `[]` operator is a jack-knife that does a number of kinds of aggregation.
189 | As such it doesn't map to one construct and doesn't always have an equivalent in Haskell.
190 | 
191 | ### Selecting columns
192 | 
193 | ```python
194 | python> df.loc[:, ["A", "B"]]
195 |                    A         B
196 | 2013-01-01  0.469112 -0.282863
197 | 2013-01-02  1.212112 -0.173215
198 | 2013-01-03 -0.861849 -2.104569
199 | 2013-01-04  0.721555 -0.706771
200 | 2013-01-05 -0.424972  0.567020
201 | 2013-01-06 -0.673690  0.113648
202 | ```
203 | 
204 | Pandas indexes the dataframe like a 2D array. We get all rows with `:` and then specify which columns after the comma.
205 | 
206 | In DataFrame we mimick SQL's select.
207 | 
208 | ```haskell
209 | ghci> D.select ["A"] df
210 | --------------
211 | index |   A   
212 | ------|-------
213 |  Int  | Double
214 | ------|-------
215 | 0     | 1.0   
216 | 1     | 1.0   
217 | 2     | 1.0   
218 | 3     | 1.0
219 | ```
220 | 
221 | To filter by rows we have to filter by the values we are interested in rather than indexes.
222 | 
223 | ```python
224 | python> df.loc["20130102":"20130104", ["A", "B"]]
225 |                    A         B
226 | 2013-01-02  1.212112 -0.173215
227 | 2013-01-03 -0.861849 -2.104569
228 | 2013-01-04  0.721555 -0.706771
229 | ```
230 | 
231 | ```haskell
232 | ghci> :{
233 | ghci| df' |> D.filter "date" (\d -> d >= (fromGregorian 2013 01 02) && d <= (fromGregorian 2013 01 04))
234 | ghci| |> D.select ["A", "B"]
235 | ghci| :}
236 | ghci> df
237 | ---------------------------
238 | index |   A    |     B     
239 | ------|--------|-----------
240 |  Int  | Double |    Day    
241 | ------|--------|-----------
242 | 0     | 1.0    | 2013-01-02
243 | 1     | 1.0    | 2013-01-02
244 | 2     | 1.0    | 2013-01-02
245 | ```
246 | 
247 | ## Missing values
248 | 
249 | Rows with missing values are represented by a `Maybe a` type. Dealing with missing values means applying the usual `Maybe` functions to the data.
250 | 
251 | ### Filling
252 | 
253 | ```haskell
254 | ghci> df' = D.addColumn "G" (V.fromList [Just 1, Just 2, Nothing, Just 4]) df
255 | ghci> df'
256 | ------------------------------------------------------------------------------
257 | index |   A    |     B      |   C   |  D  |     E     |   F    |       G      
258 | ------|--------|------------|-------|-----|-----------|--------|--------------
259 |  Int  | Double |    Day     | Float | Int | Transport | [Char] | Maybe Integer
260 | ------|--------|------------|-------|-----|-----------|--------|--------------
261 | 0     | 1.0    | 2013-01-02 | 1.0   | 3   | Test      | foo    | Just 1       
262 | 1     | 1.0    | 2013-01-02 | 1.0   | 3   | Train     | foo    | Just 2       
263 | 2     | 1.0    | 2013-01-02 | 1.0   | 3   | Test      | foo    | Nothing      
264 | 3     | 1.0    | 2013-01-02 | 1.0   | 3   | Train     | foo    | Just 4 
265 | ghci> D.apply (fromMaybe 5) "G" df'
266 | ------------------------------------------------------------------------
267 | index |   A    |     B      |   C   |  D  |     E     |   F    |    G   
268 | ------|--------|------------|-------|-----|-----------|--------|--------
269 |  Int  | Double |    Day     | Float | Int | Transport | [Char] | Integer
270 | ------|--------|------------|-------|-----|-----------|--------|--------
271 | 0     | 1.0    | 2013-01-02 | 1.0   | 3   | Test      | foo    | 1      
272 | 1     | 1.0    | 2013-01-02 | 1.0   | 3   | Train     | foo    | 2      
273 | 2     | 1.0    | 2013-01-02 | 1.0   | 3   | Test      | foo    | 5      
274 | 3     | 1.0    | 2013-01-02 | 1.0   | 3   | Train     | foo    | 4
275 | ghci> df' |> D.filter "G" (isJust @Integer)
276 | ------------------------------------------------------------------------------
277 | index |   A    |     B      |   C   |  D  |     E     |   F    |       G      
278 | ------|--------|------------|-------|-----|-----------|--------|--------------
279 |  Int  | Double |    Day     | Float | Int | Transport | [Char] | Maybe Integer
280 | ------|--------|------------|-------|-----|-----------|--------|--------------
281 | 0     | 1.0    | 2013-01-02 | 1.0   | 3   | Test      | foo    | Just 1       
282 | 1     | 1.0    | 2013-01-02 | 1.0   | 3   | Train     | foo    | Just 2       
283 | 2     | 1.0    | 2013-01-02 | 1.0   | 3   | Train     | foo    | Just 4
284 | ```
285 | 


--------------------------------------------------------------------------------
/docs/coming_from_polars.md:
--------------------------------------------------------------------------------
  1 | # Coming from Polars
  2 | 
  3 | This tutorial will walk through the examples in Polars' [getting started guide](https://docs.pola.rs/user-guide/getting-started/) showing how concepts in Polars map to dataframe.
  4 | 
  5 | ## Reading and writing CSV
  6 | 
  7 | ### Round trip test
  8 | 
  9 | To test our CSV IO we'll create a dataframe programtically, write it to a CSV file, then read the CSV file back again.
 10 | 
 11 | In polars this looks like:
 12 | 
 13 | ```python
 14 | import polars as pl
 15 | import datetime as dt
 16 | 
 17 | df = pl.DataFrame(
 18 |     {
 19 |         "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"],
 20 |         "birthdate": [
 21 |             dt.date(1997, 1, 10),
 22 |             dt.date(1985, 2, 15),
 23 |             dt.date(1983, 3, 22),
 24 |             dt.date(1981, 4, 30),
 25 |         ],
 26 |         "weight": [57.9, 72.5, 53.6, 83.1],  # (kg)
 27 |         "height": [1.56, 1.77, 1.65, 1.75],  # (m)
 28 |     }
 29 | )
 30 | df.write_csv("docs/assets/data/output.csv")
 31 | df_csv = pl.read_csv("docs/assets/data/output.csv", try_parse_dates=True)
 32 | print(df_csv)
 33 | ```
 34 | 
 35 | As a standalone dataframe script this would look like.
 36 | 
 37 | 
 38 | ```haskell
 39 | import qualified DataFrame as D
 40 | import Data.Time.Calendar
 41 | 
 42 | main :: IO
 43 | main = do
 44 |     let df = D.fromList [
 45 |         ("name", D.toColumn [ "Alice Archer"
 46 |                             , "Ben Brown"
 47 |                             , "Chloe Cooper"
 48 |                             , "Daniel Donovan"])
 49 |         , ("birthdate", D.toColumn [ fromGregorian 1997 01 10
 50 |                                    , fromGregorian 1985 02 15
 51 |                                    , fromGregorian 1983 03 22
 52 |                                    , fromGregorian 1981 04 30])
 53 |         , ("weight", D.toColumn [57.9, 72.5, 53.6, 83.1])
 54 |         , ("height", D.toColumn [1.56, 1.77, 1.65, 1.75])]
 55 |     print df
 56 |     D.writeCsv "./data/output.csv" df
 57 |     let df_csv = D.readCsv "./data/output.csv"
 58 |     print df_csv
 59 | ```
 60 | 
 61 | This round trip prints the following tables:
 62 | 
 63 | ```
 64 | -----------------------------------------------------
 65 | index |      name      | birthdate  | weight | height
 66 | ------|----------------|------------|--------|-------
 67 |  Int  |     [Char]     |    Day     | Double | Double
 68 | ------|----------------|------------|--------|-------
 69 | 0     | Alice Archer   | 1997-01-10 | 57.9   | 1.56  
 70 | 1     | Ben Brown      | 1985-02-15 | 72.5   | 1.77  
 71 | 2     | Chloe Cooper   | 1983-03-22 | 53.6   | 1.65  
 72 | 3     | Daniel Donovan | 1981-04-30 | 83.1   | 1.75
 73 | 
 74 | -----------------------------------------------------
 75 | index |      name      | birthdate  | weight | height
 76 | ------|----------------|------------|--------|-------
 77 |  Int  |      Text      |    Day     | Double | Double
 78 | ------|----------------|------------|--------|-------
 79 | 0     | Alice Archer   | 1997-01-10 | 57.9   | 1.56  
 80 | 1     | Ben Brown      | 1985-02-15 | 72.5   | 1.77  
 81 | 2     | Chloe Cooper   | 1983-03-22 | 53.6   | 1.65  
 82 | 3     | Daniel Donovan | 1981-04-30 | 83.1   | 1.75  
 83 | 
 84 | ```
 85 | 
 86 | Notice that the type of the string column changes from `[Char]` (Haskell's default) to `Text` (dataframe's default).
 87 | 
 88 | 
 89 | ## Expressions
 90 | 
 91 | Our equivalent to expressions is a tuple that contains a list of the column names followed by a
 92 | function where the arguments correspond to the order of column names. We use a special function
 93 | wrapper to make our dataframes accept functions with any number of arguments. This is done using
 94 | the `func` function.
 95 | 
 96 | This is a mouthful and is probably easier to see in action/comparison.
 97 | 
 98 | For example:
 99 | 
100 | ```python
101 | result = df.select(
102 |     pl.col("name"),
103 |     pl.col("birthdate").dt.year().alias("birth_year"),
104 |     (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"),
105 | )
106 | print(result)
107 | ```
108 | 
109 | Would be written as:
110 | 
111 | ```haskell
112 | {-# LANGUAGE ScopedTypeVariables #-}
113 | {-# LANGUAGE TypeApplications #-}
114 | import qualified DataFrame as D
115 | import qualified Data.Text as T
116 | 
117 | import DataFrame.Operations ( (|>) )
118 | import Data.Time.Calendar
119 | 
120 | main :: IO ()
121 | main = do
122 |     ...
123 |     let year = (\(YearMonthDay y _ _) -> y)
124 |     print $ df_csv
125 |           |> D.derive "birth_year" year "birthdate"
126 |           |> D.deriveFrom (["weight", "height"], D.func (\(w :: Double) (h :: Double) -> w / h ** 2))
127 |                        "bmi"
128 |           |> D.select ["name", "birth_year", "bmi"]
129 | ```
130 | 
131 | Or, more clearly:
132 | 
133 | ```haskell
134 | {-# LANGUAGE ScopedTypeVariables #-}
135 | {-# LANGUAGE TypeApplications #-}
136 | import qualified DataFrame as D
137 | import qualified Data.Text as T
138 | 
139 | import DataFrame ( (|>) )
140 | import Data.Time.Calendar
141 | 
142 | main :: IO ()
143 | main = do
144 |     ...
145 |     let year = (\(YearMonthDay y _ _) -> y)
146 |     let bmi :: Double -> Double -> Double
147 |         bmi w h = w / h ** 2
148 |     print $ df_csv
149 |           |> D.derive "birth_year" year "birthdate"
150 |           |> D.deriveFrom (["weight", "height"], D.func bmi) "bmi"
151 |           |> D.select ["name", "birth_year", "bmi"]
152 | ```
153 | 
154 | Resulting in:
155 | 
156 | ```
157 | --------------------------------------------------------
158 | index |      name      | birth_year |        bmi        
159 | ------|----------------|------------|-------------------
160 |  Int  |      Text      |  Integer   |       Double      
161 | ------|----------------|------------|-------------------
162 | 0     | Alice Archer   | 1997       | 23.791913214990135
163 | 1     | Ben Brown      | 1985       | 23.14149829231702 
164 | 2     | Chloe Cooper   | 1983       | 19.687786960514234
165 | 3     | Daniel Donovan | 1981       | 27.13469387755102 
166 | ```
167 | 
168 | The dataframe implementation can be read top down. `apply` a function that gets the year to the `birthdate`;
169 | store the result in the `birth_year` column; combine `weight` and `height` into the bmi column using the
170 | formula `w / h ** 2`; then select the `name`, `birth_year` and `bmi` fields.
171 | 
172 | Dataframe focuses on splitting transformations into transformations on the whole dataframe so it's easily usable
173 | in a repl-like environment.
174 | 
175 | In the example Polars expression expansion example:
176 | 
177 | ```python
178 | result = df.select(
179 |     pl.col("name"),
180 |     (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"),
181 | )
182 | print(result)
183 | ```
184 | 
185 | We instead write this two `applyWithAlias` calls:
186 | 
187 | ```haskell
188 | df_csv
189 |     |> D.derive "weight-5%" (*0.95) "weight"
190 |     -- Alternatively we can use the `as` function.
191 |     |> D.as "height-5%" D.apply (*0.95) "height"
192 |     |> D.select ["name", "weight-5%", "height-5%"]
193 | ```
194 | 
195 | ```
196 | ----------------------------------------------------------------
197 | index |      name      |     height-5%      |     weight-5%     
198 | ------|----------------|--------------------|-------------------
199 |  Int  |     [Char]     |       Double       |       Double      
200 | ------|----------------|--------------------|-------------------
201 | 0     | Alice Archer   | 1.482              | 55.004999999999995
202 | 1     | Ben Brown      | 1.6815             | 68.875            
203 | 2     | Chloe Cooper   | 1.5675             | 50.92             
204 | 3     | Daniel Donovan | 1.6624999999999999 | 78.945
205 | ```
206 | 
207 | However we can make our program shorter by using regular Haskell and folding over the dataframe.
208 | 
209 | ```haskell
210 | let reduce name = D.derive (name <> "-5%") (*0.95) name
211 | df_csv
212 |     |> D.fold reduce ["weight", "height"]
213 |     |> D.select ["name", "weight-5%", "height-5%"]
214 | ```
215 | 
216 | Or alternatively,
217 | 
218 | ```haskell
219 | addSuffix suffix name = D.rename name (name <> suffix)
220 | df_csv
221 |   |> D.applyMany ["weight", "height"] (*0.95)
222 |   |> D.fold (addSuffix "-5%")
223 |   |> D.select ["name", "weight-5%", "height-5%"]
224 | ```
225 | 
226 | Filtering looks much the same:
227 | 
228 | ```python
229 | result = df.filter(pl.col("birthdate").dt.year() < 1990)
230 | print(result)
231 | ```
232 | 
233 | Versus
234 | 
235 | ```haskell
236 | bornAfter1990 = ( (< 1990)
237 |                 . (\(YearMonthDay y _ _) -> y))
238 | df_csv &
239 |     D.filter "birthdate" bornAfter1990
240 | ```
241 | 
242 | ```
243 | -----------------------------------------------------
244 | index |      name      | birthdate  | weight | height
245 | ------|----------------|------------|--------|-------
246 |  Int  |      Text      |    Day     | Double | Double
247 | ------|----------------|------------|--------|-------
248 | 0     | Ben Brown      | 1985-02-15 | 72.5   | 1.77  
249 | 1     | Chloe Cooper   | 1983-03-22 | 53.6   | 1.65  
250 | 2     | Daniel Donovan | 1981-04-30 | 83.1   | 1.75
251 | ```
252 | 
253 | For multiple filter conditions we again make all the filter statements separate. Filtering by m
254 | 
255 | ```python
256 | result = df.filter(
257 |     pl.col("birthdate").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)),
258 |     pl.col("height") > 1.7,
259 | )
260 | print(result)
261 | ```
262 | 
263 | ```haskell
264 | year (YearMonthDay y _ _) = y
265 | between a b y = y >= a && y <= b 
266 | df_csv
267 |   |> D.filter "birthdate"
268 |              (between 1982 1996 . year)
269 |   |> D.filter "height" (1.7 <)
270 | ```
271 | 
272 | ```
273 | ------------------------------------------------
274 | index |   name    | birthdate  | weight | height
275 |  Int  |   Text    |    Day     | Double | Double
276 | ------|-----------|------------|--------|-------
277 | 0     | Ben Brown | 1985-02-15 | 72.5   | 1.77
278 | ```
279 | 
280 | ```python
281 | result = df.group_by(
282 |     (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
283 |     maintain_order=True,
284 | ).len()
285 | print(result)
286 | ```
287 | 
288 | Polars's `groupBy` does an implicit select. In dataframe the select is written explcitly.
289 | 
290 | We implicitly create a `Count` variable as the result of grouping by an aggregate. In general when for a `groupByAgg` we create a variable with the same name as the aggregation to store the aggregation in. 
291 | 
292 | ```haskell
293 | let decade = (*10) . flip div 10 . year
294 | df_csv
295 |     |> D.derive "decade" decade "birthdate"
296 |     |> D.select ["decade"]
297 |     |> D.groupByAgg D.Count ["decade"]
298 | ```
299 | 
300 | ```
301 | ----------------------
302 | index | decade | Count
303 | ------|--------|------
304 |  Int  |  Int   | Int
305 | ------|--------|------
306 | 0     | 1990   | 1  
307 | 1     | 1980   | 3 
308 | ```
309 | 
310 | TODO: Add notes
311 | 
312 | ```python
313 | result = df.group_by(
314 |     (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
315 |     maintain_order=True,
316 | ).agg(
317 |     pl.len().alias("sample_size"),
318 |     pl.col("weight").mean().round(2).alias("avg_weight"),
319 |     pl.col("height").max().alias("tallest"),
320 | )
321 | print(result)
322 | ```
323 | 
324 | ```haskell
325 | decade = (*10) . flip div 10 . year
326 | df_csv
327 |     |> D.derive "decade" decade "birthdate"
328 |     |> D.groupByAgg D.Count ["decade"]
329 |     |> D.aggregate [("height", D.Maximum), ("weight", D.Mean)]
330 |     |> D.select ["decade", "sampleSize", "Mean_weight", "Maximum_height"]
331 | ```
332 | 
333 | ```
334 | ----------------------------------------------------
335 | index | decade  |    Mean_weight    | Maximum_height
336 | ------|---------|-------------------|---------------
337 |  Int  | Integer |      Double       |     Double    
338 | ------|---------|-------------------|---------------
339 | 0     | 1990    | 57.9              | 1.56          
340 | 1     | 1980    | 69.73333333333333 | 1.77
341 | ```
342 | 
343 | 
344 | ```python
345 | result = (
346 |     df.with_columns(
347 |         (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"),
348 |         pl.col("name").str.split(by=" ").list.first(),
349 |     )
350 |     .select(
351 |         pl.all().exclude("birthdate"),
352 |     )
353 |     .group_by(
354 |         pl.col("decade"),
355 |         maintain_order=True,
356 |     )
357 |     .agg(
358 |         pl.col("name"),
359 |         pl.col("weight", "height").mean().round(2).name.prefix("avg_"),
360 |     )
361 | )
362 | print(result)
363 | ```
364 | 
365 | ```haskell
366 | let firstWord = head . T.split (' ' ==)
367 | df_csv
368 |     |> D.apply firstWord "name"
369 |     |> D.derive "decade" decade "birthdate"
370 |     |> D.exclude ["birthdate"]
371 |     |> D.groupByAgg D.Count ["decade"]
372 |     |> D.aggregate [("weight",  D.Mean), ("height", D.Mean)]
373 | ```
374 | 
375 | ```
376 | -------------------------------------------------------------------------------------------
377 | index | decade  |           name           | Count |    Mean_height     |    Mean_weight   
378 | ------|---------|--------------------------|-------|--------------------|------------------
379 |  Int  | Integer |       Vector Text        |  Int  |       Double       |      Double      
380 | ------|---------|--------------------------|-------|--------------------|------------------
381 | 0     | 1990    | ["Alice"]                | 1     | 1.56               | 57.9             
382 | 1     | 1980    | ["Ben","Daniel","Chloe"] | 3     | 1.7233333333333334 | 69.73333333333333
383 | ```
384 | 


--------------------------------------------------------------------------------
/docs/configuration_notes.md:
--------------------------------------------------------------------------------
 1 | # Configuration notes
 2 | 
 3 | ## Windows
 4 | Powershell doesn't support UTF-8 encoding out the box. You need to run:
 5 | 
 6 | ```
 7 | $OutputEncoding = [console]::InputEncoding = [console]::OutputEncoding = New-Object System.Text.UTF8Encoding
 8 | ```
 9 | 
10 | To show terminal plot output.
11 | 


--------------------------------------------------------------------------------
/docs/exploratory_data_analysis_primer.md:
--------------------------------------------------------------------------------
  1 | # A primer on Exploratory Data Analysis
  2 | 
  3 | Exploratory data analysis (EDA), in brief, is what you do when you first get a dataset. EDA should help us answer questions about the data and help us formulate new ones. It is the step before any modelling or inference where we look at the data so we can:
  4 | 
  5 | * check for completeness/correctness of data.
  6 | * understand the relationships between the explanatory variables.
  7 | * understand the relationship between the explanatory and outcome variables.
  8 | * preliminarily determine what models would be appropriate for our data. 
  9 | 
 10 | It's important for EDA tools to be feature-rich and intuitive so we can answer many different kinds of questions about the data without the tool getting in the way.
 11 | 
 12 | 
 13 | There are four types of explanatory data analysis:
 14 | 
 15 | * univariate non-graphical analysis
 16 | * multivariate non-graphical analysis
 17 | * univariate graphical analysis
 18 | * multivariate graphical analysis
 19 | 
 20 | We will look at each type of EDA and describe how we can use dataframe for each type. We'll be using the [California Housing Dataset](https://www.kaggle.com/datasets/camnugent/california-housing-prices) to demonstrate the concepts as we explain them.
 21 | 
 22 | ## Univariate non-graphical analysis
 23 | 
 24 | Univariate non-graphical analysis should give us a sense of the distribution of our dataset's variables. In the real world our variables are measurable characteristics. How they are distributed (the "sample distribution") and this may often help us estimate the overall distribution ("population distribution") of the variable. For example, if our variable was finishing times for a race, our analysis should be able to answer questions like what was the slowest time, what time did people tend to run, who was the fastest, were all times recorded etc.
 25 | 
 26 | For categorical data the best univariate non-graphical analysis is a tabulation of the frequency of each category.
 27 | 
 28 | ```haskell
 29 | ghci> import qualified DataFrame as D
 30 | ghci> D.frequencies "ocean_proximity" df
 31 | 
 32 | ------------------------------------------------------------------------------
 33 | index |   Statistic    | <1H OCEAN | INLAND  | ISLAND  | NEAR BAY | NEAR OCEAN
 34 | ------|----------------|-----------|---------|---------|----------|-----------
 35 |  Int  |      Text      |  Integer  | Integer | Integer | Integer  |  Integer  
 36 | ------|----------------|-----------|---------|---------|----------|-----------
 37 | 0     | Count          | 9136      | 6551    | 5       | 2290     | 2658      
 38 | 1     | Percentage (%) | 44        | 31      | 0       | 11       | 12
 39 | ```
 40 | 
 41 | We can also plot similar tables for non-categorical data with a small value set e.g shoe sizes.
 42 | 
 43 | For quantitative data our goal is to understand the population distribution through our sample distribution. For a given quantitative variable we typically care about its:
 44 | 
 45 | * presence (how much data is missing from each charateristic/variable)
 46 | * center (what a "typical" value looks like for some definition of typical),
 47 | * spread (how far values are from the "typical" value),
 48 | * modality (what are the most popular ranges of values),
 49 | * shape (is the data normally distributed? does it skew left or right?),
 50 | * and outliers (how common are outliers)
 51 | 
 52 | We can calculate sample statistics from the data such as the sample mean, sample variance etc. Although it's most often useful to use graphs to visualize the data's distribution, univariate non-graphical EDA describes aspects of the data's histogram. 
 53 | 
 54 | ### Missing data
 55 | Arguably the first thing to do when presented with a datset is check for null values.
 56 | 
 57 | ```haskell
 58 | ghci> D.columnInfo df
 59 | -----------------------------------------------------------------------------
 60 | index |    Column Name     | # Non-null Values | # Null Values |     Type    
 61 | ------|--------------------|-------------------|---------------|-------------
 62 |  Int  |       [Char]       |        Int        |      Int      |    [Char]   
 63 | ------|--------------------|-------------------|---------------|-------------
 64 | 0     | total_bedrooms     | 20433             | 207           | Maybe Double
 65 | 1     | ocean_proximity    | 20640             | 0             | Text        
 66 | 2     | median_house_value | 20640             | 0             | Double      
 67 | 3     | median_income      | 20640             | 0             | Double      
 68 | 4     | households         | 20640             | 0             | Double      
 69 | 5     | population         | 20640             | 0             | Double      
 70 | 6     | total_rooms        | 20640             | 0             | Double      
 71 | 7     | housing_median_age | 20640             | 0             | Double      
 72 | 8     | latitude           | 20640             | 0             | Double      
 73 | 9     | longitude          | 20640             | 0             | Double
 74 | ```
 75 | 
 76 | It seems we have most of the data except some missing total bedrooms. Dealing with nulls is a separate topic that requires intimate knowledge of the data. So for this initial pass we'll leave out the total_bedrooms variable.
 77 | 
 78 | ### Central tendency
 79 | The central tendency of a distribution describes a "typical" value of that distribution. The most common statistical measures of central tendency are arithmetic mean and median. For symmetric distributions the mean and the median are the same. But for a skewed distribution the mean is pulled towards the "heavier" side wherease the median is more robust to these changes.
 80 | 
 81 | For a given column calulating the mean and median is fairly straightfoward and shown below. 
 82 |             
 83 | ```haskell
 84 | ghci> D.mean "housing_median_age" df
 85 | Just 28.63948643410852
 86 | ghci> D.median "housing_median_age" df
 87 | Just 29.0
 88 | ```
 89 | 
 90 | Note: the values are displayed with a `Just` to denote that they may not be computable or not exist. Trying to get the mean or median of a non-numeric column would return `Nothing`. `Nothing` is similar to `NULL` in SQL.
 91 | 
 92 | ### Spread
 93 | Spread is a measure of how far away from the center we are still likely to find data values. There are three main measures of spread: variance, mean absolute deviation, standard deviation, and interquartile range.
 94 | 
 95 | ### Mean absolute deviation
 96 | We start by looking at mean absolute deviation since it's the simplest measure of spread. The mean absolute deviation measures how far from the average values are on average. We calcuate it by taking the absolute value of the difference between each observation and the mean of that variable, then finally taking the average of those.
 97 | 
 98 | In the housing dataset it'll tell how "typical" our typical home price is.
 99 | 
100 | ```haskell
101 | ghci> import Data.Maybe
102 | ghci> m = fromMaybe 0 $ D.mean "median_house_value" df
103 | 206855.81690891474
104 | ghci> df |> D.derive "deviation" (\v -> abs (v - m)) "median_house_value" |> D.select ["median_house_value", "deviation"] |> D.take 10
105 | -----------------------------------------------
106 | index | median_house_value |     deviation     
107 | ------|--------------------|-------------------
108 |  Int  |       Double       |       Double      
109 | ------|--------------------|-------------------
110 | 0     | 452600.0           | 245744.18309108526
111 | 1     | 358500.0           | 151644.18309108526
112 | 2     | 352100.0           | 145244.18309108526
113 | 3     | 341300.0           | 134444.18309108526
114 | 4     | 342200.0           | 135344.18309108526
115 | 5     | 269700.0           | 62844.18309108526 
116 | 6     | 299200.0           | 92344.18309108526 
117 | 7     | 241400.0           | 34544.18309108526 
118 | 8     | 226700.0           | 19844.18309108526 
119 | 9     | 261100.0           | 54244.18309108526
120 | ```
121 | 
122 | Read left to right, we begin by calling `derive` which applies a function to a given column and stores the result in a target column. The order of arguments is `derive <target column> <function> <deriving column> <dataframe>`. We then select only the two columns we want and take the first 10 rows.
123 | 
124 | This gives us a list of the deviations. From the small sample it does seem like there are some wild deviations. The first one is greater than the mean! How typical is this? Well to answer that we take the average of all these values.
125 | 
126 | ```haskell
127 | ghci> withDeviation = df |> D.derive "deviation" (\v -> abs (v - m)) "median_house_value" |> D.select ["median_house_value", "deviation"]
128 | ghci> D.mean "deviation" withDeviation
129 | Just 91170.43994367732
130 | ```
131 | 
132 | So the $200'000 deviation we saw in the sample isn't very typical but it raises a question about outliers.
133 | What if we give more weight to the further deviations?
134 | 
135 | 
136 | ### Standard deviation
137 | That's what standard deviation aims to do. Standard deviation considers the spread of outliers. Instead of calculating the absolute difference of each observation from the mean we calculate the square of the difference. This has the effect of exaggerating further outliers.
138 | 
139 | ```haskell
140 | ghci> sumOfSqureDifferences = fromMaybe 0 $ D.sum "deviation" withDeviation 
141 | ghci> n = fromIntegral $ (fst $ D.dimensions df) - 1
142 | ghci> sqrt (sumOfSqureDifferences / n)
143 | 115395.6158744
144 | ```
145 | The standard deviation being larger than the mean absolute deviation means we do have some outliers. However, since the difference is fairly small we can conclude that there aren't very many outliers in our dataset.
146 | 
147 | We can calculate the standard deviation in one line as follows:
148 | 
149 | ```haskell
150 | ghci> D.standardDeviation "median_house_value" df
151 | Just 115395.6158744
152 | ```
153 | 
154 | ## Interquartile range (IQR)
155 | A quantile is a value of the distribution such that n% of values in the distribution are smaller than that value. A quartile is a division of the data into four quantiles. So the 1st quantile is a value such that 25% of values are smaller than it. The median is the second quartile. And the third quartile is a value such that 75% of values are smaller than that value. The IQR is the difference between the 3rd and 1st quartiles. It measures how close to middle the middle 50% of values are.
156 | 
157 | The IQR is a more robust measure of spread than the variance or standard deviation. Any number of values in the top or bottom quarters of the data can be moved any distance from the median without affecting the IQR at all. More practically, a few extreme outliers have little or no effect on the IQR
158 | 
159 | For our dataset:
160 | 
161 | ```haskell
162 | ghci> D.interQuartileRange "median_house_value" df
163 | Just 145158.3333333336
164 | ```
165 | 
166 | This is larger than the standard deviation but not by much. This means that outliers don't have a significant influence on the distribution and most values are close to typical.
167 | 
168 | ### Variance
169 | Variance is the square of the standard deviation. It is much more sensitive to outliers. Variance does not have the same units as our original variable (it is in units squared). Therefore, it's much more difficult to interpret.
170 | 
171 | In our example it's a very large number:
172 | 
173 | ``` haskell
174 | ghci> D.variance  "median_house_value" df
175 | Just 1.3315503000818077e10
176 | ```
177 | 
178 | The variance is more useful when comparing different datasets. If the variance of house prices in Minnesota was lower than California this would mean there were much fewer really cheap and really expensive house in Minnesota.
179 | 
180 | ## Shape
181 | Skewness measures how left or right shifted a distribution is from a normal distribution. A positive skewness means the distribution is left shifted, a negative skew means the distribution is right shifted.
182 | 
183 | The formula for skewness is the mean cubic deviation divided by the cube of the standard deviation. It captures the relationship between the mean deviation (asymmetry of the data) and the standard deviation (spread of the data).
184 | 
185 | The intuition behind why a positive skew is left shifted follows from the formula. The numerator is more sensitive to outliers. So the futher left a distribution is the more the right-tail values will be exaggerated by the cube causing the skewness to be positive.
186 | 
187 | A skewness score between -0.5 and 0.5 means the data has little skew. A score between -0.5 and -1 or 0.5 and 1 means the data has moderate skew. A skewness greater than 1 or less than -1 means the data is heavily skewed.
188 | 
189 | ```haskell
190 | ghci> D.skewness "median_house_value" df
191 | Just 0.9776922140978703
192 | ```
193 | So the median house value is moderately skewed to the left. That is, there are more houses that are cheaper than the mean values and a tail of expensive outliers. Having lived in California, I can confirm that this data reflects reality.
194 | 
195 | 
196 | ## Summarising the data
197 | 
198 | We can get all these statistics with a single command:
199 | 
200 | ```haskell
201 | ghci> D.summarize df
202 | ------------------------------------------------------------------------------------------------------------------------------------------
203 | index | Statistic | median_house_value | median_income | households | population | total_rooms | housing_median_age | latitude | longitude
204 | ------|-----------|--------------------|---------------|------------|------------|-------------|--------------------|----------|----------
205 |  Int  |   Text    |       Double       |    Double     |   Double   |   Double   |   Double    |       Double       |  Double  |  Double  
206 | ------|-----------|--------------------|---------------|------------|------------|-------------|--------------------|----------|----------
207 | 0     | Mean      | 206855.82          | 3.87          | 499.54     | 1425.48    | 2635.76     | 28.64              | 35.63    | -119.57  
208 | 1     | Minimum   | 14999.0            | 0.5           | 1.0        | 3.0        | 2.0         | 1.0                | 32.54    | -124.35  
209 | 2     | 25%       | 119600.0           | 2.56          | 280.0      | 787.0      | 1447.42     | 18.0               | 33.93    | -121.8   
210 | 3     | Median    | 179700.0           | 3.53          | 409.0      | 1166.0     | 2127.0      | 29.0               | 34.26    | -118.49  
211 | 4     | 75%       | 264758.33          | 4.74          | 605.0      | 1725.0     | 3148.0      | 37.0               | 37.71    | -118.01  
212 | 5     | Max       | 500001.0           | 15.0          | 6082.0     | 35682.0    | 39320.0     | 52.0               | 41.95    | -114.31  
213 | 6     | StdDev    | 115395.62          | 1.9           | 382.33     | 1132.46    | 2181.62     | 12.59              | 2.14     | 2.0      
214 | 7     | IQR       | 145158.33          | 2.18          | 325.0      | 938.0      | 1700.58     | 19.0               | 3.78     | 3.79     
215 | 8     | Skewness  | 0.98               | 1.65          | 3.41       | 4.94       | 4.15        | 6.0e-2             | 0.47     | -0.3
216 | ```
217 | 
218 | As a recap we'll go over what this tells us about the data:
219 | * median_house_value: house prices tend to be close to the median but there are some pretty expensive houses.
220 | * median_income: incomes are also generally fairly typical (small standard deviation with median close to mean) but there are some really rich people (high skewness).
221 | * households: household sizes are very similar across the sample and they tend to be smaller.
222 | * population: California is generally very sparsely populated (low skewness) with some REALLY densely populated areas (high max/ low IQR).
223 | * total_rooms: a lot of the blocks have few rooms (Again sparse population) but there are some very dense areas (high max).
224 | * housing_median_age: there are as many new houses as there are old (skewness close to 0) and not many extremes (low max, standard deviation lower than IQR)
225 | * latitude: the south has slightly more people than the north (moderate skew)
226 | * longitude: most houses are in the west coast (moderate right skew)
227 | 
228 | 


--------------------------------------------------------------------------------
/docs/haskell_for_data_analysis.md:
--------------------------------------------------------------------------------
 1 | # Haskell for Data Analysis
 2 | 
 3 | This section ports/mirrors Wes McKinney's book [Python for Data Analysis](https://wesmckinney.com/book/). Examples and organizations are drawn from there. This tutorial assumes an understanding of Haskell.
 4 | 
 5 | ## Data preparation
 6 | Data in the wild doesn't always come in a form that's easy to work with. A data analysis tool should make preparing and cleaning data easy. There are a number of common issues that data analysis too must handle. We'll go through a few common ones and show how to deal with them in Haskell.
 7 | 
 8 | ### Handling missing data
 9 | In Haskell, potentially missing values are represented by a "wrapper" type called [`Maybe`](https://en.wikibooks.org/wiki/Haskell/Understanding_monads/Maybe).
10 | 
11 | ```
12 | ghci> import qualified DataFrame as D
13 | ghci> let df = D.fromColumnList [D.toColumn [Just 1, Just 1, Nothing, Nothing], D.toColumn [Just 6.5, Nothing, Nothing, Just 6.5], D.toColumn [Just 3.0, Nothing, Nothing, Just 3.0]]
14 | ghci> df
15 | ---------------------------------------------------
16 | index |       0       |      1       |      2      
17 | ------|---------------|--------------|-------------
18 |  Int  | Maybe Integer | Maybe Double | Maybe Double
19 | ------|---------------|--------------|-------------
20 | 0     | Just 1        | Just 6.5     | Just 3.0    
21 | 1     | Just 1        | Nothing      | Nothing     
22 | 2     | Nothing       | Nothing      | Nothing     
23 | 3     | Nothing       | Just 6.5     | Just 3.0    
24 | 
25 | ```
26 | 
27 | If we'd like to drop all rows with missing values we can use the `filterJust` function.
28 | 
29 | ```haskell
30 | ghci> D.filterJust "0" df
31 | ---------------------------------------------
32 | index |    0    |      1       |      2      
33 | ------|---------|--------------|-------------
34 |  Int  | Integer | Maybe Double | Maybe Double
35 | ------|---------|--------------|-------------
36 | 0     | 1       | Just 6.5     | Just 3.0    
37 | 1     | 1       | Nothing      | Nothing     
38 | ```
39 | 
40 | The function filters out the non-`Nothing` values and "unwrap" the `Maybe` type. To filter all `Nothing` values we use the `filterAllJust` function.
41 | 
42 | ```haskell
43 | ghci> D.filterAllJust df
44 | ---------------------------------
45 | index |    0    |   1    |   2   
46 | ------|---------|--------|-------
47 |  Int  | Integer | Double | Double
48 | ------|---------|--------|-------
49 | 0     | 1       | 6.5    | 3.0   
50 | ```
51 | 
52 | To fill in the missing values we the impute function which replaces all instances of `Nothing` with a given value.
53 | 
54 | ```haskell
55 | ghci> D.impute "0" (0 :: Integer) df
56 | ---------------------------------------------
57 | index |    0    |      1       |      2      
58 | ------|---------|--------------|-------------
59 |  Int  | Integer | Maybe Double | Maybe Double
60 | ------|---------|--------------|-------------
61 | 0     | 1       | Just 6.5     | Just 3.0    
62 | 1     | 1       | Nothing      | Nothing     
63 | 2     | 0       | Nothing      | Nothing     
64 | 3     | 0       | Just 6.5     | Just 3.0    
65 | ```
66 | 
67 | There is no general way to replace ALL nothing values with a default since the default depends on the type. In fact, trying to apply the wrong type to a function throws an error:
68 | 
69 | ```haskell
70 | ghci> D.impute @Double "0" 0 df
71 | *** Exception: 
72 | 
73 | [Error]: Type Mismatch
74 |         While running your code I tried to get a column of type: "Maybe Double" but column was of type: "Maybe Integer"
75 |         This happened when calling function apply on the column 0
76 | 
77 | 
78 | 
79 |         Try adding a type at the end of the function e.g change
80 |                 apply arg1 arg2 to 
81 |                 (apply arg1 arg2 :: <Type>)
82 |         or add {-# LANGUAGE TypeApplications #-} to the top of your file then change the call to 
83 |                 apply @<Type> arg1 arg2
84 | ```
85 | 
86 | In general, Haskell would usually have a compile-time. But because dataframes are usually run in REPL-like environments which offer immediate feedback to users, `dataframe` is fine turning these into compile-time exceptions.
87 | 
88 | 


--------------------------------------------------------------------------------
/flake.nix:
--------------------------------------------------------------------------------
 1 | {
 2 |   description = "An intuitive, dynamically-typed DataFrame library";
 3 | 
 4 |   inputs = {
 5 |     nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable";
 6 |     flake-utils.url = "github:numtide/flake-utils";
 7 |   };
 8 | 
 9 |   outputs = { self, nixpkgs, flake-utils }:
10 |     flake-utils.lib.eachDefaultSystem (system:
11 |       let
12 |         pkgs = nixpkgs.legacyPackages.${system};
13 | 
14 |         hsPkgs = pkgs.haskellPackages.extend (self: super: {
15 |           dataframe = self.callCabal2nix "dataframe" ./. { };
16 |         });
17 |       in
18 |       {
19 |         packages = {
20 |           default = hsPkgs.dataframe;
21 |         };
22 | 
23 |         devShells.default = pkgs.mkShell {
24 |           buildInputs = with pkgs; [
25 |             ghc
26 |             cabal-install
27 |             haskell-language-server
28 |           ];
29 |         };
30 |       });
31 | }
32 | 


--------------------------------------------------------------------------------
/run_compiled_repl.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | cabal repl dataframe --repl-options=-fobject-code -O2
4 | 


--------------------------------------------------------------------------------
/run_profiling.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PROF_TYPE=""
 4 | 
 5 | if [ -z "$1" ]; then
 6 |   # Default to eventlog profile if no argument given
 7 |   PROF_TYPE="eventlog"
 8 | else
 9 |   case "$1" in
10 |     "eventlog")
11 |       PROF_TYPE="eventlog"
12 |       ;;
13 |     "pprof")
14 |       PROF_TYPE="pprof"
15 |       ;;
16 |     *)
17 |       echo "invalid profile type $1, should be one of 'eventlog' or 'pprof'"
18 |       exit 1
19 |       ;;
20 |   esac
21 | fi
22 | 
23 | case "$PROF_TYPE" in
24 |   "eventlog")
25 |     cabal v2-run --enable-profiling dataframe -- +RTS -hy -l-agu
26 |     ;;
27 |   "pprof")
28 |     cabal v2-run --enable-profiling dataframe -- +RTS -pj -RTS
29 |     ;;
30 | esac
31 | 


--------------------------------------------------------------------------------
/src/DataFrame.hs:
--------------------------------------------------------------------------------
 1 | module DataFrame
 2 |   ( module D,
 3 |     (|>)
 4 |   )
 5 | where
 6 | 
 7 | import DataFrame.Internal.Types as D
 8 | import DataFrame.Internal.Function as D
 9 | import DataFrame.Internal.Parsing as D
10 | import DataFrame.Internal.Column as D
11 | import DataFrame.Internal.DataFrame as D hiding (columnIndices, columns)
12 | import DataFrame.Internal.Row as D hiding (mkRowRep)
13 | import DataFrame.Errors as D
14 | import DataFrame.Operations.Core as D
15 | import DataFrame.Operations.Subset as D
16 | import DataFrame.Operations.Sorting as D
17 | import DataFrame.Operations.Statistics as D
18 | import DataFrame.Operations.Transformations as D
19 | import DataFrame.Operations.Typing as D
20 | import DataFrame.Operations.Aggregation as D
21 | import DataFrame.Display.Terminal.Plot as D
22 | import DataFrame.IO.CSV as D
23 | 
24 | import Data.Function
25 | 
26 | (|>) = (&)


--------------------------------------------------------------------------------
/src/DataFrame/Display/Terminal/Colours.hs:
--------------------------------------------------------------------------------
 1 | module DataFrame.Display.Terminal.Colours where
 2 | 
 3 | -- terminal color functions
 4 | red :: String -> String
 5 | red s = "\ESC[31m" ++ s ++ "\ESC[0m"
 6 | 
 7 | green :: String -> String
 8 | green s = "\ESC[32m" ++ s ++ "\ESC[0m"
 9 | 
10 | brightGreen :: String -> String
11 | brightGreen s = "\ESC[92m" ++ s ++ "\ESC[0m"
12 | 
13 | brightBlue :: String -> String
14 | brightBlue s = "\ESC[94m" ++ s ++ "\ESC[0m"
15 | 


--------------------------------------------------------------------------------
/src/DataFrame/Display/Terminal/PrettyPrint.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module DataFrame.Display.Terminal.PrettyPrint where
 3 | 
 4 | import qualified Data.Text as T
 5 | 
 6 | import Data.List (transpose)
 7 | 
 8 | -- Utility functions to show a DataFrame as a Markdown-ish table.
 9 | 
10 | -- Adapted from: https://stackoverflow.com/questions/5929377/format-list-output-in-haskell
11 | -- a type for fill functions
12 | type Filler = Int -> T.Text -> T.Text
13 | 
14 | -- a type for describing table columns
15 | data ColDesc t = ColDesc
16 |   { colTitleFill :: Filler,
17 |     colTitle :: T.Text,
18 |     colValueFill :: Filler
19 |   }
20 | 
21 | -- functions that fill a string (s) to a given width (n) by adding pad
22 | -- character (c) to align left, right, or center
23 | fillLeft :: Char -> Int -> T.Text -> T.Text
24 | fillLeft c n s = s `T.append` T.replicate (n - T.length s) (T.singleton c)
25 | 
26 | fillRight :: Char -> Int -> T.Text -> T.Text
27 | fillRight c n s = T.replicate (n - T.length s) (T.singleton c) `T.append` s
28 | 
29 | fillCenter :: Char -> Int -> T.Text -> T.Text
30 | fillCenter c n s = T.replicate l (T.singleton c) `T.append` s `T.append` T.replicate r (T.singleton c)
31 |   where
32 |     x = n - T.length s
33 |     l = x `div` 2
34 |     r = x - l
35 | 
36 | -- functions that fill with spaces
37 | left :: Int -> T.Text -> T.Text
38 | left = fillLeft ' '
39 | 
40 | right :: Int -> T.Text -> T.Text
41 | right = fillRight ' '
42 | 
43 | center :: Int -> T.Text -> T.Text
44 | center = fillCenter ' '
45 | 
46 | showTable :: [T.Text] -> [T.Text] -> [[T.Text]] -> T.Text
47 | showTable header types rows =
48 |   let cs = map (\h -> ColDesc center h left) header
49 |       widths = [maximum $ map T.length col | col <- transpose $ header : types : rows]
50 |       border = T.intercalate "---" [T.replicate width (T.singleton '-') | width <- widths]
51 |       separator = T.intercalate "-|-" [T.replicate width (T.singleton '-') | width <- widths]
52 |       fillCols fill cols = T.intercalate " | " [fill c width col | (c, width, col) <- zip3 cs widths cols]
53 |    in T.unlines $ border : fillCols colTitleFill header : separator : fillCols colTitleFill types : separator : map (fillCols colValueFill) rows
54 | 
55 | showTableProperMarkdown :: [T.Text] -> [T.Text] -> [[T.Text]] -> T.Text
56 | showTableProperMarkdown header types rows =
57 |   let headerWithTypes = zipWith (\h t -> h <> "<br>" <> t) header types
58 |       cs = map (\h -> ColDesc center h left) headerWithTypes
59 |       widths = [maximum $ map T.length col | col <- transpose $ headerWithTypes : rows]
60 |       border = T.intercalate "---" [T.replicate width (T.singleton '-') | width <- widths]
61 |       separator = T.intercalate "-|-" [T.replicate width (T.singleton '-') | width <- widths]
62 |       fillCols fill cols = T.intercalate " | " [fill c width col | (c, width, col) <- zip3 cs widths cols]
63 |    in T.unlines $ border : fillCols colTitleFill headerWithTypes : separator : map (fillCols colValueFill) rows
64 | 


--------------------------------------------------------------------------------
/src/DataFrame/Errors.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE DeriveAnyClass #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | {-# LANGUAGE RankNTypes #-}
  4 | {-# LANGUAGE GADTs #-}
  5 | {-# LANGUAGE InstanceSigs #-}
  6 | 
  7 | module DataFrame.Errors where
  8 | 
  9 | import qualified Data.Text as T
 10 | 
 11 | import Control.Exception
 12 | import Data.Array
 13 | import DataFrame.Display.Terminal.Colours
 14 | import Data.Typeable (Typeable)
 15 | import Type.Reflection (TypeRep)
 16 | 
 17 | data DataFrameException where
 18 |     TypeMismatchException :: forall a b. (Typeable a, Typeable b)
 19 |                           => TypeRep a -- ^ given type
 20 |                           -> TypeRep b -- ^ expected type
 21 |                           -> T.Text    -- ^ column name
 22 |                           -> T.Text    -- ^ call point
 23 |                           -> DataFrameException
 24 |     TypeMismatchException' :: forall a . (Typeable a)
 25 |                            => TypeRep a -- ^ expected type
 26 |                            -> String    -- ^ given type
 27 |                            -> T.Text    -- ^ column name
 28 |                            -> T.Text    -- ^ call point
 29 |                            -> DataFrameException
 30 |     ColumnNotFoundException :: T.Text -> T.Text -> [T.Text] -> DataFrameException
 31 |     deriving (Exception)
 32 | 
 33 | instance Show DataFrameException where
 34 |     show :: DataFrameException -> String
 35 |     show (TypeMismatchException a b columnName callPoint) = addCallPointInfo columnName (Just callPoint) (typeMismatchError a b)
 36 |     show (TypeMismatchException' a b columnName callPoint) = addCallPointInfo columnName (Just callPoint) (typeMismatchError' (show a) b)
 37 |     show (ColumnNotFoundException columnName callPoint availableColumns) = columnNotFound columnName callPoint availableColumns
 38 | 
 39 | columnNotFound :: T.Text -> T.Text -> [T.Text] -> String
 40 | columnNotFound name callPoint columns =
 41 |   red "\n\n[ERROR] "
 42 |     ++ "Column not found: "
 43 |     ++ T.unpack name
 44 |     ++ " for operation "
 45 |     ++ T.unpack callPoint
 46 |     ++ "\n\tDid you mean "
 47 |     ++ T.unpack (guessColumnName name columns)
 48 |     ++ "?\n\n"
 49 | 
 50 | typeMismatchError ::
 51 |   Type.Reflection.TypeRep a ->
 52 |   Type.Reflection.TypeRep b ->
 53 |   String
 54 | typeMismatchError a b = typeMismatchError' (show a) (show b)
 55 | 
 56 | typeMismatchError' :: String -> String -> String
 57 | typeMismatchError' givenType expectedType =
 58 |   red $
 59 |     red "\n\n[Error]: Type Mismatch"
 60 |       ++ "\n\tWhile running your code I tried to "
 61 |       ++ "get a column of type: "
 62 |       ++ red (show givenType)
 63 |       ++ " but column was of type: "
 64 |       ++ green (show expectedType)
 65 | 
 66 | addCallPointInfo :: T.Text -> Maybe T.Text -> String -> String
 67 | addCallPointInfo name (Just cp) err =
 68 |   err
 69 |     ++ ( "\n\tThis happened when calling function "
 70 |            ++ brightGreen (T.unpack cp)
 71 |            ++ " on the column "
 72 |            ++ brightGreen (T.unpack name)
 73 |            ++ "\n\n"
 74 |            ++ typeAnnotationSuggestion (T.unpack cp)
 75 |        )
 76 | addCallPointInfo name Nothing err =
 77 |   err
 78 |     ++ ( "\n\tOn the column "
 79 |            ++ T.unpack name
 80 |            ++ "\n\n"
 81 |            ++ typeAnnotationSuggestion "<function>"
 82 |        )
 83 | 
 84 | typeAnnotationSuggestion :: String -> String
 85 | typeAnnotationSuggestion cp =
 86 |   "\n\n\tTry adding a type at the end of the function e.g "
 87 |     ++ "change\n\t\t"
 88 |     ++ red (cp ++ " arg1 arg2")
 89 |     ++ " to \n\t\t"
 90 |     ++ green ("(" ++ cp ++ " arg1 arg2 :: <Type>)")
 91 |     ++ "\n\tor add "
 92 |     ++ "{-# LANGUAGE TypeApplications #-} to the top of your "
 93 |     ++ "file then change the call to \n\t\t"
 94 |     ++ brightGreen (cp ++ " @<Type> arg1 arg2")
 95 | 
 96 | guessColumnName :: T.Text -> [T.Text] -> T.Text
 97 | guessColumnName userInput columns = case map (\k -> (editDistance userInput k, k)) columns of
 98 |   [] -> ""
 99 |   res -> (snd . minimum) res
100 | 
101 | editDistance :: T.Text -> T.Text -> Int
102 | editDistance xs ys = table ! (m, n)
103 |   where
104 |     (m, n) = (T.length xs, T.length ys)
105 |     x = array (1, m) (zip [1 ..] (T.unpack xs))
106 |     y = array (1, n) (zip [1 ..] (T.unpack ys))
107 | 
108 |     table :: Array (Int, Int) Int
109 |     table = array bnds [(ij, dist ij) | ij <- range bnds]
110 |     bnds = ((0, 0), (m, n))
111 | 
112 |     dist (0, j) = j
113 |     dist (i, 0) = i
114 |     dist (i, j) =
115 |       minimum
116 |         [ table ! (i - 1, j) + 1,
117 |           table ! (i, j - 1) + 1,
118 |           if x ! i == y ! j then table ! (i - 1, j - 1) else 1 + table ! (i - 1, j - 1)
119 |         ]
120 | 


--------------------------------------------------------------------------------
/src/DataFrame/IO/CSV.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE ExplicitNamespaces #-}
  3 | {-# LANGUAGE LambdaCase #-}
  4 | {-# LANGUAGE OverloadedStrings #-}
  5 | {-# LANGUAGE ScopedTypeVariables #-}
  6 | {-# LANGUAGE TypeApplications #-}
  7 | {-# LANGUAGE GADTs #-}
  8 | {-# LANGUAGE RankNTypes #-}
  9 | {-# LANGUAGE Strict #-}
 10 | module DataFrame.IO.CSV where
 11 | 
 12 | import qualified Data.ByteString.Char8 as C
 13 | import qualified Data.List as L
 14 | import qualified Data.Map as M
 15 | import qualified Data.Set as S
 16 | import qualified Data.Text as T
 17 | import qualified Data.Text.Lazy as TL
 18 | import qualified Data.Text.Lazy.IO as TLIO
 19 | import qualified Data.Text.IO as TIO
 20 | import qualified Data.Vector as V
 21 | import qualified Data.Vector.Unboxed as VU
 22 | import qualified Data.Vector.Mutable as VM
 23 | import qualified Data.Vector.Unboxed.Mutable as VUM
 24 | 
 25 | import Control.Applicative ((<$>), (<|>), (<*>), (<*), (*>), many)
 26 | import Control.Monad (forM_, zipWithM_, unless, void)
 27 | import Data.Attoparsec.Text
 28 | import Data.Char
 29 | import DataFrame.Internal.Column (Column(..), freezeColumn', writeColumn, columnLength)
 30 | import DataFrame.Internal.DataFrame (DataFrame(..))
 31 | import DataFrame.Internal.Parsing
 32 | import DataFrame.Operations.Typing
 33 | import Data.Foldable (fold)
 34 | import Data.Function (on)
 35 | import Data.IORef
 36 | import Data.Maybe
 37 | import Data.Text.Encoding (decodeUtf8Lenient)
 38 | import Data.Type.Equality
 39 |   ( TestEquality (testEquality),
 40 |     type (:~:) (Refl)
 41 |   )
 42 | import GHC.IO.Handle (Handle)
 43 | import Prelude hiding (concat, takeWhile)
 44 | import System.IO
 45 | import Type.Reflection
 46 | 
 47 | -- | Record for CSV read options.
 48 | data ReadOptions = ReadOptions {
 49 |     hasHeader :: Bool,
 50 |     inferTypes :: Bool,
 51 |     safeRead :: Bool
 52 | }
 53 | 
 54 | -- | By default we assume the file has a header, we infer the types on read
 55 | -- and we convert any rows with nullish objects into Maybe (safeRead).
 56 | defaultOptions :: ReadOptions
 57 | defaultOptions = ReadOptions { hasHeader = True, inferTypes = True, safeRead = True }
 58 | 
 59 | -- | Reads a CSV file from the given path.
 60 | -- Note this file stores intermediate temporary files
 61 | -- while converting the CSV from a row to a columnar format.
 62 | readCsv :: String -> IO DataFrame
 63 | readCsv = readSeparated ',' defaultOptions
 64 | 
 65 | -- | Reads a tab separated file from the given path.
 66 | -- Note this file stores intermediate temporary files
 67 | -- while converting the CSV from a row to a columnar format.
 68 | readTsv :: String -> IO DataFrame
 69 | readTsv = readSeparated '\t' defaultOptions
 70 | 
 71 | -- | Reads a character separated file into a dataframe using mutable vectors.
 72 | readSeparated :: Char -> ReadOptions -> String -> IO DataFrame
 73 | readSeparated c opts path = do
 74 |     totalRows <- countRows c path
 75 |     withFile path ReadMode $ \handle -> do
 76 |         firstRow <- map T.strip . parseSep c <$> TIO.hGetLine handle
 77 |         let columnNames = if hasHeader opts
 78 |                         then map (T.filter (/= '\"')) firstRow
 79 |                         else map (T.singleton . intToDigit) [0..(length firstRow - 1)]
 80 |         -- If there was no header rewind the file cursor.
 81 |         unless (hasHeader opts) $ hSeek handle AbsoluteSeek 0
 82 | 
 83 |         -- Initialize mutable vectors for each column
 84 |         let numColumns = length columnNames
 85 |         let numRows = if hasHeader opts then totalRows - 1 else totalRows
 86 |         -- Use this row to infer the types of the rest of the column.
 87 |         -- TODO: this isn't robust but in so far as this is a guess anyway
 88 |         -- it's probably fine. But we should probably sample n rows and pick
 89 |         -- the most likely type from the sample.
 90 |         dataRow <- map T.strip . parseSep c <$> TIO.hGetLine handle
 91 | 
 92 |         -- This array will track the indices of all null values for each column.
 93 |         -- If any exist then the column will be an optional type.
 94 |         nullIndices <- VM.unsafeNew numColumns
 95 |         VM.set nullIndices []
 96 |         mutableCols <- VM.unsafeNew numColumns
 97 |         getInitialDataVectors numRows mutableCols dataRow
 98 | 
 99 |         -- Read rows into the mutable vectors
100 |         fillColumns numRows c mutableCols nullIndices handle
101 | 
102 |         -- Freeze the mutable vectors into immutable ones
103 |         nulls' <- V.unsafeFreeze nullIndices
104 |         cols <- V.mapM (freezeColumn mutableCols nulls' opts) (V.generate numColumns id)
105 |         return $ DataFrame {
106 |                 columns = cols,
107 |                 freeIndices = [],
108 |                 columnIndices = M.fromList (zip columnNames [0..]),
109 |                 dataframeDimensions = (maybe 0 columnLength (cols V.! 0), V.length cols)
110 |             }
111 | {-# INLINE readSeparated #-}
112 | 
113 | getInitialDataVectors :: Int -> VM.IOVector Column -> [T.Text] -> IO ()
114 | getInitialDataVectors n mCol xs = do
115 |     forM_ (zip [0..] xs) $ \(i, x) -> do
116 |         col <- case inferValueType x of
117 |                 "Int" -> MutableUnboxedColumn <$>  ((VUM.unsafeNew n :: IO (VUM.IOVector Int)) >>= \c -> VUM.unsafeWrite c 0 (fromMaybe 0 $ readInt x) >> return c)
118 |                 "Double" -> MutableUnboxedColumn <$> ((VUM.unsafeNew n :: IO (VUM.IOVector Double)) >>= \c -> VUM.unsafeWrite c 0 (fromMaybe 0 $ readDouble x) >> return c)
119 |                 _ -> MutableBoxedColumn <$> ((VM.unsafeNew n :: IO (VM.IOVector T.Text)) >>= \c -> VM.unsafeWrite c 0 x >> return c)
120 |         VM.unsafeWrite mCol i col
121 | {-# INLINE getInitialDataVectors #-}
122 | 
123 | inferValueType :: T.Text -> T.Text
124 | inferValueType s = let
125 |         example = s
126 |     in case readInt example of
127 |         Just _ -> "Int"
128 |         Nothing -> case readDouble example of
129 |             Just _ -> "Double"
130 |             Nothing -> "Other"
131 | {-# INLINE inferValueType #-}
132 | 
133 | -- | Reads rows from the handle and stores values in mutable vectors.
134 | fillColumns :: Int -> Char -> VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> Handle -> IO ()
135 | fillColumns n c mutableCols nullIndices handle = do
136 |     input <- newIORef (mempty :: T.Text)
137 |     forM_ [1..n] $ \i -> do
138 |         isEOF <- hIsEOF handle
139 |         input' <- readIORef input
140 |         unless (isEOF && input' == mempty) $ do
141 |               parseWith (TIO.hGetChunk handle) (parseRow c) input' >>= \case
142 |                 Fail unconsumed ctx er -> do
143 |                   erpos <- hTell handle
144 |                   fail $ "Failed to parse CSV file around " <> show erpos <> " byte; due: "
145 |                     <> show er <> "; context: " <> show ctx
146 |                 Partial c -> do
147 |                   fail "Partial handler is called"
148 |                 Done (unconsumed :: T.Text) (row :: [T.Text]) -> do
149 |                   writeIORef input unconsumed
150 |                   zipWithM_ (writeValue mutableCols nullIndices i) [0..] row
151 | {-# INLINE fillColumns #-}
152 | 
153 | -- | Writes a value into the appropriate column, resizing the vector if necessary.
154 | writeValue :: VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> Int -> Int -> T.Text -> IO ()
155 | writeValue mutableCols nullIndices count colIndex value = do
156 |     col <- VM.unsafeRead mutableCols colIndex
157 |     res <- writeColumn count value col
158 |     let modify value = VM.unsafeModify nullIndices ((count, value) :) colIndex
159 |     either modify (const (return ())) res
160 | {-# INLINE writeValue #-}
161 | 
162 | -- | Freezes a mutable vector into an immutable one, trimming it to the actual row count.
163 | freezeColumn :: VM.IOVector Column -> V.Vector [(Int, T.Text)] -> ReadOptions -> Int -> IO (Maybe Column)
164 | freezeColumn mutableCols nulls opts colIndex = do
165 |     col <- VM.unsafeRead mutableCols colIndex
166 |     Just <$> freezeColumn' (nulls V.! colIndex) col
167 | {-# INLINE freezeColumn #-}
168 | 
169 | parseSep :: Char -> T.Text -> [T.Text]
170 | parseSep c s = either error id (parseOnly (record c) s)
171 | {-# INLINE parseSep #-}
172 | 
173 | record :: Char -> Parser [T.Text]
174 | record c =
175 |    field c `sepBy1` char c
176 |    <?> "record"
177 | {-# INLINE record #-}
178 | 
179 | parseRow :: Char -> Parser [T.Text]
180 | parseRow c = (record c <* lineEnd)  <?> "record-new-line"
181 | 
182 | field :: Char -> Parser T.Text
183 | field c =
184 |    quotedField <|> unquotedField c
185 |    <?> "field"
186 | {-# INLINE field #-}
187 | 
188 | unquotedTerminators :: Char -> S.Set Char
189 | unquotedTerminators sep = S.fromList [sep, '\n', '\r', '"']
190 | 
191 | unquotedField :: Char -> Parser T.Text
192 | unquotedField sep =
193 |    takeWhile (not . (`S.member` terminators)) <?> "unquoted field"
194 |    where terminators = unquotedTerminators sep
195 | {-# INLINE unquotedField #-}
196 | 
197 | quotedField :: Parser T.Text
198 | quotedField = char '"' *> contents <* char '"' <?> "quoted field"
199 |     where
200 |         contents = fold <$> many (unquote <|> unescape)
201 |             where
202 |                 unquote = takeWhile1 (notInClass "\"\\")
203 |                 unescape = char '\\' *> do
204 |                     T.singleton <$> do
205 |                         char '\\' <|> char '"'
206 | {-# INLINE quotedField #-}
207 | 
208 | lineEnd :: Parser ()
209 | lineEnd =
210 |    (endOfLine <|> endOfInput)
211 |    <?> "end of line"
212 | {-# INLINE lineEnd #-}
213 | 
214 | -- | First pass to count rows for exact allocation
215 | countRows :: Char -> FilePath -> IO Int
216 | countRows c path = withFile path ReadMode $! go 0 ""
217 |    where
218 |       go !n !input h = do
219 |          isEOF <- hIsEOF h
220 |          if isEOF && input == mempty
221 |             then pure n
222 |             else
223 |                parseWith (TIO.hGetChunk h) (parseRow c) input >>= \case
224 |                   Fail unconsumed ctx er -> do
225 |                     erpos <- hTell h
226 |                     fail $ "Failed to parse CSV file around " <> show erpos <> " byte; due: "
227 |                       <> show er <> "; context: " <> show ctx <> " " <> show unconsumed
228 |                   Partial c -> do
229 |                     fail $ "Partial handler is called; n = " <> show n
230 |                   Done (unconsumed :: T.Text) _ ->
231 |                     go (n + 1) unconsumed h
232 | {-# INLINE countRows #-}
233 | 
234 | writeCsv :: String -> DataFrame -> IO ()
235 | writeCsv = writeSeparated ','
236 | 
237 | writeSeparated :: Char      -- ^ Separator
238 |                -> String    -- ^ Path to write to
239 |                -> DataFrame
240 |                -> IO ()
241 | writeSeparated c filepath df = withFile filepath WriteMode $ \handle ->do
242 |     let (rows, columns) = dataframeDimensions df
243 |     let headers = map fst (L.sortBy (compare `on` snd) (M.toList (columnIndices df)))
244 |     TIO.hPutStrLn handle (T.intercalate ", " headers)
245 |     forM_ [0..(rows - 1)] $ \i -> do
246 |         let row = getRowAsText df i
247 |         TIO.hPutStrLn handle (T.intercalate ", " row)
248 | 
249 | getRowAsText :: DataFrame -> Int -> [T.Text]
250 | getRowAsText df i = V.ifoldr go [] (columns df)
251 |   where
252 |     indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
253 |     go k Nothing acc = acc
254 |     go k (Just (BoxedColumn (c :: V.Vector a))) acc = case c V.!? i of
255 |         Just e -> textRep : acc
256 |             where textRep = case testEquality (typeRep @a) (typeRep @T.Text) of
257 |                     Just Refl -> e
258 |                     Nothing   -> case typeRep @a of
259 |                         App t1 t2 -> case eqTypeRep t1 (typeRep @Maybe) of
260 |                             Just HRefl -> case testEquality t2 (typeRep @T.Text) of
261 |                                 Just Refl -> fromMaybe "null" e
262 |                                 Nothing -> (fromOptional . (T.pack . show)) e
263 |                                             where fromOptional s
264 |                                                     | T.isPrefixOf "Just " s = T.drop (T.length "Just ") s
265 |                                                     | otherwise = "null"
266 |                             Nothing -> (T.pack . show) e
267 |                         _ -> (T.pack . show) e
268 |         Nothing ->
269 |             error $
270 |                 "Column "
271 |                 ++ T.unpack (indexMap M.! k)
272 |                 ++ " has less items than "
273 |                 ++ "the other columns at index "
274 |                 ++ show i
275 |     go k (Just (UnboxedColumn c)) acc = case c VU.!? i of
276 |         Just e -> T.pack (show e) : acc
277 |         Nothing ->
278 |             error $
279 |                 "Column "
280 |                 ++ T.unpack (indexMap M.! k)
281 |                 ++ " has less items than "
282 |                 ++ "the other columns at index "
283 |                 ++ show i
284 |     go k (Just (OptionalColumn (c :: V.Vector (Maybe a)))) acc = case c V.!? i of
285 |         Just e -> textRep : acc
286 |             where textRep = case testEquality (typeRep @a) (typeRep @T.Text) of
287 |                     Just Refl -> fromMaybe "Nothing" e
288 |                     Nothing   -> (T.pack . show) e
289 |         Nothing ->
290 |             error $
291 |                 "Column "
292 |                 ++ T.unpack (indexMap M.! k)
293 |                 ++ " has less items than "
294 |                 ++ "the other columns at index "
295 |                 ++ show i
296 | 


--------------------------------------------------------------------------------
/src/DataFrame/Internal/DataFrame.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ExplicitNamespaces #-}
 2 | {-# LANGUAGE InstanceSigs #-}
 3 | {-# LANGUAGE OverloadedStrings #-}
 4 | {-# LANGUAGE ScopedTypeVariables #-}
 5 | {-# LANGUAGE TypeApplications #-}
 6 | {-# LANGUAGE GADTs #-}
 7 | {-# LANGUAGE StrictData #-}
 8 | module DataFrame.Internal.DataFrame where
 9 | 
10 | import qualified Data.Map as M
11 | import qualified Data.Text as T
12 | import qualified Data.Vector as V
13 | import qualified Data.Vector.Unboxed as VU
14 | 
15 | import Control.Monad (join)
16 | import DataFrame.Display.Terminal.PrettyPrint
17 | import DataFrame.Internal.Column
18 | import Data.Function (on)
19 | import Data.List (sortBy, transpose)
20 | import Data.Maybe (isJust)
21 | import Data.Type.Equality (type (:~:)(Refl), TestEquality (testEquality))
22 | import Type.Reflection (typeRep)
23 | 
24 | data DataFrame = DataFrame
25 |   { -- | Our main data structure stores a dataframe as
26 |     -- a vector of columns. This improv
27 |     columns :: V.Vector (Maybe Column),
28 |     -- | Keeps the column names in the order they were inserted in.
29 |     columnIndices :: M.Map T.Text Int,
30 |     -- | Next free index that we insert a column into.
31 |     freeIndices :: [Int],
32 |     dataframeDimensions :: (Int, Int)
33 |   }
34 | 
35 | instance Eq DataFrame where
36 |   (==) :: DataFrame -> DataFrame -> Bool
37 |   a == b = map fst (M.toList $ columnIndices a) == map fst (M.toList $ columnIndices b) &&
38 |            foldr (\(name, index) acc -> acc && (columns a V.!? index == (columns b V.!? (columnIndices b M.! name)))) True (M.toList $ columnIndices a)
39 | 
40 | instance Show DataFrame where
41 |   show :: DataFrame -> String
42 |   show d = T.unpack (asText d False)
43 | 
44 | asText :: DataFrame -> Bool -> T.Text
45 | asText d properMarkdown =
46 |   let header = "index" : map fst (sortBy (compare `on` snd) $ M.toList (columnIndices d))
47 |       types = V.toList $ V.filter (/= "") $ V.map getType (columns d)
48 |       getType Nothing = ""
49 |       getType (Just (BoxedColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a)
50 |       getType (Just (UnboxedColumn (column :: VU.Vector a))) = T.pack $ show (typeRep @a)
51 |       getType (Just (OptionalColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a)
52 |       getType (Just (GroupedBoxedColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a)
53 |       getType (Just (GroupedUnboxedColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a)
54 |       -- Separate out cases dynamically so we don't end up making round trip string
55 |       -- copies.
56 |       get (Just (BoxedColumn (column :: V.Vector a))) = case testEquality (typeRep @a) (typeRep @T.Text) of
57 |               Just Refl -> column
58 |               Nothing -> case testEquality (typeRep @a) (typeRep @String) of
59 |                 Just Refl -> V.map T.pack column
60 |                 Nothing -> V.map (T.pack . show) column
61 |       get (Just (UnboxedColumn column)) = V.map (T.pack . show) (V.convert column)
62 |       get (Just (OptionalColumn column)) = V.map (T.pack . show) column
63 |       get (Just (GroupedBoxedColumn column)) = V.map (T.pack . show) column
64 |       get (Just (GroupedUnboxedColumn column)) = V.map (T.pack . show) column
65 |       getTextColumnFromFrame df (i, name) = if i == 0
66 |                                             then V.fromList (map (T.pack . show) [0..(fst (dataframeDimensions df) - 1)])
67 |                                             else get $ (V.!) (columns d) ((M.!) (columnIndices d) name)
68 |       rows =
69 |         transpose $
70 |           zipWith (curry (V.toList . getTextColumnFromFrame d)) [0..] header
71 |    in (if properMarkdown then showTableProperMarkdown else showTable) header ("Int":types) rows
72 | 
73 | -- | O(1) Creates an empty dataframe
74 | empty :: DataFrame
75 | empty = DataFrame {columns = V.replicate initialColumnSize Nothing,
76 |                    columnIndices = M.empty,
77 |                    freeIndices = [0..(initialColumnSize - 1)],
78 |                    dataframeDimensions = (0, 0) }
79 | 
80 | initialColumnSize :: Int
81 | initialColumnSize = 8
82 | 
83 | getColumn :: T.Text -> DataFrame -> Maybe Column
84 | getColumn name df = do
85 |   i <- columnIndices df M.!? name
86 |   join $ columns df V.!? i
87 | 
88 | null :: DataFrame -> Bool
89 | null df = dataframeDimensions df == (0, 0)
90 | 
91 | metadata :: DataFrame -> String
92 | metadata df = show (columnIndices df) ++ "\n" ++
93 |               show (V.map (fmap columnVersionString) (columns df)) ++ "\n" ++
94 |               show (freeIndices df) ++ "\n" ++
95 |               show (dataframeDimensions df)
96 | 


--------------------------------------------------------------------------------
/src/DataFrame/Internal/Function.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ExplicitNamespaces #-}
 2 | {-# LANGUAGE RankNTypes #-}
 3 | {-# LANGUAGE ScopedTypeVariables #-}
 4 | {-# LANGUAGE TypeApplications #-}
 5 | {-# LANGUAGE ConstraintKinds #-}
 6 | {-# LANGUAGE GADTs #-}
 7 | {-# LANGUAGE FlexibleInstances #-}
 8 | {-# LANGUAGE InstanceSigs #-}
 9 | {-# LANGUAGE ViewPatterns #-}
10 | {-# LANGUAGE PatternSynonyms #-}
11 | 
12 | module DataFrame.Internal.Function where
13 | 
14 | import qualified Data.Text as T
15 | import qualified Data.Vector as V
16 | 
17 | import DataFrame.Internal.Types
18 | import Data.Typeable ( Typeable, type (:~:)(Refl) )
19 | import Data.Type.Equality (TestEquality(testEquality))
20 | import Type.Reflection (typeRep, typeOf)
21 | 
22 | -- A GADT to wrap functions so we can have hetegeneous lists of functions.
23 | data Function where
24 |     F1 :: forall a b . (Columnable a, Columnable b) => (a -> b) -> Function
25 |     F2 :: forall a b c . (Columnable a, Columnable b, Columnable c) => (a -> b -> c) -> Function
26 |     F3 :: forall a b c d . (Columnable a, Columnable b, Columnable c, Columnable d) => (a -> b -> c -> d) -> Function
27 |     F4 :: forall a b c d e . (Columnable a, Columnable b, Columnable c, Columnable d, Columnable e) => (a -> b -> c -> d -> e) -> Function
28 |     Cond :: forall a . (Columnable a) => (a -> Bool) -> Function
29 |     ICond :: forall a . (Columnable a) => (Int -> a -> Bool) -> Function
30 | 
31 | -- Helper class to do the actual wrapping
32 | class WrapFunction a where
33 |     wrapFunction :: a -> Function
34 | 
35 | -- Instance for 1-argument functions
36 | instance (Columnable a, Columnable b) => WrapFunction (a -> b) where
37 |     wrapFunction :: (Columnable a, Columnable b) => (a -> b) -> Function
38 |     wrapFunction = F1
39 | 
40 | -- Instance for 2-argument functions
41 | instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c) => WrapFunction (a -> b -> c) where
42 |     wrapFunction :: (Columnable a, Columnable b, Columnable c) => (a -> b -> c) -> Function
43 |     wrapFunction = F2
44 | 
45 | -- Instance for 3-argument functions
46 | instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c, Columnable d) => WrapFunction (a -> b -> c -> d) where
47 |     wrapFunction :: (Columnable a, Columnable b, Columnable c, Columnable d) => (a -> b -> c -> d) -> Function
48 |     wrapFunction = F3
49 | 
50 | instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c, Columnable d, Columnable e) => WrapFunction (a -> b -> c -> d -> e) where
51 |     wrapFunction :: (Columnable a, Columnable b, Columnable c, Columnable d, Columnable e) => (a -> b -> c -> d -> e) -> Function
52 |     wrapFunction = F4
53 | 
54 | -- The main function that wraps arbitrary functions
55 | func :: forall fn . WrapFunction fn => fn -> Function
56 | func = wrapFunction
57 | 
58 | pattern Empty :: V.Vector a
59 | pattern Empty <- (V.null -> True) where Empty = V.empty 
60 | 
61 | uncons :: V.Vector a -> Maybe (a, V.Vector a)
62 | uncons Empty = Nothing
63 | uncons v     = Just (V.unsafeHead v, V.unsafeTail v)
64 | 
65 | pattern (:<|)  :: a -> V.Vector a -> V.Vector a
66 | pattern x :<| xs <- (uncons -> Just (x, xs))
67 | 
68 | funcApply :: forall c . (Columnable c) => V.Vector RowValue -> Function ->  c
69 | funcApply Empty _ = error "Empty args"
70 | funcApply (Value (x :: a') :<| Empty) (F1 (f :: (a -> b))) = case testEquality (typeRep @a') (typeRep @a) of
71 |         Just Refl -> case testEquality (typeOf (f x)) (typeRep @c) of
72 |             Just Refl -> f x
73 |             Nothing -> error "Result type mismatch"
74 |         Nothing -> error "Arg type mismatch"
75 | funcApply (Value (x :: a') :<| xs) (F2 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
76 |         Just Refl -> funcApply xs (F1 (f x))
77 |         Nothing -> error "Arg type mismatch"
78 | funcApply (Value (x :: a') :<| xs) (F3 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
79 |         Just Refl -> funcApply xs (F2 (f x))
80 |         Nothing -> error "Arg type mismatch"
81 | funcApply (Value (x :: a') :<| xs) (F4 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of
82 |         Just Refl -> funcApply xs (F3 (f x))
83 |         Nothing -> error "Arg type mismatch"
84 | 


--------------------------------------------------------------------------------
/src/DataFrame/Internal/Parsing.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | {-# LANGUAGE Strict #-}
 3 | module DataFrame.Internal.Parsing where
 4 | 
 5 | import qualified Data.ByteString.Char8 as C
 6 | import qualified Data.Set as S
 7 | import qualified Data.Text as T
 8 | 
 9 | import Data.Text.Read
10 | import Data.Maybe (fromMaybe)
11 | import GHC.Stack (HasCallStack)
12 | import Text.Read (readMaybe)
13 | 
14 | isNullish :: T.Text -> Bool
15 | isNullish s = s `S.member` S.fromList ["Nothing", "NULL", "", " ", "nan"]
16 | 
17 | readValue :: (HasCallStack, Read a) => T.Text -> a
18 | readValue s = case readMaybe (T.unpack s) of
19 |   Nothing -> error $ "Could not read value: " ++ T.unpack s
20 |   Just value -> value
21 | 
22 | readInteger :: (HasCallStack) => T.Text -> Maybe Integer
23 | readInteger s = case signed decimal (T.strip s) of
24 |   Left _ -> Nothing
25 |   Right (value, "") -> Just value
26 |   Right (value, _) -> Nothing
27 | 
28 | readInt :: (HasCallStack) => T.Text -> Maybe Int
29 | readInt s = case signed decimal (T.strip s) of
30 |   Left _ -> Nothing
31 |   Right (value, "") -> Just value
32 |   Right (value, _) -> Nothing
33 | {-# INLINE readInt #-}
34 | 
35 | readByteStringInt :: (HasCallStack) => C.ByteString -> Maybe Int
36 | readByteStringInt s = case C.readInt (C.strip s) of
37 |   Nothing -> Nothing
38 |   Just (value, "") -> Just value
39 |   Just (value, _) -> Nothing
40 | {-# INLINE readByteStringInt #-}
41 | 
42 | readDouble :: (HasCallStack) => T.Text -> Maybe Double
43 | readDouble s =
44 |   case signed double s of
45 |     Left _ -> Nothing
46 |     Right (value, "") -> Just value
47 |     Right (value, _) -> Nothing
48 | {-# INLINE readDouble #-}
49 | 
50 | readIntegerEither :: (HasCallStack) => T.Text -> Either T.Text Integer
51 | readIntegerEither s = case signed decimal (T.strip s) of
52 |   Left _ -> Left s
53 |   Right (value, "") -> Right value
54 |   Right (value, _) -> Left s
55 | {-# INLINE readIntegerEither #-}
56 | 
57 | readIntEither :: (HasCallStack) => T.Text -> Either T.Text Int
58 | readIntEither s = case signed decimal (T.strip s) of
59 |   Left _ -> Left s
60 |   Right (value, "") -> Right value
61 |   Right (value, _) -> Left s
62 | {-# INLINE readIntEither #-}
63 | 
64 | readDoubleEither :: (HasCallStack) => T.Text -> Either T.Text Double
65 | readDoubleEither s =
66 |   case signed double s of
67 |     Left _ -> Left s
68 |     Right (value, "") -> Right value
69 |     Right (value, _) -> Left s
70 | {-# INLINE readDoubleEither #-}
71 | 
72 | safeReadValue :: (Read a) => T.Text -> Maybe a
73 | safeReadValue s = readMaybe (T.unpack s)
74 | 
75 | readWithDefault :: (HasCallStack, Read a) => a -> T.Text -> a
76 | readWithDefault v s = fromMaybe v (readMaybe (T.unpack s))
77 | 


--------------------------------------------------------------------------------
/src/DataFrame/Internal/Row.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module DataFrame.Internal.Row where
 3 | 
 4 | import qualified Data.List as L
 5 | import qualified Data.Map as M
 6 | import qualified Data.Set as S
 7 | import qualified Data.Text as T
 8 | import qualified Data.Vector as V
 9 | import qualified Data.Vector.Generic as VG
10 | import qualified Data.Vector.Unboxed as VU
11 | import qualified Data.Vector.Algorithms.Merge as VA
12 | 
13 | import Control.Exception (throw)
14 | import Control.Monad.ST (runST)
15 | import DataFrame.Errors (DataFrameException(..))
16 | import DataFrame.Internal.Column
17 | import DataFrame.Internal.DataFrame
18 | import DataFrame.Internal.Types
19 | import Data.Function (on)
20 | 
21 | type Row = V.Vector RowValue
22 | 
23 | toRowList :: [T.Text] -> DataFrame -> [Row]
24 | toRowList names df = let
25 |     nameSet = S.fromList names
26 |   in map (mkRowRep df nameSet) [0..(fst (dataframeDimensions df) - 1)]
27 | 
28 | toRowVector :: [T.Text] -> DataFrame -> V.Vector Row
29 | toRowVector names df = let
30 |     nameSet = S.fromList names
31 |   in V.generate (fst (dataframeDimensions df)) (mkRowRep df nameSet)
32 | 
33 | mkRowFromArgs :: [T.Text] -> DataFrame -> Int -> Row
34 | mkRowFromArgs names df i = V.map get (V.fromList names)
35 |   where
36 |     get name = case getColumn name df of
37 |       Nothing -> throw $ ColumnNotFoundException name "[INTERNAL] mkRowFromArgs" (map fst $ M.toList $ columnIndices df)
38 |       Just (BoxedColumn column) -> toRowValue (column V.! i)
39 |       Just (UnboxedColumn column) -> toRowValue (column VU.! i)
40 |       Just (OptionalColumn column) -> toRowValue (column V.! i)
41 | 
42 | mkRowRep :: DataFrame -> S.Set T.Text -> Int -> Row
43 | mkRowRep df names i = V.generate (S.size names) (\index -> get (names' V.! index))
44 |   where
45 |     inOrderIndexes = map fst $ L.sortBy (compare `on` snd) $ M.toList (columnIndices df)
46 |     names' = V.fromList [n | n <- inOrderIndexes, S.member n names]
47 |     throwError name = error $ "Column "
48 |                 ++ T.unpack name
49 |                 ++ " has less items than "
50 |                 ++ "the other columns at index "
51 |                 ++ show i
52 |     get name = case getColumn name df of
53 |       Just (BoxedColumn c) -> case c V.!? i of
54 |         Just e -> toRowValue e
55 |         Nothing -> throwError name
56 |       Just (OptionalColumn c) -> case c V.!? i of
57 |         Just e -> toRowValue e
58 |         Nothing -> throwError name
59 |       Just (UnboxedColumn c) -> case c VU.!? i of
60 |         Just e -> toRowValue e
61 |         Nothing -> throwError name
62 |       Just (GroupedBoxedColumn c) -> case c V.!? i of
63 |         Just e -> toRowValue e
64 |         Nothing -> throwError name
65 |       Just (GroupedUnboxedColumn c) -> case c V.!? i of
66 |         Just e -> toRowValue e
67 |         Nothing -> throwError name
68 | 
69 | sortedIndexes' :: Bool -> V.Vector Row -> VU.Vector Int
70 | sortedIndexes' asc rows = runST $ do
71 |   withIndexes <- VG.thaw (V.indexed rows)
72 |   VA.sortBy ((if asc then compare else flip compare) `on` snd) withIndexes
73 |   sorted <- VG.unsafeFreeze withIndexes
74 |   return $ VU.generate (VG.length rows) (\i -> fst (sorted VG.! i))
75 | 


--------------------------------------------------------------------------------
/src/DataFrame/Internal/Types.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ConstraintKinds #-}
 2 | {-# LANGUAGE DataKinds #-}
 3 | {-# LANGUAGE ExistentialQuantification #-}
 4 | {-# LANGUAGE GADTs #-}
 5 | {-# LANGUAGE InstanceSigs #-}
 6 | {-# LANGUAGE KindSignatures #-}
 7 | {-# LANGUAGE RankNTypes #-}
 8 | {-# LANGUAGE ScopedTypeVariables #-}
 9 | {-# LANGUAGE TypeApplications #-}
10 | {-# LANGUAGE TypeOperators #-}
11 | {-# LANGUAGE Strict #-}
12 | module DataFrame.Internal.Types where
13 | 
14 | import Data.Int ( Int8, Int16, Int32, Int64 )
15 | import Data.Kind (Type)
16 | import Data.Maybe (fromMaybe)
17 | import Data.Typeable (Typeable, type (:~:) (..))
18 | import Data.Word ( Word8, Word16, Word32, Word64 )
19 | import Type.Reflection (TypeRep, typeOf, typeRep)
20 | import Data.Type.Equality (TestEquality(..))
21 | 
22 | -- We need an "Object" type as an intermediate representation
23 | -- for rows. Useful for things like sorting and function application.
24 | type Columnable a = (Typeable a, Show a, Ord a, Eq a)
25 | 
26 | data RowValue where
27 |     Value :: (Columnable a) => a -> RowValue
28 | 
29 | instance Eq RowValue where
30 |     (==) :: RowValue -> RowValue -> Bool
31 |     (Value a) == (Value b) = fromMaybe False $ do
32 |         Refl <- testEquality (typeOf a) (typeOf b)
33 |         return $ a == b
34 | 
35 | instance Ord RowValue where
36 |     (<=) :: RowValue -> RowValue -> Bool
37 |     (Value a) <= (Value b) = fromMaybe False $ do
38 |         Refl <- testEquality (typeOf a) (typeOf b)
39 |         return $ a <= b
40 | 
41 | instance Show RowValue where
42 |     show :: RowValue -> String
43 |     show (Value a) = show a
44 | 
45 | toRowValue :: forall a . (Columnable a) => a -> RowValue
46 | toRowValue =  Value
47 | 
48 | -- | Essentially a "functor" instance of our type-erased Column.
49 | class Transformable a where
50 |   transform :: forall b c . (Columnable b, Columnable c) => (b -> c) -> a -> Maybe a
51 | 
52 | -- Convenience functions for types.
53 | unboxableTypes :: TypeRepList '[Int, Int8, Int16, Int32, Int64,
54 |                                 Word, Word8, Word16, Word32, Word64,
55 |                                 Char, Double, Float, Bool]
56 | unboxableTypes = Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep Nil)))))))))))))
57 | 
58 | numericTypes :: TypeRepList '[Int, Int8, Int16, Int32, Int64, Double, Float]
59 | numericTypes = Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep Nil))))))
60 | 
61 | data TypeRepList (xs :: [Type]) where
62 |   Nil  :: TypeRepList '[]
63 |   Cons :: Typeable x => TypeRep x -> TypeRepList xs -> TypeRepList (x ': xs)
64 | 
65 | matchesAnyType :: forall a xs. (Typeable a) => TypeRepList xs -> TypeRep a -> Bool
66 | matchesAnyType Nil _ = False
67 | matchesAnyType (Cons ty tys) rep =
68 |   case testEquality ty rep of
69 |     Just Refl -> True
70 |     Nothing   -> matchesAnyType tys rep
71 | 
72 | testUnboxable :: forall a . Typeable a => TypeRep a -> Bool
73 | testUnboxable x = matchesAnyType unboxableTypes (typeRep @a)
74 | 
75 | testNumeric :: forall a . Typeable a => TypeRep a -> Bool
76 | testNumeric x = matchesAnyType numericTypes (typeRep @a)
77 | 


--------------------------------------------------------------------------------
/src/DataFrame/Operations/Aggregation.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ExplicitNamespaces #-}
  2 | {-# LANGUAGE GADTs #-}
  3 | {-# LANGUAGE OverloadedStrings #-}
  4 | {-# LANGUAGE RankNTypes #-}
  5 | {-# LANGUAGE ScopedTypeVariables #-}
  6 | {-# LANGUAGE TypeApplications #-}
  7 | module DataFrame.Operations.Aggregation where
  8 | 
  9 | import qualified Data.Set as S
 10 | 
 11 | import qualified Data.List as L
 12 | import qualified Data.Map as M
 13 | import qualified Data.Map.Strict as MS
 14 | import qualified Data.Text as T
 15 | import qualified Data.Vector.Generic as VG
 16 | import qualified Data.Vector as V
 17 | import qualified Data.Vector.Mutable as VM
 18 | import qualified Data.Vector.Unboxed as VU
 19 | import qualified Statistics.Quantile as SS
 20 | import qualified Statistics.Sample as SS
 21 | 
 22 | import Control.Exception (throw)
 23 | import Control.Monad (foldM_)
 24 | import Control.Monad.ST (runST)
 25 | import DataFrame.Internal.Column (Column(..), toColumn', getIndicesUnboxed, getIndices)
 26 | import DataFrame.Internal.DataFrame (DataFrame(..), empty, getColumn)
 27 | import DataFrame.Internal.Parsing
 28 | import DataFrame.Internal.Types
 29 | import DataFrame.Errors
 30 | import DataFrame.Operations.Core
 31 | import DataFrame.Operations.Subset
 32 | import Data.Function ((&))
 33 | import Data.Hashable
 34 | import Data.Maybe
 35 | import Data.Type.Equality (type (:~:)(Refl), TestEquality(..))
 36 | import Type.Reflection (typeRep, typeOf)
 37 | 
 38 | -- | O(k * n) groups the dataframe by the given rows aggregating the remaining rows
 39 | -- into vector that should be reduced later.
 40 | groupBy ::
 41 |   [T.Text] ->
 42 |   DataFrame ->
 43 |   DataFrame
 44 | groupBy names df
 45 |   | any (`notElem` columnNames df) names = throw $ ColumnNotFoundException (T.pack $ show $ names L.\\ columnNames df) "groupBy" (columnNames df)
 46 |   | otherwise = L.foldl' insertColumns initDf groupingColumns
 47 |   where
 48 |     insertOrAdjust k v m = if MS.notMember k m then MS.insert k [v] m else MS.adjust (appendWithFrontMin v) k m
 49 |     -- Create a string representation of each row.
 50 |     values = V.generate (fst (dimensions df)) (mkRowRep df (S.fromList names))
 51 |     -- Create a mapping from the row representation to the list of indices that
 52 |     -- have that row representation. This will allow us sortedIndexesto combine the indexes
 53 |     -- where the rows are the same.
 54 |     valueIndices = V.ifoldl' (\m index rowRep -> insertOrAdjust rowRep index m) M.empty values
 55 |     -- Since the min is at the head this allows us to get the min in constant time and sort by it
 56 |     -- That way we can recover the original order of the rows.
 57 |     -- valueIndicesInitOrder = L.sortBy (compare `on` snd) $! MS.toList $ MS.map VU.head valueIndices
 58 |     valueIndicesInitOrder = runST $ do
 59 |       v <- VM.new (MS.size valueIndices)
 60 |       foldM_ (\i idxs -> VM.write v i (VU.fromList idxs) >> return (i + 1)) 0 valueIndices
 61 |       V.unsafeFreeze v
 62 | 
 63 |     -- These are the indexes of the grouping/key rows i.e the minimum elements
 64 |     -- of the list.
 65 |     keyIndices = VU.generate (VG.length valueIndicesInitOrder) (\i -> VG.head $ valueIndicesInitOrder VG.! i)
 66 |     -- this will be our main worker function in the fold that takes all
 67 |     -- indices and replaces each value in a column with a list of
 68 |     -- the elements with the indices where the grouped row
 69 |     -- values are the same.
 70 |     insertColumns = groupColumns valueIndicesInitOrder df
 71 |     -- Out initial DF will just be all the grouped rows added to an
 72 |     -- empty dataframe. The entries are dedued and are in their
 73 |     -- initial order.
 74 |     initDf = L.foldl' (mkGroupedColumns keyIndices df) empty names
 75 |     -- All the rest of the columns that we are grouping by.
 76 |     groupingColumns = columnNames df L.\\ names
 77 | 
 78 | mkRowRep :: DataFrame -> S.Set T.Text -> Int -> Int
 79 | mkRowRep df names i = hash $ V.ifoldl' go [] (columns df)
 80 |   where
 81 |     indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
 82 |     go acc k Nothing = acc
 83 |     go acc k (Just (BoxedColumn (c :: V.Vector a))) =
 84 |       if S.notMember (indexMap M.! k) names
 85 |         then acc
 86 |         else case c V.!? i of
 87 |           Just e -> hash' @a e : acc
 88 |           Nothing ->
 89 |             error $
 90 |               "Column "
 91 |                 ++ T.unpack (indexMap M.! k)
 92 |                 ++ " has less items than "
 93 |                 ++ "the other columns at index "
 94 |                 ++ show i
 95 |     go acc k (Just (OptionalColumn (c :: V.Vector (Maybe a)))) =
 96 |       if S.notMember (indexMap M.! k) names
 97 |         then acc
 98 |         else case c V.!? i of
 99 |           Just e -> hash' @(Maybe a) e : acc
100 |           Nothing ->
101 |             error $
102 |               "Column "
103 |                 ++ T.unpack (indexMap M.! k)
104 |                 ++ " has less items than "
105 |                 ++ "the other columns at index "
106 |                 ++ show i
107 |     go acc k (Just (UnboxedColumn (c :: VU.Vector a))) =
108 |       if S.notMember (indexMap M.! k) names
109 |         then acc
110 |         else case c VU.!? i of
111 |           Just e -> hash' @a e : acc
112 |           Nothing ->
113 |             error $
114 |               "Column "
115 |                 ++ T.unpack (indexMap M.! k)
116 |                 ++ " has less items than "
117 |                 ++ "the other columns at index "
118 |                 ++ show i
119 | 
120 | -- | This hash function returns the hash when given a non numeric type but
121 | -- the value when given a numeric.
122 | hash' :: Columnable a => a -> Double
123 | hash' value = case testEquality (typeOf value) (typeRep @Double) of
124 |   Just Refl -> value
125 |   Nothing -> case testEquality (typeOf value) (typeRep @Int) of
126 |     Just Refl -> fromIntegral value
127 |     Nothing -> case testEquality (typeOf value) (typeRep @T.Text) of
128 |       Just Refl -> fromIntegral $ hash value
129 |       Nothing -> fromIntegral $ hash (show value)
130 | 
131 | mkGroupedColumns :: VU.Vector Int -> DataFrame -> DataFrame -> T.Text -> DataFrame
132 | mkGroupedColumns indices df acc name =
133 |   case (V.!) (columns df) (columnIndices df M.! name) of
134 |     Nothing -> error "Unexpected"
135 |     (Just (BoxedColumn column)) ->
136 |       let vs = indices `getIndices` column
137 |        in insertColumn name vs acc
138 |     (Just (OptionalColumn column)) ->
139 |       let vs = indices `getIndices` column
140 |        in insertColumn name vs acc
141 |     (Just (UnboxedColumn column)) ->
142 |       let vs = indices `getIndicesUnboxed` column
143 |        in insertUnboxedColumn name vs acc
144 | 
145 | groupColumns :: V.Vector (VU.Vector Int) -> DataFrame -> DataFrame -> T.Text -> DataFrame
146 | groupColumns indices df acc name =
147 |   case (V.!) (columns df) (columnIndices df M.! name) of
148 |     Nothing -> df
149 |     (Just (BoxedColumn column)) ->
150 |       let vs = V.map (`getIndices` column) indices
151 |        in insertColumn' name (Just $ GroupedBoxedColumn vs) acc
152 |     (Just (OptionalColumn column)) ->
153 |       let vs = V.map (`getIndices` column) indices
154 |        in insertColumn' name (Just $ GroupedBoxedColumn vs) acc
155 |     (Just (UnboxedColumn column)) ->
156 |       let vs = V.map (`getIndicesUnboxed` column) indices
157 |        in insertColumn' name (Just $ GroupedUnboxedColumn vs) acc
158 | 
159 | data Aggregation = Count
160 |                  | Mean
161 |                  | Minimum
162 |                  | Median
163 |                  | Maximum
164 |                  | Sum deriving (Show, Eq)
165 | 
166 | groupByAgg :: Aggregation -> [T.Text] -> DataFrame -> DataFrame
167 | groupByAgg agg columnNames df = let
168 |   in case agg of
169 |     Count -> insertColumnWithDefault @Int 1 (T.pack (show agg)) V.empty df
170 |            & groupBy columnNames
171 |            & reduceBy @Int VG.length "Count"
172 |     _ -> error "UNIMPLEMENTED"
173 | 
174 | -- O (k * n) Reduces a vector valued volumn with a given function.
175 | reduceBy ::
176 |   forall a b . (Columnable a, Columnable b) =>
177 |   (forall v . (VG.Vector v a) => v a -> b) ->
178 |   T.Text ->
179 |   DataFrame ->
180 |   DataFrame
181 | reduceBy f name df = case getColumn name df of
182 |     Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) -> case testEquality (typeRep @a) (typeRep @a') of
183 |       Just Refl -> insertColumn' name (Just $ toColumn' (VG.map f column)) df
184 |       Nothing -> error "Type error"
185 |     Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) -> case testEquality (typeRep @a) (typeRep @a') of
186 |       Just Refl -> insertColumn' name (Just $ toColumn' (VG.map f column)) df
187 |       Nothing -> error "Type error"
188 |     _ -> error "Column is ungrouped"
189 | 
190 | reduceByAgg :: Aggregation
191 |             -> T.Text
192 |             -> DataFrame
193 |             -> DataFrame
194 | reduceByAgg agg name df = case agg of
195 |   Count   -> case getColumn name df of
196 |     Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) ->  insertColumn' name (Just $ toColumn' (VG.map VG.length column)) df
197 |     Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) ->  insertColumn' name (Just $ toColumn' (VG.map VG.length column)) df
198 |     _ -> error $ "Cannot count ungrouped Column: " ++ T.unpack name 
199 |   Mean    -> case getColumn name df of
200 |     Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) -> case testEquality (typeRep @a') (typeRep @Int) of
201 |       Just Refl -> insertColumn' name (Just $ toColumn' (VG.map (SS.mean . VG.map fromIntegral) column)) df
202 |       Nothing -> case testEquality (typeRep @a') (typeRep @Double) of
203 |         Just Refl -> insertColumn' name (Just $ toColumn' (VG.map SS.mean column)) df
204 |         Nothing -> case testEquality (typeRep @a') (typeRep @Float) of
205 |           Just Refl -> insertColumn' name (Just $ toColumn' (VG.map (SS.mean . VG.map realToFrac) column)) df
206 |           Nothing -> error $ "Cannot get mean of non-numeric column: " ++ T.unpack name -- Not sure what to do with no numeric - return nothing???
207 |     Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) -> case testEquality (typeRep @a') (typeRep @Int) of
208 |       Just Refl -> insertColumn' name (Just $ toColumn' (VG.map (SS.mean . VG.map fromIntegral) column)) df
209 |       Nothing -> case testEquality (typeRep @a') (typeRep @Double) of
210 |         Just Refl -> insertColumn' name (Just $ toColumn' (VG.map SS.mean column)) df
211 |         Nothing -> case testEquality (typeRep @a') (typeRep @Float) of
212 |           Just Refl -> insertColumn' name (Just $ toColumn' (VG.map (SS.mean . VG.map realToFrac) column)) df
213 |           Nothing -> error $ "Cannot get mean of non-numeric column: " ++ T.unpack name -- Not sure what to do with no numeric - return nothing???
214 |   Minimum -> case getColumn name df of
215 |     Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) ->  insertColumn' name (Just $ toColumn' (VG.map VG.minimum column)) df
216 |     Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) ->  insertColumn' name (Just $ toColumn' (VG.map VG.minimum column)) df
217 |   Maximum -> case getColumn name df of
218 |     Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) ->  insertColumn' name (Just $ toColumn' (VG.map VG.maximum column)) df
219 |     Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) ->  insertColumn' name (Just $ toColumn' (VG.map VG.maximum column)) df
220 |   Sum -> case getColumn name df of
221 |     Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) -> case testEquality (typeRep @a') (typeRep @Int) of
222 |       Just Refl -> insertColumn' name (Just $ toColumn' (VG.map VG.sum column)) df
223 |       Nothing -> case testEquality (typeRep @a') (typeRep @Double) of
224 |         Just Refl -> insertColumn' name (Just $ toColumn' (VG.map VG.sum column)) df
225 |         Nothing -> error $ "Cannot get sum of non-numeric column: " ++ T.unpack name -- Not sure what to do with no numeric - return nothing???
226 |     Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) -> case testEquality (typeRep @a') (typeRep @Int) of
227 |       Just Refl -> insertColumn' name (Just $ toColumn' (VG.map VG.sum column)) df
228 |       Nothing -> case testEquality (typeRep @a') (typeRep @Double) of
229 |         Just Refl -> insertColumn' name (Just $ toColumn' (VG.map VG.sum column)) df
230 |         Nothing -> error $ "Cannot get sum of non-numeric column: " ++ T.unpack name -- Not sure what to do with no numeric - return nothing???
231 |   _ -> error "UNIMPLEMENTED"
232 | 
233 | aggregate :: [(T.Text, Aggregation)] -> DataFrame -> DataFrame
234 | aggregate aggs df = let
235 |     f (name, agg) d = cloneColumn name alias d & reduceByAgg agg alias
236 |       where alias = (T.pack . show) agg <> "_" <> name 
237 |   in fold f aggs df & exclude (map fst aggs)
238 | 
239 | 
240 | appendWithFrontMin :: (Ord a) => a -> [a] -> [a]
241 | appendWithFrontMin x [] = [x]
242 | appendWithFrontMin x xs@(f:rest)
243 |   | x < f = x:xs
244 |   | otherwise = f:x:rest
245 | {-# INLINE appendWithFrontMin #-}
246 | 
247 | distinct :: DataFrame -> DataFrame
248 | distinct df = groupBy (columnNames df) df
249 | 


--------------------------------------------------------------------------------
/src/DataFrame/Operations/Core.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE ExplicitNamespaces #-}
  2 | {-# LANGUAGE FlexibleContexts #-}
  3 | {-# LANGUAGE GADTs #-}
  4 | {-# LANGUAGE OverloadedStrings #-}
  5 | {-# LANGUAGE RankNTypes #-}
  6 | {-# LANGUAGE ScopedTypeVariables #-}
  7 | {-# LANGUAGE TypeApplications #-}
  8 | {-# LANGUAGE BangPatterns #-}
  9 | module DataFrame.Operations.Core where
 10 | 
 11 | import qualified Data.List as L
 12 | import qualified Data.Map as M
 13 | import qualified Data.Map.Strict as MS
 14 | import qualified Data.Set as S
 15 | import qualified Data.Text as T
 16 | import qualified Data.Vector.Generic as VG
 17 | import qualified Data.Vector as V
 18 | import qualified Data.Vector.Unboxed as VU
 19 | 
 20 | import Control.Exception ( throw )
 21 | import DataFrame.Errors
 22 | import DataFrame.Internal.Column ( Column(..), toColumn', toColumn, columnLength, columnTypeString, expandColumn )
 23 | import DataFrame.Internal.DataFrame (DataFrame(..), getColumn, null, empty)
 24 | import DataFrame.Internal.Parsing (isNullish)
 25 | import DataFrame.Internal.Types (Columnable)
 26 | import Data.Either
 27 | import Data.Function (on, (&))
 28 | import Data.Maybe
 29 | import Data.Type.Equality (type (:~:)(Refl), TestEquality(..))
 30 | import Type.Reflection
 31 | import Prelude hiding (null)
 32 | 
 33 | -- | O(1) Get DataFrame dimensions i.e. (rows, columns)
 34 | dimensions :: DataFrame -> (Int, Int)
 35 | dimensions = dataframeDimensions
 36 | {-# INLINE dimensions #-}
 37 | 
 38 | -- | O(k) Get column names of the DataFrame in order of insertion.
 39 | columnNames :: DataFrame -> [T.Text]
 40 | columnNames = map fst . L.sortBy (compare `on` snd). M.toList . columnIndices
 41 | {-# INLINE columnNames #-}
 42 | 
 43 | -- | /O(n)/ Adds a vector to the dataframe.
 44 | insertColumn ::
 45 |   forall a.
 46 |   (Columnable a) =>
 47 |   -- | Column Name
 48 |   T.Text ->
 49 |   -- | Vector to add to column
 50 |   V.Vector a ->
 51 |   -- | DataFrame to add column to
 52 |   DataFrame ->
 53 |   DataFrame
 54 | insertColumn name xs = insertColumn' name (Just (toColumn' xs))
 55 | {-# INLINE insertColumn #-}
 56 | 
 57 | cloneColumn :: T.Text -> T.Text -> DataFrame -> DataFrame
 58 | cloneColumn original new df = fromMaybe (throw $ ColumnNotFoundException original "cloneColumn" (map fst $ M.toList $ columnIndices df)) $ do
 59 |   column <- getColumn original df
 60 |   return $ insertColumn' new (Just column) df
 61 | 
 62 | -- | /O(n)/ Adds an unboxed vector to the dataframe.
 63 | insertUnboxedColumn ::
 64 |   forall a.
 65 |   (Columnable a, VU.Unbox a) =>
 66 |   -- | Column Name
 67 |   T.Text ->
 68 |   -- | Unboxed vector to add to column
 69 |   VU.Vector a ->
 70 |   -- | DataFrame to add to column
 71 |   DataFrame ->
 72 |   DataFrame
 73 | insertUnboxedColumn name xs = insertColumn' name (Just (UnboxedColumn xs))
 74 | 
 75 | -- -- | /O(n)/ Add a column to the dataframe. Not meant for external use.
 76 | insertColumn' ::
 77 |   -- | Column Name
 78 |   T.Text ->
 79 |   -- | Column to add
 80 |   Maybe Column ->
 81 |   -- | DataFrame to add to column
 82 |   DataFrame ->
 83 |   DataFrame
 84 | insertColumn' _ Nothing d = d
 85 | insertColumn' name optCol@(Just column) d
 86 |     | M.member name (columnIndices d) = let
 87 |         i = (M.!) (columnIndices d) name
 88 |       in d { columns = columns d V.// [(i, optCol)] }
 89 |     | otherwise = insertNewColumn
 90 |       where
 91 |         l = columnLength column
 92 |         (r, c) = dataframeDimensions d
 93 |         diff = abs (l - r)
 94 |         insertNewColumn
 95 |           -- If we have a non-empty dataframe and we have more rows in the new column than the other column
 96 |           -- we should make all the other columns have null and then add the new column. 
 97 |           | r > 0 && l > r = let
 98 |               indexes = (map snd . L.sortBy (compare `on` snd). M.toList . columnIndices) d
 99 |               nonEmptyColumns = L.foldl' (\acc i -> acc ++ [maybe (error "Unexpected") (expandColumn diff) (columns d V.! i)]) [] indexes
100 |             in fromList (zip (columnNames d ++ [name]) (nonEmptyColumns ++ [column]))
101 |           | otherwise = let
102 |                 (n:rest) = case freeIndices d of
103 |                   [] -> [VG.length (columns d)..(VG.length (columns d) * 2 - 1)]
104 |                   lst -> lst
105 |                 columns' = if L.null (freeIndices d)
106 |                           then columns d V.++ V.replicate (VG.length (columns d)) Nothing
107 |                           else columns d
108 |                 xs'
109 |                   | diff <= 0 || null d = optCol
110 |                   | otherwise = expandColumn diff <$> optCol
111 |             in d
112 |                   { columns = columns' V.// [(n, xs')],
113 |                     columnIndices = M.insert name n (columnIndices d),
114 |                     freeIndices = rest,
115 |                     dataframeDimensions = (max l r, c + 1)
116 |                   }
117 | 
118 | -- | /O(k)/ Add a column to the dataframe providing a default.
119 | -- This constructs a new vector and also may convert it
120 | -- to an unboxed vector if necessary. Since columns are usually
121 | -- large the runtime is dominated by the length of the list, k.
122 | insertColumnWithDefault ::
123 |   forall a.
124 |   (Columnable a) =>
125 |   -- | Default Value
126 |   a ->
127 |   -- | Column name
128 |   T.Text ->
129 |   -- | Data to add to column
130 |   V.Vector a ->
131 |   -- | DataFrame to add to column
132 |   DataFrame ->
133 |   DataFrame
134 | insertColumnWithDefault defaultValue name xs d =
135 |   let (rows, _) = dataframeDimensions d
136 |       values = xs V.++ V.replicate (rows - V.length xs) defaultValue
137 |    in insertColumn' name (Just $ toColumn' values) d
138 | 
139 | -- TODO: Add existence check in rename.
140 | rename :: T.Text -> T.Text -> DataFrame -> DataFrame
141 | rename orig new df = fromMaybe (throw $ ColumnNotFoundException orig "rename" (map fst $ M.toList $ columnIndices df)) $ do
142 |   columnIndex <- M.lookup orig (columnIndices df)
143 |   let origRemoved = M.delete orig (columnIndices df)
144 |   let newAdded = M.insert new columnIndex origRemoved
145 |   return df { columnIndices = newAdded }
146 | 
147 | -- | O(1) Get the number of elements in a given column.
148 | columnSize :: T.Text -> DataFrame -> Maybe Int
149 | columnSize name df = columnLength <$> getColumn name df
150 | 
151 | data ColumnInfo = ColumnInfo {
152 |     nameOfColumn :: !T.Text,
153 |     nonNullValues :: !Int,
154 |     nullValues :: !Int,
155 |     partiallyParsedValues :: !Int,
156 |     uniqueValues :: !Int,
157 |     typeOfColumn :: !T.Text
158 |   }
159 | 
160 | -- | O(n) Returns the number of non-null columns in the dataframe and the type associated
161 | -- with each column.
162 | columnInfo :: DataFrame -> DataFrame
163 | columnInfo df = empty & insertColumn' "Column Name" (Just $! toColumn (map nameOfColumn infos))
164 |                       & insertColumn' "# Non-null Values" (Just $! toColumn (map nonNullValues infos))
165 |                       & insertColumn' "# Null Values" (Just $! toColumn (map nullValues infos))
166 |                       & insertColumn' "# Partially parsed" (Just $! toColumn (map partiallyParsedValues infos))
167 |                       & insertColumn' "# Unique Values" (Just $! toColumn (map uniqueValues infos))
168 |                       & insertColumn' "Type" (Just $! toColumn (map typeOfColumn infos))
169 |   where
170 |     infos = L.sortBy (compare `on` nonNullValues) (V.ifoldl' go [] (columns df)) :: [ColumnInfo]
171 |     indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df))
172 |     columnName i = M.lookup i indexMap
173 |     go acc i Nothing = acc
174 |     go acc i (Just col@(OptionalColumn (c :: V.Vector a))) = let
175 |         cname = columnName i
176 |         countNulls = nulls col
177 |         countPartial = partiallyParsed col
178 |         columnType = T.pack $ show $ typeRep @a
179 |         unique = S.size $ VG.foldr S.insert S.empty c
180 |       in if isNothing cname then acc else ColumnInfo (fromMaybe "" cname) (columnLength col - countNulls) countNulls countPartial unique columnType : acc
181 |     go acc i (Just col@(BoxedColumn (c :: V.Vector a))) = let
182 |         cname = columnName i
183 |         countPartial = partiallyParsed col
184 |         columnType = T.pack $ show $ typeRep @a
185 |         unique = S.size $ VG.foldr S.insert S.empty c
186 |       in if isNothing cname then acc else ColumnInfo (fromMaybe "" cname) (columnLength col) 0 countPartial unique columnType : acc
187 |     go acc i (Just col@(UnboxedColumn c)) = let
188 |         cname = columnName i
189 |         columnType = T.pack $ columnTypeString col
190 |         unique = S.size $ VG.foldr S.insert S.empty c
191 |         -- Unboxed columns cannot have nulls since Maybe
192 |         -- is not an instance of Unbox a
193 |       in if isNothing cname then acc else ColumnInfo (fromMaybe "" cname) (columnLength col) 0 0 unique columnType : acc
194 | 
195 | nulls :: Column -> Int
196 | nulls (OptionalColumn xs) = VG.length $ VG.filter isNothing xs
197 | nulls (BoxedColumn (xs :: V.Vector a)) = case testEquality (typeRep @a) (typeRep @T.Text) of
198 |   Just Refl -> VG.length $ VG.filter isNullish xs
199 |   Nothing -> case testEquality (typeRep @a) (typeRep @String) of
200 |     Just Refl -> VG.length $ VG.filter (isNullish . T.pack) xs
201 |     Nothing -> case typeRep @a of
202 |       App t1 t2 -> case eqTypeRep t1 (typeRep @Maybe) of
203 |           Just HRefl -> VG.length $ VG.filter isNothing xs
204 |           Nothing -> 0
205 |       _ -> 0
206 | nulls _ = 0
207 | 
208 | partiallyParsed :: Column -> Int
209 | partiallyParsed (BoxedColumn (xs :: V.Vector a)) =
210 |   case typeRep @a of
211 |     App (App tycon t1) t2 -> case eqTypeRep tycon (typeRep @Either) of
212 |       Just HRefl -> VG.length $ VG.filter isLeft xs
213 |       Nothing -> 0
214 |     _ -> 0
215 | partiallyParsed _ = 0
216 | 
217 | fromList :: [(T.Text, Column)] -> DataFrame
218 | fromList = L.foldl' (\df (!name, !column) -> insertColumn' name (Just $! column) df) empty
219 | 
220 | fromColumnList :: [Column] -> DataFrame
221 | fromColumnList = fromList . zip (map (T.pack . show) [0..])
222 | 
223 | -- | O (k * n) Counts the occurences of each value in a given column.
224 | valueCounts :: forall a. (Columnable a) => T.Text -> DataFrame -> [(a, Int)]
225 | valueCounts columnName df = case getColumn columnName df of
226 |       Nothing -> throw $ ColumnNotFoundException columnName "sortBy" (map fst $ M.toList $ columnIndices df)
227 |       Just (BoxedColumn (column' :: V.Vector c)) ->
228 |         let
229 |           column = V.foldl' (\m v -> MS.insertWith (+) v (1 :: Int) m) M.empty column'
230 |         in case (typeRep @a) `testEquality` (typeRep @c) of
231 |               Nothing -> throw $ TypeMismatchException (typeRep @a) (typeRep @c) columnName "valueCounts"
232 |               Just Refl -> M.toAscList column
233 |       Just (OptionalColumn (column' :: V.Vector c)) ->
234 |         let
235 |           column = V.foldl' (\m v -> MS.insertWith (+) v (1 :: Int) m) M.empty column'
236 |         in case (typeRep @a) `testEquality` (typeRep @c) of
237 |               Nothing -> throw $ TypeMismatchException (typeRep @a) (typeRep @c) columnName "valueCounts"
238 |               Just Refl -> M.toAscList column
239 |       Just (UnboxedColumn (column' :: VU.Vector c)) -> let
240 |           column = V.foldl' (\m v -> MS.insertWith (+) v (1 :: Int) m) M.empty (V.convert column')
241 |         in case (typeRep @a) `testEquality` (typeRep @c) of
242 |           Nothing -> throw $ TypeMismatchException (typeRep @a) (typeRep @c) columnName "valueCounts"
243 |           Just Refl -> M.toAscList column
244 | 
245 | fold :: (a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame
246 | fold f xs acc = L.foldl' (flip f) acc xs
247 | 


--------------------------------------------------------------------------------
/src/DataFrame/Operations/Sorting.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module DataFrame.Operations.Sorting where
 3 | 
 4 | import qualified Data.List as L
 5 | import qualified Data.Text as T
 6 | import qualified Data.Vector as V
 7 | 
 8 | import Control.Exception (throw)
 9 | import DataFrame.Errors (DataFrameException(..))
10 | import DataFrame.Internal.Column
11 | import DataFrame.Internal.DataFrame (DataFrame(..), getColumn)
12 | import DataFrame.Internal.Row
13 | import DataFrame.Operations.Core
14 | 
15 | -- | Sort order taken as a parameter by the sortby function.
16 | data SortOrder = Ascending | Descending deriving (Eq)
17 | 
18 | -- | O(k log n) Sorts the dataframe by a given row.
19 | --
20 | -- > sortBy "Age" df
21 | sortBy ::
22 |   SortOrder ->
23 |   [T.Text] ->
24 |   DataFrame ->
25 |   DataFrame
26 | sortBy order names df
27 |   | any (`notElem` columnNames df) names = throw $ ColumnNotFoundException (T.pack $ show $ names L.\\ columnNames df) "sortBy" (columnNames df)
28 |   | otherwise = let
29 |       -- TODO: Remove the SortOrder defintion from operations so we can share it between here and internal and
30 |       -- we don't have to do this Bool mapping.
31 |       indexes = sortedIndexes' (order == Ascending) (toRowVector names df)
32 |       pick idxs col = atIndicesStable idxs <$> col
33 |     in df {columns = V.map (pick indexes) (columns df)}
34 | 


--------------------------------------------------------------------------------
/src/DataFrame/Operations/Statistics.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE RankNTypes #-}
  2 | {-# LANGUAGE ScopedTypeVariables #-}
  3 | {-# LANGUAGE TypeApplications #-}
  4 | {-# LANGUAGE ExplicitNamespaces #-}
  5 | {-# LANGUAGE GADTs #-}
  6 | {-# LANGUAGE OverloadedStrings #-}
  7 | {-# LANGUAGE StrictData #-}
  8 | module DataFrame.Operations.Statistics where
  9 | 
 10 | import qualified Data.List as L
 11 | import qualified Data.Text as T
 12 | import qualified Data.Vector.Generic as VG
 13 | import qualified Data.Vector as V
 14 | import qualified Data.Vector.Unboxed as VU
 15 | import qualified Statistics.Quantile as SS
 16 | import qualified Statistics.Sample as SS
 17 | 
 18 | import Prelude as P
 19 | 
 20 | import Control.Exception (throw)
 21 | import DataFrame.Errors (DataFrameException(..))
 22 | import DataFrame.Internal.Column
 23 | import DataFrame.Internal.DataFrame (DataFrame(..), getColumn, empty)
 24 | import DataFrame.Internal.Types (Columnable, transform)
 25 | import DataFrame.Operations.Core
 26 | import Data.Foldable (asum)
 27 | import Data.Maybe (isJust, fromMaybe)
 28 | import Data.Function ((&))
 29 | import Data.Type.Equality (type (:~:)(Refl), TestEquality (testEquality))
 30 | import Type.Reflection (typeRep)
 31 | 
 32 | 
 33 | frequencies :: T.Text -> DataFrame -> DataFrame
 34 | frequencies name df = case getColumn name df of
 35 |   Just ((BoxedColumn (column :: V.Vector a))) -> let
 36 |       counts = valueCounts @a name df
 37 |       total = P.sum $ map snd counts
 38 |       vText :: forall a . (Columnable a) => a -> T.Text
 39 |       vText c' = case testEquality (typeRep @a) (typeRep @T.Text) of
 40 |         Just Refl -> c'
 41 |         Nothing -> case testEquality (typeRep @a) (typeRep @String) of
 42 |           Just Refl -> T.pack c'
 43 |           Nothing -> (T.pack . show) c'
 44 |       initDf = empty & insertColumn "Statistic" (V.fromList ["Count" :: T.Text,  "Percentage (%)"])
 45 |     in L.foldl' (\df (col, k) -> insertColumn (vText col) (V.fromList [k, k * 100 `div` total]) df) initDf counts
 46 |   Just ((OptionalColumn (column :: V.Vector a))) -> let
 47 |       counts = valueCounts @a name df
 48 |       total = P.sum $ map snd counts
 49 |       vText :: forall a . (Columnable a) => a -> T.Text
 50 |       vText c' = case testEquality (typeRep @a) (typeRep @T.Text) of
 51 |         Just Refl -> c'
 52 |         Nothing -> case testEquality (typeRep @a) (typeRep @String) of
 53 |           Just Refl -> T.pack c'
 54 |           Nothing -> (T.pack . show) c'
 55 |       initDf = empty & insertColumn "Statistic" (V.fromList ["Count" :: T.Text,  "Percentage (%)"])
 56 |     in L.foldl' (\df (col, k) -> insertColumn (vText col) (V.fromList [k, k * 100 `div` total]) df) initDf counts
 57 |   Just ((UnboxedColumn (column :: VU.Vector a))) -> let
 58 |       counts = valueCounts @a name df
 59 |       total = P.sum $ map snd counts
 60 |       vText :: forall a . (Columnable a) => a -> T.Text
 61 |       vText c' = case testEquality (typeRep @a) (typeRep @T.Text) of
 62 |         Just Refl -> c'
 63 |         Nothing -> case testEquality (typeRep @a) (typeRep @String) of
 64 |           Just Refl -> T.pack c'
 65 |           Nothing -> (T.pack . show) c'
 66 |       initDf = empty & insertColumn "Statistic" (V.fromList ["Count" :: T.Text,  "Percentage (%)"])
 67 |     in L.foldl' (\df (col, k) -> insertColumn (vText col) (V.fromList [k, k * 100 `div` total]) df) initDf counts
 68 | 
 69 | mean :: T.Text -> DataFrame -> Maybe Double
 70 | mean = applyStatistic SS.mean
 71 | 
 72 | median :: T.Text -> DataFrame -> Maybe Double
 73 | median = applyStatistic (SS.median SS.medianUnbiased)
 74 | 
 75 | standardDeviation :: T.Text -> DataFrame -> Maybe Double
 76 | standardDeviation = applyStatistic SS.fastStdDev
 77 | 
 78 | skewness :: T.Text -> DataFrame -> Maybe Double
 79 | skewness = applyStatistic SS.skewness
 80 | 
 81 | variance :: T.Text -> DataFrame -> Maybe Double
 82 | variance = applyStatistic SS.variance
 83 | 
 84 | interQuartileRange :: T.Text -> DataFrame -> Maybe Double
 85 | interQuartileRange = applyStatistic (SS.midspread SS.medianUnbiased 4)
 86 | 
 87 | correlation :: T.Text -> T.Text -> DataFrame -> Maybe Double
 88 | correlation first second df = do
 89 |   f <- _getColumnAsDouble first df
 90 |   s <- _getColumnAsDouble second df
 91 |   return $ SS.correlation (VG.zip f s)
 92 | 
 93 | _getColumnAsDouble :: T.Text -> DataFrame -> Maybe (VU.Vector Double)
 94 | _getColumnAsDouble name df = case getColumn name df of
 95 |   Just (UnboxedColumn (f :: VU.Vector a)) -> case testEquality (typeRep @a) (typeRep @Double) of
 96 |     Just Refl -> Just f
 97 |     Nothing -> case testEquality (typeRep @a) (typeRep @Int) of
 98 |       Just Refl -> Just $ VU.map fromIntegral f
 99 |       Nothing -> Nothing
100 |   _ -> Nothing
101 | 
102 | sum :: T.Text -> DataFrame -> Maybe Double
103 | sum name df = case getColumn name df of
104 |   Just ((UnboxedColumn (column :: VU.Vector a'))) -> case testEquality (typeRep @a') (typeRep @Int) of
105 |     Just Refl -> Just $ VG.sum (VU.map fromIntegral column)
106 |     Nothing -> case testEquality (typeRep @a') (typeRep @Double) of
107 |       Just Refl -> Just $ VG.sum column
108 |       Nothing -> Nothing
109 |   Nothing -> Nothing
110 | 
111 | applyStatistic :: (VU.Vector Double -> Double) -> T.Text -> DataFrame -> Maybe Double
112 | applyStatistic f name df = do
113 |       column <- getColumn name df
114 |       if columnTypeString column == "Double"
115 |       then safeReduceColumn f column
116 |       else do
117 |         matching <- asum [transform (fromIntegral :: Int -> Double) column,
118 |                           transform (fromIntegral :: Integer -> Double) column,
119 |                           transform (realToFrac :: Float -> Double) column,
120 |                           Just column ]
121 |         safeReduceColumn f matching
122 | 
123 | applyStatistics :: (VU.Vector Double -> VU.Vector Double) -> T.Text -> DataFrame -> Maybe (VU.Vector Double)
124 | applyStatistics f name df = case getColumn name df of
125 |   Just ((UnboxedColumn (column :: VU.Vector a'))) -> case testEquality (typeRep @a') (typeRep @Int) of
126 |     Just Refl -> Just $! f (VU.map fromIntegral column)
127 |     Nothing -> case testEquality (typeRep @a') (typeRep @Double) of
128 |       Just Refl -> Just $! f column
129 |       Nothing -> case testEquality (typeRep @a') (typeRep @Float) of
130 |         Just Refl -> Just $! f (VG.map realToFrac column)
131 |         Nothing -> Nothing
132 |   _ -> Nothing
133 | 
134 | summarize :: DataFrame -> DataFrame
135 | summarize df = fold columnStats (columnNames df) (fromList [("Statistic", toColumn ["Mean" :: T.Text, "Minimum", "25%" ,"Median", "75%", "Max", "StdDev", "IQR", "Skewness"])])
136 |   where columnStats name d = if all isJust (stats name) then insertUnboxedColumn name (VU.fromList (map (roundTo 2 . fromMaybe 0) $ stats name)) d else d
137 |         stats name = let
138 |             quantiles = applyStatistics (SS.quantilesVec SS.medianUnbiased (VU.fromList [0,1,2,3,4]) 4) name df
139 |             min' = flip (VG.!) 0 <$> quantiles
140 |             quartile1 = flip (VG.!) 1 <$> quantiles
141 |             median' = flip (VG.!) 2 <$> quantiles
142 |             quartile3 = flip (VG.!) 3 <$> quantiles
143 |             max' = flip (VG.!) 4 <$> quantiles
144 |             iqr = (-) <$> quartile3 <*> quartile1
145 |           in [mean name df,
146 |               min',
147 |               quartile1,
148 |               median',
149 |               quartile3,
150 |               max',
151 |               standardDeviation name df,
152 |               iqr,
153 |               skewness name df]
154 |         roundTo :: Int -> Double -> Double
155 |         roundTo n x = fromInteger (round $ x * (10^n)) / (10.0^^n)
156 | 


--------------------------------------------------------------------------------
/src/DataFrame/Operations/Subset.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE BangPatterns #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | {-# LANGUAGE RankNTypes #-}
  4 | {-# LANGUAGE ScopedTypeVariables #-}
  5 | {-# LANGUAGE TypeApplications #-}
  6 | {-# LANGUAGE GADTs #-}
  7 | module DataFrame.Operations.Subset where
  8 | 
  9 | import qualified Data.List as L
 10 | import qualified Data.Map as M
 11 | import qualified Data.Set as S
 12 | import qualified Data.Text as T
 13 | import qualified Data.Vector as V
 14 | import qualified Data.Vector.Unboxed as VU
 15 | import qualified Data.Vector.Generic as VG
 16 | import qualified Prelude
 17 | 
 18 | import Control.Exception (throw)
 19 | import DataFrame.Errors (DataFrameException(..))
 20 | import DataFrame.Internal.Column
 21 | import DataFrame.Internal.DataFrame (DataFrame(..), getColumn, empty)
 22 | import DataFrame.Internal.Function
 23 | import DataFrame.Internal.Row (mkRowFromArgs)
 24 | import DataFrame.Internal.Types (Columnable, RowValue, toRowValue)
 25 | import DataFrame.Operations.Core
 26 | import DataFrame.Operations.Transformations (apply)
 27 | import Data.Function ((&))
 28 | import Data.Maybe (isJust, fromJust, fromMaybe)
 29 | import Prelude hiding (filter, take)
 30 | import Type.Reflection
 31 | 
 32 | -- | O(k * n) Take the first n rows of a DataFrame.
 33 | take :: Int -> DataFrame -> DataFrame
 34 | take n d = d {columns = V.map (takeColumn n' <$>) (columns d), dataframeDimensions = (n', c)}
 35 |   where
 36 |     (r, c) = dataframeDimensions d
 37 |     n' = clip n 0 r
 38 | 
 39 | takeLast :: Int -> DataFrame -> DataFrame
 40 | takeLast n d = d {columns = V.map (takeLastColumn n' <$>) (columns d), dataframeDimensions = (n', c)}
 41 |   where
 42 |     (r, c) = dataframeDimensions d
 43 |     n' = clip n 0 r
 44 | 
 45 | drop :: Int -> DataFrame -> DataFrame
 46 | drop n d = d {columns = V.map (sliceColumn n' (max (r - n') 0) <$>) (columns d), dataframeDimensions = (max (r - n') 0, c)}
 47 |   where
 48 |     (r, c) = dataframeDimensions d
 49 |     n' = clip n 0 r
 50 | 
 51 | dropLast :: Int -> DataFrame -> DataFrame
 52 | dropLast n d = d {columns = V.map (sliceColumn 0 n' <$>) (columns d), dataframeDimensions = (n', c)}
 53 |   where
 54 |     (r, c) = dataframeDimensions d
 55 |     n' = clip (r - n) 0 r
 56 | 
 57 | -- | O(k * n) Take a range of rows of a DataFrame.
 58 | range :: (Int, Int) -> DataFrame -> DataFrame
 59 | range (start, end) d = d {columns = V.map (sliceColumn (clip start 0 r) n' <$>) (columns d), dataframeDimensions = (n', c)}
 60 |   where
 61 |     (r, c) = dataframeDimensions d
 62 |     n' = clip (end - start) 0 r
 63 | 
 64 | clip :: Int -> Int -> Int -> Int
 65 | clip n left right = min right $ max n left
 66 | 
 67 | -- | O(n * k) Filter rows by a given condition.
 68 | --
 69 | -- filter "x" even df
 70 | filter ::
 71 |   forall a.
 72 |   (Columnable a) =>
 73 |   -- | Column to filter by
 74 |   T.Text ->
 75 |   -- | Filter condition
 76 |   (a -> Bool) ->
 77 |   -- | Dataframe to filter
 78 |   DataFrame ->
 79 |   DataFrame
 80 | filter filterColumnName condition df = case getColumn filterColumnName df of
 81 |   Nothing -> throw $ ColumnNotFoundException filterColumnName "filter" (map fst $ M.toList $ columnIndices df)
 82 |   Just column -> case ifoldlColumn (\s i v -> if condition v then S.insert i s else s) S.empty column of
 83 |     Nothing -> throw $ TypeMismatchException' (typeRep @a) (columnTypeString column) filterColumnName "filter"
 84 |     Just indexes -> let
 85 |         c' = snd $ dataframeDimensions df
 86 |         pick idxs col = atIndices idxs <$> col
 87 |       in df {columns = V.map (pick indexes) (columns df), dataframeDimensions = (S.size indexes, c')}
 88 | 
 89 | -- | O(k) a version of filter where the predicate comes first.
 90 | --
 91 | -- > filterBy even "x" df
 92 | filterBy :: (Columnable a) => (a -> Bool) -> T.Text -> DataFrame -> DataFrame
 93 | filterBy = flip filter
 94 | 
 95 | -- | O(k) filters the dataframe with a row predicate. The arguments in the function
 96 | --   must appear in the same order as they do in the list.
 97 | --
 98 | -- > filterWhere (["x", "y"], func (\x y -> x + y > 5)) df
 99 | filterWhere :: ([T.Text], Function) -> DataFrame -> DataFrame
100 | filterWhere (args, f) df = let
101 |     indexes = VG.ifoldl' (\s i row -> if funcApply @Bool row f then S.insert i s else s) S.empty $ V.generate (fst (dimensions df)) (mkRowFromArgs args df)
102 |     c' = snd $ dataframeDimensions df
103 |     pick idxs col = atIndices idxs <$> col
104 |   in df {columns = V.map (pick indexes) (columns df), dataframeDimensions = (S.size indexes, c')}
105 | 
106 | 
107 | -- | O(k) removes all rows with `Nothing` in a given column from the dataframe.
108 | --
109 | -- > filterJust df
110 | filterJust :: T.Text -> DataFrame -> DataFrame
111 | filterJust name df = case getColumn name df of
112 |   Nothing -> throw $ ColumnNotFoundException name "filterJust" (map fst $ M.toList $ columnIndices df)
113 |   Just column@(OptionalColumn (col :: V.Vector (Maybe a))) -> filter @(Maybe a) name isJust df & apply @(Maybe a) fromJust name
114 |   Just column -> df
115 | 
116 | -- | O(n * k) removes all rows with `Nothing` from the dataframe.
117 | --
118 | -- > filterJust df
119 | filterAllJust :: DataFrame -> DataFrame
120 | filterAllJust df = foldr filterJust df (columnNames df)
121 | 
122 | -- | O(k) cuts the dataframe in a cube of size (a, b) where
123 | --   a is the length and b is the width.   
124 | --
125 | -- > cube (10, 5) df
126 | cube :: (Int, Int) -> DataFrame -> DataFrame
127 | cube (length, width) = take length . selectIntRange (0, width - 1)
128 | 
129 | -- | O(n) Selects a number of columns in a given dataframe.
130 | --
131 | -- > select ["name", "age"] df
132 | select ::
133 |   [T.Text] ->
134 |   DataFrame ->
135 |   DataFrame
136 | select cs df
137 |   | L.null cs = empty
138 |   | any (`notElem` columnNames df) cs = throw $ ColumnNotFoundException (T.pack $ show $ cs L.\\ columnNames df) "select" (columnNames df)
139 |   | otherwise = L.foldl' addKeyValue empty cs
140 |   where
141 |     cIndexAssoc = M.toList $ columnIndices df
142 |     remaining = L.filter (\(!c, _) -> c `elem` cs) cIndexAssoc
143 |     removed = cIndexAssoc L.\\ remaining
144 |     indexes = map snd remaining
145 |     (r, c) = dataframeDimensions df
146 |     addKeyValue d k =
147 |       d
148 |         { columns = V.imap (\i v -> if i `notElem` indexes then Nothing else v) (columns df),
149 |           columnIndices = M.fromList remaining,
150 |           freeIndices = map snd removed ++ freeIndices df,
151 |           dataframeDimensions = (r, L.length remaining)
152 |         }
153 | 
154 | -- | O(n) select columns by index range of column names.
155 | selectIntRange :: (Int, Int) -> DataFrame -> DataFrame
156 | selectIntRange (from, to) df = select (Prelude.take (to - from + 1) $ Prelude.drop from (columnNames df)) df
157 | 
158 | -- | O(n) select columns by index range of column names.
159 | selectRange :: (T.Text, T.Text) -> DataFrame -> DataFrame
160 | selectRange (from, to) df = select (reverse $ Prelude.dropWhile (to /=) $ reverse $ dropWhile (from /=) (columnNames df)) df
161 | 
162 | -- | O(n) select columns by column predicate name.
163 | selectBy :: (T.Text -> Bool) -> DataFrame -> DataFrame
164 | selectBy f df = select (L.filter f (columnNames df)) df
165 | 
166 | -- | O(n) inverse of select
167 | --
168 | -- > exclude ["Name"] df
169 | exclude ::
170 |   [T.Text] ->
171 |   DataFrame ->
172 |   DataFrame
173 | exclude cs df =
174 |   let keysToKeep = columnNames df L.\\ cs
175 |    in select keysToKeep df
176 | 


--------------------------------------------------------------------------------
/src/DataFrame/Operations/Transformations.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE OverloadedStrings #-}
  2 | {-# LANGUAGE RankNTypes #-}
  3 | {-# LANGUAGE ScopedTypeVariables #-}
  4 | {-# LANGUAGE TypeApplications #-}
  5 | module DataFrame.Operations.Transformations where
  6 | 
  7 | import qualified Data.List as L
  8 | import qualified Data.Text as T
  9 | import qualified Data.Map as M
 10 | import qualified Data.Vector.Generic as VG
 11 | import qualified Data.Vector as V
 12 | import qualified Data.Vector.Unboxed as VU
 13 | 
 14 | import Control.Exception (throw)
 15 | import DataFrame.Errors (DataFrameException(..))
 16 | import DataFrame.Internal.Column (Column(..), columnTypeString, itransform, ifoldrColumn)
 17 | import DataFrame.Internal.DataFrame (DataFrame(..), getColumn)
 18 | import DataFrame.Internal.Function (Function(..), funcApply)
 19 | import DataFrame.Internal.Row (mkRowFromArgs)
 20 | import DataFrame.Internal.Types (Columnable, RowValue, toRowValue, transform)
 21 | import DataFrame.Operations.Core
 22 | import Data.Maybe
 23 | import Type.Reflection (typeRep, typeOf)
 24 | 
 25 | -- | O(k) Apply a function to a given column in a dataframe.
 26 | apply ::
 27 |   forall b c.
 28 |   (Columnable b, Columnable c) =>
 29 |   -- | function to apply
 30 |   (b -> c) ->
 31 |   -- | Column name
 32 |   T.Text ->
 33 |   -- | DataFrame to apply operation to
 34 |   DataFrame ->
 35 |   DataFrame
 36 | apply f columnName d = case getColumn columnName d of
 37 |   Nothing -> throw $ ColumnNotFoundException columnName "apply" (map fst $ M.toList $ columnIndices d)
 38 |   Just column -> case transform f column of
 39 |     Nothing -> throw $ TypeMismatchException' (typeRep @b) (columnTypeString column) columnName "apply"
 40 |     column' -> insertColumn' columnName column' d
 41 | 
 42 | -- | O(k) Apply a function to a combination of columns in a dataframe and
 43 | -- add the result into `alias` column.
 44 | deriveFrom :: ([T.Text], Function) -> T.Text -> DataFrame -> DataFrame
 45 | deriveFrom (args, f) name df = case f of
 46 |   (F4 (f' :: a -> b -> c -> d -> e)) -> let
 47 |       xs = VG.map (\row -> funcApply @e row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df)
 48 |     in insertColumn name xs df
 49 |   (F3 (f' :: a -> b -> c -> d)) -> let
 50 |       xs = VG.map (\row -> funcApply @d row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df)
 51 |     in insertColumn name xs df
 52 |   (F2 (f' :: a -> b -> c)) -> let
 53 |       xs = VG.map (\row -> funcApply @c row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df)
 54 |     in insertColumn name xs df
 55 |   (F1 (f' :: a -> b)) -> let
 56 |       xs = VG.map (\row -> funcApply @b row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df)
 57 |     in insertColumn name xs df
 58 | 
 59 | -- | O(k) Apply a function to a given column in a dataframe and
 60 | -- add the result into alias column.
 61 | 
 62 | derive ::
 63 |   forall b c.
 64 |   (Columnable b, Columnable c) =>
 65 |   -- | New name
 66 |   T.Text ->
 67 |   -- | function to apply
 68 |   (b -> c) ->
 69 |   -- | Derivative column name
 70 |   T.Text ->
 71 |   -- | DataFrame to apply operation to
 72 |   DataFrame ->
 73 |   DataFrame
 74 | derive alias f columnName d = case getColumn columnName d of
 75 |   Nothing -> throw $ ColumnNotFoundException columnName "derive" (map fst $ M.toList $ columnIndices d)
 76 |   Just column -> case transform f column of
 77 |     Nothing  -> throw $ TypeMismatchException (typeOf column) (typeRep @b) columnName "derive"
 78 |     Just res -> insertColumn' alias (Just res) d
 79 | 
 80 | -- | O(k * n) Apply a function to given column names in a dataframe.
 81 | applyMany ::
 82 |   (Columnable b, Columnable c) =>
 83 |   (b -> c) ->
 84 |   [T.Text] ->
 85 |   DataFrame ->
 86 |   DataFrame
 87 | applyMany f names df = L.foldl' (flip (apply f)) df names
 88 | 
 89 | -- | O(k) Convenience function that applies to an int column.
 90 | applyInt ::
 91 |   (Columnable b) =>
 92 |   -- | Column name
 93 |   -- | function to apply
 94 |   (Int -> b) ->
 95 |   T.Text ->
 96 |   -- | DataFrame to apply operation to
 97 |   DataFrame ->
 98 |   DataFrame
 99 | applyInt = apply
100 | 
101 | -- | O(k) Convenience function that applies to an double column.
102 | applyDouble ::
103 |   (Columnable b) =>
104 |   -- | Column name
105 |   -- | function to apply
106 |   (Double -> b) ->
107 |   T.Text ->
108 |   -- | DataFrame to apply operation to
109 |   DataFrame ->
110 |   DataFrame
111 | applyDouble = apply
112 | 
113 | -- | O(k * n) Apply a function to a column only if there is another column
114 | -- value that matches the given criterion.
115 | --
116 | -- > applyWhere "Age" (<20) "Generation" (const "Gen-Z")
117 | applyWhere ::
118 |   forall a b .
119 |   (Columnable a, Columnable b) =>
120 |   (a -> Bool) -> -- Filter condition
121 |   T.Text -> -- Criterion Column
122 |   (b -> b) -> -- function to apply
123 |   T.Text -> -- Column name
124 |   DataFrame -> -- DataFrame to apply operation to
125 |   DataFrame
126 | applyWhere condition filterColumnName f columnName df = case getColumn filterColumnName df of
127 |   Nothing -> throw $ ColumnNotFoundException filterColumnName "applyWhere" (map fst $ M.toList $ columnIndices df)
128 |   Just column -> case ifoldrColumn (\i val acc -> if condition val then V.cons i acc else acc) V.empty column of
129 |       Nothing -> throw $ TypeMismatchException' (typeRep @a) (columnTypeString column) filterColumnName "applyWhere"
130 |       Just indexes -> if V.null indexes
131 |                       then df
132 |                       else L.foldl' (\d i -> applyAtIndex i f columnName d) df indexes
133 | 
134 | -- | O(k) Apply a function to the column at a given index.
135 | applyAtIndex ::
136 |   forall a.
137 |   (Columnable a) =>
138 |   -- | Index
139 |   Int ->
140 |   -- | function to apply
141 |   (a -> a) ->
142 |   -- | Column name
143 |   T.Text ->
144 |   -- | DataFrame to apply operation to
145 |   DataFrame ->
146 |   DataFrame
147 | applyAtIndex i f columnName df = case getColumn columnName df of
148 |   Nothing -> throw $ ColumnNotFoundException columnName "applyAtIndex" (map fst $ M.toList $ columnIndices df)
149 |   Just column -> case itransform (\index value -> if index == i then f value else value) column of
150 |     Nothing -> throw $ TypeMismatchException' (typeRep @a) (columnTypeString column) columnName "applyAtIndex"
151 |     column' -> insertColumn' columnName column' df
152 | 
153 | impute ::
154 |   forall b .
155 |   (Columnable b) =>
156 |   T.Text    ->
157 |   b         ->
158 |   DataFrame ->
159 |   DataFrame
160 | impute columnName value df = case getColumn columnName df of
161 |   Nothing -> throw $ ColumnNotFoundException columnName "impute" (map fst $ M.toList $ columnIndices df)
162 |   Just (OptionalColumn _) -> apply (fromMaybe value) columnName df
163 |   _ -> error "Cannot impute to a non-Empty column"
164 | 


--------------------------------------------------------------------------------
/src/DataFrame/Operations/Typing.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ExplicitNamespaces #-}
 2 | {-# LANGUAGE GADTs #-}
 3 | {-# LANGUAGE OverloadedStrings #-}
 4 | {-# LANGUAGE ScopedTypeVariables #-}
 5 | {-# LANGUAGE TypeApplications #-}
 6 | module DataFrame.Operations.Typing where
 7 | 
 8 | import qualified Data.Set as S
 9 | import qualified Data.Text as T
10 | import qualified Data.Vector as V
11 | import qualified Data.Vector.Unboxed as VU
12 | 
13 | import DataFrame.Internal.Column (Column(..))
14 | import DataFrame.Internal.DataFrame (DataFrame(..))
15 | import DataFrame.Internal.Parsing
16 | import Data.Either
17 | import Data.Maybe
18 | import Data.Time
19 | import Data.Type.Equality (type (:~:)(Refl), TestEquality(..))
20 | import Type.Reflection (typeRep)
21 | 
22 | parseDefaults :: Bool -> DataFrame -> DataFrame
23 | parseDefaults safeRead df = df {columns = V.map (parseDefault safeRead) (columns df)}
24 | 
25 | parseDefault :: Bool -> Maybe Column -> Maybe Column
26 | parseDefault _ Nothing = Nothing
27 | parseDefault safeRead (Just (BoxedColumn (c :: V.Vector a))) = let
28 |     parseTimeOpt s = parseTimeM {- Accept leading/trailing whitespace -} True defaultTimeLocale "%Y-%m-%d" (T.unpack s) :: Maybe Day
29 |     unsafeParseTime s = parseTimeOrError {- Accept leading/trailing whitespace -} True defaultTimeLocale "%Y-%m-%d" (T.unpack s) :: Day
30 |   in case (typeRep @a) `testEquality` (typeRep @T.Text) of
31 |         Nothing -> case (typeRep @a) `testEquality` (typeRep @String) of
32 |             Just Refl -> let
33 |                 emptyToNothing v = if isNullish (T.pack v) then Nothing else Just v
34 |                 safeVector = V.map emptyToNothing c
35 |                 hasNulls = V.foldl' (\acc v -> if isNothing v then acc || True else acc) False safeVector
36 |               in Just $ if safeRead && hasNulls then BoxedColumn safeVector else BoxedColumn c
37 |             Nothing -> Just $ BoxedColumn c
38 |         Just Refl ->
39 |           let example = T.strip (V.head c)
40 |               emptyToNothing v = if isNullish v then Nothing else Just v
41 |            in case readInt example of
42 |                 Just _ ->
43 |                   let safeVector = V.map ((=<<) readInt . emptyToNothing) c
44 |                       hasNulls = V.elem Nothing safeVector
45 |                    in Just $ if safeRead && hasNulls then BoxedColumn safeVector else UnboxedColumn (VU.generate (V.length c) (fromMaybe 0  . (safeVector V.!)))
46 |                 Nothing -> case readDouble example of
47 |                   Just _ ->
48 |                     let safeVector = V.map ((=<<) readDouble . emptyToNothing) c
49 |                         hasNulls = V.elem Nothing safeVector
50 |                      in Just $ if safeRead && hasNulls then BoxedColumn safeVector else UnboxedColumn (VU.generate (V.length c) (fromMaybe 0 . (safeVector V.!)))
51 |                   Nothing -> case parseTimeOpt example of
52 |                     Just d -> let
53 |                         -- failed parse should be Either, nullish should be Maybe
54 |                         emptyToNothing' v = if isNullish v then Left v else Right v
55 |                         parseTimeEither v = case parseTimeOpt v of
56 |                           Just v' -> Right v'
57 |                           Nothing -> Left v
58 |                         safeVector = V.map ((=<<) parseTimeEither . emptyToNothing') c
59 |                         toMaybe (Left _) = Nothing
60 |                         toMaybe (Right value) = Just value
61 |                         lefts = V.filter isLeft safeVector
62 |                         onlyNulls = (not (V.null lefts) && V.all (isNullish . fromLeft "non-null") lefts)
63 |                       in Just $ if safeRead
64 |                         then if onlyNulls
65 |                              then BoxedColumn (V.map toMaybe safeVector)
66 |                              else if V.any isLeft safeVector
67 |                               then BoxedColumn safeVector
68 |                               else BoxedColumn (V.map unsafeParseTime c)
69 |                         else BoxedColumn (V.map unsafeParseTime c)
70 |                     Nothing -> let
71 |                         safeVector = V.map emptyToNothing c
72 |                         hasNulls = V.any isNullish c
73 |                       in Just $ if safeRead && hasNulls then BoxedColumn safeVector else BoxedColumn c
74 | parseDefault safeRead column = column
75 | 


--------------------------------------------------------------------------------
/static/example.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mchav/dataframe/3f09cd47ff4d235360a0832635b140270bc20b96/static/example.gif


--------------------------------------------------------------------------------
/test_coverage.md:
--------------------------------------------------------------------------------
  1 | # Test Coverage
  2 | 
  3 | ## Properties
  4 | * Empty dataframe
  5 |   - Has dimensions (0, 0)
  6 |   - Has 8 empty vectors
  7 |   - No column indices
  8 | 
  9 | ## Operations
 10 | 
 11 | * addColumn
 12 |   - Adding a boxed vector to an empty dataframe creates a new column boxed containing the vector elements. DONE
 13 |   - Adding a boxed vector with a boxed type (Int/Double) to an empty dataframe creates a new column unboxed containing the vector elements. DONE
 14 |   - Adding columns > initial vector size gracefully adds a column that we can retrieve. DONE
 15 |   - Adding columns > initial vector size gracefully adds a column updates dimentions. DONE
 16 |   - Adding a column with the same name as an existing column overwrites the contents. DONE
 17 |   - Adding a column with more values than the current DF dimensions throws an exception. DONE
 18 |   - Adding a column with less values than the current DF dimensions adds column with optionals. DONE
 19 | 
 20 | * addColumnWithDefault
 21 |   - Adding a column with less values than the current DF dimensions adds column with optionals. DONE
 22 |   - Adding a column with as many values is a no-op. DONE
 23 | 
 24 | * apply
 25 |   - Applying to an existing column maps function to all values. DONE
 26 |   - Applying to non-existent column throws column not found exception. DONE
 27 |   - Applying function of wrong type throws exception. DONE
 28 | 
 29 | * applyMany
 30 |   - Applying many does same transformation to all columns. DONE
 31 |   - Applying many doesn't change unrelated fields. DONE
 32 |   - Applying many fails if any of the columns are not found. DONE
 33 |   - Applying many throws exception when the function type doesn't equal. DONE
 34 | 
 35 | * applyWhere
 36 |   - Applies function when target column criteria is met. DONE
 37 |   - When criterion column doesn't exist throw an error. DONE
 38 |   - When target column doesn't exist throw an error. DONE
 39 |   - When the type of the criterion column doesn't exist throw an error. DONE
 40 |   - When the type of the target column doesn't exist throw an error. DONE
 41 |   - When the criterion function has the wrong type throw an error. DONE
 42 |   - When the target function has the wrong type throw an error. DONE
 43 | 
 44 | * derive
 45 |   - Applies function to given column and adds it to alias. DONE
 46 |   - When column doesn't exist throw an error. DONE
 47 | 
 48 | * applyAtIndex
 49 |   - Applies function to row at index.
 50 |   - Does nothing if index is out of range.
 51 |   - Throws an error if the column doesn't exist.
 52 | 
 53 | * take
 54 |   - Takes correct number of elements. DONE
 55 |   - If # elements is less n then don't change the column. DONE
 56 |   - If arg is negative then don't change the dimensions of the frame. DONE
 57 | 
 58 | * filter
 59 |   - Filters column as expected. DONE
 60 |   - Filter on non existent values returns dataframe with (0,0) dimensions. DONE
 61 |   - Filter on non-existent type throws exception. DONE
 62 | 
 63 | * valueCounts
 64 |   - Counts values as expected.
 65 |   - Throws error when column doesn't exist.
 66 | 
 67 | * select
 68 |   - Selects a subset of the columns on select
 69 |   - Check that dimensions update after select
 70 |   - Add new column to result of selected column
 71 |   - Updates free indices on select
 72 | 
 73 | * exclude
 74 |   - Drops a subset of the columns on exclude
 75 |   - Check that dimensions update after exclude
 76 |   - Add new column to result of exclude column
 77 |   - Updates free indices on exclude
 78 | 
 79 | * groupBy
 80 |   - Groups by a column if at exist and other columns are vectors of vectors DONE
 81 |   - Groups by a number of columns if they exist and other columns are vectors of vectors DONE
 82 |   - If any column doesn't exist throw an error. DONE
 83 | 
 84 | * reduceBy
 85 |   - Reduces by a vector column
 86 |   - Throws an exception when the column doesn't exist.
 87 |   - Throws an error when the wrong type is passed into the function
 88 |   - Throws an error when the vector is of the wrong type.
 89 | 
 90 | * parseDefault
 91 |   - unsigned integer defaults to int
 92 |   - decimal point number defaults to double.
 93 |   - Fallback to text.
 94 | 
 95 | * sortBy
 96 |   - Sorts by a given column in ascending order. DONE
 97 |   - Sorts by a given column in descending order. DONE
 98 |   - Sorts by multiple columns in ascending order.
 99 |   - Sorts by multiple columns in descending order.
100 |   - Throws an error if it doesn't exist. DONE
101 | 
102 | * columnInfo
103 |   - Return correct types and lengths.
104 | 
105 | ## Plotting
106 | <TODO>
107 | 
108 | ## CSV I/O
109 | <TODO>
110 | 


--------------------------------------------------------------------------------
/tests/Assertions.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE ScopedTypeVariables #-}
 2 | module Assertions where
 3 | 
 4 | import qualified Data.List as L
 5 | 
 6 | import Control.Exception
 7 | import Test.HUnit
 8 | 
 9 | -- Adapted from: https://github.com/BartMassey/chunk/blob/1ee4bd6545e0db6b8b5f4935d97e7606708eacc9/hunit.hs#L29
10 | assertExpectException :: String -> String ->
11 |                          IO a -> Assertion
12 | assertExpectException preface expected action = do
13 |   r <- catch
14 |     (action >> (return . Just) "no exception thrown")
15 |     (\(e::SomeException) ->
16 |                return (checkForExpectedException e))
17 |   case r of
18 |     Nothing  -> return ()
19 |     Just msg -> assertFailure $ preface ++ ": " ++ msg
20 |   where
21 |     checkForExpectedException :: SomeException -> Maybe String
22 |     checkForExpectedException e
23 |         | expected `L.isInfixOf` show e = Nothing
24 |         | otherwise =
25 |             Just $ "wrong exception detail, expected " ++
26 |                    expected ++ ", got: " ++ show e


--------------------------------------------------------------------------------
/tests/Main.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | {-# LANGUAGE ScopedTypeVariables #-}
 3 | module Main where
 4 | 
 5 | import qualified DataFrame as D
 6 | import qualified DataFrame as DI
 7 | import qualified Data.List as L
 8 | import qualified Data.Text as T
 9 | import qualified Data.Vector as V
10 | import qualified Data.Vector.Unboxed as VU
11 | import qualified System.Exit as Exit
12 | 
13 | import Control.Exception
14 | import Data.Time
15 | import Test.HUnit
16 | 
17 | import Assertions
18 | 
19 | import qualified Operations.Apply
20 | import qualified Operations.Derive
21 | import qualified Operations.Filter
22 | import qualified Operations.GroupBy
23 | import qualified Operations.InsertColumn
24 | import qualified Operations.Sort
25 | import qualified Operations.Take
26 | 
27 | testData :: D.DataFrame
28 | testData = D.fromList [ ("test1", DI.toColumn ([1..26] :: [Int]))
29 |                       , ("test2", DI.toColumn ['a'..'z'])
30 |                       ]
31 | 
32 | -- Dimensions
33 | correctDimensions :: Test
34 | correctDimensions = TestCase (assertEqual "should be (26, 2)" (26, 2) (D.dimensions testData))
35 | 
36 | emptyDataframeDimensions :: Test
37 | emptyDataframeDimensions = TestCase (assertEqual "should be (0, 0)" (0, 0) (D.dimensions D.empty))
38 | 
39 | dimensionsTest :: [Test]
40 | dimensionsTest = [ TestLabel "dimensions_correctDimensions" correctDimensions
41 |                  , TestLabel "dimensions_emptyDataframeDimensions" emptyDataframeDimensions
42 |                  ]
43 | 
44 | -- parsing.
45 | parseDate :: Test
46 | parseDate = let
47 |     expected = Just $ DI.BoxedColumn (V.fromList [fromGregorian 2020 02 14, fromGregorian 2021 02 14, fromGregorian 2022 02 14])
48 |     actual = D.parseDefault True $ Just $ DI.toColumn' (V.fromList ["2020-02-14" :: T.Text, "2021-02-14", "2022-02-14"])
49 |   in TestCase (assertEqual "Correctly parses gregorian date" expected actual)
50 | 
51 | incompleteDataParseEither :: Test
52 | incompleteDataParseEither = let
53 |     expected = Just $ DI.BoxedColumn (V.fromList [Right $ fromGregorian 2020 02 14, Left ("2021-02-" :: T.Text), Right $ fromGregorian 2022 02 14])
54 |     actual = D.parseDefault True $ Just $ DI.toColumn' (V.fromList ["2020-02-14" :: T.Text, "2021-02-", "2022-02-14"])
55 |   in TestCase (assertEqual "Parses Either for gregorian date" expected actual)
56 | 
57 | incompleteDataParseMaybe :: Test
58 | incompleteDataParseMaybe = let
59 |     expected = Just $ DI.BoxedColumn (V.fromList [Just $ fromGregorian 2020 02 14, Nothing, Just $ fromGregorian 2022 02 14])
60 |     actual = D.parseDefault True $ Just $ DI.toColumn' (V.fromList ["2020-02-14" :: T.Text, "", "2022-02-14"])
61 |   in TestCase (assertEqual "Parses Maybe for gregorian date with null/empty" expected actual)
62 | 
63 | parseTests :: [Test]
64 | parseTests = [
65 |              TestLabel "parseDate" parseDate,
66 |              TestLabel "incompleteDataParseMaybe" incompleteDataParseMaybe,
67 |              TestLabel "incompleteDataParseEither" incompleteDataParseEither
68 |            ]
69 | 
70 | tests :: Test
71 | tests = TestList $ dimensionsTest
72 |                 ++ Operations.Apply.tests
73 |                 ++ Operations.Derive.tests
74 |                 ++ Operations.Filter.tests
75 |                 ++ Operations.GroupBy.tests
76 |                 ++ Operations.InsertColumn.tests
77 |                 ++ Operations.Sort.tests
78 |                 ++ Operations.Take.tests
79 |                 ++ parseTests
80 | 
81 | main :: IO ()
82 | main = do
83 |     result <- runTestTT tests
84 |     if failures result > 0 || errors result > 0 then Exit.exitFailure else Exit.exitSuccess
85 | 


--------------------------------------------------------------------------------
/tests/Operations/Apply.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE TypeApplications #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | {-# LANGUAGE TupleSections #-}
  4 | module Operations.Apply where
  5 | 
  6 | import qualified DataFrame as D
  7 | import qualified DataFrame as DI
  8 | import qualified DataFrame as DE
  9 | import qualified Data.Text as T
 10 | import qualified Data.Vector as V
 11 | import qualified Data.Vector.Unboxed as VU
 12 | 
 13 | import Assertions
 14 | import Test.HUnit
 15 | import Type.Reflection (typeRep)
 16 | 
 17 | values :: [(T.Text, DI.Column)]
 18 | values = [ ("test1", DI.toColumn ([1..26] :: [Int]))
 19 |          , ("test2", DI.toColumn (map show ['a'..'z']))
 20 |          , ("test3", DI.toColumn ([1..26] :: [Int]))
 21 |          , ("test4", DI.toColumn ['a'..'z'])
 22 |          , ("test5", DI.toColumn ([1..26] :: [Int]))
 23 |          , ("test6", DI.toColumn ['a'..'z'])
 24 |          , ("test7", DI.toColumn ([1..26] :: [Int]))
 25 |          , ("test8", DI.toColumn ['a'..'z'])
 26 |          ]
 27 | 
 28 | testData :: D.DataFrame
 29 | testData = D.fromList values
 30 | 
 31 | applyBoxedToUnboxed :: Test
 32 | applyBoxedToUnboxed = TestCase (assertEqual "Boxed apply unboxed when result is unboxed"
 33 |                                 (Just $ DI.UnboxedColumn (VU.fromList (replicate 26 (1 :: Int))))
 34 |                                 (DI.getColumn "test2" $ D.apply @String (const (1::Int)) "test2" testData))
 35 | 
 36 | applyBoxedToBoxed :: Test
 37 | applyBoxedToBoxed = TestCase (assertEqual "Boxed apply remains in boxed vector"
 38 |                                 (Just $ DI.BoxedColumn (V.fromList (replicate 26 (1 :: Integer))))
 39 |                                 (DI.getColumn "test2" $ D.apply @String (const (1::Integer)) "test2" testData))
 40 | 
 41 | applyWrongType :: Test
 42 | applyWrongType = TestCase (assertExpectException "[Error Case]"
 43 |                                 (DE.typeMismatchError (typeRep @Char) (typeRep @[Char]))
 44 |                                 (print $ DI.getColumn "test2" $ D.apply @Char (const (1::Int)) "test2" testData))
 45 | 
 46 | applyUnknownColumn :: Test
 47 | applyUnknownColumn = TestCase (assertExpectException "[Error Case]"
 48 |                                 (DE.columnNotFound "test9" "apply" (D.columnNames testData))
 49 |                                 (print $ D.apply @[Char] (const (1::Int)) "test9" testData))
 50 | 
 51 | applyManyOnlyGivenFields :: Test
 52 | applyManyOnlyGivenFields = TestCase (assertEqual "Applies function to many fields"
 53 |                                 (D.fromList (map (, D.toColumn $ replicate 26 (1 :: Integer)) ["test4", "test6"] ++
 54 |                                             -- All other fields should have their original values.
 55 |                                             filter (\(name, col) -> name /= "test4" && name /= "test6") values))
 56 |                                 (D.applyMany @Char (const (1::Integer))
 57 |                                     ["test4", "test6"] testData))
 58 | 
 59 | applyManyBoxedToBoxed :: Test
 60 | applyManyBoxedToBoxed = TestCase (assertEqual "Applies function to many fields"
 61 |                                 (D.fromList (map (, D.toColumn $ replicate 26 (1 :: Integer)) ["test4", "test6", "test8"]))
 62 |                                 (D.select ["test4", "test6", "test8"] $ D.applyMany @Char (const (1::Integer))
 63 |                                     ["test4", "test6", "test8"] testData))
 64 | 
 65 | applyManyBoxedToUnboxed :: Test
 66 | applyManyBoxedToUnboxed = TestCase (assertEqual "Unboxes fields when necessary"
 67 |                                 (D.fromList (map (, D.toColumn $ replicate 26 (1 :: Int)) ["test4", "test6", "test8"]))
 68 |                                 (D.select ["test4", "test6", "test8"] $ D.applyMany @Char (const (1::Int))
 69 |                                     ["test4", "test6", "test8"] testData))
 70 | 
 71 | applyManyColumnNotFound :: Test
 72 | applyManyColumnNotFound = TestCase (assertExpectException "[Error Case]"
 73 |                                 (DE.columnNotFound "test0" "apply" (D.columnNames testData))
 74 |                                 (print $ D.applyMany @Char (const (1::Integer))
 75 |                                     ["test0", "test6", "test8"] testData))
 76 | 
 77 | applyManyWrongType :: Test
 78 | applyManyWrongType = TestCase (assertExpectException "[Error Case]"
 79 |                                 (DE.typeMismatchError (typeRep @Char) (typeRep @[Char]))
 80 |                                 (print $ DI.getColumn "test2" $ D.applyMany @Char (const (1::Int)) ["test2"] testData))
 81 | 
 82 | applyWhereWrongConditionType :: Test
 83 | applyWhereWrongConditionType = TestCase (assertExpectException "[Error Case]"
 84 |                                 (DE.typeMismatchError (typeRep @Integer) (typeRep @Int))
 85 |                                 (print $ D.applyWhere (even @Integer) "test1" ((+1) :: Int -> Int) "test5" testData))
 86 | 
 87 | applyWhereWrongTargetType :: Test
 88 | applyWhereWrongTargetType = TestCase (assertExpectException "[Error Case]"
 89 |                                 (DE.typeMismatchError (typeRep @Float) (typeRep @Int))
 90 |                                 (print $ D.applyWhere (even @Int) "test1" ((+1) :: Float -> Float) "test5" testData))
 91 | 
 92 | applyWhereConditionColumnNotFound :: Test
 93 | applyWhereConditionColumnNotFound = TestCase (assertExpectException "[Error Case]"
 94 |                                 (DE.columnNotFound "test0" "applyWhere" (D.columnNames testData))
 95 |                                 (print $ D.applyWhere (even @Int) "test0" ((+1) :: Int -> Int) "test5" testData))
 96 | 
 97 | applyWhereTargetColumnNotFound :: Test
 98 | applyWhereTargetColumnNotFound = TestCase (assertExpectException "[Error Case]"
 99 |                                 (DE.columnNotFound "test0" "applyAtIndex" (D.columnNames testData))
100 |                                 (print $ D.applyWhere (even @Int) "test1" ((+1) :: Int -> Int) "test0" testData))
101 | 
102 | applyWhereWAI :: Test
103 | applyWhereWAI = TestCase (assertEqual "applyWhere works as intended"
104 |                                 (Just $ DI.UnboxedColumn (VU.fromList (zipWith ($) (cycle [id, (+1)]) [(1 :: Int)..26])))
105 |                                 (D.getColumn "test5" $ D.applyWhere (even @Int) "test1" ((+1) :: Int -> Int) "test5" testData))
106 | 
107 | tests :: [Test]
108 | tests = [ TestLabel "applyBoxedToUnboxed" applyBoxedToUnboxed
109 |         , TestLabel "applyWrongType" applyWrongType
110 |         , TestLabel "applyUnknownColumn" applyUnknownColumn
111 |         , TestLabel "applyBoxedToBoxed" applyBoxedToBoxed
112 |         , TestLabel "applyManyBoxedToBoxed" applyManyBoxedToBoxed
113 |         , TestLabel "applyManyOnlyGivenFields" applyManyOnlyGivenFields
114 |         , TestLabel "applyManyBoxedToUnboxed" applyManyBoxedToUnboxed
115 |         , TestLabel "applyManyColumnNotFound" applyManyColumnNotFound
116 |         , TestLabel "applyManyWrongType" applyManyWrongType
117 |         , TestLabel "applyWhereWrongConditionType" applyWhereWrongConditionType
118 |         , TestLabel "applyWhereWrongTargetType" applyWhereWrongTargetType
119 |         , TestLabel "applyWhereConditionColumnNotFound" applyWhereConditionColumnNotFound
120 |         , TestLabel "applyWhereTargetColumnNotFound" applyWhereTargetColumnNotFound
121 |         , TestLabel "applyWhereWAI" applyWhereWAI
122 |         ]
123 | 


--------------------------------------------------------------------------------
/tests/Operations/Derive.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE TypeApplications #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | {-# LANGUAGE ScopedTypeVariables #-}
 4 | module Operations.Derive where
 5 | 
 6 | import qualified DataFrame as D
 7 | import qualified DataFrame as DI
 8 | import qualified DataFrame as DE
 9 | import qualified Data.Text as T
10 | import qualified Data.Vector as V
11 | import qualified Data.Vector.Unboxed as VU
12 | 
13 | import Assertions
14 | import Test.HUnit
15 | import Type.Reflection (typeRep)
16 | 
17 | values :: [(T.Text, DI.Column)]
18 | values = [ ("test1", DI.toColumn ([1..26] :: [Int]))
19 |          , ("test2", DI.toColumn (map show ['a'..'z']))
20 |          , ("test3", DI.toColumn ['a'..'z'])
21 |          ]
22 | 
23 | testData :: D.DataFrame
24 | testData = D.fromList values
25 | 
26 | deriveFromWAI :: Test
27 | deriveFromWAI = TestCase (assertEqual "deriveFrom works when function args align"
28 |                                 (Just $ DI.BoxedColumn (V.fromList (zipWith (\n c -> show n ++ [c]) [1..26] ['a'..'z'])))
29 |                                 (DI.getColumn "test4" $ D.deriveFrom (
30 |                                     ["test1", "test3"],
31 |                                     D.func (\(n :: Int) (c :: Char) -> show n ++ [c])) "test4" testData))
32 | 
33 | tests :: [Test]
34 | tests = [ TestLabel "deriveFromWAI" deriveFromWAI
35 |         ]


--------------------------------------------------------------------------------
/tests/Operations/Filter.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE TypeApplications #-}
 2 | {-# LANGUAGE OverloadedStrings #-}
 3 | module Operations.Filter where
 4 | 
 5 | import qualified DataFrame as D
 6 | import qualified DataFrame as DI
 7 | import qualified DataFrame as DE
 8 | import qualified Data.Text as T
 9 | import qualified Data.Vector as V
10 | import qualified Data.Vector.Unboxed as VU
11 | 
12 | import Assertions
13 | import Test.HUnit
14 | import Type.Reflection (typeRep)
15 | 
16 | values :: [(T.Text, DI.Column)]
17 | values = [ ("test1", DI.toColumn ([1..26] :: [Int]))
18 |          , ("test2", DI.toColumn (map show ['a'..'z']))
19 |          , ("test3", DI.toColumn ([1..26] :: [Int]))
20 |          , ("test4", DI.toColumn ['a'..'z'])
21 |          , ("test5", DI.toColumn ([1..26] :: [Int]))
22 |          , ("test6", DI.toColumn ['a'..'z'])
23 |          , ("test7", DI.toColumn ([1..26] :: [Int]))
24 |          , ("test8", DI.toColumn ['a'..'z'])
25 |          ]
26 | 
27 | testData :: D.DataFrame
28 | testData = D.fromList values
29 | 
30 | filterColumnDoesNotExist :: Test
31 | filterColumnDoesNotExist = TestCase (assertExpectException "[Error Case]"
32 |                                 (DE.columnNotFound "test0" "filter" (D.columnNames testData))
33 |                                 (print $ D.filter @Int "test0" even testData))
34 | 
35 | filterColumnWrongType :: Test
36 | filterColumnWrongType = TestCase (assertExpectException "[Error Case]"
37 |                                 (DE.typeMismatchError (typeRep @Integer) (typeRep @Int))
38 |                                 (print $ D.filter @Integer "test1" even testData))
39 | 
40 | filterByColumnDoesNotExist :: Test
41 | filterByColumnDoesNotExist = TestCase (assertExpectException "[Error Case]"
42 |                                 (DE.columnNotFound "test0" "filter" (D.columnNames testData))
43 |                                 (print $ D.filterBy @Int even "test0" testData))
44 | 
45 | filterByColumnWrongType :: Test
46 | filterByColumnWrongType = TestCase (assertExpectException "[Error Case]"
47 |                                 (DE.typeMismatchError (typeRep @Integer) (typeRep @Int))
48 |                                 (print $ D.filterBy @Integer even "test1" testData))
49 | 
50 | filterColumnInexistentValues :: Test
51 | filterColumnInexistentValues = TestCase (assertEqual "Non existent filter value returns no rows"
52 |                                 (0, 8)
53 |                                 (D.dimensions $ D.filter @Int "test1" (<0) testData))
54 | 
55 | filterColumnAllValues :: Test
56 | filterColumnAllValues = TestCase (assertEqual "Filters all columns"
57 |                                 (26, 8)
58 |                                 (D.dimensions $ D.filter @Int "test1" (const True) testData))
59 | 
60 | filterJustWAI :: Test
61 | filterJustWAI = TestCase (assertEqual "Filters out Nothing and unwraps Maybe"
62 |                                 (D.fromList [("test", D.toColumn $ replicate 5 (1 :: Int))])
63 |                                 (D.filterJust "test" (D.fromList [("test", D.toColumn $ take 10 $ cycle [Just (1 :: Int), Nothing])])))
64 | 
65 | tests :: [Test]
66 | tests = [ TestLabel "filterColumnDoesNotExist" filterColumnDoesNotExist
67 |         , TestLabel "filterColumnWrongType" filterColumnWrongType
68 |         , TestLabel "filterByColumnDoesNotExist" filterByColumnDoesNotExist
69 |         , TestLabel "filterByColumnWrongType" filterByColumnWrongType
70 |         , TestLabel "filterColumnInexistentValues" filterColumnInexistentValues
71 |         , TestLabel "filterColumnAllValues" filterColumnAllValues
72 |         , TestLabel "filterJustWAI" filterJustWAI
73 |         ]


--------------------------------------------------------------------------------
/tests/Operations/GroupBy.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module Operations.GroupBy where
 3 | 
 4 | import qualified DataFrame as D
 5 | import qualified DataFrame as DI
 6 | import qualified DataFrame as DE
 7 | import qualified Data.Text as T
 8 | import qualified Data.Vector as V
 9 | import qualified Data.Vector.Unboxed as VU
10 | 
11 | import Assertions
12 | import Test.HUnit
13 | 
14 | values :: [(T.Text, DI.Column)]
15 | values = [ ("test1", DI.toColumn (concatMap (replicate 10) [1 :: Int, 2, 3, 4]))
16 |          , ("test2", DI.toColumn (take 40 $ cycle [1 :: Int,2]))
17 |          , ("test3", DI.toColumn [(1 :: Int)..40])
18 |          , ("test4", DI.toColumn (reverse [(1 :: Int)..40]))
19 |          ]
20 | 
21 | testData :: D.DataFrame
22 | testData = D.fromList values
23 | 
24 | groupBySingleRowWAI :: Test
25 | groupBySingleRowWAI = TestCase (assertEqual "Groups by single column"
26 |                 (D.fromList [("test1", DI.toColumn [(1::Int)..4]),
27 |                              -- This just makes rows with [1, 2] for every unique test1 row
28 |                              ("test2", DI.GroupedUnboxedColumn (V.replicate 4 $ VU.fromList (take 10 $ cycle [1 :: Int, 2]))),
29 |                              ("test3", DI.GroupedUnboxedColumn (V.generate 4 (\i -> VU.fromList [(i * 10 + 1)..((i + 1) * 10)]))),
30 |                              ("test4", DI.GroupedUnboxedColumn (V.generate 4 (\i -> VU.fromList [(((3 - i) + 1) * 10),(((3 - i) + 1) * 10 - 1)..((3 - i) * 10 + 1)])))
31 |                             ])
32 |                 (D.groupBy ["test1"] testData D.|> D.sortBy D.Ascending ["test1"]))
33 | 
34 | groupByMultipleRowsWAI :: Test
35 | groupByMultipleRowsWAI = TestCase (assertEqual "Groups by single column"
36 |                 (D.fromList [("test1", DI.toColumn $ concatMap (replicate 2) [(1::Int)..4]),
37 |                              ("test2", DI.toColumn (take 8 $ cycle [1 :: Int, 2])),
38 |                              ("test3", DI.GroupedUnboxedColumn (V.fromList [
39 |                                         VU.fromList [1 :: Int,3..9],
40 |                                         VU.fromList [2,4..10],
41 |                                         VU.fromList [11,13..19],
42 |                                         VU.fromList [12,14..20],
43 |                                         VU.fromList [21,23..29],
44 |                                         VU.fromList [22,24..30],
45 |                                         VU.fromList [31,33..39],
46 |                                         VU.fromList [32,34..40]
47 |                                 ])),
48 |                              ("test4", DI.GroupedUnboxedColumn (V.fromList $ reverse [
49 |                                         VU.fromList [1 :: Int,3..9],
50 |                                         VU.fromList [2,4..10],
51 |                                         VU.fromList [11,13..19],
52 |                                         VU.fromList [12,14..20],
53 |                                         VU.fromList [21,23..29],
54 |                                         VU.fromList [22,24..30],
55 |                                         VU.fromList [31,33..39],
56 |                                         VU.fromList [32,34..40]
57 |                                 ]))
58 |                             ])
59 |                 (D.groupBy ["test1", "test2"] testData D.|> D.sortBy D.Ascending ["test1", "test2"]))
60 | 
61 | groupByColumnDoesNotExist :: Test
62 | groupByColumnDoesNotExist = TestCase (assertExpectException "[Error Case]"
63 |                                 (DE.columnNotFound "[\"test0\"]" "groupBy" (D.columnNames testData))
64 |                                 (print $ D.groupBy ["test0"] testData))
65 | 
66 | tests :: [Test]
67 | tests = [ TestLabel "groupBySingleRowWAI" groupBySingleRowWAI
68 |         , TestLabel "groupByMultipleRowsWAI" groupByMultipleRowsWAI
69 |         , TestLabel "groupByColumnDoesNotExist" groupByColumnDoesNotExist
70 |         ]
71 | 
72 | 


--------------------------------------------------------------------------------
/tests/Operations/InsertColumn.hs:
--------------------------------------------------------------------------------
  1 | {-# LANGUAGE TypeApplications #-}
  2 | {-# LANGUAGE OverloadedStrings #-}
  3 | module Operations.InsertColumn where
  4 | 
  5 | import qualified DataFrame as D
  6 | import qualified DataFrame as DI
  7 | import qualified Data.Text as T
  8 | import qualified Data.Vector as V
  9 | import qualified Data.Vector.Unboxed as VU
 10 | 
 11 | import Assertions
 12 | import Test.HUnit
 13 | 
 14 | testData :: D.DataFrame
 15 | testData = D.fromList [ ("test1", DI.toColumn ([1..26] :: [Int]))
 16 |                       , ("test2", DI.toColumn ['a'..'z'])
 17 |                       , ("test3", DI.toColumn ([1..26] :: [Int]))
 18 |                       , ("test4", DI.toColumn ['a'..'z'])
 19 |                       , ("test5", DI.toColumn ([1..26] :: [Int]))
 20 |                       , ("test6", DI.toColumn ['a'..'z'])
 21 |                       , ("test7", DI.toColumn ([1..26] :: [Int]))
 22 |                       , ("test8", DI.toColumn ['a'..'z'])
 23 |                       ]
 24 | 
 25 | -- Adding a boxed vector to an empty dataframe creates a new column boxed containing the vector elements.
 26 | addBoxedColumn :: Test
 27 | addBoxedColumn = TestCase (assertEqual "Two columns should be equal"
 28 |                             (Just $ DI.BoxedColumn (V.fromList ["Thuba" :: T.Text, "Zodwa", "Themba"]))
 29 |                             (DI.getColumn "new" $ D.insertColumn "new" (V.fromList ["Thuba" :: T.Text, "Zodwa", "Themba"]) D.empty))
 30 | 
 31 | addBoxedColumn' :: Test
 32 | addBoxedColumn' = TestCase (assertEqual "Two columns should be equal"
 33 |                             (Just $ DI.toColumn ["Thuba" :: T.Text, "Zodwa", "Themba"])
 34 |                             (DI.getColumn "new" $ D.insertColumn' "new" (Just $ DI.toColumn ["Thuba" :: T.Text, "Zodwa", "Themba"]) D.empty))
 35 | 
 36 | -- Adding an boxed vector with an unboxable type (Int/Double) to an empty dataframe creates a new column boxed containing the vector elements.
 37 | addUnboxedColumn :: Test
 38 | addUnboxedColumn = TestCase (assertEqual "Value should be boxed"
 39 |                             (Just $ DI.UnboxedColumn (VU.fromList [1 :: Int, 2, 3]))
 40 |                             (DI.getColumn "new" $ D.insertColumn "new" (V.fromList [1 :: Int, 2, 3]) D.empty))
 41 | 
 42 | addUnboxedColumn' :: Test
 43 | addUnboxedColumn' = TestCase (assertEqual "Value should be boxed"
 44 |                             (Just $ DI.toColumn [1 :: Int, 2, 3])
 45 |                             (DI.getColumn "new" $ D.insertColumn' "new" (Just $ DI.toColumn [1 :: Int, 2, 3]) D.empty))
 46 | 
 47 | -- Adding a column with less values than the current DF dimensions adds column with optionals.
 48 | addSmallerColumnBoxed :: Test
 49 | addSmallerColumnBoxed = TestCase (
 50 |     assertEqual "Missing values should be replaced with Nothing"
 51 |     (Just $ DI.OptionalColumn (V.fromList [Just "a" :: Maybe T.Text, Just "b",  Just "c", Nothing, Nothing]))
 52 |     (DI.getColumn "newer" $ D.insertColumn "newer" (V.fromList ["a" :: T.Text, "b", "c"]) $ D.insertColumn "new" (V.fromList ["a" :: T.Text, "b", "c", "d", "e"]) D.empty)
 53 |   )
 54 | 
 55 | addSmallerColumnUnboxed :: Test
 56 | addSmallerColumnUnboxed = TestCase (
 57 |     assertEqual "Missing values should be replaced with Nothing"
 58 |     (Just $ DI.OptionalColumn (V.fromList [Just 1 :: Maybe Int, Just 2,  Just 3, Nothing, Nothing]))
 59 |     (DI.getColumn "newer" $ D.insertColumn "newer" (V.fromList [1 :: Int, 2, 3]) $ D.insertColumn "new" (V.fromList [1 :: Int, 2, 3, 4, 5]) D.empty)
 60 |   )
 61 | 
 62 | insertColumnWithDefaultFillsWithDefault :: Test
 63 | insertColumnWithDefaultFillsWithDefault = TestCase (
 64 |     assertEqual "Missing values should be replaced with Nothing"
 65 |     (Just $ DI.UnboxedColumn (VU.fromList [1 :: Int, 2,  3, 0, 0]))
 66 |     (DI.getColumn "newer" $ D.insertColumnWithDefault 0 "newer" (V.fromList [1 :: Int, 2, 3]) $ D.insertColumn "new" (V.fromList [1 :: Int, 2, 3, 4, 5]) D.empty)
 67 |   )
 68 | 
 69 | insertColumnWithDefaultFillsLargerNoop :: Test
 70 | insertColumnWithDefaultFillsLargerNoop = TestCase (
 71 |     assertEqual "Lists should be the same size"
 72 |     (Just $ DI.UnboxedColumn (VU.fromList [(6 :: Int)..10]))
 73 |     (DI.getColumn "newer" $ D.insertColumnWithDefault 0 "newer" (V.fromList [(6 :: Int)..10]) $ D.insertColumn "new" (V.fromList [1 :: Int, 2, 3, 4, 5]) D.empty)
 74 |   )
 75 | 
 76 | addLargerColumnBoxed :: Test
 77 | addLargerColumnBoxed =
 78 |   TestCase (assertEqual "Smaller lists should grow and contain optionals"
 79 |                     (D.fromList [("new", D.toColumn [Just "a" :: Maybe T.Text, Just "b", Just "c", Nothing, Nothing]),
 80 |                                  ("newer", D.toColumn ["a" :: T.Text, "b", "c", "d", "e"])])
 81 |                     (D.insertColumn "newer" (V.fromList ["a" :: T.Text, "b", "c", "d", "e"])
 82 |                             $ D.insertColumn "new" (V.fromList ["a" :: T.Text, "b", "c"]) D.empty))
 83 | addLargerColumnUnboxed :: Test
 84 | addLargerColumnUnboxed =
 85 |     TestCase (assertEqual "Smaller lists should grow and contain optionals"
 86 |                     (D.fromList [("old", D.toColumn [Just 1 :: Maybe Int, Just 2, Nothing, Nothing, Nothing]),
 87 |                                  ("new", D.toColumn [Just 1 :: Maybe Int, Just 2, Just 3, Nothing, Nothing]),
 88 |                                  ("newer", D.toColumn [1 :: Int, 2, 3, 4, 5])])
 89 |                     (D.insertColumn "newer" (V.fromList [1 :: Int, 2, 3, 4, 5])
 90 |                      $ D.insertColumn "new" (V.fromList [1 :: Int, 2, 3]) $ 
 91 |                      D.insertColumn "old" (V.fromList [1 :: Int, 2]) D.empty))
 92 | 
 93 | dimensionsChangeAfterAdd :: Test
 94 | dimensionsChangeAfterAdd = TestCase (assertEqual "should be (26, 3)"
 95 |                                      (26, 9)
 96 |                                      (D.dimensions $ D.insertColumn @Int "new" (V.fromList [1..26]) testData))
 97 | 
 98 | dimensionsNotChangedAfterDuplicate :: Test
 99 | dimensionsNotChangedAfterDuplicate = TestCase (assertEqual "should be (26, 3)"
100 |                                      (26, 9)
101 |                                      (D.dimensions $ D.insertColumn @Int "new" (V.fromList [1..26])
102 |                                                    $ D.insertColumn @Int "new" (V.fromList [1..26]) testData))
103 | 
104 | 
105 | tests :: [Test]
106 | tests = [
107 |              TestLabel "dimensionsChangeAfterAdd" dimensionsChangeAfterAdd
108 |            , TestLabel "dimensionsNotChangedAfterDuplicate" dimensionsNotChangedAfterDuplicate
109 |            , TestLabel "addBoxedColunmToEmpty" addBoxedColumn
110 |            , TestLabel "addBoxedColumnAutoUnboxes" addBoxedColumn
111 |            , TestLabel "addSmallerColumnBoxed" addSmallerColumnBoxed
112 |            , TestLabel "addSmallerColumnUnboxed" addSmallerColumnUnboxed
113 |            , TestLabel "addLargerColumnBoxed" addLargerColumnBoxed
114 |            , TestLabel "addLargerColumnUnboxed" addLargerColumnUnboxed
115 |            , TestLabel "insertColumnWithDefaultFillsWithDefault" insertColumnWithDefaultFillsWithDefault
116 |            , TestLabel "insertColumnWithDefaultFillsLargerNoop" insertColumnWithDefaultFillsLargerNoop
117 |            ]
118 | 


--------------------------------------------------------------------------------
/tests/Operations/Sort.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module Operations.Sort where
 3 | 
 4 | import qualified DataFrame as D
 5 | import qualified DataFrame as DI
 6 | import qualified DataFrame as DE
 7 | import qualified Data.Text as T
 8 | import qualified Data.Vector as V
 9 | import qualified Data.Vector.Unboxed as VU
10 | 
11 | import Assertions
12 | import Control.Monad
13 | import Data.Char
14 | import System.Random
15 | import System.Random.Shuffle (shuffle')
16 | import Test.HUnit
17 | 
18 | values :: [(T.Text, DI.Column)]
19 | values = let
20 |         ns = shuffle' [(1::Int)..26] 26 $ mkStdGen 252
21 |     in [ ("test1", DI.toColumn ns)
22 |        , ("test2", DI.toColumn (map (chr . (+96)) ns))
23 |        ]
24 | 
25 | testData :: D.DataFrame
26 | testData = D.fromList values
27 | 
28 | sortByAscendingWAI :: Test
29 | sortByAscendingWAI = TestCase (assertEqual "Sorting rows by ascending works as intended"
30 |                     (D.fromList [("test1", DI.toColumn [(1::Int)..26]),
31 |                                  ("test2", DI.toColumn ['a'..'z'])])
32 |                     (D.sortBy D.Ascending ["test1"] testData))
33 | 
34 | sortByDescendingWAI :: Test
35 | sortByDescendingWAI = TestCase (assertEqual "Sorting rows by descending works as intended"
36 |                     (D.fromList [("test1", DI.toColumn $ reverse [(1::Int)..26]),
37 |                                  ("test2", DI.toColumn $ reverse ['a'..'z'])])
38 |                     (D.sortBy D.Descending ["test1"] testData))
39 | 
40 | sortByColumnDoesNotExist :: Test
41 | sortByColumnDoesNotExist = TestCase (assertExpectException "[Error Case]"
42 |                                 (DE.columnNotFound "[\"test0\"]" "sortBy" (D.columnNames testData))
43 |                                 (print $ D.sortBy D.Ascending ["test0"] testData))
44 | 
45 | tests :: [Test]
46 | tests = [ TestLabel "sortByAscendingWAI" sortByAscendingWAI
47 |         , TestLabel "sortByDescendingWAI" sortByDescendingWAI
48 |         , TestLabel "sortByColumnDoesNotExist" sortByColumnDoesNotExist
49 |         ]
50 | 
51 | 


--------------------------------------------------------------------------------
/tests/Operations/Take.hs:
--------------------------------------------------------------------------------
 1 | {-# LANGUAGE OverloadedStrings #-}
 2 | module Operations.Take where
 3 | 
 4 | import qualified DataFrame as D
 5 | import qualified DataFrame as DI
 6 | 
 7 | import Test.HUnit
 8 | 
 9 | testData :: D.DataFrame
10 | testData = D.fromList [ ("test1", DI.toColumn ([1..26] :: [Int]))
11 |                       , ("test2", DI.toColumn ['a'..'z'])
12 |                       ]
13 | 
14 | 
15 | takeWAI :: Test
16 | takeWAI = TestCase (assertEqual "Gets first 10 numbers" (Just $ D.toColumn [(1 :: Int)..10]) (D.getColumn "test1" $ D.take 10 testData))
17 | 
18 | takeLastWAI :: Test
19 | takeLastWAI = TestCase (assertEqual "Gets first 10 numbers" (Just $ D.toColumn [(17 :: Int)..26]) (D.getColumn "test1" $ D.takeLast 10 testData))
20 | 
21 | lengthEqualsTakeParam :: Test
22 | lengthEqualsTakeParam = TestCase (assertEqual "should be (5, 2)" (5, 2) (D.dimensions $ D.take 5 testData))
23 | 
24 | lengthGreaterThanTakeParam :: Test
25 | lengthGreaterThanTakeParam = TestCase (assertEqual "should be (26, 2)" (26, 2) (D.dimensions $ D.take 30 testData))
26 | 
27 | emptyIsZero :: Test
28 | emptyIsZero = TestCase (assertEqual "should be (0, 0)" (0, 0) (D.dimensions $ D.take 5 D.empty))
29 | 
30 | negativeIsZero :: Test
31 | negativeIsZero = TestCase (assertEqual "should be (0, 2)" (0, 2) (D.dimensions $ D.take (-1) testData))
32 | 
33 | lengthEqualsTakeLastParam :: Test
34 | lengthEqualsTakeLastParam = TestCase (assertEqual "should be (5, 2)" (5, 2) (D.dimensions $ D.takeLast 5 testData))
35 | 
36 | lengthGreaterThanTakeLastParam :: Test
37 | lengthGreaterThanTakeLastParam = TestCase (assertEqual "should be (26, 2)" (26, 2) (D.dimensions $ D.takeLast 30 testData))
38 | 
39 | emptyIsZeroTakeLast :: Test
40 | emptyIsZeroTakeLast = TestCase (assertEqual "should be (0, 0)" (0, 0) (D.dimensions $ D.takeLast 5 D.empty))
41 | 
42 | negativeIsZeroTakeLast :: Test
43 | negativeIsZeroTakeLast = TestCase (assertEqual "should be (0, 2)" (0, 2) (D.dimensions $ D.takeLast (-1) testData))
44 | 
45 | tests :: [Test]
46 | tests = [ TestLabel "takeWAI" takeWAI
47 |         , TestLabel "takeLastWAI" takeLastWAI
48 |         , TestLabel "lengthEqualsTakeParam" lengthEqualsTakeParam
49 |         , TestLabel "lengthGreaterThanTakeParam" lengthGreaterThanTakeParam
50 |         , TestLabel "emptyIsZero" emptyIsZero
51 |         , TestLabel "negativeIsZero" negativeIsZero
52 |         , TestLabel "lengthEqualsTakeLastParam" lengthEqualsTakeLastParam
53 |         , TestLabel "lengthGreaterThanTakeLastParam" lengthGreaterThanTakeLastParam
54 |         , TestLabel "emptyIsZeroTakeLast" emptyIsZeroTakeLast
55 |         , TestLabel "negativeIsZeroTakeLast" negativeIsZeroTakeLast
56 |         ]
57 | 


--------------------------------------------------------------------------------