├── .ghci ├── .github └── workflows │ └── haskell-ci.yml ├── .gitignore ├── CHANGELOG.md ├── LICENSE ├── README.md ├── app └── Main.hs ├── benchmark └── Main.hs ├── data ├── chipotle.tsv ├── effects-of-covid-19-on-trade-at-15-december-2021-provisional.csv ├── housing.csv ├── measurements.txt └── starwars.csv ├── dataframe.cabal ├── docs ├── California Housing.ipynb ├── coming_from_dplyr.md ├── coming_from_pandas.md ├── coming_from_polars.md ├── configuration_notes.md ├── exploratory_data_analysis_primer.md └── haskell_for_data_analysis.md ├── flake.nix ├── run_compiled_repl.sh ├── run_profiling.sh ├── src ├── DataFrame.hs └── DataFrame │ ├── Display │ └── Terminal │ │ ├── Colours.hs │ │ ├── Plot.hs │ │ └── PrettyPrint.hs │ ├── Errors.hs │ ├── IO │ └── CSV.hs │ ├── Internal │ ├── Column.hs │ ├── DataFrame.hs │ ├── Function.hs │ ├── Parsing.hs │ ├── Row.hs │ └── Types.hs │ └── Operations │ ├── Aggregation.hs │ ├── Core.hs │ ├── Sorting.hs │ ├── Statistics.hs │ ├── Subset.hs │ ├── Transformations.hs │ └── Typing.hs ├── static └── example.gif ├── test_coverage.md └── tests ├── Assertions.hs ├── Main.hs └── Operations ├── Apply.hs ├── Derive.hs ├── Filter.hs ├── GroupBy.hs ├── InsertColumn.hs ├── Sort.hs └── Take.hs /.ghci: -------------------------------------------------------------------------------- 1 | :set -XOverloadedStrings 2 | :set -XTypeApplications 3 | 4 | import qualified Data.Text as Str 5 | 6 | default (Int, Str.Text, Double) 7 | 8 | -------------------------------------------------------------------------------- /.github/workflows/haskell-ci.yml: -------------------------------------------------------------------------------- 1 | # This GitHub workflow config has been generated by a script via 2 | # 3 | # haskell-ci 'github' 'dataframe.cabal' 4 | # 5 | # To regenerate the script (for example after adjusting tested-with) run 6 | # 7 | # haskell-ci regenerate 8 | # 9 | # For more information, see https://github.com/haskell-CI/haskell-ci 10 | # 11 | # version: 0.19.20241202 12 | # 13 | # REGENDATA ("0.19.20241202",["github","dataframe.cabal"]) 14 | # 15 | name: Haskell-CI 16 | on: 17 | - push 18 | - pull_request 19 | jobs: 20 | linux: 21 | name: Haskell-CI - Linux - ${{ matrix.compiler }} 22 | runs-on: ubuntu-latest 23 | timeout-minutes: 24 | 60 25 | container: 26 | image: buildpack-deps:jammy 27 | continue-on-error: ${{ matrix.allow-failure }} 28 | strategy: 29 | matrix: 30 | include: 31 | - compiler: ghc-9.4.8 32 | compilerKind: ghc 33 | compilerVersion: 9.4.8 34 | setup-method: ghcup 35 | allow-failure: false 36 | - compiler: ghc-9.8.3 37 | compilerKind: ghc 38 | compilerVersion: 9.8.3 39 | setup-method: ghcup-vanilla 40 | allow-failure: false 41 | - compiler: ghc-9.10.1 42 | compilerKind: ghc 43 | compilerVersion: 9.4.8 44 | setup-method: ghcup 45 | allow-failure: false 46 | - compiler: ghc-9.6.6 47 | compilerKind: ghc 48 | compilerVersion: 9.6.6 49 | setup-method: ghcup 50 | allow-failure: false 51 | fail-fast: false 52 | steps: 53 | - name: apt-get install 54 | run: | 55 | apt-get update 56 | apt-get install -y --no-install-recommends gnupg ca-certificates dirmngr curl git software-properties-common libtinfo5 57 | - name: Install GHCup 58 | run: | 59 | mkdir -p "$HOME/.ghcup/bin" 60 | curl -sL https://downloads.haskell.org/ghcup/0.1.30.0/x86_64-linux-ghcup-0.1.30.0 > "$HOME/.ghcup/bin/ghcup" 61 | chmod a+x "$HOME/.ghcup/bin/ghcup" 62 | - name: Install cabal-install 63 | run: | 64 | "$HOME/.ghcup/bin/ghcup" install cabal 3.12.1.0 || (cat "$HOME"/.ghcup/logs/*.* && false) 65 | echo "CABAL=$HOME/.ghcup/bin/cabal-3.12.1.0 -vnormal+nowrap" >> "$GITHUB_ENV" 66 | - name: Install GHC (GHCup) 67 | if: matrix.setup-method == 'ghcup' 68 | run: | 69 | "$HOME/.ghcup/bin/ghcup" install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false) 70 | HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER") 71 | HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#') 72 | HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#') 73 | echo "HC=$HC" >> "$GITHUB_ENV" 74 | echo "HCPKG=$HCPKG" >> "$GITHUB_ENV" 75 | echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV" 76 | env: 77 | HCKIND: ${{ matrix.compilerKind }} 78 | HCNAME: ${{ matrix.compiler }} 79 | HCVER: ${{ matrix.compilerVersion }} 80 | - name: Install GHC (GHCup vanilla) 81 | if: matrix.setup-method == 'ghcup-vanilla' 82 | run: | 83 | "$HOME/.ghcup/bin/ghcup" -s https://raw.githubusercontent.com/haskell/ghcup-metadata/master/ghcup-vanilla-0.0.8.yaml install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false) 84 | HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER") 85 | HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#') 86 | HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#') 87 | echo "HC=$HC" >> "$GITHUB_ENV" 88 | echo "HCPKG=$HCPKG" >> "$GITHUB_ENV" 89 | echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV" 90 | env: 91 | HCKIND: ${{ matrix.compilerKind }} 92 | HCNAME: ${{ matrix.compiler }} 93 | HCVER: ${{ matrix.compilerVersion }} 94 | - name: Set PATH and environment variables 95 | run: | 96 | echo "$HOME/.cabal/bin" >> $GITHUB_PATH 97 | echo "LANG=C.UTF-8" >> "$GITHUB_ENV" 98 | echo "CABAL_DIR=$HOME/.cabal" >> "$GITHUB_ENV" 99 | echo "CABAL_CONFIG=$HOME/.cabal/config" >> "$GITHUB_ENV" 100 | HCNUMVER=$(${HC} --numeric-version|perl -ne '/^(\d+)\.(\d+)\.(\d+)(\.(\d+))?$/; print(10000 * $1 + 100 * $2 + ($3 == 0 ? $5 != 1 : $3))') 101 | echo "HCNUMVER=$HCNUMVER" >> "$GITHUB_ENV" 102 | echo "ARG_TESTS=--enable-tests" >> "$GITHUB_ENV" 103 | echo "ARG_BENCH=--enable-benchmarks" >> "$GITHUB_ENV" 104 | echo "HEADHACKAGE=false" >> "$GITHUB_ENV" 105 | echo "ARG_COMPILER=--$HCKIND --with-compiler=$HC" >> "$GITHUB_ENV" 106 | env: 107 | HCKIND: ${{ matrix.compilerKind }} 108 | HCNAME: ${{ matrix.compiler }} 109 | HCVER: ${{ matrix.compilerVersion }} 110 | - name: env 111 | run: | 112 | env 113 | - name: write cabal config 114 | run: | 115 | mkdir -p $CABAL_DIR 116 | cat >> $CABAL_CONFIG <> $CABAL_CONFIG < cabal-plan.xz 149 | echo 'f62ccb2971567a5f638f2005ad3173dba14693a45154c1508645c52289714cb2 cabal-plan.xz' | sha256sum -c - 150 | xz -d < cabal-plan.xz > $HOME/.cabal/bin/cabal-plan 151 | rm -f cabal-plan.xz 152 | chmod a+x $HOME/.cabal/bin/cabal-plan 153 | cabal-plan --version 154 | - name: checkout 155 | uses: actions/checkout@v4 156 | with: 157 | path: source 158 | - name: initial cabal.project for sdist 159 | run: | 160 | touch cabal.project 161 | echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project 162 | cat cabal.project 163 | - name: sdist 164 | run: | 165 | mkdir -p sdist 166 | $CABAL sdist all --output-dir $GITHUB_WORKSPACE/sdist 167 | - name: unpack 168 | run: | 169 | mkdir -p unpacked 170 | find sdist -maxdepth 1 -type f -name '*.tar.gz' -exec tar -C $GITHUB_WORKSPACE/unpacked -xzvf {} \; 171 | - name: generate cabal.project 172 | run: | 173 | PKGDIR_dataframe="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/dataframe-[0-9.]*')" 174 | echo "PKGDIR_dataframe=${PKGDIR_dataframe}" >> "$GITHUB_ENV" 175 | rm -f cabal.project cabal.project.local 176 | touch cabal.project 177 | touch cabal.project.local 178 | echo "packages: ${PKGDIR_dataframe}" >> cabal.project 179 | echo "package dataframe" >> cabal.project 180 | echo " ghc-options: -Werror=missing-methods" >> cabal.project 181 | cat >> cabal.project <> cabal.project.local 184 | cat cabal.project 185 | cat cabal.project.local 186 | - name: dump install plan 187 | run: | 188 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dry-run all 189 | cabal-plan 190 | - name: restore cache 191 | uses: actions/cache/restore@v4 192 | with: 193 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 194 | path: ~/.cabal/store 195 | restore-keys: ${{ runner.os }}-${{ matrix.compiler }}- 196 | - name: install dependencies 197 | run: | 198 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks --dependencies-only -j2 all 199 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dependencies-only -j2 all 200 | - name: build w/o tests 201 | run: | 202 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 203 | - name: build 204 | run: | 205 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --write-ghc-environment-files=always 206 | - name: tests 207 | run: | 208 | $CABAL v2-test $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --test-show-details=direct 209 | - name: cabal check 210 | run: | 211 | cd ${PKGDIR_dataframe} || false 212 | ${CABAL} -vnormal check 213 | - name: haddock 214 | run: | 215 | $CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all 216 | - name: unconstrained build 217 | run: | 218 | rm -f cabal.project.local 219 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 220 | - name: save cache 221 | if: always() 222 | uses: actions/cache/save@v4 223 | with: 224 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 225 | path: ~/.cabal/store 226 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist 2 | dist-* 3 | cabal-dev 4 | *.o 5 | *.hi 6 | *.hie 7 | *.chi 8 | *.chs.h 9 | *.dyn_o 10 | *.dyn_hi 11 | .hpc 12 | .hsenv 13 | .cabal-sandbox/ 14 | cabal.sandbox.config 15 | *.prof 16 | *.aux 17 | *.hp 18 | *.eventlog 19 | .stack-work/ 20 | cabal.project.local 21 | cabal.project.local~ 22 | .HTF/ 23 | .ghc.environment.* 24 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Revision history for dataframe 2 | 3 | ## 0.1.0.0 4 | 5 | * Initial release 6 | 7 | ## 0.1.0.1 8 | 9 | * Fixed parse failure on nested, escaped quotation. 10 | * Fixed column info when field name isn't found. 11 | 12 | ## 0.1.0.2 13 | 14 | * Change namespace from `Data.DataFrame` to `DataFrame` 15 | * Add `toVector` function for converting columns to vectors. 16 | * Add `impute` function for replacing `Nothing` values in optional columns. 17 | * Add `filterAllJust` to filter out all rows with missing data. 18 | * Add `distinct` function that returns a dataframe with distict rows. 19 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DataFrame 2 | 3 | An intuitive, dynamically-typed DataFrame library. 4 | 5 | A tool for exploratory data analysis. 6 | 7 | ## Installing 8 | 9 | ### CLI 10 | * Install Haskell (ghc + cabal) via [ghcup](https://www.haskell.org/ghcup/install/) selecting all the default options. 11 | * To install dataframe run `cabal update && cabal install dataframe` 12 | * Open a Haskell repl with dataframe loaded by running `cabal repl --build-depends dataframe`. 13 | * Follow along any one of the tutorials below. 14 | 15 | ### Jupyter notebook 16 | * Jupyter notebook is still underway with some local tests/examples in the works. 17 | * For a preview check out the [California Housing](https://github.com/mchav/dataframe/blob/main/docs/California%20Housing.ipynb) notebook. 18 | 19 | ## What is exploratory data analysis? 20 | We provide a primer [here](https://github.com/mchav/dataframe/blob/main/docs/exploratory_data_analysis_primer.md) and show how to do some common analyses. 21 | 22 | ## Coming from other dataframe libraries 23 | Familiar with another dataframe library? Get started: 24 | * [Coming from Pandas](https://github.com/mchav/dataframe/blob/main/docs/coming_from_pandas.md) 25 | * [Coming from Polars](https://github.com/mchav/dataframe/blob/main/docs/coming_from_polars.md) 26 | * [Coming from dplyr](https://github.com/mchav/dataframe/blob/main/docs/coming_from_dplyr.md) 27 | 28 | ## Example usage 29 | 30 | ### Code example 31 | ```haskell 32 | import qualified DataFrame as D 33 | 34 | import DataFrame ((|>)) 35 | 36 | main :: IO () 37 | df <- D.readTsv "./data/chipotle.tsv" 38 | print $ df 39 | |> D.select ["item_name", "quantity"] 40 | |> D.groupBy ["item_name"] 41 | |> D.aggregate (zip (repeat "quantity") [D.Maximum, D.Mean, D.Sum]) 42 | |> D.sortBy D.Descending ["Sum_quantity"] 43 | ``` 44 | 45 | Output: 46 | 47 | ``` 48 | ---------------------------------------------------------------------------------------------------- 49 | index | item_name | Sum_quantity | Mean_quantity | Maximum_quantity 50 | ------|---------------------------------------|--------------|--------------------|----------------- 51 | Int | Text | Int | Double | Int 52 | ------|---------------------------------------|--------------|--------------------|----------------- 53 | 0 | Chips and Fresh Tomato Salsa | 130 | 1.1818181818181819 | 15 54 | 1 | Izze | 22 | 1.1 | 3 55 | 2 | Nantucket Nectar | 31 | 1.1481481481481481 | 3 56 | 3 | Chips and Tomatillo-Green Chili Salsa | 35 | 1.1290322580645162 | 3 57 | 4 | Chicken Bowl | 761 | 1.0482093663911847 | 3 58 | 5 | Side of Chips | 110 | 1.0891089108910892 | 8 59 | 6 | Steak Burrito | 386 | 1.048913043478261 | 3 60 | 7 | Steak Soft Tacos | 56 | 1.018181818181818 | 2 61 | 8 | Chips and Guacamole | 506 | 1.0563674321503131 | 4 62 | 9 | Chicken Crispy Tacos | 50 | 1.0638297872340425 | 2 63 | ``` 64 | 65 | Full example in `./app` folder using many of the constructs in the API. 66 | 67 | ### Visual example 68 | ![Screencast of usage in GHCI](./static/example.gif) 69 | 70 | ## Future work 71 | * Jupyter/ihaskell support (soon) 72 | * Apache arrow and Parquet compatability 73 | * Integration with common data formats (currently only supports CSV) 74 | * Support windowed plotting (currently only supports ASCII plots) 75 | * Create a lazy API that builds an execution graph instead of running eagerly (will be used to compute on files larger than RAM) 76 | 77 | ## Contributing 78 | * Please first submit an issue and we can discuss there. 79 | -------------------------------------------------------------------------------- /app/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExtendedDefaultRules #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | {-# LANGUAGE TypeApplications #-} 5 | {-# LANGUAGE TupleSections #-} 6 | 7 | module Main where 8 | 9 | import qualified DataFrame as D 10 | import DataFrame (dimensions, (|>)) 11 | import Data.List (delete) 12 | import Data.Maybe (fromMaybe, isJust, isNothing) 13 | import qualified Data.Text as T 14 | import qualified Data.Vector as V 15 | import qualified Data.Vector.Generic as VG 16 | import qualified Data.Vector.Unboxed as VU 17 | 18 | -- Numbers default to int and double, and strings to text 19 | default (Int, T.Text, Double) 20 | 21 | -- Example usage of DataFrame library 22 | 23 | main :: IO () 24 | main = do 25 | putStrLn "Housing" 26 | housing 27 | putStrLn $ replicate 100 '-' 28 | 29 | putStrLn "Chipotle Data" 30 | chipotle 31 | putStrLn $ replicate 100 '-' 32 | 33 | putStrLn "One Billion Row Challenge" 34 | oneBillingRowChallenge 35 | putStrLn $ replicate 100 '-' 36 | 37 | putStrLn "Covid Data" 38 | covid 39 | putStrLn $ replicate 100 '-' 40 | 41 | 42 | mean :: (Fractional a, VG.Vector v a) => v a -> a 43 | mean xs = VG.sum xs / fromIntegral (VG.length xs) 44 | 45 | oneBillingRowChallenge :: IO () 46 | oneBillingRowChallenge = do 47 | parsed <- D.readSeparated ';' D.defaultOptions "./data/measurements.txt" 48 | print $ 49 | parsed 50 | |> D.groupBy ["City"] 51 | |> D.reduceBy (\v -> (VG.minimum v, mean @Double v, VG.maximum v)) "Measurement" 52 | |> D.sortBy D.Ascending ["City"] 53 | 54 | housing :: IO () 55 | housing = do 56 | parsed <- D.readCsv "./data/housing.csv" 57 | 58 | print $ D.columnInfo parsed 59 | 60 | -- Sample. 61 | print $ D.take 5 parsed 62 | 63 | D.plotHistograms D.PlotAll D.VerticalHistogram parsed 64 | 65 | covid :: IO () 66 | covid = do 67 | rawFrame <- D.readCsv "./data/effects-of-covid-19-on-trade-at-15-december-2021-provisional.csv" 68 | print $ dimensions rawFrame 69 | print $ D.take 10 rawFrame 70 | 71 | D.plotHistograms D.PlotAll D.VerticalHistogram rawFrame 72 | 73 | -- value of all exports from 2015 74 | print $ 75 | rawFrame 76 | |> D.filter "Direction" (== "Exports") 77 | |> D.select ["Direction", "Year", "Country", "Value"] 78 | |> D.groupBy ["Direction", "Year", "Country"] 79 | |> D.reduceByAgg D.Sum "Value" 80 | 81 | chipotle :: IO () 82 | chipotle = do 83 | rawFrame <- D.readTsv "./data/chipotle.tsv" 84 | print $ D.dimensions rawFrame 85 | 86 | -- -- Sampling the dataframe 87 | print $ D.take 5 rawFrame 88 | 89 | -- Transform the data from a raw string into 90 | -- respective types (throws error on failure) 91 | let f = 92 | rawFrame 93 | -- Change a specfic order ID 94 | |> D.applyWhere (== 1) "order_id" (+ 2) "quantity" 95 | -- Index based change. 96 | |> D.applyAtIndex 0 (\n -> n - 2) "quantity" 97 | -- Custom parsing: drop dollar sign and parse price as double 98 | |> D.apply (D.readValue @Double . T.drop 1) "item_price" 99 | 100 | -- sample the dataframe. 101 | print $ D.take 10 f 102 | 103 | -- Create a total_price column that is quantity * item_price 104 | let multiply (a :: Int) (b :: Double) = fromIntegral a * b 105 | let withTotalPrice = D.deriveFrom (["quantity", "item_price"], D.func multiply) "total_price" f 106 | 107 | -- sample a filtered subset of the dataframe 108 | putStrLn "Sample dataframe" 109 | print $ 110 | withTotalPrice 111 | |> D.select ["quantity", "item_name", "item_price", "total_price"] 112 | |> D.filter "total_price" (100.0 <) 113 | |> D.take 10 114 | 115 | -- Check how many chicken burritos were ordered. 116 | -- There are two ways to checking how many chicken burritos 117 | -- were ordered. 118 | let searchTerm = "Chicken Burrito" :: T.Text 119 | 120 | print $ 121 | f 122 | |> D.select ["item_name", "quantity"] 123 | -- It's more efficient to filter before grouping. 124 | |> D.filter "item_name" (searchTerm ==) 125 | |> D.groupBy ["item_name"] 126 | -- can also be written as: 127 | -- D.aggregate (zip (repeat "quantity") [D.Sum, D.Maximum, D.Mean]) 128 | |> D.aggregate (map ("quantity",) [D.Sum, D.Maximum, D.Mean]) 129 | -- Automatically create a variable called _ 130 | |> D.sortBy D.Descending ["Sum_quantity"] 131 | 132 | -- Similarly, we can aggregate quantities by all rows. 133 | print $ 134 | f 135 | |> D.select ["item_name", "quantity"] 136 | |> D.groupBy ["item_name"] 137 | -- Aggregate written more explicitly. 138 | -- We have the full expressiveness of Haskell and we needn't fall 139 | -- use a DSL. 140 | |> D.aggregate [("quantity", D.Maximum), ("quantity", D.Mean), ("quantity", D.Sum)] 141 | |> D.take 10 142 | 143 | let firstOrder = 144 | withTotalPrice 145 | |> D.filterBy (maybe False (T.isInfixOf "Guacamole")) "choice_description" 146 | |> D.filterBy (("Chicken Bowl" :: T.Text) ==) "item_name" 147 | 148 | print $ D.take 10 firstOrder 149 | -------------------------------------------------------------------------------- /benchmark/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE NumericUnderscores #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | 4 | import qualified DataFrame as D 5 | import qualified Data.Vector.Unboxed as VU 6 | 7 | import Control.Monad (replicateM) 8 | import Criterion.Main 9 | import System.Random (randomRIO) 10 | 11 | stats :: Int -> IO () 12 | stats n = do 13 | ns <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0)) 14 | xs <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0)) 15 | ys <- VU.replicateM n (randomRIO (-20.0 :: Double, 20.0)) 16 | let df = D.fromList [("first", D.UnboxedColumn ns), 17 | ("second", D.UnboxedColumn xs), 18 | ("third", D.UnboxedColumn ys)] 19 | 20 | print $ D.mean "first" df 21 | print $ D.variance "second" df 22 | print $ D.correlation "second" "third" df 23 | print $ D.select ["first"] df D.|> D.take 1 24 | 25 | main = defaultMain [ 26 | bgroup "stats" [ bench "300_000" $ nfIO (stats 100_000) 27 | , bench "3_000_000" $ nfIO (stats 1_000_000) 28 | , bench "30_000_000" $ nfIO (stats 30_000_000) 29 | ] 30 | ] 31 | -------------------------------------------------------------------------------- /data/starwars.csv: -------------------------------------------------------------------------------- 1 | name,height,mass,hair_color,skin_color,eye_color,birth_year,sex,gender,homeworld,species,films,vehicles,starships 2 | Luke Skywalker,172,77,blond,fair,blue,19,male,masculine,Tatooine,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi, Revenge of the Sith, The Force Awakens","Snowspeeder, Imperial Speeder Bike","X-wing, Imperial shuttle" 3 | C-3PO,167,75,NA,gold,yellow,112,none,masculine,Tatooine,Droid,"A New Hope, The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith",, 4 | R2-D2,96,32,NA,"white, blue",red,33,none,masculine,Naboo,Droid,"A New Hope, The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith, The Force Awakens",, 5 | Darth Vader,202,136,none,white,yellow,41.9,male,masculine,Tatooine,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi, Revenge of the Sith",,TIE Advanced x1 6 | Leia Organa,150,49,brown,light,brown,19,female,feminine,Alderaan,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi, Revenge of the Sith, The Force Awakens",Imperial Speeder Bike, 7 | Owen Lars,178,120,"brown, grey",light,blue,52,male,masculine,Tatooine,Human,"A New Hope, Attack of the Clones, Revenge of the Sith",, 8 | Beru Whitesun Lars,165,75,brown,light,blue,47,female,feminine,Tatooine,Human,"A New Hope, Attack of the Clones, Revenge of the Sith",, 9 | R5-D4,97,32,NA,"white, red",red,NA,none,masculine,Tatooine,Droid,A New Hope,, 10 | Biggs Darklighter,183,84,black,light,brown,24,male,masculine,Tatooine,Human,A New Hope,,X-wing 11 | Obi-Wan Kenobi,182,77,"auburn, white",fair,blue-gray,57,male,masculine,Stewjon,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith",Tribubble bongo,"Jedi starfighter, Trade Federation cruiser, Naboo star skiff, Jedi Interceptor, Belbullab-22 starfighter" 12 | Anakin Skywalker,188,84,blond,fair,blue,41.9,male,masculine,Tatooine,Human,"The Phantom Menace, Attack of the Clones, Revenge of the Sith","Zephyr-G swoop bike, XJ-6 airspeeder","Naboo fighter, Trade Federation cruiser, Jedi Interceptor" 13 | Wilhuff Tarkin,180,NA,"auburn, grey",fair,blue,64,male,masculine,Eriadu,Human,"A New Hope, Revenge of the Sith",, 14 | Chewbacca,228,112,brown,unknown,blue,200,male,masculine,Kashyyyk,Wookiee,"A New Hope, The Empire Strikes Back, Return of the Jedi, Revenge of the Sith, The Force Awakens",AT-ST,"Millennium Falcon, Imperial shuttle" 15 | Han Solo,180,80,brown,fair,brown,29,male,masculine,Corellia,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi, The Force Awakens",,"Millennium Falcon, Imperial shuttle" 16 | Greedo,173,74,NA,green,black,44,male,masculine,Rodia,Rodian,A New Hope,, 17 | Jabba Desilijic Tiure,175,1358,NA,"green-tan, brown",orange,600,hermaphroditic,masculine,Nal Hutta,Hutt,"A New Hope, Return of the Jedi, The Phantom Menace",, 18 | Wedge Antilles,170,77,brown,fair,hazel,21,male,masculine,Corellia,Human,"A New Hope, The Empire Strikes Back, Return of the Jedi",Snowspeeder,X-wing 19 | Jek Tono Porkins,180,110,brown,fair,blue,NA,NA,NA,Bestine IV,NA,A New Hope,,X-wing 20 | Yoda,66,17,white,green,brown,896,male,masculine,NA,Yoda's species,"The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith",, 21 | Palpatine,170,75,grey,pale,yellow,82,male,masculine,Naboo,Human,"The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith",, 22 | Boba Fett,183,78.2,black,fair,brown,31.5,male,masculine,Kamino,Human,"The Empire Strikes Back, Return of the Jedi, Attack of the Clones",,Slave 1 23 | IG-88,200,140,none,metal,red,15,none,masculine,NA,Droid,The Empire Strikes Back,, 24 | Bossk,190,113,none,green,red,53,male,masculine,Trandosha,Trandoshan,The Empire Strikes Back,, 25 | Lando Calrissian,177,79,black,dark,brown,31,male,masculine,Socorro,Human,"The Empire Strikes Back, Return of the Jedi",,Millennium Falcon 26 | Lobot,175,79,none,light,blue,37,male,masculine,Bespin,Human,The Empire Strikes Back,, 27 | Ackbar,180,83,none,brown mottle,orange,41,male,masculine,Mon Cala,Mon Calamari,"Return of the Jedi, The Force Awakens",, 28 | Mon Mothma,150,NA,auburn,fair,blue,48,female,feminine,Chandrila,Human,Return of the Jedi,, 29 | Arvel Crynyd,NA,NA,brown,fair,brown,NA,male,masculine,NA,Human,Return of the Jedi,,A-wing 30 | Wicket Systri Warrick,88,20,brown,brown,brown,8,male,masculine,Endor,Ewok,Return of the Jedi,, 31 | Nien Nunb,160,68,none,grey,black,NA,male,masculine,Sullust,Sullustan,Return of the Jedi,,Millennium Falcon 32 | Qui-Gon Jinn,193,89,brown,fair,blue,92,male,masculine,NA,Human,The Phantom Menace,Tribubble bongo, 33 | Nute Gunray,191,90,none,mottled green,red,NA,male,masculine,Cato Neimoidia,Neimodian,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",, 34 | Finis Valorum,170,NA,blond,fair,blue,91,male,masculine,Coruscant,Human,The Phantom Menace,, 35 | Padmé Amidala,185,45,brown,light,brown,46,female,feminine,Naboo,Human,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",,"Naboo fighter, H-type Nubian yacht, Naboo star skiff" 36 | Jar Jar Binks,196,66,none,orange,orange,52,male,masculine,Naboo,Gungan,"The Phantom Menace, Attack of the Clones",, 37 | Roos Tarpals,224,82,none,grey,orange,NA,male,masculine,Naboo,Gungan,The Phantom Menace,, 38 | Rugor Nass,206,NA,none,green,orange,NA,male,masculine,Naboo,Gungan,The Phantom Menace,, 39 | Ric Olié,183,NA,brown,fair,blue,NA,male,masculine,Naboo,Human,The Phantom Menace,,Naboo Royal Starship 40 | Watto,137,NA,black,"blue, grey",yellow,NA,male,masculine,Toydaria,Toydarian,"The Phantom Menace, Attack of the Clones",, 41 | Sebulba,112,40,none,"grey, red",orange,NA,male,masculine,Malastare,Dug,The Phantom Menace,, 42 | Quarsh Panaka,183,NA,black,dark,brown,62,male,masculine,Naboo,Human,The Phantom Menace,, 43 | Shmi Skywalker,163,NA,black,fair,brown,72,female,feminine,Tatooine,Human,"The Phantom Menace, Attack of the Clones",, 44 | Darth Maul,175,80,none,red,yellow,54,male,masculine,Dathomir,Zabrak,The Phantom Menace,Sith speeder,Scimitar 45 | Bib Fortuna,180,NA,none,pale,pink,NA,male,masculine,Ryloth,Twi'lek,Return of the Jedi,, 46 | Ayla Secura,178,55,none,blue,hazel,48,female,feminine,Ryloth,Twi'lek,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",, 47 | Ratts Tyerel,79,15,none,"grey, blue",unknown,NA,male,masculine,Aleen Minor,Aleena,The Phantom Menace,, 48 | Dud Bolt,94,45,none,"blue, grey",yellow,NA,male,masculine,Vulpter,Vulptereen,The Phantom Menace,, 49 | Gasgano,122,NA,none,"white, blue",black,NA,male,masculine,Troiken,Xexto,The Phantom Menace,, 50 | Ben Quadinaros,163,65,none,"grey, green, yellow",orange,NA,male,masculine,Tund,Toong,The Phantom Menace,, 51 | Mace Windu,188,84,none,dark,brown,72,male,masculine,Haruun Kal,Human,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",, 52 | Ki-Adi-Mundi,198,82,white,pale,yellow,92,male,masculine,Cerea,Cerean,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",, 53 | Kit Fisto,196,87,none,green,black,NA,male,masculine,Glee Anselm,Nautolan,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",, 54 | Eeth Koth,171,NA,black,brown,brown,NA,male,masculine,Iridonia,Zabrak,"The Phantom Menace, Revenge of the Sith",, 55 | Adi Gallia,184,50,none,dark,blue,NA,female,feminine,Coruscant,Tholothian,"The Phantom Menace, Revenge of the Sith",, 56 | Saesee Tiin,188,NA,none,pale,orange,NA,male,masculine,Iktotch,Iktotchi,"The Phantom Menace, Revenge of the Sith",, 57 | Yarael Poof,264,NA,none,white,yellow,NA,male,masculine,Quermia,Quermian,The Phantom Menace,, 58 | Plo Koon,188,80,none,orange,black,22,male,masculine,Dorin,Kel Dor,"The Phantom Menace, Attack of the Clones, Revenge of the Sith",,Jedi starfighter 59 | Mas Amedda,196,NA,none,blue,blue,NA,male,masculine,Champala,Chagrian,"The Phantom Menace, Attack of the Clones",, 60 | Gregar Typho,185,85,black,dark,brown,NA,NA,NA,Naboo,NA,Attack of the Clones,,Naboo fighter 61 | Cordé,157,NA,brown,light,brown,NA,NA,NA,Naboo,NA,Attack of the Clones,, 62 | Cliegg Lars,183,NA,brown,fair,blue,82,male,masculine,Tatooine,Human,Attack of the Clones,, 63 | Poggle the Lesser,183,80,none,green,yellow,NA,male,masculine,Geonosis,Geonosian,"Attack of the Clones, Revenge of the Sith",, 64 | Luminara Unduli,170,56.2,black,yellow,blue,58,female,feminine,Mirial,Mirialan,"Attack of the Clones, Revenge of the Sith",, 65 | Barriss Offee,166,50,black,yellow,blue,40,female,feminine,Mirial,Mirialan,Attack of the Clones,, 66 | Dormé,165,NA,brown,light,brown,NA,female,feminine,Naboo,Human,Attack of the Clones,, 67 | Dooku,193,80,white,fair,brown,102,male,masculine,Serenno,Human,"Attack of the Clones, Revenge of the Sith",Flitknot speeder, 68 | Bail Prestor Organa,191,NA,black,tan,brown,67,male,masculine,Alderaan,Human,"Attack of the Clones, Revenge of the Sith",, 69 | Jango Fett,183,79,black,tan,brown,66,male,masculine,Concord Dawn,Human,Attack of the Clones,, 70 | Zam Wesell,168,55,blonde,"fair, green, yellow",yellow,NA,female,feminine,Zolan,Clawdite,Attack of the Clones,Koro-2 Exodrive airspeeder, 71 | Dexter Jettster,198,102,none,brown,yellow,NA,male,masculine,Ojom,Besalisk,Attack of the Clones,, 72 | Lama Su,229,88,none,grey,black,NA,male,masculine,Kamino,Kaminoan,Attack of the Clones,, 73 | Taun We,213,NA,none,grey,black,NA,female,feminine,Kamino,Kaminoan,Attack of the Clones,, 74 | Jocasta Nu,167,NA,white,fair,blue,NA,female,feminine,Coruscant,Human,Attack of the Clones,, 75 | R4-P17,96,NA,none,"silver, red","red, blue",NA,none,feminine,NA,Droid,"Attack of the Clones, Revenge of the Sith",, 76 | Wat Tambor,193,48,none,"green, grey",unknown,NA,male,masculine,Skako,Skakoan,Attack of the Clones,, 77 | San Hill,191,NA,none,grey,gold,NA,male,masculine,Muunilinst,Muun,Attack of the Clones,, 78 | Shaak Ti,178,57,none,"red, blue, white",black,NA,female,feminine,Shili,Togruta,"Attack of the Clones, Revenge of the Sith",, 79 | Grievous,216,159,none,"brown, white","green, yellow",NA,male,masculine,Kalee,Kaleesh,Revenge of the Sith,Tsmeu-6 personal wheel bike,Belbullab-22 starfighter 80 | Tarfful,234,136,brown,brown,blue,NA,male,masculine,Kashyyyk,Wookiee,Revenge of the Sith,, 81 | Raymus Antilles,188,79,brown,light,brown,NA,male,masculine,Alderaan,Human,"A New Hope, Revenge of the Sith",, 82 | Sly Moore,178,48,none,pale,white,NA,NA,NA,Umbara,NA,"Attack of the Clones, Revenge of the Sith",, 83 | Tion Medon,206,80,none,grey,black,NA,male,masculine,Utapau,Pau'an,Revenge of the Sith,, 84 | Finn,NA,NA,black,dark,dark,NA,male,masculine,NA,Human,The Force Awakens,, 85 | Rey,NA,NA,brown,light,hazel,NA,female,feminine,NA,Human,The Force Awakens,, 86 | Poe Dameron,NA,NA,brown,light,brown,NA,male,masculine,NA,Human,The Force Awakens,,X-wing 87 | BB8,NA,NA,none,none,black,NA,none,masculine,NA,Droid,The Force Awakens,, 88 | Captain Phasma,NA,NA,none,none,unknown,NA,female,feminine,NA,Human,The Force Awakens,, 89 | -------------------------------------------------------------------------------- /dataframe.cabal: -------------------------------------------------------------------------------- 1 | cabal-version: 2.4 2 | name: dataframe 3 | version: 0.1.0.3 4 | 5 | synopsis: An intuitive, dynamically-typed DataFrame library. 6 | 7 | description: An intuitive, dynamically-typed DataFrame library for exploratory data analysis. 8 | 9 | bug-reports: https://github.com/mchav/dataframe/issues 10 | license: GPL-3.0-or-later 11 | license-file: LICENSE 12 | author: Michael Chavinda 13 | maintainer: mschavinda@gmail.com 14 | 15 | copyright: (c) 2024-2024 Michael Chavinda 16 | category: Data 17 | tested-with: GHC ==9.8.3 || ==9.6.6 || == 9.4.8 18 | extra-doc-files: CHANGELOG.md README.md 19 | 20 | source-repository head 21 | type: git 22 | location: https://github.com/mchav/dataframe 23 | 24 | library 25 | exposed-modules: DataFrame 26 | other-modules: DataFrame.Internal.Types, 27 | DataFrame.Internal.Function, 28 | DataFrame.Internal.Parsing, 29 | DataFrame.Internal.Column, 30 | DataFrame.Display.Terminal.PrettyPrint, 31 | DataFrame.Display.Terminal.Colours, 32 | DataFrame.Internal.DataFrame, 33 | DataFrame.Internal.Row, 34 | DataFrame.Errors, 35 | DataFrame.Operations.Core, 36 | DataFrame.Operations.Subset, 37 | DataFrame.Operations.Sorting, 38 | DataFrame.Operations.Statistics, 39 | DataFrame.Operations.Transformations, 40 | DataFrame.Operations.Typing, 41 | DataFrame.Operations.Aggregation, 42 | DataFrame.Display.Terminal.Plot, 43 | DataFrame.IO.CSV 44 | build-depends: base >= 4.17.2.0 && < 4.21, 45 | array ^>= 0.5, 46 | attoparsec >= 0.12 && <= 0.14.4, 47 | bytestring >= 0.11 && <= 0.12.2.0, 48 | containers >= 0.6.7 && < 0.8, 49 | directory >= 1.3.0.0 && <= 1.3.9.0, 50 | hashable >= 1.2 && <= 1.5.0.0, 51 | statistics >= 0.16.2.1 && <= 0.16.3.0, 52 | text >= 2.0 && <= 2.1.2, 53 | time >= 1.12 && <= 1.14, 54 | vector ^>= 0.13, 55 | vector-algorithms ^>= 0.9 56 | hs-source-dirs: src 57 | default-language: Haskell2010 58 | 59 | executable dataframe 60 | main-is: Main.hs 61 | other-modules: DataFrame, 62 | DataFrame.Internal.Types, 63 | DataFrame.Internal.Function, 64 | DataFrame.Internal.Parsing, 65 | DataFrame.Internal.Column, 66 | DataFrame.Display.Terminal.PrettyPrint, 67 | DataFrame.Display.Terminal.Colours, 68 | DataFrame.Internal.DataFrame, 69 | DataFrame.Internal.Row, 70 | DataFrame.Errors, 71 | DataFrame.Operations.Core, 72 | DataFrame.Operations.Subset, 73 | DataFrame.Operations.Sorting, 74 | DataFrame.Operations.Statistics, 75 | DataFrame.Operations.Transformations, 76 | DataFrame.Operations.Typing, 77 | DataFrame.Operations.Aggregation, 78 | DataFrame.Display.Terminal.Plot, 79 | DataFrame.IO.CSV 80 | build-depends: base >= 4.17.2.0 && < 4.21, 81 | array ^>= 0.5, 82 | attoparsec >= 0.12 && <= 0.14.4, 83 | bytestring >= 0.11 && <= 0.12.2.0, 84 | containers >= 0.6.7 && < 0.8, 85 | directory >= 1.3.0.0 && <= 1.3.9.0, 86 | hashable >= 1.2 && <= 1.5.0.0, 87 | statistics >= 0.16.2.1 && <= 0.16.3.0, 88 | text >= 2.0 && <= 2.1.2, 89 | time >= 1.12 && <= 1.14, 90 | vector ^>= 0.13, 91 | vector-algorithms ^>= 0.9 92 | hs-source-dirs: app, 93 | src 94 | default-language: Haskell2010 95 | 96 | benchmark dataframe-benchmark 97 | type: exitcode-stdio-1.0 98 | main-is: Main.hs 99 | hs-source-dirs: benchmark 100 | build-depends: base >= 4.17.2.0 && < 4.21, 101 | criterion >= 1 && <= 1.6.4.0, 102 | text >= 2.0 && <= 2.1.2, 103 | random >= 1 && <= 1.3.1, 104 | vector ^>= 0.13, 105 | dataframe 106 | default-language: Haskell2010 107 | 108 | test-suite tests 109 | type: exitcode-stdio-1.0 110 | main-is: Main.hs 111 | other-modules: Assertions, 112 | Operations.Apply, 113 | Operations.Derive, 114 | Operations.Filter, 115 | Operations.GroupBy, 116 | Operations.InsertColumn, 117 | Operations.Sort, 118 | Operations.Take 119 | build-depends: base >= 4.17.2.0 && < 4.21, 120 | HUnit ^>= 1.6, 121 | random >= 1, 122 | random-shuffle >= 0.0.4, 123 | text >= 2.0, 124 | time >= 1.12, 125 | vector ^>= 0.13, 126 | dataframe 127 | hs-source-dirs: tests 128 | default-language: Haskell2010 129 | -------------------------------------------------------------------------------- /docs/coming_from_dplyr.md: -------------------------------------------------------------------------------- 1 | # Coming from dplyr 2 | 3 | This tutorial will walk through the examples in dplyr's [mini tutorial](https://dplyr.tidyverse.org/) showing how concepts in dplyr map to dataframe. 4 | 5 | ## Filtering 6 | Filtering looks similar in both libraries. 7 | 8 | ```r 9 | starwars %>% 10 | filter(species == "Droid") 11 | #> # A tibble: 6 × 14 12 | #> name height mass hair_color skin_color eye_color birth_year sex gender 13 | #> 14 | #> 1 C-3PO 167 75 gold yellow 112 none masculi… 15 | #> 2 R2-D2 96 32 white, blue red 33 none masculi… 16 | #> 3 R5-D4 97 32 white, red red NA none masculi… 17 | #> 4 IG-88 200 140 none metal red 15 none masculi… 18 | #> 5 R4-P17 96 NA none silver, red red, blue NA none feminine 19 | #> # ℹ 1 more row 20 | #> # ℹ 5 more variables: homeworld , species , films , 21 | #> # vehicles , starships 22 | ``` 23 | 24 | ```haskell 25 | starwars |> D.filter "species" (("Droid" :: Str.Text) ==) 26 | |> D.take 10 27 | ``` 28 | 29 | ``` 30 | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 31 | index | name | height | mass | hair_color | skin_color | eye_color | birth_year | sex | gender | homeworld | species | films | vehicles | starships 32 | ------|--------|-----------|-----------|------------|-------------|-----------|------------|------|-----------|-----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------|------------|----------- 33 | Int | Text | Maybe Int | Maybe Int | Text | Text | Text | Maybe Int | Text | Text | Text | Text | Text | Maybe Text | Maybe Text 34 | ------|--------|-----------|-----------|------------|-------------|-----------|------------|------|-----------|-----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------|------------|----------- 35 | 0 | C-3PO | Just 167 | Just 75 | NA | gold | yellow | Just 112 | none | masculine | Tatooine | Droid | A New Hope, The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith | Nothing | Nothing 36 | 1 | R2-D2 | Just 96 | Just 32 | NA | white, blue | red | Just 33 | none | masculine | Naboo | Droid | A New Hope, The Empire Strikes Back, Return of the Jedi, The Phantom Menace, Attack of the Clones, Revenge of the Sith, The Force Awakens | Nothing | Nothing 37 | 2 | R5-D4 | Just 97 | Just 32 | NA | white, red | red | Nothing | none | masculine | Tatooine | Droid | A New Hope | Nothing | Nothing 38 | 3 | IG-88 | Just 200 | Just 140 | none | metal | red | Just 15 | none | masculine | NA | Droid | The Empire Strikes Back | Nothing | Nothing 39 | 4 | R4-P17 | Just 96 | Nothing | none | silver, red | red, blue | Nothing | none | feminine | NA | Droid | Attack of the Clones, Revenge of the Sith | Nothing | Nothing 40 | 5 | BB8 | Nothing | Nothing | none | none | black | Nothing | none | masculine | NA | Droid | The Force Awakens | Nothing | Nothing 41 | ``` 42 | 43 | ## Selecting columns 44 | Select looks similar except in Haskell we take as argument a list of strings instead of a mix of predicates and strings. 45 | 46 | ```r 47 | starwars %>% 48 | select(name, ends_with("color")) 49 | #> # A tibble: 87 × 4 50 | #> name hair_color skin_color eye_color 51 | #> 52 | #> 1 Luke Skywalker blond fair blue 53 | #> 2 C-3PO gold yellow 54 | #> 3 R2-D2 white, blue red 55 | #> 4 Darth Vader none white yellow 56 | #> 5 Leia Organa brown light brown 57 | #> # ℹ 82 more rows 58 | ``` 59 | 60 | To get the same predicate-like functionality we use `selectBy`. 61 | 62 | ```haskell 63 | starwars |> D.selectBy (\cname -> cname == "name" || T.isSuffixOf "color" cname) 64 | |> D.take 10 65 | ``` 66 | 67 | 68 | ``` 69 | -------------------------------------------------------------------- 70 | index | name | hair_color | skin_color | eye_color 71 | ------|--------------------|---------------|-------------|---------- 72 | Int | Text | Text | Text | Text 73 | ------|--------------------|---------------|-------------|---------- 74 | 0 | Luke Skywalker | blond | fair | blue 75 | 1 | C-3PO | NA | gold | yellow 76 | 2 | R2-D2 | NA | white, blue | red 77 | 3 | Darth Vader | none | white | yellow 78 | 4 | Leia Organa | brown | light | brown 79 | 5 | Owen Lars | brown, grey | light | blue 80 | 6 | Beru Whitesun Lars | brown | light | blue 81 | 7 | R5-D4 | NA | white, red | red 82 | 8 | Biggs Darklighter | black | light | brown 83 | 9 | Obi-Wan Kenobi | auburn, white | fair | blue-gray 84 | ``` 85 | 86 | ## Transforming columns 87 | 88 | R has a general mutate function that takes in a mix of expressions and column names. 89 | 90 | ```r 91 | starwars %>% 92 | mutate(name, bmi = mass / ((height / 100) ^ 2)) %>% 93 | select(name:mass, bmi) 94 | #> # A tibble: 87 × 4 95 | #> name height mass bmi 96 | #> 97 | #> 1 Luke Skywalker 172 77 26.0 98 | #> 2 C-3PO 167 75 26.9 99 | #> 3 R2-D2 96 32 34.7 100 | #> 4 Darth Vader 202 136 33.3 101 | #> 5 Leia Organa 150 49 21.8 102 | #> # ℹ 82 more rows 103 | ``` 104 | 105 | Our logic is more explicit about what's going on. Because both our fields are nullable/optional we have to specify the type. 106 | 107 | ```haskell 108 | bmi (w :: Int) (h :: Int) = (fromIntegral w) / (fromIntegral h / 100) ** 2 :: Double 109 | 110 | starwars 111 | |> D.selectRange ("name", "mass") 112 | -- mass and height are optionals so we combine them with 113 | -- Haskell's Applicative operators. 114 | |> D.deriveFrom (["mass", "height"], D.func (\w h -> bmi <$> w <*> h)) "bmi" 115 | |> D.take 10 116 | ``` 117 | 118 | ``` 119 | ------------------------------------------------------------------------------- 120 | index | name | height | mass | bmi 121 | ------|-----------------------|-----------|-----------|------------------------ 122 | Int | Text | Maybe Int | Maybe Int | Maybe Double 123 | ------|-----------------------|-----------|-----------|------------------------ 124 | 0 | Luke Skywalker | Just 172 | Just 77 | Just 26.027582477014604 125 | 1 | C-3PO | Just 167 | Just 75 | Just 26.89232313815483 126 | 2 | R2-D2 | Just 96 | Just 32 | Just 34.72222222222222 127 | 3 | Darth Vader | Just 202 | Just 136 | Just 33.33006567983531 128 | 4 | Leia Organa | Just 150 | Just 49 | Just 21.77777777777778 129 | 5 | Owen Lars | Just 178 | Just 120 | Just 37.87400580734756 130 | 6 | Beru Whitesun Lars | Just 165 | Just 75 | Just 27.548209366391188 131 | 7 | R5-D4 | Just 97 | Just 32 | Just 34.009990434690195 132 | 8 | Biggs Darklighter | Just 183 | Just 84 | Just 25.082863029651524 133 | 9 | Obi-Wan Kenobi | Just 182 | Just 77 | Just 23.24598478444632 134 | ``` 135 | 136 | Haskell's applicative syntax does take some getting used to. 137 | 138 | `f <$> a` means apply f to the thing inside the "container". In this 139 | case the container (or more infamously the monad) is of type `Maybe`. 140 | So this can also be written as `fmap f a`. 141 | 142 | But this only works if our `f` takes a single argument. If it takes 143 | two arguments then the we use `<*>` to specify the second argument. 144 | 145 | So, applying bmi to two optionals can be written as: 146 | 147 | ```haskell 148 | ghci> fmap (+) (Just 2) <*> Just 2 149 | Just 4 150 | ghci> (+) <$> Just 2 <*> Just 2 151 | Just 4 152 | ``` 153 | 154 | You'll find a wealth of functions for dealing with optionals in the package 155 | `Data.Maybe`. 156 | 157 | ## Sorting 158 | 159 | ```r 160 | starwars %>% 161 | arrange(desc(mass)) 162 | #> # A tibble: 87 × 14 163 | #> name height mass hair_color skin_color eye_color birth_year sex gender 164 | #> 165 | #> 1 Jabba De… 175 1358 green-tan… orange 600 herm… mascu… 166 | #> 2 Grievous 216 159 none brown, wh… green, y… NA male mascu… 167 | #> 3 IG-88 200 140 none metal red 15 none mascu… 168 | #> 4 Darth Va… 202 136 none white yellow 41.9 male mascu… 169 | #> 5 Tarfful 234 136 brown brown blue NA male mascu… 170 | #> # ℹ 82 more rows 171 | #> # ℹ 5 more variables: homeworld , species , films , 172 | #> # vehicles , starships 173 | ``` 174 | 175 | ```haskell 176 | starwars |> D.sortBy D.Descending ["mass"] |> D.take 5 177 | ``` 178 | 179 | ``` 180 | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- 181 | index | name | height | mass | hair_color | skin_color | eye_color | birth_year | sex | gender | homeworld | species | films | vehicles | starships 182 | ------|-----------------------|-----------|-----------|------------|------------------|---------------|------------|----------------|-----------|-----------|---------|------------------------------------------------------------------------------|------------------------------------|-------------------------------- 183 | Int | Text | Maybe Int | Maybe Int | Text | Text | Text | Maybe Int | Text | Text | Text | Text | Text | Maybe Text | Maybe Text 184 | ------|-----------------------|-----------|-----------|------------|------------------|---------------|------------|----------------|-----------|-----------|---------|------------------------------------------------------------------------------|------------------------------------|-------------------------------- 185 | 0 | Jabba Desilijic Tiure | Just 175 | Just 1358 | NA | green-tan, brown | orange | Just 600 | hermaphroditic | masculine | Nal Hutta | Hutt | A New Hope, Return of the Jedi, The Phantom Menace | Nothing | Nothing 186 | 1 | Grievous | Just 216 | Just 159 | none | brown, white | green, yellow | Nothing | male | masculine | Kalee | Kaleesh | Revenge of the Sith | Just "Tsmeu-6 personal wheel bike" | Just "Belbullab-22 starfighter" 187 | 2 | IG-88 | Just 200 | Just 140 | none | metal | red | Just 15 | none | masculine | NA | Droid | The Empire Strikes Back | Nothing | Nothing 188 | 3 | Tarfful | Just 234 | Just 136 | brown | brown | blue | Nothing | male | masculine | Kashyyyk | Wookiee | Revenge of the Sith | Nothing | Nothing 189 | 4 | Darth Vader | Just 202 | Just 136 | none | white | yellow | Nothing | male | masculine | Tatooine | Human | A New Hope, The Empire Strikes Back, Return of the Jedi, Revenge of the Sith | Nothing | Just "TIE Advanced x1" 190 | ``` 191 | 192 | ## Grouping and aggregating 193 | 194 | ```r 195 | starwars %>% 196 | group_by(species) %>% 197 | summarise( 198 | n = n(), 199 | mass = mean(mass, na.rm = TRUE) 200 | ) %>% 201 | filter( 202 | n > 1, 203 | mass > 50 204 | ) 205 | #> # A tibble: 9 × 3 206 | #> species n mass 207 | #> 208 | #> 1 Droid 6 69.8 209 | #> 2 Gungan 3 74 210 | #> 3 Human 35 81.3 211 | #> 4 Kaminoan 2 88 212 | #> 5 Mirialan 2 53.1 213 | #> # ℹ 4 more rows 214 | ``` 215 | 216 | ```haskell 217 | starwars |> D.select ["species", "mass"] 218 | |> D.groupByAgg D.Count ["species"] 219 | -- This will be saved in a variable called "Mean_mass" 220 | |> D.reduceByAgg D.Mean "mass" 221 | -- Always better to be explcit about types for 222 | -- numbers but you can also turn on defaults 223 | -- to save keystrokes. 224 | |> D.filterWhere (["Count", "Mean_mass"], 225 | D.func (\(n :: Int) (mass :: Double) -> n > 1 && mass > 50)) 226 | ``` 227 | 228 | ``` 229 | -------------------------------------------- 230 | index | species | Mean_mass | Count 231 | ------|----------|-------------------|------ 232 | Int | Text | Double | Int 233 | ------|----------|-------------------|------ 234 | 0 | Human | 81.47368421052632 | 35 235 | 1 | Droid | 69.75 | 6 236 | 2 | Wookiee | 124.0 | 2 237 | 3 | NA | 81.0 | 4 238 | 4 | Gungan | 74.0 | 3 239 | 5 | Zabrak | 80.0 | 2 240 | 6 | Twi'lek | 55.0 | 2 241 | 7 | Kaminoan | 88.0 | 2 242 | ``` 243 | -------------------------------------------------------------------------------- /docs/coming_from_pandas.md: -------------------------------------------------------------------------------- 1 | # Coming from pandas 2 | 3 | We'll be porting over concepts from [10 minutes to Pandas](https://pandas.pydata.org/docs/user_guide/10min.html). 4 | 5 | ## Basic Data Structures 6 | 7 | A pandas `Series` maps to a `Column`. `Series` are indexable (labelled) arrays. We currently don't support indexing so `Column`s aren't meant to be manipulated directly so we don't focus on them too much. 8 | 9 | A `DataFrame` maps to a `DataFrame` as expected. Our dataframes are essentially a list of `Vector`s with some metadata for managing state. 10 | 11 | ## Creating our structures 12 | 13 | Creaing a series. 14 | 15 | ```python 16 | python> s = pd.Series([1, 3, 5, np.nan, 6, 8]) 17 | python> s 18 | 0 1.0 19 | 1 3.0 20 | 2 5.0 21 | 3 NaN 22 | 4 6.0 23 | 5 8.0 24 | dtype: float64 25 | ``` 26 | 27 | ```haskell 28 | ghci> import qualified DataFrame as D 29 | ghci> D.toColumn [1, 3, 5, read @Float "NaN", 6, 8] 30 | [1.0,3.0,5.0,NaN,6.0,8.0] 31 | ``` 32 | 33 | ```python 34 | python> dates = pd.date_range("20130101", periods=6) 35 | python> dates 36 | DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04', 37 | '2013-01-05', '2013-01-06'], 38 | dtype='datetime64[ns]', freq='D') 39 | ``` 40 | 41 | ```haskell 42 | ghci> import Data.Time.Calendar 43 | ghci> dates = D.toColumn $ Prelude.take 6 $ [fromGregorian 2013 01 01..] 44 | ghci> dates 45 | [2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06] 46 | ``` 47 | 48 | Use the series to create a dataframe. 49 | 50 | ```python 51 | python> df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD")) 52 | python> df 53 | A B C D 54 | 2013-01-01 0.469112 -0.282863 -1.509059 -1.135632 55 | 2013-01-02 1.212112 -0.173215 0.119209 -1.044236 56 | 2013-01-03 -0.861849 -2.104569 -0.494929 1.071804 57 | 2013-01-04 0.721555 -0.706771 -1.039575 0.271860 58 | 2013-01-05 -0.424972 0.567020 0.276232 -1.087401 59 | 2013-01-06 -0.673690 0.113648 -1.478427 0.524988 60 | ``` 61 | 62 | ```haskell 63 | ghci> import qualified Data.Vector as V 64 | ghci> import System.Random (randomRIO) 65 | ghci> import Control.Monad (replicateM) 66 | ghci> import Data.List (foldl') 67 | ghci> :set -XOverloadedStrings 68 | ghci> initDf = D.fromList [("date", dates)] 69 | ghci> ns <- replicateM 4 (replicateM 6 (randomRIO (-2.0, 2.0))) 70 | ghci> df = foldl' (\d (name, col) -> D.insertColumn name (V.fromList col) d) initDf (zip ["A","B","C","D"] ns) 71 | ghci> df 72 | ------------------------------------------------------------------------------------------------------------ 73 | index | date | A | B | C | D 74 | ------|------------|---------------------|----------------------|----------------------|-------------------- 75 | Int | Day | Double | Double | Double | Double 76 | ------|------------|---------------------|----------------------|----------------------|-------------------- 77 | 0 | 2013-01-01 | 0.49287792598710745 | 1.2126312556288785 | -1.3553292904555625 | 1.8491213627748553 78 | 1 | 2013-01-02 | 0.7936547276080512 | -1.5209756494542028 | -0.5208055385837551 | 0.8895325450813525 79 | 2 | 2013-01-03 | 1.8883976214395153 | 1.3453541205495676 | -1.1801018894304223 | 0.20583994035730901 80 | 3 | 2013-01-04 | -1.3262867911904324 | -0.37375298679005686 | -0.8580515357149543 | 1.4681616115128593 81 | 4 | 2013-01-05 | 1.9068894062167745 | 0.792553168600036 | -0.13526265076664545 | -1.6239378251651466 82 | 5 | 2013-01-06 | -0.5541246187320041 | -1.5791034339829042 | -1.5650415391333796 | -1.7802523632196152 83 | ``` 84 | 85 | As hinted in the previous example we can create a dataframe with `fromList`. This function takes in a list of tuples. We don't broadast values like python does i.e if you put in a single value into a column all other values will be null/nothing. But we'll detail how to get the same functionality. 86 | 87 | ```python 88 | df2 = pd.DataFrame( 89 | { 90 | "A": 1.0, 91 | "B": pd.Timestamp("20130102"), 92 | "C": pd.Series(1, index=list(range(4)), dtype="float32"), 93 | "D": np.array([3] * 4, dtype="int32"), 94 | "E": pd.Categorical(["test", "train", "test", "train"]), 95 | "F": "foo", 96 | } 97 | ) 98 | 99 | # Result 100 | # df2 101 | # A B C D E F 102 | # 0 1.0 2013-01-02 1.0 3 test foo 103 | # 1 1.0 2013-01-02 1.0 3 train foo 104 | # 2 1.0 2013-01-02 1.0 3 test foo 105 | # 3 1.0 2013-01-02 1.0 3 train foo 106 | 107 | ``` 108 | 109 | ```haskell 110 | -- All our data types must be printable and orderable. 111 | data Transport = Test | Train deriving (Show, Ord, Eq) 112 | ghci> :{ 113 | ghci| df = D.fromList [ 114 | ghci| ("A", D.toColumn (replicate 4 1.0)), 115 | ghci| ("B", D.toColumn (replicate 4 (fromGregorian 2013 01 02))), 116 | ghci| ("C", D.toColumn (replicate 4 (1.0 :: Float))), 117 | ghci| ("D", D.toColumn (replicate 4 (3 :: Int))), 118 | ghci| ("E", D.toColumn (take 4 $ cycle [Test, Train])), 119 | ghci| ("F", D.toColumn (replicate 4 "foo"))] 120 | ghci|:} 121 | ghci> df 122 | -------------------------------------------------------------- 123 | index | A | B | C | D | E | F 124 | ------|--------|------------|-------|-----|-----------|------- 125 | Int | Double | Day | Float | Int | Transport | [Char] 126 | ------|--------|------------|-------|-----|-----------|------- 127 | 0 | 1.0 | 2013-01-02 | 1.0 | 3 | Test | foo 128 | 1 | 1.0 | 2013-01-02 | 1.0 | 3 | Train | foo 129 | 2 | 1.0 | 2013-01-02 | 1.0 | 3 | Test | foo 130 | 3 | 1.0 | 2013-01-02 | 1.0 | 3 | Train | foo 131 | ``` 132 | 133 | Rather than label a string value as categorial we create a type that encapsulates the value. 134 | 135 | ## Viewing data 136 | 137 | By default we print the whole dataframe. To see the first `n` rows we instead provide a `take` function that takes in as arguments `n` and the dataframe. 138 | 139 | ```haskell 140 | ghci> D.take 2 df 141 | -------------------------------------------------------------- 142 | index | A | B | C | D | E | F 143 | ------|--------|------------|-------|-----|-----------|------- 144 | Int | Double | Day | Float | Int | Transport | [Char] 145 | ------|--------|------------|-------|-----|-----------|------- 146 | 0 | 1.0 | 2013-01-02 | 1.0 | 3 | Test | foo 147 | 1 | 1.0 | 2013-01-02 | 1.0 | 3 | Train | foo 148 | ``` 149 | 150 | Our equivalent of describe is `summarize`: 151 | 152 | ```haskell 153 | ghci> D.summarize df 154 | ----------------------------------------------------- 155 | index | Statistic | D | C | A 156 | ------|-----------|-----------|-----------|---------- 157 | Int | Text | Double | Double | Double 158 | ------|-----------|-----------|-----------|---------- 159 | 0 | Mean | 3.0 | 1.0 | 1.0 160 | 1 | Minimum | 3.0 | 1.0 | 1.0 161 | 2 | 25% | 3.0 | 1.0 | 1.0 162 | 3 | Median | 3.0 | 1.0 | 1.0 163 | 4 | 75% | 3.0 | 1.0 | 1.0 164 | 5 | Max | 3.0 | 1.0 | 1.0 165 | 6 | StdDev | 0.0 | 0.0 | 0.0 166 | 7 | IQR | 0.0 | 0.0 | 0.0 167 | 8 | Skewness | -Infinity | -Infinity | -Infinity 168 | ``` 169 | 170 | #### Sorting 171 | 172 | Since we don't have indexes we only have one sort function that sorts by a column. 173 | 174 | ```haskell 175 | ghci> D.sortBy D.Ascending ["E"] df 176 | -------------------------------------------------------------- 177 | index | A | B | C | D | E | F 178 | ------|--------|------------|-------|-----|-----------|------- 179 | Int | Double | Day | Float | Int | Transport | [Char] 180 | ------|--------|------------|-------|-----|-----------|------- 181 | 0 | 1.0 | 2013-01-02 | 1.0 | 3 | Test | foo 182 | 1 | 1.0 | 2013-01-02 | 1.0 | 3 | Test | foo 183 | 2 | 1.0 | 2013-01-02 | 1.0 | 3 | Train | foo 184 | 3 | 1.0 | 2013-01-02 | 1.0 | 3 | Train | foo 185 | ``` 186 | 187 | ## Selection 188 | Panda's `[]` operator is a jack-knife that does a number of kinds of aggregation. 189 | As such it doesn't map to one construct and doesn't always have an equivalent in Haskell. 190 | 191 | ### Selecting columns 192 | 193 | ```python 194 | python> df.loc[:, ["A", "B"]] 195 | A B 196 | 2013-01-01 0.469112 -0.282863 197 | 2013-01-02 1.212112 -0.173215 198 | 2013-01-03 -0.861849 -2.104569 199 | 2013-01-04 0.721555 -0.706771 200 | 2013-01-05 -0.424972 0.567020 201 | 2013-01-06 -0.673690 0.113648 202 | ``` 203 | 204 | Pandas indexes the dataframe like a 2D array. We get all rows with `:` and then specify which columns after the comma. 205 | 206 | In DataFrame we mimick SQL's select. 207 | 208 | ```haskell 209 | ghci> D.select ["A"] df 210 | -------------- 211 | index | A 212 | ------|------- 213 | Int | Double 214 | ------|------- 215 | 0 | 1.0 216 | 1 | 1.0 217 | 2 | 1.0 218 | 3 | 1.0 219 | ``` 220 | 221 | To filter by rows we have to filter by the values we are interested in rather than indexes. 222 | 223 | ```python 224 | python> df.loc["20130102":"20130104", ["A", "B"]] 225 | A B 226 | 2013-01-02 1.212112 -0.173215 227 | 2013-01-03 -0.861849 -2.104569 228 | 2013-01-04 0.721555 -0.706771 229 | ``` 230 | 231 | ```haskell 232 | ghci> :{ 233 | ghci| df' |> D.filter "date" (\d -> d >= (fromGregorian 2013 01 02) && d <= (fromGregorian 2013 01 04)) 234 | ghci| |> D.select ["A", "B"] 235 | ghci| :} 236 | ghci> df 237 | --------------------------- 238 | index | A | B 239 | ------|--------|----------- 240 | Int | Double | Day 241 | ------|--------|----------- 242 | 0 | 1.0 | 2013-01-02 243 | 1 | 1.0 | 2013-01-02 244 | 2 | 1.0 | 2013-01-02 245 | ``` 246 | 247 | ## Missing values 248 | 249 | Rows with missing values are represented by a `Maybe a` type. Dealing with missing values means applying the usual `Maybe` functions to the data. 250 | 251 | ### Filling 252 | 253 | ```haskell 254 | ghci> df' = D.addColumn "G" (V.fromList [Just 1, Just 2, Nothing, Just 4]) df 255 | ghci> df' 256 | ------------------------------------------------------------------------------ 257 | index | A | B | C | D | E | F | G 258 | ------|--------|------------|-------|-----|-----------|--------|-------------- 259 | Int | Double | Day | Float | Int | Transport | [Char] | Maybe Integer 260 | ------|--------|------------|-------|-----|-----------|--------|-------------- 261 | 0 | 1.0 | 2013-01-02 | 1.0 | 3 | Test | foo | Just 1 262 | 1 | 1.0 | 2013-01-02 | 1.0 | 3 | Train | foo | Just 2 263 | 2 | 1.0 | 2013-01-02 | 1.0 | 3 | Test | foo | Nothing 264 | 3 | 1.0 | 2013-01-02 | 1.0 | 3 | Train | foo | Just 4 265 | ghci> D.apply (fromMaybe 5) "G" df' 266 | ------------------------------------------------------------------------ 267 | index | A | B | C | D | E | F | G 268 | ------|--------|------------|-------|-----|-----------|--------|-------- 269 | Int | Double | Day | Float | Int | Transport | [Char] | Integer 270 | ------|--------|------------|-------|-----|-----------|--------|-------- 271 | 0 | 1.0 | 2013-01-02 | 1.0 | 3 | Test | foo | 1 272 | 1 | 1.0 | 2013-01-02 | 1.0 | 3 | Train | foo | 2 273 | 2 | 1.0 | 2013-01-02 | 1.0 | 3 | Test | foo | 5 274 | 3 | 1.0 | 2013-01-02 | 1.0 | 3 | Train | foo | 4 275 | ghci> df' |> D.filter "G" (isJust @Integer) 276 | ------------------------------------------------------------------------------ 277 | index | A | B | C | D | E | F | G 278 | ------|--------|------------|-------|-----|-----------|--------|-------------- 279 | Int | Double | Day | Float | Int | Transport | [Char] | Maybe Integer 280 | ------|--------|------------|-------|-----|-----------|--------|-------------- 281 | 0 | 1.0 | 2013-01-02 | 1.0 | 3 | Test | foo | Just 1 282 | 1 | 1.0 | 2013-01-02 | 1.0 | 3 | Train | foo | Just 2 283 | 2 | 1.0 | 2013-01-02 | 1.0 | 3 | Train | foo | Just 4 284 | ``` 285 | -------------------------------------------------------------------------------- /docs/coming_from_polars.md: -------------------------------------------------------------------------------- 1 | # Coming from Polars 2 | 3 | This tutorial will walk through the examples in Polars' [getting started guide](https://docs.pola.rs/user-guide/getting-started/) showing how concepts in Polars map to dataframe. 4 | 5 | ## Reading and writing CSV 6 | 7 | ### Round trip test 8 | 9 | To test our CSV IO we'll create a dataframe programtically, write it to a CSV file, then read the CSV file back again. 10 | 11 | In polars this looks like: 12 | 13 | ```python 14 | import polars as pl 15 | import datetime as dt 16 | 17 | df = pl.DataFrame( 18 | { 19 | "name": ["Alice Archer", "Ben Brown", "Chloe Cooper", "Daniel Donovan"], 20 | "birthdate": [ 21 | dt.date(1997, 1, 10), 22 | dt.date(1985, 2, 15), 23 | dt.date(1983, 3, 22), 24 | dt.date(1981, 4, 30), 25 | ], 26 | "weight": [57.9, 72.5, 53.6, 83.1], # (kg) 27 | "height": [1.56, 1.77, 1.65, 1.75], # (m) 28 | } 29 | ) 30 | df.write_csv("docs/assets/data/output.csv") 31 | df_csv = pl.read_csv("docs/assets/data/output.csv", try_parse_dates=True) 32 | print(df_csv) 33 | ``` 34 | 35 | As a standalone dataframe script this would look like. 36 | 37 | 38 | ```haskell 39 | import qualified DataFrame as D 40 | import Data.Time.Calendar 41 | 42 | main :: IO 43 | main = do 44 | let df = D.fromList [ 45 | ("name", D.toColumn [ "Alice Archer" 46 | , "Ben Brown" 47 | , "Chloe Cooper" 48 | , "Daniel Donovan"]) 49 | , ("birthdate", D.toColumn [ fromGregorian 1997 01 10 50 | , fromGregorian 1985 02 15 51 | , fromGregorian 1983 03 22 52 | , fromGregorian 1981 04 30]) 53 | , ("weight", D.toColumn [57.9, 72.5, 53.6, 83.1]) 54 | , ("height", D.toColumn [1.56, 1.77, 1.65, 1.75])] 55 | print df 56 | D.writeCsv "./data/output.csv" df 57 | let df_csv = D.readCsv "./data/output.csv" 58 | print df_csv 59 | ``` 60 | 61 | This round trip prints the following tables: 62 | 63 | ``` 64 | ----------------------------------------------------- 65 | index | name | birthdate | weight | height 66 | ------|----------------|------------|--------|------- 67 | Int | [Char] | Day | Double | Double 68 | ------|----------------|------------|--------|------- 69 | 0 | Alice Archer | 1997-01-10 | 57.9 | 1.56 70 | 1 | Ben Brown | 1985-02-15 | 72.5 | 1.77 71 | 2 | Chloe Cooper | 1983-03-22 | 53.6 | 1.65 72 | 3 | Daniel Donovan | 1981-04-30 | 83.1 | 1.75 73 | 74 | ----------------------------------------------------- 75 | index | name | birthdate | weight | height 76 | ------|----------------|------------|--------|------- 77 | Int | Text | Day | Double | Double 78 | ------|----------------|------------|--------|------- 79 | 0 | Alice Archer | 1997-01-10 | 57.9 | 1.56 80 | 1 | Ben Brown | 1985-02-15 | 72.5 | 1.77 81 | 2 | Chloe Cooper | 1983-03-22 | 53.6 | 1.65 82 | 3 | Daniel Donovan | 1981-04-30 | 83.1 | 1.75 83 | 84 | ``` 85 | 86 | Notice that the type of the string column changes from `[Char]` (Haskell's default) to `Text` (dataframe's default). 87 | 88 | 89 | ## Expressions 90 | 91 | Our equivalent to expressions is a tuple that contains a list of the column names followed by a 92 | function where the arguments correspond to the order of column names. We use a special function 93 | wrapper to make our dataframes accept functions with any number of arguments. This is done using 94 | the `func` function. 95 | 96 | This is a mouthful and is probably easier to see in action/comparison. 97 | 98 | For example: 99 | 100 | ```python 101 | result = df.select( 102 | pl.col("name"), 103 | pl.col("birthdate").dt.year().alias("birth_year"), 104 | (pl.col("weight") / (pl.col("height") ** 2)).alias("bmi"), 105 | ) 106 | print(result) 107 | ``` 108 | 109 | Would be written as: 110 | 111 | ```haskell 112 | {-# LANGUAGE ScopedTypeVariables #-} 113 | {-# LANGUAGE TypeApplications #-} 114 | import qualified DataFrame as D 115 | import qualified Data.Text as T 116 | 117 | import DataFrame.Operations ( (|>) ) 118 | import Data.Time.Calendar 119 | 120 | main :: IO () 121 | main = do 122 | ... 123 | let year = (\(YearMonthDay y _ _) -> y) 124 | print $ df_csv 125 | |> D.derive "birth_year" year "birthdate" 126 | |> D.deriveFrom (["weight", "height"], D.func (\(w :: Double) (h :: Double) -> w / h ** 2)) 127 | "bmi" 128 | |> D.select ["name", "birth_year", "bmi"] 129 | ``` 130 | 131 | Or, more clearly: 132 | 133 | ```haskell 134 | {-# LANGUAGE ScopedTypeVariables #-} 135 | {-# LANGUAGE TypeApplications #-} 136 | import qualified DataFrame as D 137 | import qualified Data.Text as T 138 | 139 | import DataFrame ( (|>) ) 140 | import Data.Time.Calendar 141 | 142 | main :: IO () 143 | main = do 144 | ... 145 | let year = (\(YearMonthDay y _ _) -> y) 146 | let bmi :: Double -> Double -> Double 147 | bmi w h = w / h ** 2 148 | print $ df_csv 149 | |> D.derive "birth_year" year "birthdate" 150 | |> D.deriveFrom (["weight", "height"], D.func bmi) "bmi" 151 | |> D.select ["name", "birth_year", "bmi"] 152 | ``` 153 | 154 | Resulting in: 155 | 156 | ``` 157 | -------------------------------------------------------- 158 | index | name | birth_year | bmi 159 | ------|----------------|------------|------------------- 160 | Int | Text | Integer | Double 161 | ------|----------------|------------|------------------- 162 | 0 | Alice Archer | 1997 | 23.791913214990135 163 | 1 | Ben Brown | 1985 | 23.14149829231702 164 | 2 | Chloe Cooper | 1983 | 19.687786960514234 165 | 3 | Daniel Donovan | 1981 | 27.13469387755102 166 | ``` 167 | 168 | The dataframe implementation can be read top down. `apply` a function that gets the year to the `birthdate`; 169 | store the result in the `birth_year` column; combine `weight` and `height` into the bmi column using the 170 | formula `w / h ** 2`; then select the `name`, `birth_year` and `bmi` fields. 171 | 172 | Dataframe focuses on splitting transformations into transformations on the whole dataframe so it's easily usable 173 | in a repl-like environment. 174 | 175 | In the example Polars expression expansion example: 176 | 177 | ```python 178 | result = df.select( 179 | pl.col("name"), 180 | (pl.col("weight", "height") * 0.95).round(2).name.suffix("-5%"), 181 | ) 182 | print(result) 183 | ``` 184 | 185 | We instead write this two `applyWithAlias` calls: 186 | 187 | ```haskell 188 | df_csv 189 | |> D.derive "weight-5%" (*0.95) "weight" 190 | -- Alternatively we can use the `as` function. 191 | |> D.as "height-5%" D.apply (*0.95) "height" 192 | |> D.select ["name", "weight-5%", "height-5%"] 193 | ``` 194 | 195 | ``` 196 | ---------------------------------------------------------------- 197 | index | name | height-5% | weight-5% 198 | ------|----------------|--------------------|------------------- 199 | Int | [Char] | Double | Double 200 | ------|----------------|--------------------|------------------- 201 | 0 | Alice Archer | 1.482 | 55.004999999999995 202 | 1 | Ben Brown | 1.6815 | 68.875 203 | 2 | Chloe Cooper | 1.5675 | 50.92 204 | 3 | Daniel Donovan | 1.6624999999999999 | 78.945 205 | ``` 206 | 207 | However we can make our program shorter by using regular Haskell and folding over the dataframe. 208 | 209 | ```haskell 210 | let reduce name = D.derive (name <> "-5%") (*0.95) name 211 | df_csv 212 | |> D.fold reduce ["weight", "height"] 213 | |> D.select ["name", "weight-5%", "height-5%"] 214 | ``` 215 | 216 | Or alternatively, 217 | 218 | ```haskell 219 | addSuffix suffix name = D.rename name (name <> suffix) 220 | df_csv 221 | |> D.applyMany ["weight", "height"] (*0.95) 222 | |> D.fold (addSuffix "-5%") 223 | |> D.select ["name", "weight-5%", "height-5%"] 224 | ``` 225 | 226 | Filtering looks much the same: 227 | 228 | ```python 229 | result = df.filter(pl.col("birthdate").dt.year() < 1990) 230 | print(result) 231 | ``` 232 | 233 | Versus 234 | 235 | ```haskell 236 | bornAfter1990 = ( (< 1990) 237 | . (\(YearMonthDay y _ _) -> y)) 238 | df_csv & 239 | D.filter "birthdate" bornAfter1990 240 | ``` 241 | 242 | ``` 243 | ----------------------------------------------------- 244 | index | name | birthdate | weight | height 245 | ------|----------------|------------|--------|------- 246 | Int | Text | Day | Double | Double 247 | ------|----------------|------------|--------|------- 248 | 0 | Ben Brown | 1985-02-15 | 72.5 | 1.77 249 | 1 | Chloe Cooper | 1983-03-22 | 53.6 | 1.65 250 | 2 | Daniel Donovan | 1981-04-30 | 83.1 | 1.75 251 | ``` 252 | 253 | For multiple filter conditions we again make all the filter statements separate. Filtering by m 254 | 255 | ```python 256 | result = df.filter( 257 | pl.col("birthdate").is_between(dt.date(1982, 12, 31), dt.date(1996, 1, 1)), 258 | pl.col("height") > 1.7, 259 | ) 260 | print(result) 261 | ``` 262 | 263 | ```haskell 264 | year (YearMonthDay y _ _) = y 265 | between a b y = y >= a && y <= b 266 | df_csv 267 | |> D.filter "birthdate" 268 | (between 1982 1996 . year) 269 | |> D.filter "height" (1.7 <) 270 | ``` 271 | 272 | ``` 273 | ------------------------------------------------ 274 | index | name | birthdate | weight | height 275 | Int | Text | Day | Double | Double 276 | ------|-----------|------------|--------|------- 277 | 0 | Ben Brown | 1985-02-15 | 72.5 | 1.77 278 | ``` 279 | 280 | ```python 281 | result = df.group_by( 282 | (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"), 283 | maintain_order=True, 284 | ).len() 285 | print(result) 286 | ``` 287 | 288 | Polars's `groupBy` does an implicit select. In dataframe the select is written explcitly. 289 | 290 | We implicitly create a `Count` variable as the result of grouping by an aggregate. In general when for a `groupByAgg` we create a variable with the same name as the aggregation to store the aggregation in. 291 | 292 | ```haskell 293 | let decade = (*10) . flip div 10 . year 294 | df_csv 295 | |> D.derive "decade" decade "birthdate" 296 | |> D.select ["decade"] 297 | |> D.groupByAgg D.Count ["decade"] 298 | ``` 299 | 300 | ``` 301 | ---------------------- 302 | index | decade | Count 303 | ------|--------|------ 304 | Int | Int | Int 305 | ------|--------|------ 306 | 0 | 1990 | 1 307 | 1 | 1980 | 3 308 | ``` 309 | 310 | TODO: Add notes 311 | 312 | ```python 313 | result = df.group_by( 314 | (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"), 315 | maintain_order=True, 316 | ).agg( 317 | pl.len().alias("sample_size"), 318 | pl.col("weight").mean().round(2).alias("avg_weight"), 319 | pl.col("height").max().alias("tallest"), 320 | ) 321 | print(result) 322 | ``` 323 | 324 | ```haskell 325 | decade = (*10) . flip div 10 . year 326 | df_csv 327 | |> D.derive "decade" decade "birthdate" 328 | |> D.groupByAgg D.Count ["decade"] 329 | |> D.aggregate [("height", D.Maximum), ("weight", D.Mean)] 330 | |> D.select ["decade", "sampleSize", "Mean_weight", "Maximum_height"] 331 | ``` 332 | 333 | ``` 334 | ---------------------------------------------------- 335 | index | decade | Mean_weight | Maximum_height 336 | ------|---------|-------------------|--------------- 337 | Int | Integer | Double | Double 338 | ------|---------|-------------------|--------------- 339 | 0 | 1990 | 57.9 | 1.56 340 | 1 | 1980 | 69.73333333333333 | 1.77 341 | ``` 342 | 343 | 344 | ```python 345 | result = ( 346 | df.with_columns( 347 | (pl.col("birthdate").dt.year() // 10 * 10).alias("decade"), 348 | pl.col("name").str.split(by=" ").list.first(), 349 | ) 350 | .select( 351 | pl.all().exclude("birthdate"), 352 | ) 353 | .group_by( 354 | pl.col("decade"), 355 | maintain_order=True, 356 | ) 357 | .agg( 358 | pl.col("name"), 359 | pl.col("weight", "height").mean().round(2).name.prefix("avg_"), 360 | ) 361 | ) 362 | print(result) 363 | ``` 364 | 365 | ```haskell 366 | let firstWord = head . T.split (' ' ==) 367 | df_csv 368 | |> D.apply firstWord "name" 369 | |> D.derive "decade" decade "birthdate" 370 | |> D.exclude ["birthdate"] 371 | |> D.groupByAgg D.Count ["decade"] 372 | |> D.aggregate [("weight", D.Mean), ("height", D.Mean)] 373 | ``` 374 | 375 | ``` 376 | ------------------------------------------------------------------------------------------- 377 | index | decade | name | Count | Mean_height | Mean_weight 378 | ------|---------|--------------------------|-------|--------------------|------------------ 379 | Int | Integer | Vector Text | Int | Double | Double 380 | ------|---------|--------------------------|-------|--------------------|------------------ 381 | 0 | 1990 | ["Alice"] | 1 | 1.56 | 57.9 382 | 1 | 1980 | ["Ben","Daniel","Chloe"] | 3 | 1.7233333333333334 | 69.73333333333333 383 | ``` 384 | -------------------------------------------------------------------------------- /docs/configuration_notes.md: -------------------------------------------------------------------------------- 1 | # Configuration notes 2 | 3 | ## Windows 4 | Powershell doesn't support UTF-8 encoding out the box. You need to run: 5 | 6 | ``` 7 | $OutputEncoding = [console]::InputEncoding = [console]::OutputEncoding = New-Object System.Text.UTF8Encoding 8 | ``` 9 | 10 | To show terminal plot output. 11 | -------------------------------------------------------------------------------- /docs/exploratory_data_analysis_primer.md: -------------------------------------------------------------------------------- 1 | # A primer on Exploratory Data Analysis 2 | 3 | Exploratory data analysis (EDA), in brief, is what you do when you first get a dataset. EDA should help us answer questions about the data and help us formulate new ones. It is the step before any modelling or inference where we look at the data so we can: 4 | 5 | * check for completeness/correctness of data. 6 | * understand the relationships between the explanatory variables. 7 | * understand the relationship between the explanatory and outcome variables. 8 | * preliminarily determine what models would be appropriate for our data. 9 | 10 | It's important for EDA tools to be feature-rich and intuitive so we can answer many different kinds of questions about the data without the tool getting in the way. 11 | 12 | 13 | There are four types of explanatory data analysis: 14 | 15 | * univariate non-graphical analysis 16 | * multivariate non-graphical analysis 17 | * univariate graphical analysis 18 | * multivariate graphical analysis 19 | 20 | We will look at each type of EDA and describe how we can use dataframe for each type. We'll be using the [California Housing Dataset](https://www.kaggle.com/datasets/camnugent/california-housing-prices) to demonstrate the concepts as we explain them. 21 | 22 | ## Univariate non-graphical analysis 23 | 24 | Univariate non-graphical analysis should give us a sense of the distribution of our dataset's variables. In the real world our variables are measurable characteristics. How they are distributed (the "sample distribution") and this may often help us estimate the overall distribution ("population distribution") of the variable. For example, if our variable was finishing times for a race, our analysis should be able to answer questions like what was the slowest time, what time did people tend to run, who was the fastest, were all times recorded etc. 25 | 26 | For categorical data the best univariate non-graphical analysis is a tabulation of the frequency of each category. 27 | 28 | ```haskell 29 | ghci> import qualified DataFrame as D 30 | ghci> D.frequencies "ocean_proximity" df 31 | 32 | ------------------------------------------------------------------------------ 33 | index | Statistic | <1H OCEAN | INLAND | ISLAND | NEAR BAY | NEAR OCEAN 34 | ------|----------------|-----------|---------|---------|----------|----------- 35 | Int | Text | Integer | Integer | Integer | Integer | Integer 36 | ------|----------------|-----------|---------|---------|----------|----------- 37 | 0 | Count | 9136 | 6551 | 5 | 2290 | 2658 38 | 1 | Percentage (%) | 44 | 31 | 0 | 11 | 12 39 | ``` 40 | 41 | We can also plot similar tables for non-categorical data with a small value set e.g shoe sizes. 42 | 43 | For quantitative data our goal is to understand the population distribution through our sample distribution. For a given quantitative variable we typically care about its: 44 | 45 | * presence (how much data is missing from each charateristic/variable) 46 | * center (what a "typical" value looks like for some definition of typical), 47 | * spread (how far values are from the "typical" value), 48 | * modality (what are the most popular ranges of values), 49 | * shape (is the data normally distributed? does it skew left or right?), 50 | * and outliers (how common are outliers) 51 | 52 | We can calculate sample statistics from the data such as the sample mean, sample variance etc. Although it's most often useful to use graphs to visualize the data's distribution, univariate non-graphical EDA describes aspects of the data's histogram. 53 | 54 | ### Missing data 55 | Arguably the first thing to do when presented with a datset is check for null values. 56 | 57 | ```haskell 58 | ghci> D.columnInfo df 59 | ----------------------------------------------------------------------------- 60 | index | Column Name | # Non-null Values | # Null Values | Type 61 | ------|--------------------|-------------------|---------------|------------- 62 | Int | [Char] | Int | Int | [Char] 63 | ------|--------------------|-------------------|---------------|------------- 64 | 0 | total_bedrooms | 20433 | 207 | Maybe Double 65 | 1 | ocean_proximity | 20640 | 0 | Text 66 | 2 | median_house_value | 20640 | 0 | Double 67 | 3 | median_income | 20640 | 0 | Double 68 | 4 | households | 20640 | 0 | Double 69 | 5 | population | 20640 | 0 | Double 70 | 6 | total_rooms | 20640 | 0 | Double 71 | 7 | housing_median_age | 20640 | 0 | Double 72 | 8 | latitude | 20640 | 0 | Double 73 | 9 | longitude | 20640 | 0 | Double 74 | ``` 75 | 76 | It seems we have most of the data except some missing total bedrooms. Dealing with nulls is a separate topic that requires intimate knowledge of the data. So for this initial pass we'll leave out the total_bedrooms variable. 77 | 78 | ### Central tendency 79 | The central tendency of a distribution describes a "typical" value of that distribution. The most common statistical measures of central tendency are arithmetic mean and median. For symmetric distributions the mean and the median are the same. But for a skewed distribution the mean is pulled towards the "heavier" side wherease the median is more robust to these changes. 80 | 81 | For a given column calulating the mean and median is fairly straightfoward and shown below. 82 | 83 | ```haskell 84 | ghci> D.mean "housing_median_age" df 85 | Just 28.63948643410852 86 | ghci> D.median "housing_median_age" df 87 | Just 29.0 88 | ``` 89 | 90 | Note: the values are displayed with a `Just` to denote that they may not be computable or not exist. Trying to get the mean or median of a non-numeric column would return `Nothing`. `Nothing` is similar to `NULL` in SQL. 91 | 92 | ### Spread 93 | Spread is a measure of how far away from the center we are still likely to find data values. There are three main measures of spread: variance, mean absolute deviation, standard deviation, and interquartile range. 94 | 95 | ### Mean absolute deviation 96 | We start by looking at mean absolute deviation since it's the simplest measure of spread. The mean absolute deviation measures how far from the average values are on average. We calcuate it by taking the absolute value of the difference between each observation and the mean of that variable, then finally taking the average of those. 97 | 98 | In the housing dataset it'll tell how "typical" our typical home price is. 99 | 100 | ```haskell 101 | ghci> import Data.Maybe 102 | ghci> m = fromMaybe 0 $ D.mean "median_house_value" df 103 | 206855.81690891474 104 | ghci> df |> D.derive "deviation" (\v -> abs (v - m)) "median_house_value" |> D.select ["median_house_value", "deviation"] |> D.take 10 105 | ----------------------------------------------- 106 | index | median_house_value | deviation 107 | ------|--------------------|------------------- 108 | Int | Double | Double 109 | ------|--------------------|------------------- 110 | 0 | 452600.0 | 245744.18309108526 111 | 1 | 358500.0 | 151644.18309108526 112 | 2 | 352100.0 | 145244.18309108526 113 | 3 | 341300.0 | 134444.18309108526 114 | 4 | 342200.0 | 135344.18309108526 115 | 5 | 269700.0 | 62844.18309108526 116 | 6 | 299200.0 | 92344.18309108526 117 | 7 | 241400.0 | 34544.18309108526 118 | 8 | 226700.0 | 19844.18309108526 119 | 9 | 261100.0 | 54244.18309108526 120 | ``` 121 | 122 | Read left to right, we begin by calling `derive` which applies a function to a given column and stores the result in a target column. The order of arguments is `derive `. We then select only the two columns we want and take the first 10 rows. 123 | 124 | This gives us a list of the deviations. From the small sample it does seem like there are some wild deviations. The first one is greater than the mean! How typical is this? Well to answer that we take the average of all these values. 125 | 126 | ```haskell 127 | ghci> withDeviation = df |> D.derive "deviation" (\v -> abs (v - m)) "median_house_value" |> D.select ["median_house_value", "deviation"] 128 | ghci> D.mean "deviation" withDeviation 129 | Just 91170.43994367732 130 | ``` 131 | 132 | So the $200'000 deviation we saw in the sample isn't very typical but it raises a question about outliers. 133 | What if we give more weight to the further deviations? 134 | 135 | 136 | ### Standard deviation 137 | That's what standard deviation aims to do. Standard deviation considers the spread of outliers. Instead of calculating the absolute difference of each observation from the mean we calculate the square of the difference. This has the effect of exaggerating further outliers. 138 | 139 | ```haskell 140 | ghci> sumOfSqureDifferences = fromMaybe 0 $ D.sum "deviation" withDeviation 141 | ghci> n = fromIntegral $ (fst $ D.dimensions df) - 1 142 | ghci> sqrt (sumOfSqureDifferences / n) 143 | 115395.6158744 144 | ``` 145 | The standard deviation being larger than the mean absolute deviation means we do have some outliers. However, since the difference is fairly small we can conclude that there aren't very many outliers in our dataset. 146 | 147 | We can calculate the standard deviation in one line as follows: 148 | 149 | ```haskell 150 | ghci> D.standardDeviation "median_house_value" df 151 | Just 115395.6158744 152 | ``` 153 | 154 | ## Interquartile range (IQR) 155 | A quantile is a value of the distribution such that n% of values in the distribution are smaller than that value. A quartile is a division of the data into four quantiles. So the 1st quantile is a value such that 25% of values are smaller than it. The median is the second quartile. And the third quartile is a value such that 75% of values are smaller than that value. The IQR is the difference between the 3rd and 1st quartiles. It measures how close to middle the middle 50% of values are. 156 | 157 | The IQR is a more robust measure of spread than the variance or standard deviation. Any number of values in the top or bottom quarters of the data can be moved any distance from the median without affecting the IQR at all. More practically, a few extreme outliers have little or no effect on the IQR 158 | 159 | For our dataset: 160 | 161 | ```haskell 162 | ghci> D.interQuartileRange "median_house_value" df 163 | Just 145158.3333333336 164 | ``` 165 | 166 | This is larger than the standard deviation but not by much. This means that outliers don't have a significant influence on the distribution and most values are close to typical. 167 | 168 | ### Variance 169 | Variance is the square of the standard deviation. It is much more sensitive to outliers. Variance does not have the same units as our original variable (it is in units squared). Therefore, it's much more difficult to interpret. 170 | 171 | In our example it's a very large number: 172 | 173 | ``` haskell 174 | ghci> D.variance "median_house_value" df 175 | Just 1.3315503000818077e10 176 | ``` 177 | 178 | The variance is more useful when comparing different datasets. If the variance of house prices in Minnesota was lower than California this would mean there were much fewer really cheap and really expensive house in Minnesota. 179 | 180 | ## Shape 181 | Skewness measures how left or right shifted a distribution is from a normal distribution. A positive skewness means the distribution is left shifted, a negative skew means the distribution is right shifted. 182 | 183 | The formula for skewness is the mean cubic deviation divided by the cube of the standard deviation. It captures the relationship between the mean deviation (asymmetry of the data) and the standard deviation (spread of the data). 184 | 185 | The intuition behind why a positive skew is left shifted follows from the formula. The numerator is more sensitive to outliers. So the futher left a distribution is the more the right-tail values will be exaggerated by the cube causing the skewness to be positive. 186 | 187 | A skewness score between -0.5 and 0.5 means the data has little skew. A score between -0.5 and -1 or 0.5 and 1 means the data has moderate skew. A skewness greater than 1 or less than -1 means the data is heavily skewed. 188 | 189 | ```haskell 190 | ghci> D.skewness "median_house_value" df 191 | Just 0.9776922140978703 192 | ``` 193 | So the median house value is moderately skewed to the left. That is, there are more houses that are cheaper than the mean values and a tail of expensive outliers. Having lived in California, I can confirm that this data reflects reality. 194 | 195 | 196 | ## Summarising the data 197 | 198 | We can get all these statistics with a single command: 199 | 200 | ```haskell 201 | ghci> D.summarize df 202 | ------------------------------------------------------------------------------------------------------------------------------------------ 203 | index | Statistic | median_house_value | median_income | households | population | total_rooms | housing_median_age | latitude | longitude 204 | ------|-----------|--------------------|---------------|------------|------------|-------------|--------------------|----------|---------- 205 | Int | Text | Double | Double | Double | Double | Double | Double | Double | Double 206 | ------|-----------|--------------------|---------------|------------|------------|-------------|--------------------|----------|---------- 207 | 0 | Mean | 206855.82 | 3.87 | 499.54 | 1425.48 | 2635.76 | 28.64 | 35.63 | -119.57 208 | 1 | Minimum | 14999.0 | 0.5 | 1.0 | 3.0 | 2.0 | 1.0 | 32.54 | -124.35 209 | 2 | 25% | 119600.0 | 2.56 | 280.0 | 787.0 | 1447.42 | 18.0 | 33.93 | -121.8 210 | 3 | Median | 179700.0 | 3.53 | 409.0 | 1166.0 | 2127.0 | 29.0 | 34.26 | -118.49 211 | 4 | 75% | 264758.33 | 4.74 | 605.0 | 1725.0 | 3148.0 | 37.0 | 37.71 | -118.01 212 | 5 | Max | 500001.0 | 15.0 | 6082.0 | 35682.0 | 39320.0 | 52.0 | 41.95 | -114.31 213 | 6 | StdDev | 115395.62 | 1.9 | 382.33 | 1132.46 | 2181.62 | 12.59 | 2.14 | 2.0 214 | 7 | IQR | 145158.33 | 2.18 | 325.0 | 938.0 | 1700.58 | 19.0 | 3.78 | 3.79 215 | 8 | Skewness | 0.98 | 1.65 | 3.41 | 4.94 | 4.15 | 6.0e-2 | 0.47 | -0.3 216 | ``` 217 | 218 | As a recap we'll go over what this tells us about the data: 219 | * median_house_value: house prices tend to be close to the median but there are some pretty expensive houses. 220 | * median_income: incomes are also generally fairly typical (small standard deviation with median close to mean) but there are some really rich people (high skewness). 221 | * households: household sizes are very similar across the sample and they tend to be smaller. 222 | * population: California is generally very sparsely populated (low skewness) with some REALLY densely populated areas (high max/ low IQR). 223 | * total_rooms: a lot of the blocks have few rooms (Again sparse population) but there are some very dense areas (high max). 224 | * housing_median_age: there are as many new houses as there are old (skewness close to 0) and not many extremes (low max, standard deviation lower than IQR) 225 | * latitude: the south has slightly more people than the north (moderate skew) 226 | * longitude: most houses are in the west coast (moderate right skew) 227 | 228 | -------------------------------------------------------------------------------- /docs/haskell_for_data_analysis.md: -------------------------------------------------------------------------------- 1 | # Haskell for Data Analysis 2 | 3 | This section ports/mirrors Wes McKinney's book [Python for Data Analysis](https://wesmckinney.com/book/). Examples and organizations are drawn from there. This tutorial assumes an understanding of Haskell. 4 | 5 | ## Data preparation 6 | Data in the wild doesn't always come in a form that's easy to work with. A data analysis tool should make preparing and cleaning data easy. There are a number of common issues that data analysis too must handle. We'll go through a few common ones and show how to deal with them in Haskell. 7 | 8 | ### Handling missing data 9 | In Haskell, potentially missing values are represented by a "wrapper" type called [`Maybe`](https://en.wikibooks.org/wiki/Haskell/Understanding_monads/Maybe). 10 | 11 | ``` 12 | ghci> import qualified DataFrame as D 13 | ghci> let df = D.fromColumnList [D.toColumn [Just 1, Just 1, Nothing, Nothing], D.toColumn [Just 6.5, Nothing, Nothing, Just 6.5], D.toColumn [Just 3.0, Nothing, Nothing, Just 3.0]] 14 | ghci> df 15 | --------------------------------------------------- 16 | index | 0 | 1 | 2 17 | ------|---------------|--------------|------------- 18 | Int | Maybe Integer | Maybe Double | Maybe Double 19 | ------|---------------|--------------|------------- 20 | 0 | Just 1 | Just 6.5 | Just 3.0 21 | 1 | Just 1 | Nothing | Nothing 22 | 2 | Nothing | Nothing | Nothing 23 | 3 | Nothing | Just 6.5 | Just 3.0 24 | 25 | ``` 26 | 27 | If we'd like to drop all rows with missing values we can use the `filterJust` function. 28 | 29 | ```haskell 30 | ghci> D.filterJust "0" df 31 | --------------------------------------------- 32 | index | 0 | 1 | 2 33 | ------|---------|--------------|------------- 34 | Int | Integer | Maybe Double | Maybe Double 35 | ------|---------|--------------|------------- 36 | 0 | 1 | Just 6.5 | Just 3.0 37 | 1 | 1 | Nothing | Nothing 38 | ``` 39 | 40 | The function filters out the non-`Nothing` values and "unwrap" the `Maybe` type. To filter all `Nothing` values we use the `filterAllJust` function. 41 | 42 | ```haskell 43 | ghci> D.filterAllJust df 44 | --------------------------------- 45 | index | 0 | 1 | 2 46 | ------|---------|--------|------- 47 | Int | Integer | Double | Double 48 | ------|---------|--------|------- 49 | 0 | 1 | 6.5 | 3.0 50 | ``` 51 | 52 | To fill in the missing values we the impute function which replaces all instances of `Nothing` with a given value. 53 | 54 | ```haskell 55 | ghci> D.impute "0" (0 :: Integer) df 56 | --------------------------------------------- 57 | index | 0 | 1 | 2 58 | ------|---------|--------------|------------- 59 | Int | Integer | Maybe Double | Maybe Double 60 | ------|---------|--------------|------------- 61 | 0 | 1 | Just 6.5 | Just 3.0 62 | 1 | 1 | Nothing | Nothing 63 | 2 | 0 | Nothing | Nothing 64 | 3 | 0 | Just 6.5 | Just 3.0 65 | ``` 66 | 67 | There is no general way to replace ALL nothing values with a default since the default depends on the type. In fact, trying to apply the wrong type to a function throws an error: 68 | 69 | ```haskell 70 | ghci> D.impute @Double "0" 0 df 71 | *** Exception: 72 | 73 | [Error]: Type Mismatch 74 | While running your code I tried to get a column of type: "Maybe Double" but column was of type: "Maybe Integer" 75 | This happened when calling function apply on the column 0 76 | 77 | 78 | 79 | Try adding a type at the end of the function e.g change 80 | apply arg1 arg2 to 81 | (apply arg1 arg2 :: ) 82 | or add {-# LANGUAGE TypeApplications #-} to the top of your file then change the call to 83 | apply @ arg1 arg2 84 | ``` 85 | 86 | In general, Haskell would usually have a compile-time. But because dataframes are usually run in REPL-like environments which offer immediate feedback to users, `dataframe` is fine turning these into compile-time exceptions. 87 | 88 | -------------------------------------------------------------------------------- /flake.nix: -------------------------------------------------------------------------------- 1 | { 2 | description = "An intuitive, dynamically-typed DataFrame library"; 3 | 4 | inputs = { 5 | nixpkgs.url = "github:NixOS/nixpkgs/nixos-unstable"; 6 | flake-utils.url = "github:numtide/flake-utils"; 7 | }; 8 | 9 | outputs = { self, nixpkgs, flake-utils }: 10 | flake-utils.lib.eachDefaultSystem (system: 11 | let 12 | pkgs = nixpkgs.legacyPackages.${system}; 13 | 14 | hsPkgs = pkgs.haskellPackages.extend (self: super: { 15 | dataframe = self.callCabal2nix "dataframe" ./. { }; 16 | }); 17 | in 18 | { 19 | packages = { 20 | default = hsPkgs.dataframe; 21 | }; 22 | 23 | devShells.default = pkgs.mkShell { 24 | buildInputs = with pkgs; [ 25 | ghc 26 | cabal-install 27 | haskell-language-server 28 | ]; 29 | }; 30 | }); 31 | } 32 | -------------------------------------------------------------------------------- /run_compiled_repl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | cabal repl dataframe --repl-options=-fobject-code -O2 4 | -------------------------------------------------------------------------------- /run_profiling.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PROF_TYPE="" 4 | 5 | if [ -z "$1" ]; then 6 | # Default to eventlog profile if no argument given 7 | PROF_TYPE="eventlog" 8 | else 9 | case "$1" in 10 | "eventlog") 11 | PROF_TYPE="eventlog" 12 | ;; 13 | "pprof") 14 | PROF_TYPE="pprof" 15 | ;; 16 | *) 17 | echo "invalid profile type $1, should be one of 'eventlog' or 'pprof'" 18 | exit 1 19 | ;; 20 | esac 21 | fi 22 | 23 | case "$PROF_TYPE" in 24 | "eventlog") 25 | cabal v2-run --enable-profiling dataframe -- +RTS -hy -l-agu 26 | ;; 27 | "pprof") 28 | cabal v2-run --enable-profiling dataframe -- +RTS -pj -RTS 29 | ;; 30 | esac 31 | -------------------------------------------------------------------------------- /src/DataFrame.hs: -------------------------------------------------------------------------------- 1 | module DataFrame 2 | ( module D, 3 | (|>) 4 | ) 5 | where 6 | 7 | import DataFrame.Internal.Types as D 8 | import DataFrame.Internal.Function as D 9 | import DataFrame.Internal.Parsing as D 10 | import DataFrame.Internal.Column as D 11 | import DataFrame.Internal.DataFrame as D hiding (columnIndices, columns) 12 | import DataFrame.Internal.Row as D hiding (mkRowRep) 13 | import DataFrame.Errors as D 14 | import DataFrame.Operations.Core as D 15 | import DataFrame.Operations.Subset as D 16 | import DataFrame.Operations.Sorting as D 17 | import DataFrame.Operations.Statistics as D 18 | import DataFrame.Operations.Transformations as D 19 | import DataFrame.Operations.Typing as D 20 | import DataFrame.Operations.Aggregation as D 21 | import DataFrame.Display.Terminal.Plot as D 22 | import DataFrame.IO.CSV as D 23 | 24 | import Data.Function 25 | 26 | (|>) = (&) -------------------------------------------------------------------------------- /src/DataFrame/Display/Terminal/Colours.hs: -------------------------------------------------------------------------------- 1 | module DataFrame.Display.Terminal.Colours where 2 | 3 | -- terminal color functions 4 | red :: String -> String 5 | red s = "\ESC[31m" ++ s ++ "\ESC[0m" 6 | 7 | green :: String -> String 8 | green s = "\ESC[32m" ++ s ++ "\ESC[0m" 9 | 10 | brightGreen :: String -> String 11 | brightGreen s = "\ESC[92m" ++ s ++ "\ESC[0m" 12 | 13 | brightBlue :: String -> String 14 | brightBlue s = "\ESC[94m" ++ s ++ "\ESC[0m" 15 | -------------------------------------------------------------------------------- /src/DataFrame/Display/Terminal/PrettyPrint.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module DataFrame.Display.Terminal.PrettyPrint where 3 | 4 | import qualified Data.Text as T 5 | 6 | import Data.List (transpose) 7 | 8 | -- Utility functions to show a DataFrame as a Markdown-ish table. 9 | 10 | -- Adapted from: https://stackoverflow.com/questions/5929377/format-list-output-in-haskell 11 | -- a type for fill functions 12 | type Filler = Int -> T.Text -> T.Text 13 | 14 | -- a type for describing table columns 15 | data ColDesc t = ColDesc 16 | { colTitleFill :: Filler, 17 | colTitle :: T.Text, 18 | colValueFill :: Filler 19 | } 20 | 21 | -- functions that fill a string (s) to a given width (n) by adding pad 22 | -- character (c) to align left, right, or center 23 | fillLeft :: Char -> Int -> T.Text -> T.Text 24 | fillLeft c n s = s `T.append` T.replicate (n - T.length s) (T.singleton c) 25 | 26 | fillRight :: Char -> Int -> T.Text -> T.Text 27 | fillRight c n s = T.replicate (n - T.length s) (T.singleton c) `T.append` s 28 | 29 | fillCenter :: Char -> Int -> T.Text -> T.Text 30 | fillCenter c n s = T.replicate l (T.singleton c) `T.append` s `T.append` T.replicate r (T.singleton c) 31 | where 32 | x = n - T.length s 33 | l = x `div` 2 34 | r = x - l 35 | 36 | -- functions that fill with spaces 37 | left :: Int -> T.Text -> T.Text 38 | left = fillLeft ' ' 39 | 40 | right :: Int -> T.Text -> T.Text 41 | right = fillRight ' ' 42 | 43 | center :: Int -> T.Text -> T.Text 44 | center = fillCenter ' ' 45 | 46 | showTable :: [T.Text] -> [T.Text] -> [[T.Text]] -> T.Text 47 | showTable header types rows = 48 | let cs = map (\h -> ColDesc center h left) header 49 | widths = [maximum $ map T.length col | col <- transpose $ header : types : rows] 50 | border = T.intercalate "---" [T.replicate width (T.singleton '-') | width <- widths] 51 | separator = T.intercalate "-|-" [T.replicate width (T.singleton '-') | width <- widths] 52 | fillCols fill cols = T.intercalate " | " [fill c width col | (c, width, col) <- zip3 cs widths cols] 53 | in T.unlines $ border : fillCols colTitleFill header : separator : fillCols colTitleFill types : separator : map (fillCols colValueFill) rows 54 | 55 | showTableProperMarkdown :: [T.Text] -> [T.Text] -> [[T.Text]] -> T.Text 56 | showTableProperMarkdown header types rows = 57 | let headerWithTypes = zipWith (\h t -> h <> "
" <> t) header types 58 | cs = map (\h -> ColDesc center h left) headerWithTypes 59 | widths = [maximum $ map T.length col | col <- transpose $ headerWithTypes : rows] 60 | border = T.intercalate "---" [T.replicate width (T.singleton '-') | width <- widths] 61 | separator = T.intercalate "-|-" [T.replicate width (T.singleton '-') | width <- widths] 62 | fillCols fill cols = T.intercalate " | " [fill c width col | (c, width, col) <- zip3 cs widths cols] 63 | in T.unlines $ border : fillCols colTitleFill headerWithTypes : separator : map (fillCols colValueFill) rows 64 | -------------------------------------------------------------------------------- /src/DataFrame/Errors.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE DeriveAnyClass #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE RankNTypes #-} 4 | {-# LANGUAGE GADTs #-} 5 | {-# LANGUAGE InstanceSigs #-} 6 | 7 | module DataFrame.Errors where 8 | 9 | import qualified Data.Text as T 10 | 11 | import Control.Exception 12 | import Data.Array 13 | import DataFrame.Display.Terminal.Colours 14 | import Data.Typeable (Typeable) 15 | import Type.Reflection (TypeRep) 16 | 17 | data DataFrameException where 18 | TypeMismatchException :: forall a b. (Typeable a, Typeable b) 19 | => TypeRep a -- ^ given type 20 | -> TypeRep b -- ^ expected type 21 | -> T.Text -- ^ column name 22 | -> T.Text -- ^ call point 23 | -> DataFrameException 24 | TypeMismatchException' :: forall a . (Typeable a) 25 | => TypeRep a -- ^ expected type 26 | -> String -- ^ given type 27 | -> T.Text -- ^ column name 28 | -> T.Text -- ^ call point 29 | -> DataFrameException 30 | ColumnNotFoundException :: T.Text -> T.Text -> [T.Text] -> DataFrameException 31 | deriving (Exception) 32 | 33 | instance Show DataFrameException where 34 | show :: DataFrameException -> String 35 | show (TypeMismatchException a b columnName callPoint) = addCallPointInfo columnName (Just callPoint) (typeMismatchError a b) 36 | show (TypeMismatchException' a b columnName callPoint) = addCallPointInfo columnName (Just callPoint) (typeMismatchError' (show a) b) 37 | show (ColumnNotFoundException columnName callPoint availableColumns) = columnNotFound columnName callPoint availableColumns 38 | 39 | columnNotFound :: T.Text -> T.Text -> [T.Text] -> String 40 | columnNotFound name callPoint columns = 41 | red "\n\n[ERROR] " 42 | ++ "Column not found: " 43 | ++ T.unpack name 44 | ++ " for operation " 45 | ++ T.unpack callPoint 46 | ++ "\n\tDid you mean " 47 | ++ T.unpack (guessColumnName name columns) 48 | ++ "?\n\n" 49 | 50 | typeMismatchError :: 51 | Type.Reflection.TypeRep a -> 52 | Type.Reflection.TypeRep b -> 53 | String 54 | typeMismatchError a b = typeMismatchError' (show a) (show b) 55 | 56 | typeMismatchError' :: String -> String -> String 57 | typeMismatchError' givenType expectedType = 58 | red $ 59 | red "\n\n[Error]: Type Mismatch" 60 | ++ "\n\tWhile running your code I tried to " 61 | ++ "get a column of type: " 62 | ++ red (show givenType) 63 | ++ " but column was of type: " 64 | ++ green (show expectedType) 65 | 66 | addCallPointInfo :: T.Text -> Maybe T.Text -> String -> String 67 | addCallPointInfo name (Just cp) err = 68 | err 69 | ++ ( "\n\tThis happened when calling function " 70 | ++ brightGreen (T.unpack cp) 71 | ++ " on the column " 72 | ++ brightGreen (T.unpack name) 73 | ++ "\n\n" 74 | ++ typeAnnotationSuggestion (T.unpack cp) 75 | ) 76 | addCallPointInfo name Nothing err = 77 | err 78 | ++ ( "\n\tOn the column " 79 | ++ T.unpack name 80 | ++ "\n\n" 81 | ++ typeAnnotationSuggestion "" 82 | ) 83 | 84 | typeAnnotationSuggestion :: String -> String 85 | typeAnnotationSuggestion cp = 86 | "\n\n\tTry adding a type at the end of the function e.g " 87 | ++ "change\n\t\t" 88 | ++ red (cp ++ " arg1 arg2") 89 | ++ " to \n\t\t" 90 | ++ green ("(" ++ cp ++ " arg1 arg2 :: )") 91 | ++ "\n\tor add " 92 | ++ "{-# LANGUAGE TypeApplications #-} to the top of your " 93 | ++ "file then change the call to \n\t\t" 94 | ++ brightGreen (cp ++ " @ arg1 arg2") 95 | 96 | guessColumnName :: T.Text -> [T.Text] -> T.Text 97 | guessColumnName userInput columns = case map (\k -> (editDistance userInput k, k)) columns of 98 | [] -> "" 99 | res -> (snd . minimum) res 100 | 101 | editDistance :: T.Text -> T.Text -> Int 102 | editDistance xs ys = table ! (m, n) 103 | where 104 | (m, n) = (T.length xs, T.length ys) 105 | x = array (1, m) (zip [1 ..] (T.unpack xs)) 106 | y = array (1, n) (zip [1 ..] (T.unpack ys)) 107 | 108 | table :: Array (Int, Int) Int 109 | table = array bnds [(ij, dist ij) | ij <- range bnds] 110 | bnds = ((0, 0), (m, n)) 111 | 112 | dist (0, j) = j 113 | dist (i, 0) = i 114 | dist (i, j) = 115 | minimum 116 | [ table ! (i - 1, j) + 1, 117 | table ! (i, j - 1) + 1, 118 | if x ! i == y ! j then table ! (i - 1, j - 1) else 1 + table ! (i - 1, j - 1) 119 | ] 120 | -------------------------------------------------------------------------------- /src/DataFrame/IO/CSV.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE ExplicitNamespaces #-} 3 | {-# LANGUAGE LambdaCase #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE TypeApplications #-} 7 | {-# LANGUAGE GADTs #-} 8 | {-# LANGUAGE RankNTypes #-} 9 | {-# LANGUAGE Strict #-} 10 | module DataFrame.IO.CSV where 11 | 12 | import qualified Data.ByteString.Char8 as C 13 | import qualified Data.List as L 14 | import qualified Data.Map as M 15 | import qualified Data.Set as S 16 | import qualified Data.Text as T 17 | import qualified Data.Text.Lazy as TL 18 | import qualified Data.Text.Lazy.IO as TLIO 19 | import qualified Data.Text.IO as TIO 20 | import qualified Data.Vector as V 21 | import qualified Data.Vector.Unboxed as VU 22 | import qualified Data.Vector.Mutable as VM 23 | import qualified Data.Vector.Unboxed.Mutable as VUM 24 | 25 | import Control.Applicative ((<$>), (<|>), (<*>), (<*), (*>), many) 26 | import Control.Monad (forM_, zipWithM_, unless, void) 27 | import Data.Attoparsec.Text 28 | import Data.Char 29 | import DataFrame.Internal.Column (Column(..), freezeColumn', writeColumn, columnLength) 30 | import DataFrame.Internal.DataFrame (DataFrame(..)) 31 | import DataFrame.Internal.Parsing 32 | import DataFrame.Operations.Typing 33 | import Data.Foldable (fold) 34 | import Data.Function (on) 35 | import Data.IORef 36 | import Data.Maybe 37 | import Data.Text.Encoding (decodeUtf8Lenient) 38 | import Data.Type.Equality 39 | ( TestEquality (testEquality), 40 | type (:~:) (Refl) 41 | ) 42 | import GHC.IO.Handle (Handle) 43 | import Prelude hiding (concat, takeWhile) 44 | import System.IO 45 | import Type.Reflection 46 | 47 | -- | Record for CSV read options. 48 | data ReadOptions = ReadOptions { 49 | hasHeader :: Bool, 50 | inferTypes :: Bool, 51 | safeRead :: Bool 52 | } 53 | 54 | -- | By default we assume the file has a header, we infer the types on read 55 | -- and we convert any rows with nullish objects into Maybe (safeRead). 56 | defaultOptions :: ReadOptions 57 | defaultOptions = ReadOptions { hasHeader = True, inferTypes = True, safeRead = True } 58 | 59 | -- | Reads a CSV file from the given path. 60 | -- Note this file stores intermediate temporary files 61 | -- while converting the CSV from a row to a columnar format. 62 | readCsv :: String -> IO DataFrame 63 | readCsv = readSeparated ',' defaultOptions 64 | 65 | -- | Reads a tab separated file from the given path. 66 | -- Note this file stores intermediate temporary files 67 | -- while converting the CSV from a row to a columnar format. 68 | readTsv :: String -> IO DataFrame 69 | readTsv = readSeparated '\t' defaultOptions 70 | 71 | -- | Reads a character separated file into a dataframe using mutable vectors. 72 | readSeparated :: Char -> ReadOptions -> String -> IO DataFrame 73 | readSeparated c opts path = do 74 | totalRows <- countRows c path 75 | withFile path ReadMode $ \handle -> do 76 | firstRow <- map T.strip . parseSep c <$> TIO.hGetLine handle 77 | let columnNames = if hasHeader opts 78 | then map (T.filter (/= '\"')) firstRow 79 | else map (T.singleton . intToDigit) [0..(length firstRow - 1)] 80 | -- If there was no header rewind the file cursor. 81 | unless (hasHeader opts) $ hSeek handle AbsoluteSeek 0 82 | 83 | -- Initialize mutable vectors for each column 84 | let numColumns = length columnNames 85 | let numRows = if hasHeader opts then totalRows - 1 else totalRows 86 | -- Use this row to infer the types of the rest of the column. 87 | -- TODO: this isn't robust but in so far as this is a guess anyway 88 | -- it's probably fine. But we should probably sample n rows and pick 89 | -- the most likely type from the sample. 90 | dataRow <- map T.strip . parseSep c <$> TIO.hGetLine handle 91 | 92 | -- This array will track the indices of all null values for each column. 93 | -- If any exist then the column will be an optional type. 94 | nullIndices <- VM.unsafeNew numColumns 95 | VM.set nullIndices [] 96 | mutableCols <- VM.unsafeNew numColumns 97 | getInitialDataVectors numRows mutableCols dataRow 98 | 99 | -- Read rows into the mutable vectors 100 | fillColumns numRows c mutableCols nullIndices handle 101 | 102 | -- Freeze the mutable vectors into immutable ones 103 | nulls' <- V.unsafeFreeze nullIndices 104 | cols <- V.mapM (freezeColumn mutableCols nulls' opts) (V.generate numColumns id) 105 | return $ DataFrame { 106 | columns = cols, 107 | freeIndices = [], 108 | columnIndices = M.fromList (zip columnNames [0..]), 109 | dataframeDimensions = (maybe 0 columnLength (cols V.! 0), V.length cols) 110 | } 111 | {-# INLINE readSeparated #-} 112 | 113 | getInitialDataVectors :: Int -> VM.IOVector Column -> [T.Text] -> IO () 114 | getInitialDataVectors n mCol xs = do 115 | forM_ (zip [0..] xs) $ \(i, x) -> do 116 | col <- case inferValueType x of 117 | "Int" -> MutableUnboxedColumn <$> ((VUM.unsafeNew n :: IO (VUM.IOVector Int)) >>= \c -> VUM.unsafeWrite c 0 (fromMaybe 0 $ readInt x) >> return c) 118 | "Double" -> MutableUnboxedColumn <$> ((VUM.unsafeNew n :: IO (VUM.IOVector Double)) >>= \c -> VUM.unsafeWrite c 0 (fromMaybe 0 $ readDouble x) >> return c) 119 | _ -> MutableBoxedColumn <$> ((VM.unsafeNew n :: IO (VM.IOVector T.Text)) >>= \c -> VM.unsafeWrite c 0 x >> return c) 120 | VM.unsafeWrite mCol i col 121 | {-# INLINE getInitialDataVectors #-} 122 | 123 | inferValueType :: T.Text -> T.Text 124 | inferValueType s = let 125 | example = s 126 | in case readInt example of 127 | Just _ -> "Int" 128 | Nothing -> case readDouble example of 129 | Just _ -> "Double" 130 | Nothing -> "Other" 131 | {-# INLINE inferValueType #-} 132 | 133 | -- | Reads rows from the handle and stores values in mutable vectors. 134 | fillColumns :: Int -> Char -> VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> Handle -> IO () 135 | fillColumns n c mutableCols nullIndices handle = do 136 | input <- newIORef (mempty :: T.Text) 137 | forM_ [1..n] $ \i -> do 138 | isEOF <- hIsEOF handle 139 | input' <- readIORef input 140 | unless (isEOF && input' == mempty) $ do 141 | parseWith (TIO.hGetChunk handle) (parseRow c) input' >>= \case 142 | Fail unconsumed ctx er -> do 143 | erpos <- hTell handle 144 | fail $ "Failed to parse CSV file around " <> show erpos <> " byte; due: " 145 | <> show er <> "; context: " <> show ctx 146 | Partial c -> do 147 | fail "Partial handler is called" 148 | Done (unconsumed :: T.Text) (row :: [T.Text]) -> do 149 | writeIORef input unconsumed 150 | zipWithM_ (writeValue mutableCols nullIndices i) [0..] row 151 | {-# INLINE fillColumns #-} 152 | 153 | -- | Writes a value into the appropriate column, resizing the vector if necessary. 154 | writeValue :: VM.IOVector Column -> VM.IOVector [(Int, T.Text)] -> Int -> Int -> T.Text -> IO () 155 | writeValue mutableCols nullIndices count colIndex value = do 156 | col <- VM.unsafeRead mutableCols colIndex 157 | res <- writeColumn count value col 158 | let modify value = VM.unsafeModify nullIndices ((count, value) :) colIndex 159 | either modify (const (return ())) res 160 | {-# INLINE writeValue #-} 161 | 162 | -- | Freezes a mutable vector into an immutable one, trimming it to the actual row count. 163 | freezeColumn :: VM.IOVector Column -> V.Vector [(Int, T.Text)] -> ReadOptions -> Int -> IO (Maybe Column) 164 | freezeColumn mutableCols nulls opts colIndex = do 165 | col <- VM.unsafeRead mutableCols colIndex 166 | Just <$> freezeColumn' (nulls V.! colIndex) col 167 | {-# INLINE freezeColumn #-} 168 | 169 | parseSep :: Char -> T.Text -> [T.Text] 170 | parseSep c s = either error id (parseOnly (record c) s) 171 | {-# INLINE parseSep #-} 172 | 173 | record :: Char -> Parser [T.Text] 174 | record c = 175 | field c `sepBy1` char c 176 | "record" 177 | {-# INLINE record #-} 178 | 179 | parseRow :: Char -> Parser [T.Text] 180 | parseRow c = (record c <* lineEnd) "record-new-line" 181 | 182 | field :: Char -> Parser T.Text 183 | field c = 184 | quotedField <|> unquotedField c 185 | "field" 186 | {-# INLINE field #-} 187 | 188 | unquotedTerminators :: Char -> S.Set Char 189 | unquotedTerminators sep = S.fromList [sep, '\n', '\r', '"'] 190 | 191 | unquotedField :: Char -> Parser T.Text 192 | unquotedField sep = 193 | takeWhile (not . (`S.member` terminators)) "unquoted field" 194 | where terminators = unquotedTerminators sep 195 | {-# INLINE unquotedField #-} 196 | 197 | quotedField :: Parser T.Text 198 | quotedField = char '"' *> contents <* char '"' "quoted field" 199 | where 200 | contents = fold <$> many (unquote <|> unescape) 201 | where 202 | unquote = takeWhile1 (notInClass "\"\\") 203 | unescape = char '\\' *> do 204 | T.singleton <$> do 205 | char '\\' <|> char '"' 206 | {-# INLINE quotedField #-} 207 | 208 | lineEnd :: Parser () 209 | lineEnd = 210 | (endOfLine <|> endOfInput) 211 | "end of line" 212 | {-# INLINE lineEnd #-} 213 | 214 | -- | First pass to count rows for exact allocation 215 | countRows :: Char -> FilePath -> IO Int 216 | countRows c path = withFile path ReadMode $! go 0 "" 217 | where 218 | go !n !input h = do 219 | isEOF <- hIsEOF h 220 | if isEOF && input == mempty 221 | then pure n 222 | else 223 | parseWith (TIO.hGetChunk h) (parseRow c) input >>= \case 224 | Fail unconsumed ctx er -> do 225 | erpos <- hTell h 226 | fail $ "Failed to parse CSV file around " <> show erpos <> " byte; due: " 227 | <> show er <> "; context: " <> show ctx <> " " <> show unconsumed 228 | Partial c -> do 229 | fail $ "Partial handler is called; n = " <> show n 230 | Done (unconsumed :: T.Text) _ -> 231 | go (n + 1) unconsumed h 232 | {-# INLINE countRows #-} 233 | 234 | writeCsv :: String -> DataFrame -> IO () 235 | writeCsv = writeSeparated ',' 236 | 237 | writeSeparated :: Char -- ^ Separator 238 | -> String -- ^ Path to write to 239 | -> DataFrame 240 | -> IO () 241 | writeSeparated c filepath df = withFile filepath WriteMode $ \handle ->do 242 | let (rows, columns) = dataframeDimensions df 243 | let headers = map fst (L.sortBy (compare `on` snd) (M.toList (columnIndices df))) 244 | TIO.hPutStrLn handle (T.intercalate ", " headers) 245 | forM_ [0..(rows - 1)] $ \i -> do 246 | let row = getRowAsText df i 247 | TIO.hPutStrLn handle (T.intercalate ", " row) 248 | 249 | getRowAsText :: DataFrame -> Int -> [T.Text] 250 | getRowAsText df i = V.ifoldr go [] (columns df) 251 | where 252 | indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df)) 253 | go k Nothing acc = acc 254 | go k (Just (BoxedColumn (c :: V.Vector a))) acc = case c V.!? i of 255 | Just e -> textRep : acc 256 | where textRep = case testEquality (typeRep @a) (typeRep @T.Text) of 257 | Just Refl -> e 258 | Nothing -> case typeRep @a of 259 | App t1 t2 -> case eqTypeRep t1 (typeRep @Maybe) of 260 | Just HRefl -> case testEquality t2 (typeRep @T.Text) of 261 | Just Refl -> fromMaybe "null" e 262 | Nothing -> (fromOptional . (T.pack . show)) e 263 | where fromOptional s 264 | | T.isPrefixOf "Just " s = T.drop (T.length "Just ") s 265 | | otherwise = "null" 266 | Nothing -> (T.pack . show) e 267 | _ -> (T.pack . show) e 268 | Nothing -> 269 | error $ 270 | "Column " 271 | ++ T.unpack (indexMap M.! k) 272 | ++ " has less items than " 273 | ++ "the other columns at index " 274 | ++ show i 275 | go k (Just (UnboxedColumn c)) acc = case c VU.!? i of 276 | Just e -> T.pack (show e) : acc 277 | Nothing -> 278 | error $ 279 | "Column " 280 | ++ T.unpack (indexMap M.! k) 281 | ++ " has less items than " 282 | ++ "the other columns at index " 283 | ++ show i 284 | go k (Just (OptionalColumn (c :: V.Vector (Maybe a)))) acc = case c V.!? i of 285 | Just e -> textRep : acc 286 | where textRep = case testEquality (typeRep @a) (typeRep @T.Text) of 287 | Just Refl -> fromMaybe "Nothing" e 288 | Nothing -> (T.pack . show) e 289 | Nothing -> 290 | error $ 291 | "Column " 292 | ++ T.unpack (indexMap M.! k) 293 | ++ " has less items than " 294 | ++ "the other columns at index " 295 | ++ show i 296 | -------------------------------------------------------------------------------- /src/DataFrame/Internal/DataFrame.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExplicitNamespaces #-} 2 | {-# LANGUAGE InstanceSigs #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE ScopedTypeVariables #-} 5 | {-# LANGUAGE TypeApplications #-} 6 | {-# LANGUAGE GADTs #-} 7 | {-# LANGUAGE StrictData #-} 8 | module DataFrame.Internal.DataFrame where 9 | 10 | import qualified Data.Map as M 11 | import qualified Data.Text as T 12 | import qualified Data.Vector as V 13 | import qualified Data.Vector.Unboxed as VU 14 | 15 | import Control.Monad (join) 16 | import DataFrame.Display.Terminal.PrettyPrint 17 | import DataFrame.Internal.Column 18 | import Data.Function (on) 19 | import Data.List (sortBy, transpose) 20 | import Data.Maybe (isJust) 21 | import Data.Type.Equality (type (:~:)(Refl), TestEquality (testEquality)) 22 | import Type.Reflection (typeRep) 23 | 24 | data DataFrame = DataFrame 25 | { -- | Our main data structure stores a dataframe as 26 | -- a vector of columns. This improv 27 | columns :: V.Vector (Maybe Column), 28 | -- | Keeps the column names in the order they were inserted in. 29 | columnIndices :: M.Map T.Text Int, 30 | -- | Next free index that we insert a column into. 31 | freeIndices :: [Int], 32 | dataframeDimensions :: (Int, Int) 33 | } 34 | 35 | instance Eq DataFrame where 36 | (==) :: DataFrame -> DataFrame -> Bool 37 | a == b = map fst (M.toList $ columnIndices a) == map fst (M.toList $ columnIndices b) && 38 | foldr (\(name, index) acc -> acc && (columns a V.!? index == (columns b V.!? (columnIndices b M.! name)))) True (M.toList $ columnIndices a) 39 | 40 | instance Show DataFrame where 41 | show :: DataFrame -> String 42 | show d = T.unpack (asText d False) 43 | 44 | asText :: DataFrame -> Bool -> T.Text 45 | asText d properMarkdown = 46 | let header = "index" : map fst (sortBy (compare `on` snd) $ M.toList (columnIndices d)) 47 | types = V.toList $ V.filter (/= "") $ V.map getType (columns d) 48 | getType Nothing = "" 49 | getType (Just (BoxedColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a) 50 | getType (Just (UnboxedColumn (column :: VU.Vector a))) = T.pack $ show (typeRep @a) 51 | getType (Just (OptionalColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a) 52 | getType (Just (GroupedBoxedColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a) 53 | getType (Just (GroupedUnboxedColumn (column :: V.Vector a))) = T.pack $ show (typeRep @a) 54 | -- Separate out cases dynamically so we don't end up making round trip string 55 | -- copies. 56 | get (Just (BoxedColumn (column :: V.Vector a))) = case testEquality (typeRep @a) (typeRep @T.Text) of 57 | Just Refl -> column 58 | Nothing -> case testEquality (typeRep @a) (typeRep @String) of 59 | Just Refl -> V.map T.pack column 60 | Nothing -> V.map (T.pack . show) column 61 | get (Just (UnboxedColumn column)) = V.map (T.pack . show) (V.convert column) 62 | get (Just (OptionalColumn column)) = V.map (T.pack . show) column 63 | get (Just (GroupedBoxedColumn column)) = V.map (T.pack . show) column 64 | get (Just (GroupedUnboxedColumn column)) = V.map (T.pack . show) column 65 | getTextColumnFromFrame df (i, name) = if i == 0 66 | then V.fromList (map (T.pack . show) [0..(fst (dataframeDimensions df) - 1)]) 67 | else get $ (V.!) (columns d) ((M.!) (columnIndices d) name) 68 | rows = 69 | transpose $ 70 | zipWith (curry (V.toList . getTextColumnFromFrame d)) [0..] header 71 | in (if properMarkdown then showTableProperMarkdown else showTable) header ("Int":types) rows 72 | 73 | -- | O(1) Creates an empty dataframe 74 | empty :: DataFrame 75 | empty = DataFrame {columns = V.replicate initialColumnSize Nothing, 76 | columnIndices = M.empty, 77 | freeIndices = [0..(initialColumnSize - 1)], 78 | dataframeDimensions = (0, 0) } 79 | 80 | initialColumnSize :: Int 81 | initialColumnSize = 8 82 | 83 | getColumn :: T.Text -> DataFrame -> Maybe Column 84 | getColumn name df = do 85 | i <- columnIndices df M.!? name 86 | join $ columns df V.!? i 87 | 88 | null :: DataFrame -> Bool 89 | null df = dataframeDimensions df == (0, 0) 90 | 91 | metadata :: DataFrame -> String 92 | metadata df = show (columnIndices df) ++ "\n" ++ 93 | show (V.map (fmap columnVersionString) (columns df)) ++ "\n" ++ 94 | show (freeIndices df) ++ "\n" ++ 95 | show (dataframeDimensions df) 96 | -------------------------------------------------------------------------------- /src/DataFrame/Internal/Function.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExplicitNamespaces #-} 2 | {-# LANGUAGE RankNTypes #-} 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | {-# LANGUAGE TypeApplications #-} 5 | {-# LANGUAGE ConstraintKinds #-} 6 | {-# LANGUAGE GADTs #-} 7 | {-# LANGUAGE FlexibleInstances #-} 8 | {-# LANGUAGE InstanceSigs #-} 9 | {-# LANGUAGE ViewPatterns #-} 10 | {-# LANGUAGE PatternSynonyms #-} 11 | 12 | module DataFrame.Internal.Function where 13 | 14 | import qualified Data.Text as T 15 | import qualified Data.Vector as V 16 | 17 | import DataFrame.Internal.Types 18 | import Data.Typeable ( Typeable, type (:~:)(Refl) ) 19 | import Data.Type.Equality (TestEquality(testEquality)) 20 | import Type.Reflection (typeRep, typeOf) 21 | 22 | -- A GADT to wrap functions so we can have hetegeneous lists of functions. 23 | data Function where 24 | F1 :: forall a b . (Columnable a, Columnable b) => (a -> b) -> Function 25 | F2 :: forall a b c . (Columnable a, Columnable b, Columnable c) => (a -> b -> c) -> Function 26 | F3 :: forall a b c d . (Columnable a, Columnable b, Columnable c, Columnable d) => (a -> b -> c -> d) -> Function 27 | F4 :: forall a b c d e . (Columnable a, Columnable b, Columnable c, Columnable d, Columnable e) => (a -> b -> c -> d -> e) -> Function 28 | Cond :: forall a . (Columnable a) => (a -> Bool) -> Function 29 | ICond :: forall a . (Columnable a) => (Int -> a -> Bool) -> Function 30 | 31 | -- Helper class to do the actual wrapping 32 | class WrapFunction a where 33 | wrapFunction :: a -> Function 34 | 35 | -- Instance for 1-argument functions 36 | instance (Columnable a, Columnable b) => WrapFunction (a -> b) where 37 | wrapFunction :: (Columnable a, Columnable b) => (a -> b) -> Function 38 | wrapFunction = F1 39 | 40 | -- Instance for 2-argument functions 41 | instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c) => WrapFunction (a -> b -> c) where 42 | wrapFunction :: (Columnable a, Columnable b, Columnable c) => (a -> b -> c) -> Function 43 | wrapFunction = F2 44 | 45 | -- Instance for 3-argument functions 46 | instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c, Columnable d) => WrapFunction (a -> b -> c -> d) where 47 | wrapFunction :: (Columnable a, Columnable b, Columnable c, Columnable d) => (a -> b -> c -> d) -> Function 48 | wrapFunction = F3 49 | 50 | instance {-# INCOHERENT #-} (Columnable a, Columnable b, Columnable c, Columnable d, Columnable e) => WrapFunction (a -> b -> c -> d -> e) where 51 | wrapFunction :: (Columnable a, Columnable b, Columnable c, Columnable d, Columnable e) => (a -> b -> c -> d -> e) -> Function 52 | wrapFunction = F4 53 | 54 | -- The main function that wraps arbitrary functions 55 | func :: forall fn . WrapFunction fn => fn -> Function 56 | func = wrapFunction 57 | 58 | pattern Empty :: V.Vector a 59 | pattern Empty <- (V.null -> True) where Empty = V.empty 60 | 61 | uncons :: V.Vector a -> Maybe (a, V.Vector a) 62 | uncons Empty = Nothing 63 | uncons v = Just (V.unsafeHead v, V.unsafeTail v) 64 | 65 | pattern (:<|) :: a -> V.Vector a -> V.Vector a 66 | pattern x :<| xs <- (uncons -> Just (x, xs)) 67 | 68 | funcApply :: forall c . (Columnable c) => V.Vector RowValue -> Function -> c 69 | funcApply Empty _ = error "Empty args" 70 | funcApply (Value (x :: a') :<| Empty) (F1 (f :: (a -> b))) = case testEquality (typeRep @a') (typeRep @a) of 71 | Just Refl -> case testEquality (typeOf (f x)) (typeRep @c) of 72 | Just Refl -> f x 73 | Nothing -> error "Result type mismatch" 74 | Nothing -> error "Arg type mismatch" 75 | funcApply (Value (x :: a') :<| xs) (F2 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of 76 | Just Refl -> funcApply xs (F1 (f x)) 77 | Nothing -> error "Arg type mismatch" 78 | funcApply (Value (x :: a') :<| xs) (F3 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of 79 | Just Refl -> funcApply xs (F2 (f x)) 80 | Nothing -> error "Arg type mismatch" 81 | funcApply (Value (x :: a') :<| xs) (F4 (f :: (a -> b))) = case testEquality (typeOf x) (typeRep @a) of 82 | Just Refl -> funcApply xs (F3 (f x)) 83 | Nothing -> error "Arg type mismatch" 84 | -------------------------------------------------------------------------------- /src/DataFrame/Internal/Parsing.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE Strict #-} 3 | module DataFrame.Internal.Parsing where 4 | 5 | import qualified Data.ByteString.Char8 as C 6 | import qualified Data.Set as S 7 | import qualified Data.Text as T 8 | 9 | import Data.Text.Read 10 | import Data.Maybe (fromMaybe) 11 | import GHC.Stack (HasCallStack) 12 | import Text.Read (readMaybe) 13 | 14 | isNullish :: T.Text -> Bool 15 | isNullish s = s `S.member` S.fromList ["Nothing", "NULL", "", " ", "nan"] 16 | 17 | readValue :: (HasCallStack, Read a) => T.Text -> a 18 | readValue s = case readMaybe (T.unpack s) of 19 | Nothing -> error $ "Could not read value: " ++ T.unpack s 20 | Just value -> value 21 | 22 | readInteger :: (HasCallStack) => T.Text -> Maybe Integer 23 | readInteger s = case signed decimal (T.strip s) of 24 | Left _ -> Nothing 25 | Right (value, "") -> Just value 26 | Right (value, _) -> Nothing 27 | 28 | readInt :: (HasCallStack) => T.Text -> Maybe Int 29 | readInt s = case signed decimal (T.strip s) of 30 | Left _ -> Nothing 31 | Right (value, "") -> Just value 32 | Right (value, _) -> Nothing 33 | {-# INLINE readInt #-} 34 | 35 | readByteStringInt :: (HasCallStack) => C.ByteString -> Maybe Int 36 | readByteStringInt s = case C.readInt (C.strip s) of 37 | Nothing -> Nothing 38 | Just (value, "") -> Just value 39 | Just (value, _) -> Nothing 40 | {-# INLINE readByteStringInt #-} 41 | 42 | readDouble :: (HasCallStack) => T.Text -> Maybe Double 43 | readDouble s = 44 | case signed double s of 45 | Left _ -> Nothing 46 | Right (value, "") -> Just value 47 | Right (value, _) -> Nothing 48 | {-# INLINE readDouble #-} 49 | 50 | readIntegerEither :: (HasCallStack) => T.Text -> Either T.Text Integer 51 | readIntegerEither s = case signed decimal (T.strip s) of 52 | Left _ -> Left s 53 | Right (value, "") -> Right value 54 | Right (value, _) -> Left s 55 | {-# INLINE readIntegerEither #-} 56 | 57 | readIntEither :: (HasCallStack) => T.Text -> Either T.Text Int 58 | readIntEither s = case signed decimal (T.strip s) of 59 | Left _ -> Left s 60 | Right (value, "") -> Right value 61 | Right (value, _) -> Left s 62 | {-# INLINE readIntEither #-} 63 | 64 | readDoubleEither :: (HasCallStack) => T.Text -> Either T.Text Double 65 | readDoubleEither s = 66 | case signed double s of 67 | Left _ -> Left s 68 | Right (value, "") -> Right value 69 | Right (value, _) -> Left s 70 | {-# INLINE readDoubleEither #-} 71 | 72 | safeReadValue :: (Read a) => T.Text -> Maybe a 73 | safeReadValue s = readMaybe (T.unpack s) 74 | 75 | readWithDefault :: (HasCallStack, Read a) => a -> T.Text -> a 76 | readWithDefault v s = fromMaybe v (readMaybe (T.unpack s)) 77 | -------------------------------------------------------------------------------- /src/DataFrame/Internal/Row.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module DataFrame.Internal.Row where 3 | 4 | import qualified Data.List as L 5 | import qualified Data.Map as M 6 | import qualified Data.Set as S 7 | import qualified Data.Text as T 8 | import qualified Data.Vector as V 9 | import qualified Data.Vector.Generic as VG 10 | import qualified Data.Vector.Unboxed as VU 11 | import qualified Data.Vector.Algorithms.Merge as VA 12 | 13 | import Control.Exception (throw) 14 | import Control.Monad.ST (runST) 15 | import DataFrame.Errors (DataFrameException(..)) 16 | import DataFrame.Internal.Column 17 | import DataFrame.Internal.DataFrame 18 | import DataFrame.Internal.Types 19 | import Data.Function (on) 20 | 21 | type Row = V.Vector RowValue 22 | 23 | toRowList :: [T.Text] -> DataFrame -> [Row] 24 | toRowList names df = let 25 | nameSet = S.fromList names 26 | in map (mkRowRep df nameSet) [0..(fst (dataframeDimensions df) - 1)] 27 | 28 | toRowVector :: [T.Text] -> DataFrame -> V.Vector Row 29 | toRowVector names df = let 30 | nameSet = S.fromList names 31 | in V.generate (fst (dataframeDimensions df)) (mkRowRep df nameSet) 32 | 33 | mkRowFromArgs :: [T.Text] -> DataFrame -> Int -> Row 34 | mkRowFromArgs names df i = V.map get (V.fromList names) 35 | where 36 | get name = case getColumn name df of 37 | Nothing -> throw $ ColumnNotFoundException name "[INTERNAL] mkRowFromArgs" (map fst $ M.toList $ columnIndices df) 38 | Just (BoxedColumn column) -> toRowValue (column V.! i) 39 | Just (UnboxedColumn column) -> toRowValue (column VU.! i) 40 | Just (OptionalColumn column) -> toRowValue (column V.! i) 41 | 42 | mkRowRep :: DataFrame -> S.Set T.Text -> Int -> Row 43 | mkRowRep df names i = V.generate (S.size names) (\index -> get (names' V.! index)) 44 | where 45 | inOrderIndexes = map fst $ L.sortBy (compare `on` snd) $ M.toList (columnIndices df) 46 | names' = V.fromList [n | n <- inOrderIndexes, S.member n names] 47 | throwError name = error $ "Column " 48 | ++ T.unpack name 49 | ++ " has less items than " 50 | ++ "the other columns at index " 51 | ++ show i 52 | get name = case getColumn name df of 53 | Just (BoxedColumn c) -> case c V.!? i of 54 | Just e -> toRowValue e 55 | Nothing -> throwError name 56 | Just (OptionalColumn c) -> case c V.!? i of 57 | Just e -> toRowValue e 58 | Nothing -> throwError name 59 | Just (UnboxedColumn c) -> case c VU.!? i of 60 | Just e -> toRowValue e 61 | Nothing -> throwError name 62 | Just (GroupedBoxedColumn c) -> case c V.!? i of 63 | Just e -> toRowValue e 64 | Nothing -> throwError name 65 | Just (GroupedUnboxedColumn c) -> case c V.!? i of 66 | Just e -> toRowValue e 67 | Nothing -> throwError name 68 | 69 | sortedIndexes' :: Bool -> V.Vector Row -> VU.Vector Int 70 | sortedIndexes' asc rows = runST $ do 71 | withIndexes <- VG.thaw (V.indexed rows) 72 | VA.sortBy ((if asc then compare else flip compare) `on` snd) withIndexes 73 | sorted <- VG.unsafeFreeze withIndexes 74 | return $ VU.generate (VG.length rows) (\i -> fst (sorted VG.! i)) 75 | -------------------------------------------------------------------------------- /src/DataFrame/Internal/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ConstraintKinds #-} 2 | {-# LANGUAGE DataKinds #-} 3 | {-# LANGUAGE ExistentialQuantification #-} 4 | {-# LANGUAGE GADTs #-} 5 | {-# LANGUAGE InstanceSigs #-} 6 | {-# LANGUAGE KindSignatures #-} 7 | {-# LANGUAGE RankNTypes #-} 8 | {-# LANGUAGE ScopedTypeVariables #-} 9 | {-# LANGUAGE TypeApplications #-} 10 | {-# LANGUAGE TypeOperators #-} 11 | {-# LANGUAGE Strict #-} 12 | module DataFrame.Internal.Types where 13 | 14 | import Data.Int ( Int8, Int16, Int32, Int64 ) 15 | import Data.Kind (Type) 16 | import Data.Maybe (fromMaybe) 17 | import Data.Typeable (Typeable, type (:~:) (..)) 18 | import Data.Word ( Word8, Word16, Word32, Word64 ) 19 | import Type.Reflection (TypeRep, typeOf, typeRep) 20 | import Data.Type.Equality (TestEquality(..)) 21 | 22 | -- We need an "Object" type as an intermediate representation 23 | -- for rows. Useful for things like sorting and function application. 24 | type Columnable a = (Typeable a, Show a, Ord a, Eq a) 25 | 26 | data RowValue where 27 | Value :: (Columnable a) => a -> RowValue 28 | 29 | instance Eq RowValue where 30 | (==) :: RowValue -> RowValue -> Bool 31 | (Value a) == (Value b) = fromMaybe False $ do 32 | Refl <- testEquality (typeOf a) (typeOf b) 33 | return $ a == b 34 | 35 | instance Ord RowValue where 36 | (<=) :: RowValue -> RowValue -> Bool 37 | (Value a) <= (Value b) = fromMaybe False $ do 38 | Refl <- testEquality (typeOf a) (typeOf b) 39 | return $ a <= b 40 | 41 | instance Show RowValue where 42 | show :: RowValue -> String 43 | show (Value a) = show a 44 | 45 | toRowValue :: forall a . (Columnable a) => a -> RowValue 46 | toRowValue = Value 47 | 48 | -- | Essentially a "functor" instance of our type-erased Column. 49 | class Transformable a where 50 | transform :: forall b c . (Columnable b, Columnable c) => (b -> c) -> a -> Maybe a 51 | 52 | -- Convenience functions for types. 53 | unboxableTypes :: TypeRepList '[Int, Int8, Int16, Int32, Int64, 54 | Word, Word8, Word16, Word32, Word64, 55 | Char, Double, Float, Bool] 56 | unboxableTypes = Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep Nil))))))))))))) 57 | 58 | numericTypes :: TypeRepList '[Int, Int8, Int16, Int32, Int64, Double, Float] 59 | numericTypes = Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep (Cons typeRep Nil)))))) 60 | 61 | data TypeRepList (xs :: [Type]) where 62 | Nil :: TypeRepList '[] 63 | Cons :: Typeable x => TypeRep x -> TypeRepList xs -> TypeRepList (x ': xs) 64 | 65 | matchesAnyType :: forall a xs. (Typeable a) => TypeRepList xs -> TypeRep a -> Bool 66 | matchesAnyType Nil _ = False 67 | matchesAnyType (Cons ty tys) rep = 68 | case testEquality ty rep of 69 | Just Refl -> True 70 | Nothing -> matchesAnyType tys rep 71 | 72 | testUnboxable :: forall a . Typeable a => TypeRep a -> Bool 73 | testUnboxable x = matchesAnyType unboxableTypes (typeRep @a) 74 | 75 | testNumeric :: forall a . Typeable a => TypeRep a -> Bool 76 | testNumeric x = matchesAnyType numericTypes (typeRep @a) 77 | -------------------------------------------------------------------------------- /src/DataFrame/Operations/Aggregation.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExplicitNamespaces #-} 2 | {-# LANGUAGE GADTs #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE RankNTypes #-} 5 | {-# LANGUAGE ScopedTypeVariables #-} 6 | {-# LANGUAGE TypeApplications #-} 7 | module DataFrame.Operations.Aggregation where 8 | 9 | import qualified Data.Set as S 10 | 11 | import qualified Data.List as L 12 | import qualified Data.Map as M 13 | import qualified Data.Map.Strict as MS 14 | import qualified Data.Text as T 15 | import qualified Data.Vector.Generic as VG 16 | import qualified Data.Vector as V 17 | import qualified Data.Vector.Mutable as VM 18 | import qualified Data.Vector.Unboxed as VU 19 | import qualified Statistics.Quantile as SS 20 | import qualified Statistics.Sample as SS 21 | 22 | import Control.Exception (throw) 23 | import Control.Monad (foldM_) 24 | import Control.Monad.ST (runST) 25 | import DataFrame.Internal.Column (Column(..), toColumn', getIndicesUnboxed, getIndices) 26 | import DataFrame.Internal.DataFrame (DataFrame(..), empty, getColumn) 27 | import DataFrame.Internal.Parsing 28 | import DataFrame.Internal.Types 29 | import DataFrame.Errors 30 | import DataFrame.Operations.Core 31 | import DataFrame.Operations.Subset 32 | import Data.Function ((&)) 33 | import Data.Hashable 34 | import Data.Maybe 35 | import Data.Type.Equality (type (:~:)(Refl), TestEquality(..)) 36 | import Type.Reflection (typeRep, typeOf) 37 | 38 | -- | O(k * n) groups the dataframe by the given rows aggregating the remaining rows 39 | -- into vector that should be reduced later. 40 | groupBy :: 41 | [T.Text] -> 42 | DataFrame -> 43 | DataFrame 44 | groupBy names df 45 | | any (`notElem` columnNames df) names = throw $ ColumnNotFoundException (T.pack $ show $ names L.\\ columnNames df) "groupBy" (columnNames df) 46 | | otherwise = L.foldl' insertColumns initDf groupingColumns 47 | where 48 | insertOrAdjust k v m = if MS.notMember k m then MS.insert k [v] m else MS.adjust (appendWithFrontMin v) k m 49 | -- Create a string representation of each row. 50 | values = V.generate (fst (dimensions df)) (mkRowRep df (S.fromList names)) 51 | -- Create a mapping from the row representation to the list of indices that 52 | -- have that row representation. This will allow us sortedIndexesto combine the indexes 53 | -- where the rows are the same. 54 | valueIndices = V.ifoldl' (\m index rowRep -> insertOrAdjust rowRep index m) M.empty values 55 | -- Since the min is at the head this allows us to get the min in constant time and sort by it 56 | -- That way we can recover the original order of the rows. 57 | -- valueIndicesInitOrder = L.sortBy (compare `on` snd) $! MS.toList $ MS.map VU.head valueIndices 58 | valueIndicesInitOrder = runST $ do 59 | v <- VM.new (MS.size valueIndices) 60 | foldM_ (\i idxs -> VM.write v i (VU.fromList idxs) >> return (i + 1)) 0 valueIndices 61 | V.unsafeFreeze v 62 | 63 | -- These are the indexes of the grouping/key rows i.e the minimum elements 64 | -- of the list. 65 | keyIndices = VU.generate (VG.length valueIndicesInitOrder) (\i -> VG.head $ valueIndicesInitOrder VG.! i) 66 | -- this will be our main worker function in the fold that takes all 67 | -- indices and replaces each value in a column with a list of 68 | -- the elements with the indices where the grouped row 69 | -- values are the same. 70 | insertColumns = groupColumns valueIndicesInitOrder df 71 | -- Out initial DF will just be all the grouped rows added to an 72 | -- empty dataframe. The entries are dedued and are in their 73 | -- initial order. 74 | initDf = L.foldl' (mkGroupedColumns keyIndices df) empty names 75 | -- All the rest of the columns that we are grouping by. 76 | groupingColumns = columnNames df L.\\ names 77 | 78 | mkRowRep :: DataFrame -> S.Set T.Text -> Int -> Int 79 | mkRowRep df names i = hash $ V.ifoldl' go [] (columns df) 80 | where 81 | indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df)) 82 | go acc k Nothing = acc 83 | go acc k (Just (BoxedColumn (c :: V.Vector a))) = 84 | if S.notMember (indexMap M.! k) names 85 | then acc 86 | else case c V.!? i of 87 | Just e -> hash' @a e : acc 88 | Nothing -> 89 | error $ 90 | "Column " 91 | ++ T.unpack (indexMap M.! k) 92 | ++ " has less items than " 93 | ++ "the other columns at index " 94 | ++ show i 95 | go acc k (Just (OptionalColumn (c :: V.Vector (Maybe a)))) = 96 | if S.notMember (indexMap M.! k) names 97 | then acc 98 | else case c V.!? i of 99 | Just e -> hash' @(Maybe a) e : acc 100 | Nothing -> 101 | error $ 102 | "Column " 103 | ++ T.unpack (indexMap M.! k) 104 | ++ " has less items than " 105 | ++ "the other columns at index " 106 | ++ show i 107 | go acc k (Just (UnboxedColumn (c :: VU.Vector a))) = 108 | if S.notMember (indexMap M.! k) names 109 | then acc 110 | else case c VU.!? i of 111 | Just e -> hash' @a e : acc 112 | Nothing -> 113 | error $ 114 | "Column " 115 | ++ T.unpack (indexMap M.! k) 116 | ++ " has less items than " 117 | ++ "the other columns at index " 118 | ++ show i 119 | 120 | -- | This hash function returns the hash when given a non numeric type but 121 | -- the value when given a numeric. 122 | hash' :: Columnable a => a -> Double 123 | hash' value = case testEquality (typeOf value) (typeRep @Double) of 124 | Just Refl -> value 125 | Nothing -> case testEquality (typeOf value) (typeRep @Int) of 126 | Just Refl -> fromIntegral value 127 | Nothing -> case testEquality (typeOf value) (typeRep @T.Text) of 128 | Just Refl -> fromIntegral $ hash value 129 | Nothing -> fromIntegral $ hash (show value) 130 | 131 | mkGroupedColumns :: VU.Vector Int -> DataFrame -> DataFrame -> T.Text -> DataFrame 132 | mkGroupedColumns indices df acc name = 133 | case (V.!) (columns df) (columnIndices df M.! name) of 134 | Nothing -> error "Unexpected" 135 | (Just (BoxedColumn column)) -> 136 | let vs = indices `getIndices` column 137 | in insertColumn name vs acc 138 | (Just (OptionalColumn column)) -> 139 | let vs = indices `getIndices` column 140 | in insertColumn name vs acc 141 | (Just (UnboxedColumn column)) -> 142 | let vs = indices `getIndicesUnboxed` column 143 | in insertUnboxedColumn name vs acc 144 | 145 | groupColumns :: V.Vector (VU.Vector Int) -> DataFrame -> DataFrame -> T.Text -> DataFrame 146 | groupColumns indices df acc name = 147 | case (V.!) (columns df) (columnIndices df M.! name) of 148 | Nothing -> df 149 | (Just (BoxedColumn column)) -> 150 | let vs = V.map (`getIndices` column) indices 151 | in insertColumn' name (Just $ GroupedBoxedColumn vs) acc 152 | (Just (OptionalColumn column)) -> 153 | let vs = V.map (`getIndices` column) indices 154 | in insertColumn' name (Just $ GroupedBoxedColumn vs) acc 155 | (Just (UnboxedColumn column)) -> 156 | let vs = V.map (`getIndicesUnboxed` column) indices 157 | in insertColumn' name (Just $ GroupedUnboxedColumn vs) acc 158 | 159 | data Aggregation = Count 160 | | Mean 161 | | Minimum 162 | | Median 163 | | Maximum 164 | | Sum deriving (Show, Eq) 165 | 166 | groupByAgg :: Aggregation -> [T.Text] -> DataFrame -> DataFrame 167 | groupByAgg agg columnNames df = let 168 | in case agg of 169 | Count -> insertColumnWithDefault @Int 1 (T.pack (show agg)) V.empty df 170 | & groupBy columnNames 171 | & reduceBy @Int VG.length "Count" 172 | _ -> error "UNIMPLEMENTED" 173 | 174 | -- O (k * n) Reduces a vector valued volumn with a given function. 175 | reduceBy :: 176 | forall a b . (Columnable a, Columnable b) => 177 | (forall v . (VG.Vector v a) => v a -> b) -> 178 | T.Text -> 179 | DataFrame -> 180 | DataFrame 181 | reduceBy f name df = case getColumn name df of 182 | Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) -> case testEquality (typeRep @a) (typeRep @a') of 183 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map f column)) df 184 | Nothing -> error "Type error" 185 | Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) -> case testEquality (typeRep @a) (typeRep @a') of 186 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map f column)) df 187 | Nothing -> error "Type error" 188 | _ -> error "Column is ungrouped" 189 | 190 | reduceByAgg :: Aggregation 191 | -> T.Text 192 | -> DataFrame 193 | -> DataFrame 194 | reduceByAgg agg name df = case agg of 195 | Count -> case getColumn name df of 196 | Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) -> insertColumn' name (Just $ toColumn' (VG.map VG.length column)) df 197 | Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) -> insertColumn' name (Just $ toColumn' (VG.map VG.length column)) df 198 | _ -> error $ "Cannot count ungrouped Column: " ++ T.unpack name 199 | Mean -> case getColumn name df of 200 | Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) -> case testEquality (typeRep @a') (typeRep @Int) of 201 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map (SS.mean . VG.map fromIntegral) column)) df 202 | Nothing -> case testEquality (typeRep @a') (typeRep @Double) of 203 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map SS.mean column)) df 204 | Nothing -> case testEquality (typeRep @a') (typeRep @Float) of 205 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map (SS.mean . VG.map realToFrac) column)) df 206 | Nothing -> error $ "Cannot get mean of non-numeric column: " ++ T.unpack name -- Not sure what to do with no numeric - return nothing??? 207 | Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) -> case testEquality (typeRep @a') (typeRep @Int) of 208 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map (SS.mean . VG.map fromIntegral) column)) df 209 | Nothing -> case testEquality (typeRep @a') (typeRep @Double) of 210 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map SS.mean column)) df 211 | Nothing -> case testEquality (typeRep @a') (typeRep @Float) of 212 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map (SS.mean . VG.map realToFrac) column)) df 213 | Nothing -> error $ "Cannot get mean of non-numeric column: " ++ T.unpack name -- Not sure what to do with no numeric - return nothing??? 214 | Minimum -> case getColumn name df of 215 | Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) -> insertColumn' name (Just $ toColumn' (VG.map VG.minimum column)) df 216 | Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) -> insertColumn' name (Just $ toColumn' (VG.map VG.minimum column)) df 217 | Maximum -> case getColumn name df of 218 | Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) -> insertColumn' name (Just $ toColumn' (VG.map VG.maximum column)) df 219 | Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) -> insertColumn' name (Just $ toColumn' (VG.map VG.maximum column)) df 220 | Sum -> case getColumn name df of 221 | Just ((GroupedBoxedColumn (column :: V.Vector (V.Vector a')))) -> case testEquality (typeRep @a') (typeRep @Int) of 222 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map VG.sum column)) df 223 | Nothing -> case testEquality (typeRep @a') (typeRep @Double) of 224 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map VG.sum column)) df 225 | Nothing -> error $ "Cannot get sum of non-numeric column: " ++ T.unpack name -- Not sure what to do with no numeric - return nothing??? 226 | Just ((GroupedUnboxedColumn (column :: V.Vector (VU.Vector a')))) -> case testEquality (typeRep @a') (typeRep @Int) of 227 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map VG.sum column)) df 228 | Nothing -> case testEquality (typeRep @a') (typeRep @Double) of 229 | Just Refl -> insertColumn' name (Just $ toColumn' (VG.map VG.sum column)) df 230 | Nothing -> error $ "Cannot get sum of non-numeric column: " ++ T.unpack name -- Not sure what to do with no numeric - return nothing??? 231 | _ -> error "UNIMPLEMENTED" 232 | 233 | aggregate :: [(T.Text, Aggregation)] -> DataFrame -> DataFrame 234 | aggregate aggs df = let 235 | f (name, agg) d = cloneColumn name alias d & reduceByAgg agg alias 236 | where alias = (T.pack . show) agg <> "_" <> name 237 | in fold f aggs df & exclude (map fst aggs) 238 | 239 | 240 | appendWithFrontMin :: (Ord a) => a -> [a] -> [a] 241 | appendWithFrontMin x [] = [x] 242 | appendWithFrontMin x xs@(f:rest) 243 | | x < f = x:xs 244 | | otherwise = f:x:rest 245 | {-# INLINE appendWithFrontMin #-} 246 | 247 | distinct :: DataFrame -> DataFrame 248 | distinct df = groupBy (columnNames df) df 249 | -------------------------------------------------------------------------------- /src/DataFrame/Operations/Core.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExplicitNamespaces #-} 2 | {-# LANGUAGE FlexibleContexts #-} 3 | {-# LANGUAGE GADTs #-} 4 | {-# LANGUAGE OverloadedStrings #-} 5 | {-# LANGUAGE RankNTypes #-} 6 | {-# LANGUAGE ScopedTypeVariables #-} 7 | {-# LANGUAGE TypeApplications #-} 8 | {-# LANGUAGE BangPatterns #-} 9 | module DataFrame.Operations.Core where 10 | 11 | import qualified Data.List as L 12 | import qualified Data.Map as M 13 | import qualified Data.Map.Strict as MS 14 | import qualified Data.Set as S 15 | import qualified Data.Text as T 16 | import qualified Data.Vector.Generic as VG 17 | import qualified Data.Vector as V 18 | import qualified Data.Vector.Unboxed as VU 19 | 20 | import Control.Exception ( throw ) 21 | import DataFrame.Errors 22 | import DataFrame.Internal.Column ( Column(..), toColumn', toColumn, columnLength, columnTypeString, expandColumn ) 23 | import DataFrame.Internal.DataFrame (DataFrame(..), getColumn, null, empty) 24 | import DataFrame.Internal.Parsing (isNullish) 25 | import DataFrame.Internal.Types (Columnable) 26 | import Data.Either 27 | import Data.Function (on, (&)) 28 | import Data.Maybe 29 | import Data.Type.Equality (type (:~:)(Refl), TestEquality(..)) 30 | import Type.Reflection 31 | import Prelude hiding (null) 32 | 33 | -- | O(1) Get DataFrame dimensions i.e. (rows, columns) 34 | dimensions :: DataFrame -> (Int, Int) 35 | dimensions = dataframeDimensions 36 | {-# INLINE dimensions #-} 37 | 38 | -- | O(k) Get column names of the DataFrame in order of insertion. 39 | columnNames :: DataFrame -> [T.Text] 40 | columnNames = map fst . L.sortBy (compare `on` snd). M.toList . columnIndices 41 | {-# INLINE columnNames #-} 42 | 43 | -- | /O(n)/ Adds a vector to the dataframe. 44 | insertColumn :: 45 | forall a. 46 | (Columnable a) => 47 | -- | Column Name 48 | T.Text -> 49 | -- | Vector to add to column 50 | V.Vector a -> 51 | -- | DataFrame to add column to 52 | DataFrame -> 53 | DataFrame 54 | insertColumn name xs = insertColumn' name (Just (toColumn' xs)) 55 | {-# INLINE insertColumn #-} 56 | 57 | cloneColumn :: T.Text -> T.Text -> DataFrame -> DataFrame 58 | cloneColumn original new df = fromMaybe (throw $ ColumnNotFoundException original "cloneColumn" (map fst $ M.toList $ columnIndices df)) $ do 59 | column <- getColumn original df 60 | return $ insertColumn' new (Just column) df 61 | 62 | -- | /O(n)/ Adds an unboxed vector to the dataframe. 63 | insertUnboxedColumn :: 64 | forall a. 65 | (Columnable a, VU.Unbox a) => 66 | -- | Column Name 67 | T.Text -> 68 | -- | Unboxed vector to add to column 69 | VU.Vector a -> 70 | -- | DataFrame to add to column 71 | DataFrame -> 72 | DataFrame 73 | insertUnboxedColumn name xs = insertColumn' name (Just (UnboxedColumn xs)) 74 | 75 | -- -- | /O(n)/ Add a column to the dataframe. Not meant for external use. 76 | insertColumn' :: 77 | -- | Column Name 78 | T.Text -> 79 | -- | Column to add 80 | Maybe Column -> 81 | -- | DataFrame to add to column 82 | DataFrame -> 83 | DataFrame 84 | insertColumn' _ Nothing d = d 85 | insertColumn' name optCol@(Just column) d 86 | | M.member name (columnIndices d) = let 87 | i = (M.!) (columnIndices d) name 88 | in d { columns = columns d V.// [(i, optCol)] } 89 | | otherwise = insertNewColumn 90 | where 91 | l = columnLength column 92 | (r, c) = dataframeDimensions d 93 | diff = abs (l - r) 94 | insertNewColumn 95 | -- If we have a non-empty dataframe and we have more rows in the new column than the other column 96 | -- we should make all the other columns have null and then add the new column. 97 | | r > 0 && l > r = let 98 | indexes = (map snd . L.sortBy (compare `on` snd). M.toList . columnIndices) d 99 | nonEmptyColumns = L.foldl' (\acc i -> acc ++ [maybe (error "Unexpected") (expandColumn diff) (columns d V.! i)]) [] indexes 100 | in fromList (zip (columnNames d ++ [name]) (nonEmptyColumns ++ [column])) 101 | | otherwise = let 102 | (n:rest) = case freeIndices d of 103 | [] -> [VG.length (columns d)..(VG.length (columns d) * 2 - 1)] 104 | lst -> lst 105 | columns' = if L.null (freeIndices d) 106 | then columns d V.++ V.replicate (VG.length (columns d)) Nothing 107 | else columns d 108 | xs' 109 | | diff <= 0 || null d = optCol 110 | | otherwise = expandColumn diff <$> optCol 111 | in d 112 | { columns = columns' V.// [(n, xs')], 113 | columnIndices = M.insert name n (columnIndices d), 114 | freeIndices = rest, 115 | dataframeDimensions = (max l r, c + 1) 116 | } 117 | 118 | -- | /O(k)/ Add a column to the dataframe providing a default. 119 | -- This constructs a new vector and also may convert it 120 | -- to an unboxed vector if necessary. Since columns are usually 121 | -- large the runtime is dominated by the length of the list, k. 122 | insertColumnWithDefault :: 123 | forall a. 124 | (Columnable a) => 125 | -- | Default Value 126 | a -> 127 | -- | Column name 128 | T.Text -> 129 | -- | Data to add to column 130 | V.Vector a -> 131 | -- | DataFrame to add to column 132 | DataFrame -> 133 | DataFrame 134 | insertColumnWithDefault defaultValue name xs d = 135 | let (rows, _) = dataframeDimensions d 136 | values = xs V.++ V.replicate (rows - V.length xs) defaultValue 137 | in insertColumn' name (Just $ toColumn' values) d 138 | 139 | -- TODO: Add existence check in rename. 140 | rename :: T.Text -> T.Text -> DataFrame -> DataFrame 141 | rename orig new df = fromMaybe (throw $ ColumnNotFoundException orig "rename" (map fst $ M.toList $ columnIndices df)) $ do 142 | columnIndex <- M.lookup orig (columnIndices df) 143 | let origRemoved = M.delete orig (columnIndices df) 144 | let newAdded = M.insert new columnIndex origRemoved 145 | return df { columnIndices = newAdded } 146 | 147 | -- | O(1) Get the number of elements in a given column. 148 | columnSize :: T.Text -> DataFrame -> Maybe Int 149 | columnSize name df = columnLength <$> getColumn name df 150 | 151 | data ColumnInfo = ColumnInfo { 152 | nameOfColumn :: !T.Text, 153 | nonNullValues :: !Int, 154 | nullValues :: !Int, 155 | partiallyParsedValues :: !Int, 156 | uniqueValues :: !Int, 157 | typeOfColumn :: !T.Text 158 | } 159 | 160 | -- | O(n) Returns the number of non-null columns in the dataframe and the type associated 161 | -- with each column. 162 | columnInfo :: DataFrame -> DataFrame 163 | columnInfo df = empty & insertColumn' "Column Name" (Just $! toColumn (map nameOfColumn infos)) 164 | & insertColumn' "# Non-null Values" (Just $! toColumn (map nonNullValues infos)) 165 | & insertColumn' "# Null Values" (Just $! toColumn (map nullValues infos)) 166 | & insertColumn' "# Partially parsed" (Just $! toColumn (map partiallyParsedValues infos)) 167 | & insertColumn' "# Unique Values" (Just $! toColumn (map uniqueValues infos)) 168 | & insertColumn' "Type" (Just $! toColumn (map typeOfColumn infos)) 169 | where 170 | infos = L.sortBy (compare `on` nonNullValues) (V.ifoldl' go [] (columns df)) :: [ColumnInfo] 171 | indexMap = M.fromList (map (\(a, b) -> (b, a)) $ M.toList (columnIndices df)) 172 | columnName i = M.lookup i indexMap 173 | go acc i Nothing = acc 174 | go acc i (Just col@(OptionalColumn (c :: V.Vector a))) = let 175 | cname = columnName i 176 | countNulls = nulls col 177 | countPartial = partiallyParsed col 178 | columnType = T.pack $ show $ typeRep @a 179 | unique = S.size $ VG.foldr S.insert S.empty c 180 | in if isNothing cname then acc else ColumnInfo (fromMaybe "" cname) (columnLength col - countNulls) countNulls countPartial unique columnType : acc 181 | go acc i (Just col@(BoxedColumn (c :: V.Vector a))) = let 182 | cname = columnName i 183 | countPartial = partiallyParsed col 184 | columnType = T.pack $ show $ typeRep @a 185 | unique = S.size $ VG.foldr S.insert S.empty c 186 | in if isNothing cname then acc else ColumnInfo (fromMaybe "" cname) (columnLength col) 0 countPartial unique columnType : acc 187 | go acc i (Just col@(UnboxedColumn c)) = let 188 | cname = columnName i 189 | columnType = T.pack $ columnTypeString col 190 | unique = S.size $ VG.foldr S.insert S.empty c 191 | -- Unboxed columns cannot have nulls since Maybe 192 | -- is not an instance of Unbox a 193 | in if isNothing cname then acc else ColumnInfo (fromMaybe "" cname) (columnLength col) 0 0 unique columnType : acc 194 | 195 | nulls :: Column -> Int 196 | nulls (OptionalColumn xs) = VG.length $ VG.filter isNothing xs 197 | nulls (BoxedColumn (xs :: V.Vector a)) = case testEquality (typeRep @a) (typeRep @T.Text) of 198 | Just Refl -> VG.length $ VG.filter isNullish xs 199 | Nothing -> case testEquality (typeRep @a) (typeRep @String) of 200 | Just Refl -> VG.length $ VG.filter (isNullish . T.pack) xs 201 | Nothing -> case typeRep @a of 202 | App t1 t2 -> case eqTypeRep t1 (typeRep @Maybe) of 203 | Just HRefl -> VG.length $ VG.filter isNothing xs 204 | Nothing -> 0 205 | _ -> 0 206 | nulls _ = 0 207 | 208 | partiallyParsed :: Column -> Int 209 | partiallyParsed (BoxedColumn (xs :: V.Vector a)) = 210 | case typeRep @a of 211 | App (App tycon t1) t2 -> case eqTypeRep tycon (typeRep @Either) of 212 | Just HRefl -> VG.length $ VG.filter isLeft xs 213 | Nothing -> 0 214 | _ -> 0 215 | partiallyParsed _ = 0 216 | 217 | fromList :: [(T.Text, Column)] -> DataFrame 218 | fromList = L.foldl' (\df (!name, !column) -> insertColumn' name (Just $! column) df) empty 219 | 220 | fromColumnList :: [Column] -> DataFrame 221 | fromColumnList = fromList . zip (map (T.pack . show) [0..]) 222 | 223 | -- | O (k * n) Counts the occurences of each value in a given column. 224 | valueCounts :: forall a. (Columnable a) => T.Text -> DataFrame -> [(a, Int)] 225 | valueCounts columnName df = case getColumn columnName df of 226 | Nothing -> throw $ ColumnNotFoundException columnName "sortBy" (map fst $ M.toList $ columnIndices df) 227 | Just (BoxedColumn (column' :: V.Vector c)) -> 228 | let 229 | column = V.foldl' (\m v -> MS.insertWith (+) v (1 :: Int) m) M.empty column' 230 | in case (typeRep @a) `testEquality` (typeRep @c) of 231 | Nothing -> throw $ TypeMismatchException (typeRep @a) (typeRep @c) columnName "valueCounts" 232 | Just Refl -> M.toAscList column 233 | Just (OptionalColumn (column' :: V.Vector c)) -> 234 | let 235 | column = V.foldl' (\m v -> MS.insertWith (+) v (1 :: Int) m) M.empty column' 236 | in case (typeRep @a) `testEquality` (typeRep @c) of 237 | Nothing -> throw $ TypeMismatchException (typeRep @a) (typeRep @c) columnName "valueCounts" 238 | Just Refl -> M.toAscList column 239 | Just (UnboxedColumn (column' :: VU.Vector c)) -> let 240 | column = V.foldl' (\m v -> MS.insertWith (+) v (1 :: Int) m) M.empty (V.convert column') 241 | in case (typeRep @a) `testEquality` (typeRep @c) of 242 | Nothing -> throw $ TypeMismatchException (typeRep @a) (typeRep @c) columnName "valueCounts" 243 | Just Refl -> M.toAscList column 244 | 245 | fold :: (a -> DataFrame -> DataFrame) -> [a] -> DataFrame -> DataFrame 246 | fold f xs acc = L.foldl' (flip f) acc xs 247 | -------------------------------------------------------------------------------- /src/DataFrame/Operations/Sorting.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module DataFrame.Operations.Sorting where 3 | 4 | import qualified Data.List as L 5 | import qualified Data.Text as T 6 | import qualified Data.Vector as V 7 | 8 | import Control.Exception (throw) 9 | import DataFrame.Errors (DataFrameException(..)) 10 | import DataFrame.Internal.Column 11 | import DataFrame.Internal.DataFrame (DataFrame(..), getColumn) 12 | import DataFrame.Internal.Row 13 | import DataFrame.Operations.Core 14 | 15 | -- | Sort order taken as a parameter by the sortby function. 16 | data SortOrder = Ascending | Descending deriving (Eq) 17 | 18 | -- | O(k log n) Sorts the dataframe by a given row. 19 | -- 20 | -- > sortBy "Age" df 21 | sortBy :: 22 | SortOrder -> 23 | [T.Text] -> 24 | DataFrame -> 25 | DataFrame 26 | sortBy order names df 27 | | any (`notElem` columnNames df) names = throw $ ColumnNotFoundException (T.pack $ show $ names L.\\ columnNames df) "sortBy" (columnNames df) 28 | | otherwise = let 29 | -- TODO: Remove the SortOrder defintion from operations so we can share it between here and internal and 30 | -- we don't have to do this Bool mapping. 31 | indexes = sortedIndexes' (order == Ascending) (toRowVector names df) 32 | pick idxs col = atIndicesStable idxs <$> col 33 | in df {columns = V.map (pick indexes) (columns df)} 34 | -------------------------------------------------------------------------------- /src/DataFrame/Operations/Statistics.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE RankNTypes #-} 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | {-# LANGUAGE TypeApplications #-} 4 | {-# LANGUAGE ExplicitNamespaces #-} 5 | {-# LANGUAGE GADTs #-} 6 | {-# LANGUAGE OverloadedStrings #-} 7 | {-# LANGUAGE StrictData #-} 8 | module DataFrame.Operations.Statistics where 9 | 10 | import qualified Data.List as L 11 | import qualified Data.Text as T 12 | import qualified Data.Vector.Generic as VG 13 | import qualified Data.Vector as V 14 | import qualified Data.Vector.Unboxed as VU 15 | import qualified Statistics.Quantile as SS 16 | import qualified Statistics.Sample as SS 17 | 18 | import Prelude as P 19 | 20 | import Control.Exception (throw) 21 | import DataFrame.Errors (DataFrameException(..)) 22 | import DataFrame.Internal.Column 23 | import DataFrame.Internal.DataFrame (DataFrame(..), getColumn, empty) 24 | import DataFrame.Internal.Types (Columnable, transform) 25 | import DataFrame.Operations.Core 26 | import Data.Foldable (asum) 27 | import Data.Maybe (isJust, fromMaybe) 28 | import Data.Function ((&)) 29 | import Data.Type.Equality (type (:~:)(Refl), TestEquality (testEquality)) 30 | import Type.Reflection (typeRep) 31 | 32 | 33 | frequencies :: T.Text -> DataFrame -> DataFrame 34 | frequencies name df = case getColumn name df of 35 | Just ((BoxedColumn (column :: V.Vector a))) -> let 36 | counts = valueCounts @a name df 37 | total = P.sum $ map snd counts 38 | vText :: forall a . (Columnable a) => a -> T.Text 39 | vText c' = case testEquality (typeRep @a) (typeRep @T.Text) of 40 | Just Refl -> c' 41 | Nothing -> case testEquality (typeRep @a) (typeRep @String) of 42 | Just Refl -> T.pack c' 43 | Nothing -> (T.pack . show) c' 44 | initDf = empty & insertColumn "Statistic" (V.fromList ["Count" :: T.Text, "Percentage (%)"]) 45 | in L.foldl' (\df (col, k) -> insertColumn (vText col) (V.fromList [k, k * 100 `div` total]) df) initDf counts 46 | Just ((OptionalColumn (column :: V.Vector a))) -> let 47 | counts = valueCounts @a name df 48 | total = P.sum $ map snd counts 49 | vText :: forall a . (Columnable a) => a -> T.Text 50 | vText c' = case testEquality (typeRep @a) (typeRep @T.Text) of 51 | Just Refl -> c' 52 | Nothing -> case testEquality (typeRep @a) (typeRep @String) of 53 | Just Refl -> T.pack c' 54 | Nothing -> (T.pack . show) c' 55 | initDf = empty & insertColumn "Statistic" (V.fromList ["Count" :: T.Text, "Percentage (%)"]) 56 | in L.foldl' (\df (col, k) -> insertColumn (vText col) (V.fromList [k, k * 100 `div` total]) df) initDf counts 57 | Just ((UnboxedColumn (column :: VU.Vector a))) -> let 58 | counts = valueCounts @a name df 59 | total = P.sum $ map snd counts 60 | vText :: forall a . (Columnable a) => a -> T.Text 61 | vText c' = case testEquality (typeRep @a) (typeRep @T.Text) of 62 | Just Refl -> c' 63 | Nothing -> case testEquality (typeRep @a) (typeRep @String) of 64 | Just Refl -> T.pack c' 65 | Nothing -> (T.pack . show) c' 66 | initDf = empty & insertColumn "Statistic" (V.fromList ["Count" :: T.Text, "Percentage (%)"]) 67 | in L.foldl' (\df (col, k) -> insertColumn (vText col) (V.fromList [k, k * 100 `div` total]) df) initDf counts 68 | 69 | mean :: T.Text -> DataFrame -> Maybe Double 70 | mean = applyStatistic SS.mean 71 | 72 | median :: T.Text -> DataFrame -> Maybe Double 73 | median = applyStatistic (SS.median SS.medianUnbiased) 74 | 75 | standardDeviation :: T.Text -> DataFrame -> Maybe Double 76 | standardDeviation = applyStatistic SS.fastStdDev 77 | 78 | skewness :: T.Text -> DataFrame -> Maybe Double 79 | skewness = applyStatistic SS.skewness 80 | 81 | variance :: T.Text -> DataFrame -> Maybe Double 82 | variance = applyStatistic SS.variance 83 | 84 | interQuartileRange :: T.Text -> DataFrame -> Maybe Double 85 | interQuartileRange = applyStatistic (SS.midspread SS.medianUnbiased 4) 86 | 87 | correlation :: T.Text -> T.Text -> DataFrame -> Maybe Double 88 | correlation first second df = do 89 | f <- _getColumnAsDouble first df 90 | s <- _getColumnAsDouble second df 91 | return $ SS.correlation (VG.zip f s) 92 | 93 | _getColumnAsDouble :: T.Text -> DataFrame -> Maybe (VU.Vector Double) 94 | _getColumnAsDouble name df = case getColumn name df of 95 | Just (UnboxedColumn (f :: VU.Vector a)) -> case testEquality (typeRep @a) (typeRep @Double) of 96 | Just Refl -> Just f 97 | Nothing -> case testEquality (typeRep @a) (typeRep @Int) of 98 | Just Refl -> Just $ VU.map fromIntegral f 99 | Nothing -> Nothing 100 | _ -> Nothing 101 | 102 | sum :: T.Text -> DataFrame -> Maybe Double 103 | sum name df = case getColumn name df of 104 | Just ((UnboxedColumn (column :: VU.Vector a'))) -> case testEquality (typeRep @a') (typeRep @Int) of 105 | Just Refl -> Just $ VG.sum (VU.map fromIntegral column) 106 | Nothing -> case testEquality (typeRep @a') (typeRep @Double) of 107 | Just Refl -> Just $ VG.sum column 108 | Nothing -> Nothing 109 | Nothing -> Nothing 110 | 111 | applyStatistic :: (VU.Vector Double -> Double) -> T.Text -> DataFrame -> Maybe Double 112 | applyStatistic f name df = do 113 | column <- getColumn name df 114 | if columnTypeString column == "Double" 115 | then safeReduceColumn f column 116 | else do 117 | matching <- asum [transform (fromIntegral :: Int -> Double) column, 118 | transform (fromIntegral :: Integer -> Double) column, 119 | transform (realToFrac :: Float -> Double) column, 120 | Just column ] 121 | safeReduceColumn f matching 122 | 123 | applyStatistics :: (VU.Vector Double -> VU.Vector Double) -> T.Text -> DataFrame -> Maybe (VU.Vector Double) 124 | applyStatistics f name df = case getColumn name df of 125 | Just ((UnboxedColumn (column :: VU.Vector a'))) -> case testEquality (typeRep @a') (typeRep @Int) of 126 | Just Refl -> Just $! f (VU.map fromIntegral column) 127 | Nothing -> case testEquality (typeRep @a') (typeRep @Double) of 128 | Just Refl -> Just $! f column 129 | Nothing -> case testEquality (typeRep @a') (typeRep @Float) of 130 | Just Refl -> Just $! f (VG.map realToFrac column) 131 | Nothing -> Nothing 132 | _ -> Nothing 133 | 134 | summarize :: DataFrame -> DataFrame 135 | summarize df = fold columnStats (columnNames df) (fromList [("Statistic", toColumn ["Mean" :: T.Text, "Minimum", "25%" ,"Median", "75%", "Max", "StdDev", "IQR", "Skewness"])]) 136 | where columnStats name d = if all isJust (stats name) then insertUnboxedColumn name (VU.fromList (map (roundTo 2 . fromMaybe 0) $ stats name)) d else d 137 | stats name = let 138 | quantiles = applyStatistics (SS.quantilesVec SS.medianUnbiased (VU.fromList [0,1,2,3,4]) 4) name df 139 | min' = flip (VG.!) 0 <$> quantiles 140 | quartile1 = flip (VG.!) 1 <$> quantiles 141 | median' = flip (VG.!) 2 <$> quantiles 142 | quartile3 = flip (VG.!) 3 <$> quantiles 143 | max' = flip (VG.!) 4 <$> quantiles 144 | iqr = (-) <$> quartile3 <*> quartile1 145 | in [mean name df, 146 | min', 147 | quartile1, 148 | median', 149 | quartile3, 150 | max', 151 | standardDeviation name df, 152 | iqr, 153 | skewness name df] 154 | roundTo :: Int -> Double -> Double 155 | roundTo n x = fromInteger (round $ x * (10^n)) / (10.0^^n) 156 | -------------------------------------------------------------------------------- /src/DataFrame/Operations/Subset.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE RankNTypes #-} 4 | {-# LANGUAGE ScopedTypeVariables #-} 5 | {-# LANGUAGE TypeApplications #-} 6 | {-# LANGUAGE GADTs #-} 7 | module DataFrame.Operations.Subset where 8 | 9 | import qualified Data.List as L 10 | import qualified Data.Map as M 11 | import qualified Data.Set as S 12 | import qualified Data.Text as T 13 | import qualified Data.Vector as V 14 | import qualified Data.Vector.Unboxed as VU 15 | import qualified Data.Vector.Generic as VG 16 | import qualified Prelude 17 | 18 | import Control.Exception (throw) 19 | import DataFrame.Errors (DataFrameException(..)) 20 | import DataFrame.Internal.Column 21 | import DataFrame.Internal.DataFrame (DataFrame(..), getColumn, empty) 22 | import DataFrame.Internal.Function 23 | import DataFrame.Internal.Row (mkRowFromArgs) 24 | import DataFrame.Internal.Types (Columnable, RowValue, toRowValue) 25 | import DataFrame.Operations.Core 26 | import DataFrame.Operations.Transformations (apply) 27 | import Data.Function ((&)) 28 | import Data.Maybe (isJust, fromJust, fromMaybe) 29 | import Prelude hiding (filter, take) 30 | import Type.Reflection 31 | 32 | -- | O(k * n) Take the first n rows of a DataFrame. 33 | take :: Int -> DataFrame -> DataFrame 34 | take n d = d {columns = V.map (takeColumn n' <$>) (columns d), dataframeDimensions = (n', c)} 35 | where 36 | (r, c) = dataframeDimensions d 37 | n' = clip n 0 r 38 | 39 | takeLast :: Int -> DataFrame -> DataFrame 40 | takeLast n d = d {columns = V.map (takeLastColumn n' <$>) (columns d), dataframeDimensions = (n', c)} 41 | where 42 | (r, c) = dataframeDimensions d 43 | n' = clip n 0 r 44 | 45 | drop :: Int -> DataFrame -> DataFrame 46 | drop n d = d {columns = V.map (sliceColumn n' (max (r - n') 0) <$>) (columns d), dataframeDimensions = (max (r - n') 0, c)} 47 | where 48 | (r, c) = dataframeDimensions d 49 | n' = clip n 0 r 50 | 51 | dropLast :: Int -> DataFrame -> DataFrame 52 | dropLast n d = d {columns = V.map (sliceColumn 0 n' <$>) (columns d), dataframeDimensions = (n', c)} 53 | where 54 | (r, c) = dataframeDimensions d 55 | n' = clip (r - n) 0 r 56 | 57 | -- | O(k * n) Take a range of rows of a DataFrame. 58 | range :: (Int, Int) -> DataFrame -> DataFrame 59 | range (start, end) d = d {columns = V.map (sliceColumn (clip start 0 r) n' <$>) (columns d), dataframeDimensions = (n', c)} 60 | where 61 | (r, c) = dataframeDimensions d 62 | n' = clip (end - start) 0 r 63 | 64 | clip :: Int -> Int -> Int -> Int 65 | clip n left right = min right $ max n left 66 | 67 | -- | O(n * k) Filter rows by a given condition. 68 | -- 69 | -- filter "x" even df 70 | filter :: 71 | forall a. 72 | (Columnable a) => 73 | -- | Column to filter by 74 | T.Text -> 75 | -- | Filter condition 76 | (a -> Bool) -> 77 | -- | Dataframe to filter 78 | DataFrame -> 79 | DataFrame 80 | filter filterColumnName condition df = case getColumn filterColumnName df of 81 | Nothing -> throw $ ColumnNotFoundException filterColumnName "filter" (map fst $ M.toList $ columnIndices df) 82 | Just column -> case ifoldlColumn (\s i v -> if condition v then S.insert i s else s) S.empty column of 83 | Nothing -> throw $ TypeMismatchException' (typeRep @a) (columnTypeString column) filterColumnName "filter" 84 | Just indexes -> let 85 | c' = snd $ dataframeDimensions df 86 | pick idxs col = atIndices idxs <$> col 87 | in df {columns = V.map (pick indexes) (columns df), dataframeDimensions = (S.size indexes, c')} 88 | 89 | -- | O(k) a version of filter where the predicate comes first. 90 | -- 91 | -- > filterBy even "x" df 92 | filterBy :: (Columnable a) => (a -> Bool) -> T.Text -> DataFrame -> DataFrame 93 | filterBy = flip filter 94 | 95 | -- | O(k) filters the dataframe with a row predicate. The arguments in the function 96 | -- must appear in the same order as they do in the list. 97 | -- 98 | -- > filterWhere (["x", "y"], func (\x y -> x + y > 5)) df 99 | filterWhere :: ([T.Text], Function) -> DataFrame -> DataFrame 100 | filterWhere (args, f) df = let 101 | indexes = VG.ifoldl' (\s i row -> if funcApply @Bool row f then S.insert i s else s) S.empty $ V.generate (fst (dimensions df)) (mkRowFromArgs args df) 102 | c' = snd $ dataframeDimensions df 103 | pick idxs col = atIndices idxs <$> col 104 | in df {columns = V.map (pick indexes) (columns df), dataframeDimensions = (S.size indexes, c')} 105 | 106 | 107 | -- | O(k) removes all rows with `Nothing` in a given column from the dataframe. 108 | -- 109 | -- > filterJust df 110 | filterJust :: T.Text -> DataFrame -> DataFrame 111 | filterJust name df = case getColumn name df of 112 | Nothing -> throw $ ColumnNotFoundException name "filterJust" (map fst $ M.toList $ columnIndices df) 113 | Just column@(OptionalColumn (col :: V.Vector (Maybe a))) -> filter @(Maybe a) name isJust df & apply @(Maybe a) fromJust name 114 | Just column -> df 115 | 116 | -- | O(n * k) removes all rows with `Nothing` from the dataframe. 117 | -- 118 | -- > filterJust df 119 | filterAllJust :: DataFrame -> DataFrame 120 | filterAllJust df = foldr filterJust df (columnNames df) 121 | 122 | -- | O(k) cuts the dataframe in a cube of size (a, b) where 123 | -- a is the length and b is the width. 124 | -- 125 | -- > cube (10, 5) df 126 | cube :: (Int, Int) -> DataFrame -> DataFrame 127 | cube (length, width) = take length . selectIntRange (0, width - 1) 128 | 129 | -- | O(n) Selects a number of columns in a given dataframe. 130 | -- 131 | -- > select ["name", "age"] df 132 | select :: 133 | [T.Text] -> 134 | DataFrame -> 135 | DataFrame 136 | select cs df 137 | | L.null cs = empty 138 | | any (`notElem` columnNames df) cs = throw $ ColumnNotFoundException (T.pack $ show $ cs L.\\ columnNames df) "select" (columnNames df) 139 | | otherwise = L.foldl' addKeyValue empty cs 140 | where 141 | cIndexAssoc = M.toList $ columnIndices df 142 | remaining = L.filter (\(!c, _) -> c `elem` cs) cIndexAssoc 143 | removed = cIndexAssoc L.\\ remaining 144 | indexes = map snd remaining 145 | (r, c) = dataframeDimensions df 146 | addKeyValue d k = 147 | d 148 | { columns = V.imap (\i v -> if i `notElem` indexes then Nothing else v) (columns df), 149 | columnIndices = M.fromList remaining, 150 | freeIndices = map snd removed ++ freeIndices df, 151 | dataframeDimensions = (r, L.length remaining) 152 | } 153 | 154 | -- | O(n) select columns by index range of column names. 155 | selectIntRange :: (Int, Int) -> DataFrame -> DataFrame 156 | selectIntRange (from, to) df = select (Prelude.take (to - from + 1) $ Prelude.drop from (columnNames df)) df 157 | 158 | -- | O(n) select columns by index range of column names. 159 | selectRange :: (T.Text, T.Text) -> DataFrame -> DataFrame 160 | selectRange (from, to) df = select (reverse $ Prelude.dropWhile (to /=) $ reverse $ dropWhile (from /=) (columnNames df)) df 161 | 162 | -- | O(n) select columns by column predicate name. 163 | selectBy :: (T.Text -> Bool) -> DataFrame -> DataFrame 164 | selectBy f df = select (L.filter f (columnNames df)) df 165 | 166 | -- | O(n) inverse of select 167 | -- 168 | -- > exclude ["Name"] df 169 | exclude :: 170 | [T.Text] -> 171 | DataFrame -> 172 | DataFrame 173 | exclude cs df = 174 | let keysToKeep = columnNames df L.\\ cs 175 | in select keysToKeep df 176 | -------------------------------------------------------------------------------- /src/DataFrame/Operations/Transformations.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE RankNTypes #-} 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | {-# LANGUAGE TypeApplications #-} 5 | module DataFrame.Operations.Transformations where 6 | 7 | import qualified Data.List as L 8 | import qualified Data.Text as T 9 | import qualified Data.Map as M 10 | import qualified Data.Vector.Generic as VG 11 | import qualified Data.Vector as V 12 | import qualified Data.Vector.Unboxed as VU 13 | 14 | import Control.Exception (throw) 15 | import DataFrame.Errors (DataFrameException(..)) 16 | import DataFrame.Internal.Column (Column(..), columnTypeString, itransform, ifoldrColumn) 17 | import DataFrame.Internal.DataFrame (DataFrame(..), getColumn) 18 | import DataFrame.Internal.Function (Function(..), funcApply) 19 | import DataFrame.Internal.Row (mkRowFromArgs) 20 | import DataFrame.Internal.Types (Columnable, RowValue, toRowValue, transform) 21 | import DataFrame.Operations.Core 22 | import Data.Maybe 23 | import Type.Reflection (typeRep, typeOf) 24 | 25 | -- | O(k) Apply a function to a given column in a dataframe. 26 | apply :: 27 | forall b c. 28 | (Columnable b, Columnable c) => 29 | -- | function to apply 30 | (b -> c) -> 31 | -- | Column name 32 | T.Text -> 33 | -- | DataFrame to apply operation to 34 | DataFrame -> 35 | DataFrame 36 | apply f columnName d = case getColumn columnName d of 37 | Nothing -> throw $ ColumnNotFoundException columnName "apply" (map fst $ M.toList $ columnIndices d) 38 | Just column -> case transform f column of 39 | Nothing -> throw $ TypeMismatchException' (typeRep @b) (columnTypeString column) columnName "apply" 40 | column' -> insertColumn' columnName column' d 41 | 42 | -- | O(k) Apply a function to a combination of columns in a dataframe and 43 | -- add the result into `alias` column. 44 | deriveFrom :: ([T.Text], Function) -> T.Text -> DataFrame -> DataFrame 45 | deriveFrom (args, f) name df = case f of 46 | (F4 (f' :: a -> b -> c -> d -> e)) -> let 47 | xs = VG.map (\row -> funcApply @e row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df) 48 | in insertColumn name xs df 49 | (F3 (f' :: a -> b -> c -> d)) -> let 50 | xs = VG.map (\row -> funcApply @d row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df) 51 | in insertColumn name xs df 52 | (F2 (f' :: a -> b -> c)) -> let 53 | xs = VG.map (\row -> funcApply @c row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df) 54 | in insertColumn name xs df 55 | (F1 (f' :: a -> b)) -> let 56 | xs = VG.map (\row -> funcApply @b row f) $ V.generate (fst (dimensions df)) (mkRowFromArgs args df) 57 | in insertColumn name xs df 58 | 59 | -- | O(k) Apply a function to a given column in a dataframe and 60 | -- add the result into alias column. 61 | 62 | derive :: 63 | forall b c. 64 | (Columnable b, Columnable c) => 65 | -- | New name 66 | T.Text -> 67 | -- | function to apply 68 | (b -> c) -> 69 | -- | Derivative column name 70 | T.Text -> 71 | -- | DataFrame to apply operation to 72 | DataFrame -> 73 | DataFrame 74 | derive alias f columnName d = case getColumn columnName d of 75 | Nothing -> throw $ ColumnNotFoundException columnName "derive" (map fst $ M.toList $ columnIndices d) 76 | Just column -> case transform f column of 77 | Nothing -> throw $ TypeMismatchException (typeOf column) (typeRep @b) columnName "derive" 78 | Just res -> insertColumn' alias (Just res) d 79 | 80 | -- | O(k * n) Apply a function to given column names in a dataframe. 81 | applyMany :: 82 | (Columnable b, Columnable c) => 83 | (b -> c) -> 84 | [T.Text] -> 85 | DataFrame -> 86 | DataFrame 87 | applyMany f names df = L.foldl' (flip (apply f)) df names 88 | 89 | -- | O(k) Convenience function that applies to an int column. 90 | applyInt :: 91 | (Columnable b) => 92 | -- | Column name 93 | -- | function to apply 94 | (Int -> b) -> 95 | T.Text -> 96 | -- | DataFrame to apply operation to 97 | DataFrame -> 98 | DataFrame 99 | applyInt = apply 100 | 101 | -- | O(k) Convenience function that applies to an double column. 102 | applyDouble :: 103 | (Columnable b) => 104 | -- | Column name 105 | -- | function to apply 106 | (Double -> b) -> 107 | T.Text -> 108 | -- | DataFrame to apply operation to 109 | DataFrame -> 110 | DataFrame 111 | applyDouble = apply 112 | 113 | -- | O(k * n) Apply a function to a column only if there is another column 114 | -- value that matches the given criterion. 115 | -- 116 | -- > applyWhere "Age" (<20) "Generation" (const "Gen-Z") 117 | applyWhere :: 118 | forall a b . 119 | (Columnable a, Columnable b) => 120 | (a -> Bool) -> -- Filter condition 121 | T.Text -> -- Criterion Column 122 | (b -> b) -> -- function to apply 123 | T.Text -> -- Column name 124 | DataFrame -> -- DataFrame to apply operation to 125 | DataFrame 126 | applyWhere condition filterColumnName f columnName df = case getColumn filterColumnName df of 127 | Nothing -> throw $ ColumnNotFoundException filterColumnName "applyWhere" (map fst $ M.toList $ columnIndices df) 128 | Just column -> case ifoldrColumn (\i val acc -> if condition val then V.cons i acc else acc) V.empty column of 129 | Nothing -> throw $ TypeMismatchException' (typeRep @a) (columnTypeString column) filterColumnName "applyWhere" 130 | Just indexes -> if V.null indexes 131 | then df 132 | else L.foldl' (\d i -> applyAtIndex i f columnName d) df indexes 133 | 134 | -- | O(k) Apply a function to the column at a given index. 135 | applyAtIndex :: 136 | forall a. 137 | (Columnable a) => 138 | -- | Index 139 | Int -> 140 | -- | function to apply 141 | (a -> a) -> 142 | -- | Column name 143 | T.Text -> 144 | -- | DataFrame to apply operation to 145 | DataFrame -> 146 | DataFrame 147 | applyAtIndex i f columnName df = case getColumn columnName df of 148 | Nothing -> throw $ ColumnNotFoundException columnName "applyAtIndex" (map fst $ M.toList $ columnIndices df) 149 | Just column -> case itransform (\index value -> if index == i then f value else value) column of 150 | Nothing -> throw $ TypeMismatchException' (typeRep @a) (columnTypeString column) columnName "applyAtIndex" 151 | column' -> insertColumn' columnName column' df 152 | 153 | impute :: 154 | forall b . 155 | (Columnable b) => 156 | T.Text -> 157 | b -> 158 | DataFrame -> 159 | DataFrame 160 | impute columnName value df = case getColumn columnName df of 161 | Nothing -> throw $ ColumnNotFoundException columnName "impute" (map fst $ M.toList $ columnIndices df) 162 | Just (OptionalColumn _) -> apply (fromMaybe value) columnName df 163 | _ -> error "Cannot impute to a non-Empty column" 164 | -------------------------------------------------------------------------------- /src/DataFrame/Operations/Typing.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ExplicitNamespaces #-} 2 | {-# LANGUAGE GADTs #-} 3 | {-# LANGUAGE OverloadedStrings #-} 4 | {-# LANGUAGE ScopedTypeVariables #-} 5 | {-# LANGUAGE TypeApplications #-} 6 | module DataFrame.Operations.Typing where 7 | 8 | import qualified Data.Set as S 9 | import qualified Data.Text as T 10 | import qualified Data.Vector as V 11 | import qualified Data.Vector.Unboxed as VU 12 | 13 | import DataFrame.Internal.Column (Column(..)) 14 | import DataFrame.Internal.DataFrame (DataFrame(..)) 15 | import DataFrame.Internal.Parsing 16 | import Data.Either 17 | import Data.Maybe 18 | import Data.Time 19 | import Data.Type.Equality (type (:~:)(Refl), TestEquality(..)) 20 | import Type.Reflection (typeRep) 21 | 22 | parseDefaults :: Bool -> DataFrame -> DataFrame 23 | parseDefaults safeRead df = df {columns = V.map (parseDefault safeRead) (columns df)} 24 | 25 | parseDefault :: Bool -> Maybe Column -> Maybe Column 26 | parseDefault _ Nothing = Nothing 27 | parseDefault safeRead (Just (BoxedColumn (c :: V.Vector a))) = let 28 | parseTimeOpt s = parseTimeM {- Accept leading/trailing whitespace -} True defaultTimeLocale "%Y-%m-%d" (T.unpack s) :: Maybe Day 29 | unsafeParseTime s = parseTimeOrError {- Accept leading/trailing whitespace -} True defaultTimeLocale "%Y-%m-%d" (T.unpack s) :: Day 30 | in case (typeRep @a) `testEquality` (typeRep @T.Text) of 31 | Nothing -> case (typeRep @a) `testEquality` (typeRep @String) of 32 | Just Refl -> let 33 | emptyToNothing v = if isNullish (T.pack v) then Nothing else Just v 34 | safeVector = V.map emptyToNothing c 35 | hasNulls = V.foldl' (\acc v -> if isNothing v then acc || True else acc) False safeVector 36 | in Just $ if safeRead && hasNulls then BoxedColumn safeVector else BoxedColumn c 37 | Nothing -> Just $ BoxedColumn c 38 | Just Refl -> 39 | let example = T.strip (V.head c) 40 | emptyToNothing v = if isNullish v then Nothing else Just v 41 | in case readInt example of 42 | Just _ -> 43 | let safeVector = V.map ((=<<) readInt . emptyToNothing) c 44 | hasNulls = V.elem Nothing safeVector 45 | in Just $ if safeRead && hasNulls then BoxedColumn safeVector else UnboxedColumn (VU.generate (V.length c) (fromMaybe 0 . (safeVector V.!))) 46 | Nothing -> case readDouble example of 47 | Just _ -> 48 | let safeVector = V.map ((=<<) readDouble . emptyToNothing) c 49 | hasNulls = V.elem Nothing safeVector 50 | in Just $ if safeRead && hasNulls then BoxedColumn safeVector else UnboxedColumn (VU.generate (V.length c) (fromMaybe 0 . (safeVector V.!))) 51 | Nothing -> case parseTimeOpt example of 52 | Just d -> let 53 | -- failed parse should be Either, nullish should be Maybe 54 | emptyToNothing' v = if isNullish v then Left v else Right v 55 | parseTimeEither v = case parseTimeOpt v of 56 | Just v' -> Right v' 57 | Nothing -> Left v 58 | safeVector = V.map ((=<<) parseTimeEither . emptyToNothing') c 59 | toMaybe (Left _) = Nothing 60 | toMaybe (Right value) = Just value 61 | lefts = V.filter isLeft safeVector 62 | onlyNulls = (not (V.null lefts) && V.all (isNullish . fromLeft "non-null") lefts) 63 | in Just $ if safeRead 64 | then if onlyNulls 65 | then BoxedColumn (V.map toMaybe safeVector) 66 | else if V.any isLeft safeVector 67 | then BoxedColumn safeVector 68 | else BoxedColumn (V.map unsafeParseTime c) 69 | else BoxedColumn (V.map unsafeParseTime c) 70 | Nothing -> let 71 | safeVector = V.map emptyToNothing c 72 | hasNulls = V.any isNullish c 73 | in Just $ if safeRead && hasNulls then BoxedColumn safeVector else BoxedColumn c 74 | parseDefault safeRead column = column 75 | -------------------------------------------------------------------------------- /static/example.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mchav/dataframe/3f09cd47ff4d235360a0832635b140270bc20b96/static/example.gif -------------------------------------------------------------------------------- /test_coverage.md: -------------------------------------------------------------------------------- 1 | # Test Coverage 2 | 3 | ## Properties 4 | * Empty dataframe 5 | - Has dimensions (0, 0) 6 | - Has 8 empty vectors 7 | - No column indices 8 | 9 | ## Operations 10 | 11 | * addColumn 12 | - Adding a boxed vector to an empty dataframe creates a new column boxed containing the vector elements. DONE 13 | - Adding a boxed vector with a boxed type (Int/Double) to an empty dataframe creates a new column unboxed containing the vector elements. DONE 14 | - Adding columns > initial vector size gracefully adds a column that we can retrieve. DONE 15 | - Adding columns > initial vector size gracefully adds a column updates dimentions. DONE 16 | - Adding a column with the same name as an existing column overwrites the contents. DONE 17 | - Adding a column with more values than the current DF dimensions throws an exception. DONE 18 | - Adding a column with less values than the current DF dimensions adds column with optionals. DONE 19 | 20 | * addColumnWithDefault 21 | - Adding a column with less values than the current DF dimensions adds column with optionals. DONE 22 | - Adding a column with as many values is a no-op. DONE 23 | 24 | * apply 25 | - Applying to an existing column maps function to all values. DONE 26 | - Applying to non-existent column throws column not found exception. DONE 27 | - Applying function of wrong type throws exception. DONE 28 | 29 | * applyMany 30 | - Applying many does same transformation to all columns. DONE 31 | - Applying many doesn't change unrelated fields. DONE 32 | - Applying many fails if any of the columns are not found. DONE 33 | - Applying many throws exception when the function type doesn't equal. DONE 34 | 35 | * applyWhere 36 | - Applies function when target column criteria is met. DONE 37 | - When criterion column doesn't exist throw an error. DONE 38 | - When target column doesn't exist throw an error. DONE 39 | - When the type of the criterion column doesn't exist throw an error. DONE 40 | - When the type of the target column doesn't exist throw an error. DONE 41 | - When the criterion function has the wrong type throw an error. DONE 42 | - When the target function has the wrong type throw an error. DONE 43 | 44 | * derive 45 | - Applies function to given column and adds it to alias. DONE 46 | - When column doesn't exist throw an error. DONE 47 | 48 | * applyAtIndex 49 | - Applies function to row at index. 50 | - Does nothing if index is out of range. 51 | - Throws an error if the column doesn't exist. 52 | 53 | * take 54 | - Takes correct number of elements. DONE 55 | - If # elements is less n then don't change the column. DONE 56 | - If arg is negative then don't change the dimensions of the frame. DONE 57 | 58 | * filter 59 | - Filters column as expected. DONE 60 | - Filter on non existent values returns dataframe with (0,0) dimensions. DONE 61 | - Filter on non-existent type throws exception. DONE 62 | 63 | * valueCounts 64 | - Counts values as expected. 65 | - Throws error when column doesn't exist. 66 | 67 | * select 68 | - Selects a subset of the columns on select 69 | - Check that dimensions update after select 70 | - Add new column to result of selected column 71 | - Updates free indices on select 72 | 73 | * exclude 74 | - Drops a subset of the columns on exclude 75 | - Check that dimensions update after exclude 76 | - Add new column to result of exclude column 77 | - Updates free indices on exclude 78 | 79 | * groupBy 80 | - Groups by a column if at exist and other columns are vectors of vectors DONE 81 | - Groups by a number of columns if they exist and other columns are vectors of vectors DONE 82 | - If any column doesn't exist throw an error. DONE 83 | 84 | * reduceBy 85 | - Reduces by a vector column 86 | - Throws an exception when the column doesn't exist. 87 | - Throws an error when the wrong type is passed into the function 88 | - Throws an error when the vector is of the wrong type. 89 | 90 | * parseDefault 91 | - unsigned integer defaults to int 92 | - decimal point number defaults to double. 93 | - Fallback to text. 94 | 95 | * sortBy 96 | - Sorts by a given column in ascending order. DONE 97 | - Sorts by a given column in descending order. DONE 98 | - Sorts by multiple columns in ascending order. 99 | - Sorts by multiple columns in descending order. 100 | - Throws an error if it doesn't exist. DONE 101 | 102 | * columnInfo 103 | - Return correct types and lengths. 104 | 105 | ## Plotting 106 | 107 | 108 | ## CSV I/O 109 | 110 | -------------------------------------------------------------------------------- /tests/Assertions.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE ScopedTypeVariables #-} 2 | module Assertions where 3 | 4 | import qualified Data.List as L 5 | 6 | import Control.Exception 7 | import Test.HUnit 8 | 9 | -- Adapted from: https://github.com/BartMassey/chunk/blob/1ee4bd6545e0db6b8b5f4935d97e7606708eacc9/hunit.hs#L29 10 | assertExpectException :: String -> String -> 11 | IO a -> Assertion 12 | assertExpectException preface expected action = do 13 | r <- catch 14 | (action >> (return . Just) "no exception thrown") 15 | (\(e::SomeException) -> 16 | return (checkForExpectedException e)) 17 | case r of 18 | Nothing -> return () 19 | Just msg -> assertFailure $ preface ++ ": " ++ msg 20 | where 21 | checkForExpectedException :: SomeException -> Maybe String 22 | checkForExpectedException e 23 | | expected `L.isInfixOf` show e = Nothing 24 | | otherwise = 25 | Just $ "wrong exception detail, expected " ++ 26 | expected ++ ", got: " ++ show e -------------------------------------------------------------------------------- /tests/Main.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE ScopedTypeVariables #-} 3 | module Main where 4 | 5 | import qualified DataFrame as D 6 | import qualified DataFrame as DI 7 | import qualified Data.List as L 8 | import qualified Data.Text as T 9 | import qualified Data.Vector as V 10 | import qualified Data.Vector.Unboxed as VU 11 | import qualified System.Exit as Exit 12 | 13 | import Control.Exception 14 | import Data.Time 15 | import Test.HUnit 16 | 17 | import Assertions 18 | 19 | import qualified Operations.Apply 20 | import qualified Operations.Derive 21 | import qualified Operations.Filter 22 | import qualified Operations.GroupBy 23 | import qualified Operations.InsertColumn 24 | import qualified Operations.Sort 25 | import qualified Operations.Take 26 | 27 | testData :: D.DataFrame 28 | testData = D.fromList [ ("test1", DI.toColumn ([1..26] :: [Int])) 29 | , ("test2", DI.toColumn ['a'..'z']) 30 | ] 31 | 32 | -- Dimensions 33 | correctDimensions :: Test 34 | correctDimensions = TestCase (assertEqual "should be (26, 2)" (26, 2) (D.dimensions testData)) 35 | 36 | emptyDataframeDimensions :: Test 37 | emptyDataframeDimensions = TestCase (assertEqual "should be (0, 0)" (0, 0) (D.dimensions D.empty)) 38 | 39 | dimensionsTest :: [Test] 40 | dimensionsTest = [ TestLabel "dimensions_correctDimensions" correctDimensions 41 | , TestLabel "dimensions_emptyDataframeDimensions" emptyDataframeDimensions 42 | ] 43 | 44 | -- parsing. 45 | parseDate :: Test 46 | parseDate = let 47 | expected = Just $ DI.BoxedColumn (V.fromList [fromGregorian 2020 02 14, fromGregorian 2021 02 14, fromGregorian 2022 02 14]) 48 | actual = D.parseDefault True $ Just $ DI.toColumn' (V.fromList ["2020-02-14" :: T.Text, "2021-02-14", "2022-02-14"]) 49 | in TestCase (assertEqual "Correctly parses gregorian date" expected actual) 50 | 51 | incompleteDataParseEither :: Test 52 | incompleteDataParseEither = let 53 | expected = Just $ DI.BoxedColumn (V.fromList [Right $ fromGregorian 2020 02 14, Left ("2021-02-" :: T.Text), Right $ fromGregorian 2022 02 14]) 54 | actual = D.parseDefault True $ Just $ DI.toColumn' (V.fromList ["2020-02-14" :: T.Text, "2021-02-", "2022-02-14"]) 55 | in TestCase (assertEqual "Parses Either for gregorian date" expected actual) 56 | 57 | incompleteDataParseMaybe :: Test 58 | incompleteDataParseMaybe = let 59 | expected = Just $ DI.BoxedColumn (V.fromList [Just $ fromGregorian 2020 02 14, Nothing, Just $ fromGregorian 2022 02 14]) 60 | actual = D.parseDefault True $ Just $ DI.toColumn' (V.fromList ["2020-02-14" :: T.Text, "", "2022-02-14"]) 61 | in TestCase (assertEqual "Parses Maybe for gregorian date with null/empty" expected actual) 62 | 63 | parseTests :: [Test] 64 | parseTests = [ 65 | TestLabel "parseDate" parseDate, 66 | TestLabel "incompleteDataParseMaybe" incompleteDataParseMaybe, 67 | TestLabel "incompleteDataParseEither" incompleteDataParseEither 68 | ] 69 | 70 | tests :: Test 71 | tests = TestList $ dimensionsTest 72 | ++ Operations.Apply.tests 73 | ++ Operations.Derive.tests 74 | ++ Operations.Filter.tests 75 | ++ Operations.GroupBy.tests 76 | ++ Operations.InsertColumn.tests 77 | ++ Operations.Sort.tests 78 | ++ Operations.Take.tests 79 | ++ parseTests 80 | 81 | main :: IO () 82 | main = do 83 | result <- runTestTT tests 84 | if failures result > 0 || errors result > 0 then Exit.exitFailure else Exit.exitSuccess 85 | -------------------------------------------------------------------------------- /tests/Operations/Apply.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeApplications #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE TupleSections #-} 4 | module Operations.Apply where 5 | 6 | import qualified DataFrame as D 7 | import qualified DataFrame as DI 8 | import qualified DataFrame as DE 9 | import qualified Data.Text as T 10 | import qualified Data.Vector as V 11 | import qualified Data.Vector.Unboxed as VU 12 | 13 | import Assertions 14 | import Test.HUnit 15 | import Type.Reflection (typeRep) 16 | 17 | values :: [(T.Text, DI.Column)] 18 | values = [ ("test1", DI.toColumn ([1..26] :: [Int])) 19 | , ("test2", DI.toColumn (map show ['a'..'z'])) 20 | , ("test3", DI.toColumn ([1..26] :: [Int])) 21 | , ("test4", DI.toColumn ['a'..'z']) 22 | , ("test5", DI.toColumn ([1..26] :: [Int])) 23 | , ("test6", DI.toColumn ['a'..'z']) 24 | , ("test7", DI.toColumn ([1..26] :: [Int])) 25 | , ("test8", DI.toColumn ['a'..'z']) 26 | ] 27 | 28 | testData :: D.DataFrame 29 | testData = D.fromList values 30 | 31 | applyBoxedToUnboxed :: Test 32 | applyBoxedToUnboxed = TestCase (assertEqual "Boxed apply unboxed when result is unboxed" 33 | (Just $ DI.UnboxedColumn (VU.fromList (replicate 26 (1 :: Int)))) 34 | (DI.getColumn "test2" $ D.apply @String (const (1::Int)) "test2" testData)) 35 | 36 | applyBoxedToBoxed :: Test 37 | applyBoxedToBoxed = TestCase (assertEqual "Boxed apply remains in boxed vector" 38 | (Just $ DI.BoxedColumn (V.fromList (replicate 26 (1 :: Integer)))) 39 | (DI.getColumn "test2" $ D.apply @String (const (1::Integer)) "test2" testData)) 40 | 41 | applyWrongType :: Test 42 | applyWrongType = TestCase (assertExpectException "[Error Case]" 43 | (DE.typeMismatchError (typeRep @Char) (typeRep @[Char])) 44 | (print $ DI.getColumn "test2" $ D.apply @Char (const (1::Int)) "test2" testData)) 45 | 46 | applyUnknownColumn :: Test 47 | applyUnknownColumn = TestCase (assertExpectException "[Error Case]" 48 | (DE.columnNotFound "test9" "apply" (D.columnNames testData)) 49 | (print $ D.apply @[Char] (const (1::Int)) "test9" testData)) 50 | 51 | applyManyOnlyGivenFields :: Test 52 | applyManyOnlyGivenFields = TestCase (assertEqual "Applies function to many fields" 53 | (D.fromList (map (, D.toColumn $ replicate 26 (1 :: Integer)) ["test4", "test6"] ++ 54 | -- All other fields should have their original values. 55 | filter (\(name, col) -> name /= "test4" && name /= "test6") values)) 56 | (D.applyMany @Char (const (1::Integer)) 57 | ["test4", "test6"] testData)) 58 | 59 | applyManyBoxedToBoxed :: Test 60 | applyManyBoxedToBoxed = TestCase (assertEqual "Applies function to many fields" 61 | (D.fromList (map (, D.toColumn $ replicate 26 (1 :: Integer)) ["test4", "test6", "test8"])) 62 | (D.select ["test4", "test6", "test8"] $ D.applyMany @Char (const (1::Integer)) 63 | ["test4", "test6", "test8"] testData)) 64 | 65 | applyManyBoxedToUnboxed :: Test 66 | applyManyBoxedToUnboxed = TestCase (assertEqual "Unboxes fields when necessary" 67 | (D.fromList (map (, D.toColumn $ replicate 26 (1 :: Int)) ["test4", "test6", "test8"])) 68 | (D.select ["test4", "test6", "test8"] $ D.applyMany @Char (const (1::Int)) 69 | ["test4", "test6", "test8"] testData)) 70 | 71 | applyManyColumnNotFound :: Test 72 | applyManyColumnNotFound = TestCase (assertExpectException "[Error Case]" 73 | (DE.columnNotFound "test0" "apply" (D.columnNames testData)) 74 | (print $ D.applyMany @Char (const (1::Integer)) 75 | ["test0", "test6", "test8"] testData)) 76 | 77 | applyManyWrongType :: Test 78 | applyManyWrongType = TestCase (assertExpectException "[Error Case]" 79 | (DE.typeMismatchError (typeRep @Char) (typeRep @[Char])) 80 | (print $ DI.getColumn "test2" $ D.applyMany @Char (const (1::Int)) ["test2"] testData)) 81 | 82 | applyWhereWrongConditionType :: Test 83 | applyWhereWrongConditionType = TestCase (assertExpectException "[Error Case]" 84 | (DE.typeMismatchError (typeRep @Integer) (typeRep @Int)) 85 | (print $ D.applyWhere (even @Integer) "test1" ((+1) :: Int -> Int) "test5" testData)) 86 | 87 | applyWhereWrongTargetType :: Test 88 | applyWhereWrongTargetType = TestCase (assertExpectException "[Error Case]" 89 | (DE.typeMismatchError (typeRep @Float) (typeRep @Int)) 90 | (print $ D.applyWhere (even @Int) "test1" ((+1) :: Float -> Float) "test5" testData)) 91 | 92 | applyWhereConditionColumnNotFound :: Test 93 | applyWhereConditionColumnNotFound = TestCase (assertExpectException "[Error Case]" 94 | (DE.columnNotFound "test0" "applyWhere" (D.columnNames testData)) 95 | (print $ D.applyWhere (even @Int) "test0" ((+1) :: Int -> Int) "test5" testData)) 96 | 97 | applyWhereTargetColumnNotFound :: Test 98 | applyWhereTargetColumnNotFound = TestCase (assertExpectException "[Error Case]" 99 | (DE.columnNotFound "test0" "applyAtIndex" (D.columnNames testData)) 100 | (print $ D.applyWhere (even @Int) "test1" ((+1) :: Int -> Int) "test0" testData)) 101 | 102 | applyWhereWAI :: Test 103 | applyWhereWAI = TestCase (assertEqual "applyWhere works as intended" 104 | (Just $ DI.UnboxedColumn (VU.fromList (zipWith ($) (cycle [id, (+1)]) [(1 :: Int)..26]))) 105 | (D.getColumn "test5" $ D.applyWhere (even @Int) "test1" ((+1) :: Int -> Int) "test5" testData)) 106 | 107 | tests :: [Test] 108 | tests = [ TestLabel "applyBoxedToUnboxed" applyBoxedToUnboxed 109 | , TestLabel "applyWrongType" applyWrongType 110 | , TestLabel "applyUnknownColumn" applyUnknownColumn 111 | , TestLabel "applyBoxedToBoxed" applyBoxedToBoxed 112 | , TestLabel "applyManyBoxedToBoxed" applyManyBoxedToBoxed 113 | , TestLabel "applyManyOnlyGivenFields" applyManyOnlyGivenFields 114 | , TestLabel "applyManyBoxedToUnboxed" applyManyBoxedToUnboxed 115 | , TestLabel "applyManyColumnNotFound" applyManyColumnNotFound 116 | , TestLabel "applyManyWrongType" applyManyWrongType 117 | , TestLabel "applyWhereWrongConditionType" applyWhereWrongConditionType 118 | , TestLabel "applyWhereWrongTargetType" applyWhereWrongTargetType 119 | , TestLabel "applyWhereConditionColumnNotFound" applyWhereConditionColumnNotFound 120 | , TestLabel "applyWhereTargetColumnNotFound" applyWhereTargetColumnNotFound 121 | , TestLabel "applyWhereWAI" applyWhereWAI 122 | ] 123 | -------------------------------------------------------------------------------- /tests/Operations/Derive.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeApplications #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | module Operations.Derive where 5 | 6 | import qualified DataFrame as D 7 | import qualified DataFrame as DI 8 | import qualified DataFrame as DE 9 | import qualified Data.Text as T 10 | import qualified Data.Vector as V 11 | import qualified Data.Vector.Unboxed as VU 12 | 13 | import Assertions 14 | import Test.HUnit 15 | import Type.Reflection (typeRep) 16 | 17 | values :: [(T.Text, DI.Column)] 18 | values = [ ("test1", DI.toColumn ([1..26] :: [Int])) 19 | , ("test2", DI.toColumn (map show ['a'..'z'])) 20 | , ("test3", DI.toColumn ['a'..'z']) 21 | ] 22 | 23 | testData :: D.DataFrame 24 | testData = D.fromList values 25 | 26 | deriveFromWAI :: Test 27 | deriveFromWAI = TestCase (assertEqual "deriveFrom works when function args align" 28 | (Just $ DI.BoxedColumn (V.fromList (zipWith (\n c -> show n ++ [c]) [1..26] ['a'..'z']))) 29 | (DI.getColumn "test4" $ D.deriveFrom ( 30 | ["test1", "test3"], 31 | D.func (\(n :: Int) (c :: Char) -> show n ++ [c])) "test4" testData)) 32 | 33 | tests :: [Test] 34 | tests = [ TestLabel "deriveFromWAI" deriveFromWAI 35 | ] -------------------------------------------------------------------------------- /tests/Operations/Filter.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeApplications #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | module Operations.Filter where 4 | 5 | import qualified DataFrame as D 6 | import qualified DataFrame as DI 7 | import qualified DataFrame as DE 8 | import qualified Data.Text as T 9 | import qualified Data.Vector as V 10 | import qualified Data.Vector.Unboxed as VU 11 | 12 | import Assertions 13 | import Test.HUnit 14 | import Type.Reflection (typeRep) 15 | 16 | values :: [(T.Text, DI.Column)] 17 | values = [ ("test1", DI.toColumn ([1..26] :: [Int])) 18 | , ("test2", DI.toColumn (map show ['a'..'z'])) 19 | , ("test3", DI.toColumn ([1..26] :: [Int])) 20 | , ("test4", DI.toColumn ['a'..'z']) 21 | , ("test5", DI.toColumn ([1..26] :: [Int])) 22 | , ("test6", DI.toColumn ['a'..'z']) 23 | , ("test7", DI.toColumn ([1..26] :: [Int])) 24 | , ("test8", DI.toColumn ['a'..'z']) 25 | ] 26 | 27 | testData :: D.DataFrame 28 | testData = D.fromList values 29 | 30 | filterColumnDoesNotExist :: Test 31 | filterColumnDoesNotExist = TestCase (assertExpectException "[Error Case]" 32 | (DE.columnNotFound "test0" "filter" (D.columnNames testData)) 33 | (print $ D.filter @Int "test0" even testData)) 34 | 35 | filterColumnWrongType :: Test 36 | filterColumnWrongType = TestCase (assertExpectException "[Error Case]" 37 | (DE.typeMismatchError (typeRep @Integer) (typeRep @Int)) 38 | (print $ D.filter @Integer "test1" even testData)) 39 | 40 | filterByColumnDoesNotExist :: Test 41 | filterByColumnDoesNotExist = TestCase (assertExpectException "[Error Case]" 42 | (DE.columnNotFound "test0" "filter" (D.columnNames testData)) 43 | (print $ D.filterBy @Int even "test0" testData)) 44 | 45 | filterByColumnWrongType :: Test 46 | filterByColumnWrongType = TestCase (assertExpectException "[Error Case]" 47 | (DE.typeMismatchError (typeRep @Integer) (typeRep @Int)) 48 | (print $ D.filterBy @Integer even "test1" testData)) 49 | 50 | filterColumnInexistentValues :: Test 51 | filterColumnInexistentValues = TestCase (assertEqual "Non existent filter value returns no rows" 52 | (0, 8) 53 | (D.dimensions $ D.filter @Int "test1" (<0) testData)) 54 | 55 | filterColumnAllValues :: Test 56 | filterColumnAllValues = TestCase (assertEqual "Filters all columns" 57 | (26, 8) 58 | (D.dimensions $ D.filter @Int "test1" (const True) testData)) 59 | 60 | filterJustWAI :: Test 61 | filterJustWAI = TestCase (assertEqual "Filters out Nothing and unwraps Maybe" 62 | (D.fromList [("test", D.toColumn $ replicate 5 (1 :: Int))]) 63 | (D.filterJust "test" (D.fromList [("test", D.toColumn $ take 10 $ cycle [Just (1 :: Int), Nothing])]))) 64 | 65 | tests :: [Test] 66 | tests = [ TestLabel "filterColumnDoesNotExist" filterColumnDoesNotExist 67 | , TestLabel "filterColumnWrongType" filterColumnWrongType 68 | , TestLabel "filterByColumnDoesNotExist" filterByColumnDoesNotExist 69 | , TestLabel "filterByColumnWrongType" filterByColumnWrongType 70 | , TestLabel "filterColumnInexistentValues" filterColumnInexistentValues 71 | , TestLabel "filterColumnAllValues" filterColumnAllValues 72 | , TestLabel "filterJustWAI" filterJustWAI 73 | ] -------------------------------------------------------------------------------- /tests/Operations/GroupBy.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module Operations.GroupBy where 3 | 4 | import qualified DataFrame as D 5 | import qualified DataFrame as DI 6 | import qualified DataFrame as DE 7 | import qualified Data.Text as T 8 | import qualified Data.Vector as V 9 | import qualified Data.Vector.Unboxed as VU 10 | 11 | import Assertions 12 | import Test.HUnit 13 | 14 | values :: [(T.Text, DI.Column)] 15 | values = [ ("test1", DI.toColumn (concatMap (replicate 10) [1 :: Int, 2, 3, 4])) 16 | , ("test2", DI.toColumn (take 40 $ cycle [1 :: Int,2])) 17 | , ("test3", DI.toColumn [(1 :: Int)..40]) 18 | , ("test4", DI.toColumn (reverse [(1 :: Int)..40])) 19 | ] 20 | 21 | testData :: D.DataFrame 22 | testData = D.fromList values 23 | 24 | groupBySingleRowWAI :: Test 25 | groupBySingleRowWAI = TestCase (assertEqual "Groups by single column" 26 | (D.fromList [("test1", DI.toColumn [(1::Int)..4]), 27 | -- This just makes rows with [1, 2] for every unique test1 row 28 | ("test2", DI.GroupedUnboxedColumn (V.replicate 4 $ VU.fromList (take 10 $ cycle [1 :: Int, 2]))), 29 | ("test3", DI.GroupedUnboxedColumn (V.generate 4 (\i -> VU.fromList [(i * 10 + 1)..((i + 1) * 10)]))), 30 | ("test4", DI.GroupedUnboxedColumn (V.generate 4 (\i -> VU.fromList [(((3 - i) + 1) * 10),(((3 - i) + 1) * 10 - 1)..((3 - i) * 10 + 1)]))) 31 | ]) 32 | (D.groupBy ["test1"] testData D.|> D.sortBy D.Ascending ["test1"])) 33 | 34 | groupByMultipleRowsWAI :: Test 35 | groupByMultipleRowsWAI = TestCase (assertEqual "Groups by single column" 36 | (D.fromList [("test1", DI.toColumn $ concatMap (replicate 2) [(1::Int)..4]), 37 | ("test2", DI.toColumn (take 8 $ cycle [1 :: Int, 2])), 38 | ("test3", DI.GroupedUnboxedColumn (V.fromList [ 39 | VU.fromList [1 :: Int,3..9], 40 | VU.fromList [2,4..10], 41 | VU.fromList [11,13..19], 42 | VU.fromList [12,14..20], 43 | VU.fromList [21,23..29], 44 | VU.fromList [22,24..30], 45 | VU.fromList [31,33..39], 46 | VU.fromList [32,34..40] 47 | ])), 48 | ("test4", DI.GroupedUnboxedColumn (V.fromList $ reverse [ 49 | VU.fromList [1 :: Int,3..9], 50 | VU.fromList [2,4..10], 51 | VU.fromList [11,13..19], 52 | VU.fromList [12,14..20], 53 | VU.fromList [21,23..29], 54 | VU.fromList [22,24..30], 55 | VU.fromList [31,33..39], 56 | VU.fromList [32,34..40] 57 | ])) 58 | ]) 59 | (D.groupBy ["test1", "test2"] testData D.|> D.sortBy D.Ascending ["test1", "test2"])) 60 | 61 | groupByColumnDoesNotExist :: Test 62 | groupByColumnDoesNotExist = TestCase (assertExpectException "[Error Case]" 63 | (DE.columnNotFound "[\"test0\"]" "groupBy" (D.columnNames testData)) 64 | (print $ D.groupBy ["test0"] testData)) 65 | 66 | tests :: [Test] 67 | tests = [ TestLabel "groupBySingleRowWAI" groupBySingleRowWAI 68 | , TestLabel "groupByMultipleRowsWAI" groupByMultipleRowsWAI 69 | , TestLabel "groupByColumnDoesNotExist" groupByColumnDoesNotExist 70 | ] 71 | 72 | -------------------------------------------------------------------------------- /tests/Operations/InsertColumn.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE TypeApplications #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | module Operations.InsertColumn where 4 | 5 | import qualified DataFrame as D 6 | import qualified DataFrame as DI 7 | import qualified Data.Text as T 8 | import qualified Data.Vector as V 9 | import qualified Data.Vector.Unboxed as VU 10 | 11 | import Assertions 12 | import Test.HUnit 13 | 14 | testData :: D.DataFrame 15 | testData = D.fromList [ ("test1", DI.toColumn ([1..26] :: [Int])) 16 | , ("test2", DI.toColumn ['a'..'z']) 17 | , ("test3", DI.toColumn ([1..26] :: [Int])) 18 | , ("test4", DI.toColumn ['a'..'z']) 19 | , ("test5", DI.toColumn ([1..26] :: [Int])) 20 | , ("test6", DI.toColumn ['a'..'z']) 21 | , ("test7", DI.toColumn ([1..26] :: [Int])) 22 | , ("test8", DI.toColumn ['a'..'z']) 23 | ] 24 | 25 | -- Adding a boxed vector to an empty dataframe creates a new column boxed containing the vector elements. 26 | addBoxedColumn :: Test 27 | addBoxedColumn = TestCase (assertEqual "Two columns should be equal" 28 | (Just $ DI.BoxedColumn (V.fromList ["Thuba" :: T.Text, "Zodwa", "Themba"])) 29 | (DI.getColumn "new" $ D.insertColumn "new" (V.fromList ["Thuba" :: T.Text, "Zodwa", "Themba"]) D.empty)) 30 | 31 | addBoxedColumn' :: Test 32 | addBoxedColumn' = TestCase (assertEqual "Two columns should be equal" 33 | (Just $ DI.toColumn ["Thuba" :: T.Text, "Zodwa", "Themba"]) 34 | (DI.getColumn "new" $ D.insertColumn' "new" (Just $ DI.toColumn ["Thuba" :: T.Text, "Zodwa", "Themba"]) D.empty)) 35 | 36 | -- Adding an boxed vector with an unboxable type (Int/Double) to an empty dataframe creates a new column boxed containing the vector elements. 37 | addUnboxedColumn :: Test 38 | addUnboxedColumn = TestCase (assertEqual "Value should be boxed" 39 | (Just $ DI.UnboxedColumn (VU.fromList [1 :: Int, 2, 3])) 40 | (DI.getColumn "new" $ D.insertColumn "new" (V.fromList [1 :: Int, 2, 3]) D.empty)) 41 | 42 | addUnboxedColumn' :: Test 43 | addUnboxedColumn' = TestCase (assertEqual "Value should be boxed" 44 | (Just $ DI.toColumn [1 :: Int, 2, 3]) 45 | (DI.getColumn "new" $ D.insertColumn' "new" (Just $ DI.toColumn [1 :: Int, 2, 3]) D.empty)) 46 | 47 | -- Adding a column with less values than the current DF dimensions adds column with optionals. 48 | addSmallerColumnBoxed :: Test 49 | addSmallerColumnBoxed = TestCase ( 50 | assertEqual "Missing values should be replaced with Nothing" 51 | (Just $ DI.OptionalColumn (V.fromList [Just "a" :: Maybe T.Text, Just "b", Just "c", Nothing, Nothing])) 52 | (DI.getColumn "newer" $ D.insertColumn "newer" (V.fromList ["a" :: T.Text, "b", "c"]) $ D.insertColumn "new" (V.fromList ["a" :: T.Text, "b", "c", "d", "e"]) D.empty) 53 | ) 54 | 55 | addSmallerColumnUnboxed :: Test 56 | addSmallerColumnUnboxed = TestCase ( 57 | assertEqual "Missing values should be replaced with Nothing" 58 | (Just $ DI.OptionalColumn (V.fromList [Just 1 :: Maybe Int, Just 2, Just 3, Nothing, Nothing])) 59 | (DI.getColumn "newer" $ D.insertColumn "newer" (V.fromList [1 :: Int, 2, 3]) $ D.insertColumn "new" (V.fromList [1 :: Int, 2, 3, 4, 5]) D.empty) 60 | ) 61 | 62 | insertColumnWithDefaultFillsWithDefault :: Test 63 | insertColumnWithDefaultFillsWithDefault = TestCase ( 64 | assertEqual "Missing values should be replaced with Nothing" 65 | (Just $ DI.UnboxedColumn (VU.fromList [1 :: Int, 2, 3, 0, 0])) 66 | (DI.getColumn "newer" $ D.insertColumnWithDefault 0 "newer" (V.fromList [1 :: Int, 2, 3]) $ D.insertColumn "new" (V.fromList [1 :: Int, 2, 3, 4, 5]) D.empty) 67 | ) 68 | 69 | insertColumnWithDefaultFillsLargerNoop :: Test 70 | insertColumnWithDefaultFillsLargerNoop = TestCase ( 71 | assertEqual "Lists should be the same size" 72 | (Just $ DI.UnboxedColumn (VU.fromList [(6 :: Int)..10])) 73 | (DI.getColumn "newer" $ D.insertColumnWithDefault 0 "newer" (V.fromList [(6 :: Int)..10]) $ D.insertColumn "new" (V.fromList [1 :: Int, 2, 3, 4, 5]) D.empty) 74 | ) 75 | 76 | addLargerColumnBoxed :: Test 77 | addLargerColumnBoxed = 78 | TestCase (assertEqual "Smaller lists should grow and contain optionals" 79 | (D.fromList [("new", D.toColumn [Just "a" :: Maybe T.Text, Just "b", Just "c", Nothing, Nothing]), 80 | ("newer", D.toColumn ["a" :: T.Text, "b", "c", "d", "e"])]) 81 | (D.insertColumn "newer" (V.fromList ["a" :: T.Text, "b", "c", "d", "e"]) 82 | $ D.insertColumn "new" (V.fromList ["a" :: T.Text, "b", "c"]) D.empty)) 83 | addLargerColumnUnboxed :: Test 84 | addLargerColumnUnboxed = 85 | TestCase (assertEqual "Smaller lists should grow and contain optionals" 86 | (D.fromList [("old", D.toColumn [Just 1 :: Maybe Int, Just 2, Nothing, Nothing, Nothing]), 87 | ("new", D.toColumn [Just 1 :: Maybe Int, Just 2, Just 3, Nothing, Nothing]), 88 | ("newer", D.toColumn [1 :: Int, 2, 3, 4, 5])]) 89 | (D.insertColumn "newer" (V.fromList [1 :: Int, 2, 3, 4, 5]) 90 | $ D.insertColumn "new" (V.fromList [1 :: Int, 2, 3]) $ 91 | D.insertColumn "old" (V.fromList [1 :: Int, 2]) D.empty)) 92 | 93 | dimensionsChangeAfterAdd :: Test 94 | dimensionsChangeAfterAdd = TestCase (assertEqual "should be (26, 3)" 95 | (26, 9) 96 | (D.dimensions $ D.insertColumn @Int "new" (V.fromList [1..26]) testData)) 97 | 98 | dimensionsNotChangedAfterDuplicate :: Test 99 | dimensionsNotChangedAfterDuplicate = TestCase (assertEqual "should be (26, 3)" 100 | (26, 9) 101 | (D.dimensions $ D.insertColumn @Int "new" (V.fromList [1..26]) 102 | $ D.insertColumn @Int "new" (V.fromList [1..26]) testData)) 103 | 104 | 105 | tests :: [Test] 106 | tests = [ 107 | TestLabel "dimensionsChangeAfterAdd" dimensionsChangeAfterAdd 108 | , TestLabel "dimensionsNotChangedAfterDuplicate" dimensionsNotChangedAfterDuplicate 109 | , TestLabel "addBoxedColunmToEmpty" addBoxedColumn 110 | , TestLabel "addBoxedColumnAutoUnboxes" addBoxedColumn 111 | , TestLabel "addSmallerColumnBoxed" addSmallerColumnBoxed 112 | , TestLabel "addSmallerColumnUnboxed" addSmallerColumnUnboxed 113 | , TestLabel "addLargerColumnBoxed" addLargerColumnBoxed 114 | , TestLabel "addLargerColumnUnboxed" addLargerColumnUnboxed 115 | , TestLabel "insertColumnWithDefaultFillsWithDefault" insertColumnWithDefaultFillsWithDefault 116 | , TestLabel "insertColumnWithDefaultFillsLargerNoop" insertColumnWithDefaultFillsLargerNoop 117 | ] 118 | -------------------------------------------------------------------------------- /tests/Operations/Sort.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module Operations.Sort where 3 | 4 | import qualified DataFrame as D 5 | import qualified DataFrame as DI 6 | import qualified DataFrame as DE 7 | import qualified Data.Text as T 8 | import qualified Data.Vector as V 9 | import qualified Data.Vector.Unboxed as VU 10 | 11 | import Assertions 12 | import Control.Monad 13 | import Data.Char 14 | import System.Random 15 | import System.Random.Shuffle (shuffle') 16 | import Test.HUnit 17 | 18 | values :: [(T.Text, DI.Column)] 19 | values = let 20 | ns = shuffle' [(1::Int)..26] 26 $ mkStdGen 252 21 | in [ ("test1", DI.toColumn ns) 22 | , ("test2", DI.toColumn (map (chr . (+96)) ns)) 23 | ] 24 | 25 | testData :: D.DataFrame 26 | testData = D.fromList values 27 | 28 | sortByAscendingWAI :: Test 29 | sortByAscendingWAI = TestCase (assertEqual "Sorting rows by ascending works as intended" 30 | (D.fromList [("test1", DI.toColumn [(1::Int)..26]), 31 | ("test2", DI.toColumn ['a'..'z'])]) 32 | (D.sortBy D.Ascending ["test1"] testData)) 33 | 34 | sortByDescendingWAI :: Test 35 | sortByDescendingWAI = TestCase (assertEqual "Sorting rows by descending works as intended" 36 | (D.fromList [("test1", DI.toColumn $ reverse [(1::Int)..26]), 37 | ("test2", DI.toColumn $ reverse ['a'..'z'])]) 38 | (D.sortBy D.Descending ["test1"] testData)) 39 | 40 | sortByColumnDoesNotExist :: Test 41 | sortByColumnDoesNotExist = TestCase (assertExpectException "[Error Case]" 42 | (DE.columnNotFound "[\"test0\"]" "sortBy" (D.columnNames testData)) 43 | (print $ D.sortBy D.Ascending ["test0"] testData)) 44 | 45 | tests :: [Test] 46 | tests = [ TestLabel "sortByAscendingWAI" sortByAscendingWAI 47 | , TestLabel "sortByDescendingWAI" sortByDescendingWAI 48 | , TestLabel "sortByColumnDoesNotExist" sortByColumnDoesNotExist 49 | ] 50 | 51 | -------------------------------------------------------------------------------- /tests/Operations/Take.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | module Operations.Take where 3 | 4 | import qualified DataFrame as D 5 | import qualified DataFrame as DI 6 | 7 | import Test.HUnit 8 | 9 | testData :: D.DataFrame 10 | testData = D.fromList [ ("test1", DI.toColumn ([1..26] :: [Int])) 11 | , ("test2", DI.toColumn ['a'..'z']) 12 | ] 13 | 14 | 15 | takeWAI :: Test 16 | takeWAI = TestCase (assertEqual "Gets first 10 numbers" (Just $ D.toColumn [(1 :: Int)..10]) (D.getColumn "test1" $ D.take 10 testData)) 17 | 18 | takeLastWAI :: Test 19 | takeLastWAI = TestCase (assertEqual "Gets first 10 numbers" (Just $ D.toColumn [(17 :: Int)..26]) (D.getColumn "test1" $ D.takeLast 10 testData)) 20 | 21 | lengthEqualsTakeParam :: Test 22 | lengthEqualsTakeParam = TestCase (assertEqual "should be (5, 2)" (5, 2) (D.dimensions $ D.take 5 testData)) 23 | 24 | lengthGreaterThanTakeParam :: Test 25 | lengthGreaterThanTakeParam = TestCase (assertEqual "should be (26, 2)" (26, 2) (D.dimensions $ D.take 30 testData)) 26 | 27 | emptyIsZero :: Test 28 | emptyIsZero = TestCase (assertEqual "should be (0, 0)" (0, 0) (D.dimensions $ D.take 5 D.empty)) 29 | 30 | negativeIsZero :: Test 31 | negativeIsZero = TestCase (assertEqual "should be (0, 2)" (0, 2) (D.dimensions $ D.take (-1) testData)) 32 | 33 | lengthEqualsTakeLastParam :: Test 34 | lengthEqualsTakeLastParam = TestCase (assertEqual "should be (5, 2)" (5, 2) (D.dimensions $ D.takeLast 5 testData)) 35 | 36 | lengthGreaterThanTakeLastParam :: Test 37 | lengthGreaterThanTakeLastParam = TestCase (assertEqual "should be (26, 2)" (26, 2) (D.dimensions $ D.takeLast 30 testData)) 38 | 39 | emptyIsZeroTakeLast :: Test 40 | emptyIsZeroTakeLast = TestCase (assertEqual "should be (0, 0)" (0, 0) (D.dimensions $ D.takeLast 5 D.empty)) 41 | 42 | negativeIsZeroTakeLast :: Test 43 | negativeIsZeroTakeLast = TestCase (assertEqual "should be (0, 2)" (0, 2) (D.dimensions $ D.takeLast (-1) testData)) 44 | 45 | tests :: [Test] 46 | tests = [ TestLabel "takeWAI" takeWAI 47 | , TestLabel "takeLastWAI" takeLastWAI 48 | , TestLabel "lengthEqualsTakeParam" lengthEqualsTakeParam 49 | , TestLabel "lengthGreaterThanTakeParam" lengthGreaterThanTakeParam 50 | , TestLabel "emptyIsZero" emptyIsZero 51 | , TestLabel "negativeIsZero" negativeIsZero 52 | , TestLabel "lengthEqualsTakeLastParam" lengthEqualsTakeLastParam 53 | , TestLabel "lengthGreaterThanTakeLastParam" lengthGreaterThanTakeLastParam 54 | , TestLabel "emptyIsZeroTakeLast" emptyIsZeroTakeLast 55 | , TestLabel "negativeIsZeroTakeLast" negativeIsZeroTakeLast 56 | ] 57 | --------------------------------------------------------------------------------