├── .ghci ├── .github └── workflows │ ├── cabal.yml │ └── stack.yml ├── .gitignore ├── LICENSE ├── README.md ├── Setup.hs ├── changelog.md ├── csv-conduit.cabal ├── src └── Data │ └── CSV │ ├── Conduit.hs │ └── Conduit │ ├── Conversion.hs │ ├── Conversion │ └── Internal.hs │ ├── Monoid.hs │ ├── Parser │ ├── ByteString.hs │ └── Text.hs │ └── Types.hs ├── stack.yaml └── test ├── AdHoc.hs ├── Test.hs ├── test-mac-excel.csv ├── test-windows-excel.csv ├── test.csv └── test.xls /.ghci: -------------------------------------------------------------------------------- 1 | :set -isrc 2 | -------------------------------------------------------------------------------- /.github/workflows/cabal.yml: -------------------------------------------------------------------------------- 1 | # This GitHub workflow config has been generated by a script via 2 | # 3 | # haskell-ci 'github' 'csv-conduit.cabal' 4 | # 5 | # To regenerate the script (for example after adjusting tested-with) run 6 | # 7 | # haskell-ci regenerate 8 | # 9 | # For more information, see https://github.com/haskell-CI/haskell-ci 10 | # 11 | # version: 0.19.20240608 12 | # 13 | # REGENDATA ("0.19.20240608",["github","csv-conduit.cabal"]) 14 | # 15 | name: cabal 16 | on: 17 | - push 18 | - pull_request 19 | jobs: 20 | linux: 21 | name: ${{ matrix.compiler }} 22 | runs-on: ubuntu-20.04 23 | timeout-minutes: 24 | 60 25 | container: 26 | image: buildpack-deps:jammy 27 | continue-on-error: ${{ matrix.allow-failure }} 28 | strategy: 29 | matrix: 30 | include: 31 | - compiler: ghc-9.6.4 32 | compilerKind: ghc 33 | compilerVersion: 9.6.4 34 | setup-method: ghcup 35 | allow-failure: false 36 | - compiler: ghc-9.0.1 37 | compilerKind: ghc 38 | compilerVersion: 9.0.1 39 | setup-method: ghcup 40 | allow-failure: false 41 | - compiler: ghc-8.10.4 42 | compilerKind: ghc 43 | compilerVersion: 8.10.4 44 | setup-method: ghcup 45 | allow-failure: false 46 | - compiler: ghc-8.8.4 47 | compilerKind: ghc 48 | compilerVersion: 8.8.4 49 | setup-method: ghcup 50 | allow-failure: false 51 | - compiler: ghc-8.8.3 52 | compilerKind: ghc 53 | compilerVersion: 8.8.3 54 | setup-method: ghcup 55 | allow-failure: false 56 | - compiler: ghc-8.6.5 57 | compilerKind: ghc 58 | compilerVersion: 8.6.5 59 | setup-method: ghcup 60 | allow-failure: false 61 | - compiler: ghc-8.4.4 62 | compilerKind: ghc 63 | compilerVersion: 8.4.4 64 | setup-method: ghcup 65 | allow-failure: false 66 | - compiler: ghc-8.2.2 67 | compilerKind: ghc 68 | compilerVersion: 8.2.2 69 | setup-method: ghcup 70 | allow-failure: false 71 | fail-fast: false 72 | steps: 73 | - name: apt 74 | run: | 75 | apt-get update 76 | apt-get install -y --no-install-recommends gnupg ca-certificates dirmngr curl git software-properties-common libtinfo5 libnuma-dev 77 | mkdir -p "$HOME/.ghcup/bin" 78 | curl -sL https://downloads.haskell.org/ghcup/0.1.20.0/x86_64-linux-ghcup-0.1.20.0 > "$HOME/.ghcup/bin/ghcup" 79 | chmod a+x "$HOME/.ghcup/bin/ghcup" 80 | "$HOME/.ghcup/bin/ghcup" install ghc "$HCVER" || (cat "$HOME"/.ghcup/logs/*.* && false) 81 | "$HOME/.ghcup/bin/ghcup" install cabal 3.10.2.0 || (cat "$HOME"/.ghcup/logs/*.* && false) 82 | env: 83 | HCKIND: ${{ matrix.compilerKind }} 84 | HCNAME: ${{ matrix.compiler }} 85 | HCVER: ${{ matrix.compilerVersion }} 86 | - name: Set PATH and environment variables 87 | run: | 88 | echo "$HOME/.cabal/bin" >> $GITHUB_PATH 89 | echo "LANG=C.UTF-8" >> "$GITHUB_ENV" 90 | echo "CABAL_DIR=$HOME/.cabal" >> "$GITHUB_ENV" 91 | echo "CABAL_CONFIG=$HOME/.cabal/config" >> "$GITHUB_ENV" 92 | HCDIR=/opt/$HCKIND/$HCVER 93 | HC=$("$HOME/.ghcup/bin/ghcup" whereis ghc "$HCVER") 94 | HCPKG=$(echo "$HC" | sed 's#ghc$#ghc-pkg#') 95 | HADDOCK=$(echo "$HC" | sed 's#ghc$#haddock#') 96 | echo "HC=$HC" >> "$GITHUB_ENV" 97 | echo "HCPKG=$HCPKG" >> "$GITHUB_ENV" 98 | echo "HADDOCK=$HADDOCK" >> "$GITHUB_ENV" 99 | echo "CABAL=$HOME/.ghcup/bin/cabal-3.10.2.0 -vnormal+nowrap" >> "$GITHUB_ENV" 100 | HCNUMVER=$(${HC} --numeric-version|perl -ne '/^(\d+)\.(\d+)\.(\d+)(\.(\d+))?$/; print(10000 * $1 + 100 * $2 + ($3 == 0 ? $5 != 1 : $3))') 101 | echo "HCNUMVER=$HCNUMVER" >> "$GITHUB_ENV" 102 | echo "ARG_TESTS=--enable-tests" >> "$GITHUB_ENV" 103 | echo "ARG_BENCH=--enable-benchmarks" >> "$GITHUB_ENV" 104 | echo "HEADHACKAGE=false" >> "$GITHUB_ENV" 105 | echo "ARG_COMPILER=--$HCKIND --with-compiler=$HC" >> "$GITHUB_ENV" 106 | echo "GHCJSARITH=0" >> "$GITHUB_ENV" 107 | env: 108 | HCKIND: ${{ matrix.compilerKind }} 109 | HCNAME: ${{ matrix.compiler }} 110 | HCVER: ${{ matrix.compilerVersion }} 111 | - name: env 112 | run: | 113 | env 114 | - name: write cabal config 115 | run: | 116 | mkdir -p $CABAL_DIR 117 | cat >> $CABAL_CONFIG <> $CABAL_CONFIG < cabal-plan.xz 150 | echo 'f62ccb2971567a5f638f2005ad3173dba14693a45154c1508645c52289714cb2 cabal-plan.xz' | sha256sum -c - 151 | xz -d < cabal-plan.xz > $HOME/.cabal/bin/cabal-plan 152 | rm -f cabal-plan.xz 153 | chmod a+x $HOME/.cabal/bin/cabal-plan 154 | cabal-plan --version 155 | - name: checkout 156 | uses: actions/checkout@v4 157 | with: 158 | path: source 159 | - name: initial cabal.project for sdist 160 | run: | 161 | touch cabal.project 162 | echo "packages: $GITHUB_WORKSPACE/source/." >> cabal.project 163 | cat cabal.project 164 | - name: sdist 165 | run: | 166 | mkdir -p sdist 167 | $CABAL sdist all --output-dir $GITHUB_WORKSPACE/sdist 168 | - name: unpack 169 | run: | 170 | mkdir -p unpacked 171 | find sdist -maxdepth 1 -type f -name '*.tar.gz' -exec tar -C $GITHUB_WORKSPACE/unpacked -xzvf {} \; 172 | - name: generate cabal.project 173 | run: | 174 | PKGDIR_csv_conduit="$(find "$GITHUB_WORKSPACE/unpacked" -maxdepth 1 -type d -regex '.*/csv-conduit-[0-9.]*')" 175 | echo "PKGDIR_csv_conduit=${PKGDIR_csv_conduit}" >> "$GITHUB_ENV" 176 | rm -f cabal.project cabal.project.local 177 | touch cabal.project 178 | touch cabal.project.local 179 | echo "packages: ${PKGDIR_csv_conduit}" >> cabal.project 180 | echo "package csv-conduit" >> cabal.project 181 | echo " ghc-options: -Werror=missing-methods" >> cabal.project 182 | cat >> cabal.project <> cabal.project.local 185 | cat cabal.project 186 | cat cabal.project.local 187 | - name: dump install plan 188 | run: | 189 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dry-run all 190 | cabal-plan 191 | - name: restore cache 192 | uses: actions/cache/restore@v4 193 | with: 194 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 195 | path: ~/.cabal/store 196 | restore-keys: ${{ runner.os }}-${{ matrix.compiler }}- 197 | - name: install dependencies 198 | run: | 199 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks --dependencies-only -j2 all 200 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH --dependencies-only -j2 all 201 | - name: build w/o tests 202 | run: | 203 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 204 | - name: build 205 | run: | 206 | $CABAL v2-build $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --write-ghc-environment-files=always 207 | - name: tests 208 | run: | 209 | $CABAL v2-test $ARG_COMPILER $ARG_TESTS $ARG_BENCH all --test-show-details=direct 210 | - name: cabal check 211 | run: | 212 | cd ${PKGDIR_csv_conduit} || false 213 | ${CABAL} -vnormal check 214 | - name: haddock 215 | run: | 216 | $CABAL v2-haddock --disable-documentation --haddock-all $ARG_COMPILER --with-haddock $HADDOCK $ARG_TESTS $ARG_BENCH all 217 | - name: unconstrained build 218 | run: | 219 | rm -f cabal.project.local 220 | $CABAL v2-build $ARG_COMPILER --disable-tests --disable-benchmarks all 221 | - name: save cache 222 | uses: actions/cache/save@v4 223 | if: always() 224 | with: 225 | key: ${{ runner.os }}-${{ matrix.compiler }}-${{ github.sha }} 226 | path: ~/.cabal/store 227 | -------------------------------------------------------------------------------- /.github/workflows/stack.yml: -------------------------------------------------------------------------------- 1 | name: stack 2 | on: 3 | - push 4 | - pull_request 5 | 6 | jobs: 7 | linux: 8 | name: ${{ matrix.resolver }} 9 | runs-on: ubuntu-latest 10 | timeout-minutes: 60 11 | continue-on-error: false 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | resolver: 16 | - "lts-22.34" 17 | - "lts-21.25" 18 | - "lts-20.26" 19 | - "lts-19.33" 20 | - "lts-18.28" 21 | steps: 22 | - name: Clone project 23 | uses: actions/checkout@v4 24 | 25 | - name: Cache dependencies 26 | uses: actions/cache@v4 27 | with: 28 | path: ~/.stack 29 | key: ${{ runner.os }}-${{ matrix.resolver }}-${{ hashFiles('stack.yaml') }} 30 | restore-keys: ${{ runner.os }}-${{ matrix.resolver }} 31 | 32 | - name: Build and run tests 33 | shell: bash 34 | run: | 35 | set -ex 36 | stack --version 37 | stack test --fast --no-terminal --resolver ${{ matrix.resolver }} 38 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.hi 3 | dist/ 4 | cabal-dev/ 5 | .DS* 6 | TAGS 7 | *.prof 8 | *.hi 9 | *.o 10 | *.ps 11 | *.aux 12 | *.hp 13 | .cabal-sandbox 14 | cabal.sandbox.config 15 | .stack-work/ 16 | stack.yaml.lock 17 | dist-newstyle 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c)2024, Daniel Vianna 2 | Copyright (c)2013, Ozgun Ataman 3 | Copyright (c)2012, Johan Tibell 4 | 5 | All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without 8 | modification, are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright 11 | notice, this list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above 14 | copyright notice, this list of conditions and the following 15 | disclaimer in the documentation and/or other materials provided 16 | with the distribution. 17 | 18 | * Neither the name of Ozgun Ataman nor the names of other 19 | contributors may be used to endorse or promote products derived 20 | from this software without specific prior written permission. 21 | 22 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 25 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 26 | OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 27 | SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 28 | LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 29 | DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 30 | THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 31 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 32 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 33 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # README 2 | [![cabal 3 | build](https://github.com/ozataman/csv-conduit/actions/workflows/cabal.yml/badge.svg)](https://github.com/ozataman/csv-conduit/actions) 4 | [![stack build](https://github.com/ozataman/csv-conduit/actions/workflows/stack.yml/badge.svg)](https://github.com/ozataman/csv-conduit/actions) 5 | 6 | ## CSV Files and Haskell 7 | 8 | CSV files are the de-facto standard in many cases of data transfer, 9 | particularly when dealing with enterprise application or disparate database 10 | systems. 11 | 12 | While there are a number of csv libraries in Haskell, at the time of 13 | this project's start, there wasn't one that provided all of the 14 | following: 15 | 16 | * Full flexibility in quote characters, separators, input/output 17 | * Constant space operation 18 | * Robust parsing and error resiliency 19 | * Battle-tested reliability in real-world datasets 20 | * Fast operation 21 | * Convenient interface that supports a variety of use cases 22 | 23 | Over time, people created other plausible CSV packages like cassava. 24 | The major benefit from this library remains to be: 25 | 26 | * Direct participation in the conduit ecosystem, which is now quite 27 | large, and all the benefits that come with it. 28 | * Flexibility in CSV format definition. 29 | * Resiliency to errors in the input data. 30 | 31 | 32 | ## This package 33 | 34 | csv-conduit is a conduit-based CSV parsing library that is easy to 35 | use, flexible and fast. It leverages the conduit infrastructure to 36 | provide constant-space operation, which is quite critical in many real 37 | world use cases. 38 | 39 | For example, you can use http-conduit to download a CSV file from the 40 | internet and plug its Source into intoCSV to stream-convert the 41 | download into the Row data type and do something with it as the data 42 | streams, that is without having to download the entire file to disk 43 | first. 44 | 45 | 46 | ## Author & Contributors 47 | 48 | - Ozgun Ataman (@ozataman) 49 | - Daniel Bergey (@bergey) 50 | - BJTerry (@BJTerry) 51 | - Mike Craig (@mkscrg) 52 | - Daniel Corson (@dancor) 53 | - Dmitry Dzhus (@dzhus) 54 | - Niklas Hambüchen (@nh2) 55 | - Facundo Domínguez (@facundominguez) 56 | - Daniel Vianna (@dmvianna) 57 | 58 | ### Introduction 59 | 60 | * The CSVeable typeclass implements the key operations. 61 | * CSVeable is parameterized on both a stream type and a target CSV row type. 62 | * There are 2 basic row types and they implement *exactly* the same operations, 63 | so you can chose the right one for the job at hand: 64 | - `type MapRow t = Map t t` 65 | - `type Row t = [t]` 66 | * You basically use the Conduits defined in this library to do the 67 | parsing from a CSV stream and rendering back into a CSV stream. 68 | * Use the full flexibility and modularity of conduits for sources and sinks. 69 | 70 | ### Speed 71 | 72 | While fast operation is of concern, I have so far cared more about correct 73 | operation and a flexible API. Please let me know if you notice any performance 74 | regressions or optimization opportunities. 75 | 76 | 77 | ### Usage Examples 78 | 79 | 80 | #### Example #1: Basics Using Convenience API 81 | 82 | ```haskell 83 | {-# LANGUAGE OverloadedStrings #-} 84 | 85 | import Data.Conduit 86 | import Data.Conduit.Binary 87 | import Data.Conduit.List as CL 88 | import Data.CSV.Conduit 89 | import Data.Text (Text) 90 | 91 | -- Just reverse te columns 92 | myProcessor :: Monad m => Conduit (Row Text) m (Row Text) 93 | myProcessor = CL.map reverse 94 | 95 | test :: IO () 96 | test = runResourceT $ 97 | transformCSV defCSVSettings 98 | (sourceFile "input.csv") 99 | myProcessor 100 | (sinkFile "output.csv") 101 | ``` 102 | 103 | #### Example #2: Basics Using Conduit API 104 | 105 | ```haskell 106 | {-# LANGUAGE OverloadedStrings #-} 107 | 108 | import Data.Conduit 109 | import Data.Conduit.Binary 110 | import Data.CSV.Conduit 111 | import Data.Text (Text) 112 | 113 | myProcessor :: Monad m => Conduit (Row Text) m (Row Text) 114 | myProcessor = awaitForever $ yield 115 | 116 | -- Let's simply stream from a file, parse the CSV, reserialize it 117 | -- and push back into another file. 118 | test :: IO () 119 | test = runResourceT $ 120 | sourceFile "test/BigFile.csv" $= 121 | intoCSV defCSVSettings $= 122 | myProcessor $= 123 | fromCSV defCSVSettings $$ 124 | sinkFile "test/BigFileOut.csv" 125 | ``` 126 | -------------------------------------------------------------------------------- /Setup.hs: -------------------------------------------------------------------------------- 1 | import Distribution.Simple 2 | main = defaultMain 3 | -------------------------------------------------------------------------------- /changelog.md: -------------------------------------------------------------------------------- 1 | 1.0.1.1 2 | * Fix test suite to build with text 2.1.2 and ghc 9.12.2, resolving [#62](https://github.com/ozataman/csv-conduit/issues/62) 3 | 4 | 1.0.1.0 5 | * Use ConduitT instead of ConduitM (prettier type inference with newer conduit imports) 6 | 7 | 1.0.0.2 8 | * Fixed [#17](https://github.com/ozataman/csv-conduit/issues/17), 9 | where CSV created with Excel in Mac OS failed to parse due to its 10 | newline characters. 11 | 12 | 1.0.0.1 13 | * Removed dependencies: mmorph, monad-control, mtl, 14 | unordered-containers, primitive 15 | 16 | 1.0.0.0 17 | * Removed `return` from the `Monad` instance for `Parser`, and 18 | transfered its definition to `pure` in the `Applicative` instance of 19 | `Parser`. This was necessary to support GHC 9.6.4. 20 | * Create new API to choose whether to handle empty CSV cells as empty 21 | strings or NULLs. 22 | * Added imports that were removed from `Prelude` in GHC 9.6.4. 23 | * Bumped the default Stack resolver to LTS-22.20. 24 | 25 | 0.7.3.0 26 | * Add ordered versions of named records for consistent, controllable header column ordering. [PR 44](https://github.com/ozataman/csv-conduit/pull/44) 27 | * Add support for GHC 9.0.1 28 | 29 | 0.7.2.0 30 | * Remove some dependency upper bounds for forward compatibility. 31 | 32 | 0.7.1.0 33 | * Add MonadFail instance for Parser. [PR 38](https://github.com/ozataman/csv-conduit/pull/38) 34 | 35 | 0.7.0.0 36 | * BREAKING: Switch from partial Monoid instance on Parser to total Semigroup instance. 37 | * Compatibility with GHC 8.4.x/base-4.11.1.0 38 | 39 | 0.6.8.1 40 | * Fix documentation mistake in FromNamedRecord/ToNamedRecord examples. 41 | 42 | 0.6.8 43 | * Haddocks improvements 44 | * Fix inlining and specialization rules around formatDecimal 45 | * Updates to permit newest conduit/resourcet packages 46 | 47 | 0.6.7 48 | * Fix build for GHC 8.0.1 49 | -------------------------------------------------------------------------------- /csv-conduit.cabal: -------------------------------------------------------------------------------- 1 | name: csv-conduit 2 | version: 1.0.1.1 3 | synopsis: 4 | A flexible, fast, conduit-based CSV parser library for Haskell. 5 | 6 | homepage: http://github.com/ozataman/csv-conduit 7 | license: BSD3 8 | license-file: LICENSE 9 | author: Ozgun Ataman 10 | maintainer: Daniel Vianna 11 | category: Data, Conduit, CSV, Text 12 | build-type: Simple 13 | cabal-version: >=1.10 14 | tested-with: 15 | GHC ==8.2.2 16 | || ==8.4.4 17 | || ==8.6.5 18 | || ==8.8.3 19 | || ==8.8.4 20 | || ==8.10.4 21 | || ==9.0.1 22 | || ==9.6.4 23 | 24 | description: 25 | CSV files are the de-facto standard in many situations involving data transfer, 26 | particularly when dealing with enterprise application or disparate database 27 | systems. 28 | . 29 | While there are a number of CSV libraries in Haskell, at the time of this 30 | project's start in 2010, there wasn't one that provided all of the following: 31 | . 32 | * Full flexibility in quote characters, separators, input/output 33 | . 34 | * Constant space operation 35 | . 36 | * Robust parsing, correctness and error resiliency 37 | . 38 | * Convenient interface that supports a variety of use cases 39 | . 40 | * Fast operation 41 | . 42 | This library is an attempt to close these gaps. Please note that 43 | this library started its life based on the enumerator package and 44 | has recently been ported to work with conduits instead. In the 45 | process, it has been greatly simplified thanks to the modular nature 46 | of the conduits library. 47 | . 48 | Following the port to conduits, the library has also gained the 49 | ability to parameterize on the stream type and work both with 50 | ByteString and Text. 51 | . 52 | For more documentation and examples, check out the README at: 53 | . 54 | 55 | . 56 | 57 | extra-source-files: 58 | changelog.md 59 | README.md 60 | test/test-mac-excel.csv 61 | test/test-windows-excel.csv 62 | test/test.csv 63 | test/Test.hs 64 | test/test.xls 65 | 66 | flag lib-Werror 67 | default: False 68 | manual: True 69 | 70 | library 71 | default-language: Haskell2010 72 | exposed-modules: 73 | Data.CSV.Conduit 74 | Data.CSV.Conduit.Conversion 75 | Data.CSV.Conduit.Parser.ByteString 76 | Data.CSV.Conduit.Parser.Text 77 | Data.CSV.Conduit.Types 78 | 79 | other-modules: 80 | Data.CSV.Conduit.Conversion.Internal 81 | Data.CSV.Conduit.Monoid 82 | 83 | ghc-options: -Wall -funbox-strict-fields 84 | 85 | if flag(lib-werror) 86 | ghc-options: -Werror 87 | 88 | hs-source-dirs: src 89 | build-depends: 90 | array 91 | , attoparsec >=0.10 92 | , base >=4 && <5 93 | , blaze-builder 94 | , bytestring 95 | , conduit >=1.3.0 && <2.0 96 | , conduit-extra 97 | , containers >=0.3 98 | , data-default 99 | , exceptions >=0.3 100 | , ordered-containers 101 | , primitive 102 | , resourcet >=1.1.2.1 103 | , text 104 | , transformers 105 | , vector 106 | 107 | test-suite test 108 | default-language: Haskell2010 109 | type: exitcode-stdio-1.0 110 | main-is: Test.hs 111 | ghc-options: -Wall -Wunused-packages 112 | 113 | if flag(lib-werror) 114 | ghc-options: -Werror 115 | 116 | hs-source-dirs: test 117 | build-depends: 118 | base >=4 && <5 119 | , bytestring 120 | , conduit >=1.3.0 121 | , containers >=0.3 122 | , csv-conduit 123 | , directory 124 | , HUnit >=1.2 125 | , ordered-containers 126 | , test-framework 127 | , test-framework-hunit 128 | , text 129 | , vector 130 | 131 | source-repository head 132 | type: git 133 | location: git://github.com/ozataman/csv-conduit.git 134 | -------------------------------------------------------------------------------- /src/Data/CSV/Conduit.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns #-} 2 | {-# LANGUAGE CPP #-} 3 | {-# LANGUAGE FlexibleContexts #-} 4 | {-# LANGUAGE FlexibleInstances #-} 5 | {-# LANGUAGE MultiParamTypeClasses #-} 6 | {-# LANGUAGE OverloadedStrings #-} 7 | {-# LANGUAGE RankNTypes #-} 8 | {-# LANGUAGE ScopedTypeVariables #-} 9 | {-# LANGUAGE UndecidableInstances #-} 10 | 11 | module Data.CSV.Conduit 12 | ( 13 | 14 | -- * Main Interface 15 | decodeCSV 16 | , readCSVFile 17 | , writeCSVFile 18 | , transformCSV 19 | , transformCSV' 20 | , mapCSVFile 21 | , writeHeaders 22 | , writeHeadersOrdered 23 | 24 | -- Types 25 | , CSV (..) 26 | , CSVSettings (..) 27 | , defCSVSettings 28 | , MapRow 29 | , OrderedMapRow 30 | , Row 31 | 32 | -- * Re-exported For Convenience 33 | , runResourceT 34 | ) where 35 | 36 | ------------------------------------------------------------------------------- 37 | import Control.Exception 38 | import Control.Monad.Catch.Pure (runCatchT, CatchT) 39 | import Control.Monad.IO.Class (MonadIO (liftIO)) 40 | import Control.Monad.Primitive 41 | import Control.Monad.ST 42 | import Control.Monad.Trans.Class (MonadTrans(lift)) 43 | import Control.Monad.Trans.Except (ExceptT(..), runExceptT) 44 | import Control.Monad.Trans.Resource (MonadResource, MonadThrow, 45 | runResourceT) 46 | import Data.Attoparsec.Types (Parser) 47 | import qualified Data.ByteString as B 48 | import Data.ByteString.Char8 (ByteString) 49 | import qualified Data.ByteString.Char8 as B8 50 | import Data.ByteString.Internal (c2w) 51 | import Data.Conduit 52 | import Data.Conduit.Attoparsec 53 | import Data.Conduit.Binary (sinkFile, sinkIOHandle, 54 | sourceFile) 55 | import qualified Data.Conduit.List as C 56 | import qualified Data.Map as M 57 | import qualified Data.Map.Ordered as MO 58 | import Data.String 59 | import Data.Text (Text) 60 | import qualified Data.Text as T 61 | import qualified Data.Text.Encoding as T 62 | import qualified Data.Vector as V 63 | import qualified Data.Vector.Generic as GV 64 | import qualified Data.Vector.Generic.Mutable as GMV 65 | import Data.Void as Void 66 | import System.IO 67 | ------------------------------------------------------------------------------- 68 | import Data.CSV.Conduit.Conversion (FromNamedRecord (..), 69 | FromNamedRecordOrdered (..), 70 | Named (..), 71 | NamedOrdered (..), 72 | ToNamedRecord (..), 73 | ToNamedRecordOrdered (..), 74 | runParser) 75 | import qualified Data.CSV.Conduit.Parser.ByteString as BSP 76 | import qualified Data.CSV.Conduit.Parser.Text as TP 77 | import Data.CSV.Conduit.Types 78 | ------------------------------------------------------------------------------- 79 | 80 | 81 | ------------------------------------------------------------------------------- 82 | -- | Represents types 'r' that are CSV-like and can be converted 83 | -- to/from an underlying stream of type 's'. There is nothing scary 84 | -- about the type: 85 | -- 86 | -- @s@ represents stream types that can be converted to\/from CSV rows. 87 | -- Examples are 'ByteString', 'Text' and 'String'. 88 | -- 89 | -- @r@ represents the target CSV row representations that this library 90 | -- can work with. Examples are the 'Row' types, the 'Record' type and 91 | -- the 'MapRow' family of types. We can also convert directly to 92 | -- complex Haskell types using the 'Data.CSV.Conduit.Conversion' 93 | -- module that was borrowed from the cassava package, which was itself 94 | -- inspired by the aeson package. 95 | -- 96 | -- 97 | -- Example #1: Basics Using Convenience API 98 | -- 99 | -- >import Data.Conduit 100 | -- >import Data.Conduit.Binary 101 | -- >import Data.Conduit.List as CL 102 | -- >import Data.CSV.Conduit 103 | -- > 104 | -- >myProcessor :: Conduit (Row Text) m (Row Text) 105 | -- >myProcessor = CL.map reverse 106 | -- > 107 | -- >test = runResourceT $ 108 | -- > transformCSV defCSVSettings 109 | -- > (sourceFile "input.csv") 110 | -- > myProcessor 111 | -- > (sinkFile "output.csv") 112 | -- 113 | -- 114 | -- Example #2: Basics Using Conduit API 115 | -- 116 | -- >import Data.Conduit 117 | -- >import Data.Conduit.Binary 118 | -- >import Data.CSV.Conduit 119 | -- > 120 | -- >myProcessor :: Conduit (MapRow Text) m (MapRow Text) 121 | -- >myProcessor = undefined 122 | -- > 123 | -- >test = runResourceT $ runConduit $ 124 | -- > sourceFile "test/BigFile.csv" .| 125 | -- > intoCSV defCSVSettings .| 126 | -- > myProcessor .| 127 | -- > (writeHeaders defCSVSettings >> fromCSV defCSVSettings) .| 128 | -- > sinkFile "test/BigFileOut.csv" 129 | class CSV s r where 130 | 131 | ----------------------------------------------------------------------------- 132 | -- | Convert a CSV row into strict ByteString equivalent. 133 | rowToStr :: CSVSettings -> r -> s 134 | 135 | ----------------------------------------------------------------------------- 136 | -- | Turn a stream of 's' into a stream of CSV row type. An example 137 | -- would be parsing a ByteString stream as rows of 'MapRow' 'Text'. 138 | intoCSV :: (MonadThrow m) => CSVSettings -> ConduitT s r m () 139 | 140 | ----------------------------------------------------------------------------- 141 | -- | Turn a stream of CSV row type back into a stream of 's'. An 142 | -- example would be rendering a stream of 'Row' 'ByteString' rows as 143 | -- 'Text'. 144 | fromCSV :: Monad m => CSVSettings -> ConduitT r s m () 145 | 146 | 147 | 148 | 149 | 150 | ------------------------------------------------------------------------------ 151 | -- | 'Row' instance using 'ByteString' 152 | instance CSV ByteString (Row ByteString) where 153 | rowToStr s !r = 154 | let 155 | sep = B.pack [c2w (csvSep s)] 156 | wrapField !f = case csvQuoteCharAndStyle s of 157 | Just (x, quoteEmpty) -> 158 | case quoteEmpty == DoQuoteEmpty || B8.length f /= 0 of 159 | True -> (x `B8.cons` escape x f) `B8.snoc` x 160 | False -> f 161 | Nothing -> f 162 | escape c str = B8.intercalate (B8.pack [c,c]) $ B8.split c str 163 | in B.intercalate sep . map wrapField $ r 164 | 165 | intoCSV set = intoCSVRow (BSP.row set) 166 | fromCSV set = fromCSVRow set 167 | 168 | 169 | ------------------------------------------------------------------------------ 170 | -- | 'Row' instance using 'Text' 171 | instance CSV Text (Row Text) where 172 | rowToStr s !r = 173 | let 174 | sep = T.pack [csvSep s] 175 | wrapField !f = case csvQuoteCharAndStyle s of 176 | Just (x, quoteEmpty) -> case quoteEmpty == DoQuoteEmpty || not (T.null f) of 177 | True -> x `T.cons` escape x f `T.snoc` x 178 | False -> f 179 | Nothing -> f 180 | escape c str = T.intercalate (T.pack [c,c]) $ T.split (== c) str 181 | in T.intercalate sep . map wrapField $ r 182 | 183 | intoCSV set = intoCSVRow (TP.row set) 184 | fromCSV set = fromCSVRow set 185 | 186 | 187 | ------------------------------------------------------------------------------- 188 | -- | 'Row' instance using 'Text' based on 'ByteString' stream 189 | instance CSV ByteString (Row Text) where 190 | rowToStr s r = T.encodeUtf8 $ rowToStr s r 191 | intoCSV set = intoCSV set .| C.map (map T.decodeUtf8) 192 | fromCSV set = fromCSV set .| C.map T.encodeUtf8 193 | 194 | 195 | 196 | ------------------------------------------------------------------------------- 197 | -- | 'Row' instance using 'String' based on 'ByteString' stream. 198 | -- Please note this uses the ByteString operations underneath and has 199 | -- lots of unnecessary overhead. Included for convenience. 200 | instance CSV ByteString (Row String) where 201 | rowToStr s r = rowToStr s $ map B8.pack r 202 | intoCSV set = intoCSV set .| C.map (map B8.unpack) 203 | fromCSV set = C.map (map B8.pack) .| fromCSV set 204 | 205 | 206 | -- | Support for parsing rows in the 'Vector' form. 207 | instance (CSV s (Row s)) => CSV s (V.Vector s) where 208 | rowToStr s r = rowToStr s . V.toList $ r 209 | intoCSV set = intoCSV set .| C.map (V.fromList) 210 | fromCSV set = C.map (V.toList) .| fromCSV set 211 | 212 | 213 | 214 | ------------------------------------------------------------------------------- 215 | fromCSVRow :: (Monad m, IsString s, CSV s r) 216 | => CSVSettings -> ConduitT r s m () 217 | fromCSVRow set = awaitForever $ \row -> mapM_ yield [rowToStr set row, "\n"] 218 | 219 | 220 | 221 | ------------------------------------------------------------------------------- 222 | intoCSVRow :: (MonadThrow m, AttoparsecInput i) => Parser i (Maybe o) -> ConduitT i o m () 223 | intoCSVRow p = parse .| puller 224 | where 225 | parse = {-# SCC "conduitParser_p" #-} conduitParser p 226 | puller = {-# SCC "puller" #-} 227 | awaitForever $ \ (_, mrow) -> maybe (return ()) yield mrow 228 | 229 | 230 | ------------------------------------------------------------------------------- 231 | -- | Generic 'MapRow' instance; any stream type with a 'Row' instance 232 | -- automatically gets a 'MapRow' instance. 233 | instance (CSV s (Row s'), Ord s', IsString s) => CSV s (MapRow s') where 234 | rowToStr s r = rowToStr s . M.elems $ r 235 | intoCSV set = intoCSVMap set 236 | fromCSV set = fromCSVMap set 237 | 238 | instance (CSV s (Row s'), Ord s', IsString s) => CSV s (OrderedMapRow s') where 239 | rowToStr s r = rowToStr s . (map snd . MO.assocs) $ r 240 | intoCSV set = intoCSVMapOrdered set 241 | fromCSV set = fromCSVMapOrdered set 242 | 243 | 244 | ------------------------------------------------------------------------------- 245 | intoCSVMap :: (Ord a, MonadThrow m, CSV s [a]) 246 | => CSVSettings -> ConduitT s (MapRow a) m () 247 | intoCSVMap set = intoCSV set .| (headers >>= converter) 248 | where 249 | headers = do 250 | mrow <- await 251 | case mrow of 252 | Nothing -> return [] 253 | Just [] -> headers 254 | Just hs -> return hs 255 | converter hs = awaitForever $ yield . toMapCSV hs 256 | toMapCSV !hs !fs = M.fromList $ zip hs fs 257 | 258 | intoCSVMapOrdered :: (Ord a, MonadThrow m, CSV s [a]) 259 | => CSVSettings -> ConduitM s (OrderedMapRow a) m () 260 | intoCSVMapOrdered set = intoCSV set .| (headers >>= converter) 261 | where 262 | headers = do 263 | mrow <- await 264 | case mrow of 265 | Nothing -> return [] 266 | Just [] -> headers 267 | Just hs -> return hs 268 | converter hs = awaitForever $ yield . toMapCSV hs 269 | toMapCSV !hs !fs = MO.fromList $ zip hs fs 270 | 271 | 272 | -- | Conversion of stream directly to/from a custom complex haskell 273 | -- type. 274 | instance (FromNamedRecord a, ToNamedRecord a, CSV s (MapRow ByteString)) => 275 | CSV s (Named a) where 276 | rowToStr s a = rowToStr s . toNamedRecord . getNamed $ a 277 | intoCSV set = intoCSV set .| C.mapMaybe go 278 | where 279 | go x = either (const Nothing) (Just . Named) $ 280 | runParser (parseNamedRecord x) 281 | 282 | fromCSV set = C.map go .| fromCSV set 283 | where 284 | go = toNamedRecord . getNamed 285 | 286 | instance (FromNamedRecordOrdered a, ToNamedRecordOrdered a, CSV s (OrderedMapRow ByteString)) => 287 | CSV s (NamedOrdered a) where 288 | rowToStr s a = rowToStr s . toNamedRecordOrdered . getNamedOrdered $ a 289 | intoCSV set = intoCSV set .| C.mapMaybe go 290 | where 291 | go x = either (const Nothing) (Just . NamedOrdered) $ 292 | runParser (parseNamedRecordOrdered x) 293 | 294 | fromCSV set = C.map go .| fromCSV set 295 | where 296 | go = toNamedRecordOrdered . getNamedOrdered 297 | 298 | 299 | ------------------------------------------------------------------------------- 300 | fromCSVMap :: (Monad m, IsString s, CSV s [a]) 301 | => CSVSettings -> ConduitT (M.Map k a) s m () 302 | fromCSVMap set = awaitForever push 303 | where 304 | push r = mapM_ yield [rowToStr set (M.elems r), "\n"] 305 | 306 | fromCSVMapOrdered :: (Monad m, IsString s, CSV s [a]) 307 | => CSVSettings -> ConduitM (MO.OMap k a) s m () 308 | fromCSVMapOrdered set = awaitForever push 309 | where 310 | push r = mapM_ yield [rowToStr set (map snd $ MO.assocs r), "\n"] 311 | 312 | 313 | ------------------------------------------------------------------------------- 314 | -- | Write headers AND the row into the output stream, once. If you 315 | -- don't call this while using 'MapRow' family of row types, then your 316 | -- resulting output will NOT have any headers in it. 317 | -- 318 | -- Usage: Just chain this using the 'Monad' instance in your pipeline: 319 | -- 320 | -- > runConduit $ ... .| writeHeaders settings >> fromCSV settings .| sinkFile "..." 321 | writeHeaders 322 | :: (Monad m, CSV s (Row r), IsString s) 323 | => CSVSettings 324 | -> ConduitT (MapRow r) s m () 325 | writeHeaders set = do 326 | mrow <- await 327 | case mrow of 328 | Nothing -> return () 329 | Just row -> mapM_ yield [ rowToStr set (M.keys row) 330 | , "\n" 331 | , rowToStr set (M.elems row) 332 | , "\n" ] 333 | 334 | writeHeadersOrdered 335 | :: (Monad m, CSV s (Row r), IsString s) 336 | => CSVSettings 337 | -> ConduitM (OrderedMapRow r) s m () 338 | writeHeadersOrdered set = do 339 | mrow <- await 340 | case mrow of 341 | Nothing -> return () 342 | Just row -> mapM_ yield [ rowToStr set (map fst $ MO.assocs row) 343 | , "\n" 344 | , rowToStr set (map snd $ MO.assocs row) 345 | , "\n" ] 346 | 347 | 348 | --------------------------- 349 | -- Convenience Functions -- 350 | --------------------------- 351 | 352 | 353 | ------------------------------------------------------------------------------- 354 | -- | Read the entire contents of a CSV file into memory. 355 | readCSVFile 356 | :: (MonadIO m, CSV ByteString a) 357 | => CSVSettings -- ^ Settings to use in deciphering stream 358 | -> FilePath -- ^ Input file 359 | -> m (V.Vector a) 360 | readCSVFile set fp = liftIO . runResourceT $ runConduit $ sourceFile fp .| intoCSV set .| transPipe lift (sinkVector growthFactor) 361 | where 362 | growthFactor = 10 363 | 364 | 365 | ------------------------------------------------------------------------------- 366 | -- | A simple way to decode a CSV string. Don't be alarmed by the 367 | -- polymorphic nature of the signature. 's' is the type for the string 368 | -- and 'v' is a kind of 'Vector' here. 369 | -- 370 | -- For example for 'ByteString': 371 | -- 372 | -- >>> s <- LB.readFile "my.csv" 373 | -- >>> decodeCSV defCSVSettings s :: Either SomeException (Vector (Vector ByteString)) 374 | -- 375 | -- will work as long as the data is comma separated. 376 | decodeCSV 377 | :: forall v a s. (GV.Vector v a, CSV s a) 378 | => CSVSettings 379 | -> s 380 | -> Either SomeException (v a) 381 | decodeCSV set bs = runST $ runExceptT pipeline 382 | where 383 | src :: ConduitT () s (ExceptT SomeException (ST s1)) () 384 | src = C.sourceList [bs] 385 | csvConvert :: ConduitT s a (ExceptT SomeException (ST s1)) () 386 | csvConvert = transPipe (ExceptT . runCatchT) csvConvert' 387 | csvConvert' :: ConduitT s a (CatchT (ST s1)) () 388 | csvConvert' = intoCSV set 389 | growthFactor = 10 390 | sink :: ConduitT a Void.Void (ExceptT SomeException (ST s1)) (v a) 391 | sink = sinkVector growthFactor 392 | pipeline :: ExceptT SomeException (ST s1) (v a) 393 | pipeline = runConduit (src .| csvConvert .| sink) 394 | 395 | 396 | 397 | ------------------------------------------------------------------------------- 398 | -- | Write CSV data into file. As we use a 'ByteString' sink, you'll 399 | -- need to get your data into a 'ByteString' stream type. 400 | writeCSVFile 401 | :: (CSV ByteString a) 402 | => CSVSettings 403 | -- ^ CSV Settings 404 | -> FilePath 405 | -- ^ Target file 406 | -> IOMode 407 | -- ^ Write vs. append mode 408 | -> [a] 409 | -- ^ List of rows 410 | -> IO () 411 | writeCSVFile set fo fmode rows = runResourceT $ runConduit $ do 412 | C.sourceList rows .| fromCSV set .| 413 | sinkIOHandle (openFile fo fmode) 414 | 415 | 416 | ------------------------------------------------------------------------------- 417 | -- | Map over the rows of a CSV file. Provided for convenience for 418 | -- historical reasons. 419 | -- 420 | -- An easy way to run this function would be 'runResourceT' after 421 | -- feeding it all the arguments. 422 | mapCSVFile 423 | :: ( MonadResource m 424 | , CSV ByteString a 425 | , CSV ByteString b 426 | # if MIN_VERSION_resourcet(1,2,0) 427 | , MonadThrow m 428 | #endif 429 | ) 430 | => CSVSettings 431 | -- ^ Settings to use both for both input and output 432 | -> (a -> [b]) 433 | -- ^ A mapping function 434 | -> FilePath 435 | -- ^ Input file 436 | -> FilePath 437 | -- ^ Output file 438 | -> m () 439 | mapCSVFile set f fi fo = 440 | transformCSV set (sourceFile fi) (C.concatMap f) (sinkFile fo) 441 | 442 | 443 | ------------------------------------------------------------------------------- 444 | -- | Like transformCSV' but uses the same settings for both input and 445 | -- output. 446 | transformCSV 447 | :: (MonadThrow m, CSV s a, CSV s' b) 448 | => CSVSettings 449 | -- ^ Settings to be used for both input and output 450 | -> ConduitT () s m () 451 | -- ^ A raw stream data source. Ex: 'sourceFile inFile' 452 | -> ConduitT a b m () 453 | -- ^ A transforming conduit 454 | -> ConduitT s' Void.Void m () 455 | -- ^ A raw stream data sink. Ex: 'sinkFile outFile' 456 | -> m () 457 | transformCSV set = transformCSV' set set 458 | 459 | 460 | ------------------------------------------------------------------------------- 461 | -- | General purpose CSV transformer. Apply a list-like processing 462 | -- function from 'Data.Conduit.List' to the rows of a CSV stream. You 463 | -- need to provide a stream data source, a transformer and a stream 464 | -- data sink. 465 | -- 466 | -- An easy way to run this function would be 'runResourceT' after 467 | -- feeding it all the arguments. 468 | -- 469 | -- Example - map a function over the rows of a CSV file: 470 | -- 471 | -- > transformCSV setIn setOut (sourceFile inFile) (C.map f) (sinkFile outFile) 472 | transformCSV' 473 | :: (MonadThrow m, CSV s a, CSV s' b) 474 | => CSVSettings 475 | -- ^ Settings to be used for input 476 | -> CSVSettings 477 | -- ^ Settings to be used for output 478 | -> ConduitT () s m () 479 | -- ^ A raw stream data source. Ex: 'sourceFile inFile' 480 | -> ConduitT a b m () 481 | -- ^ A transforming conduit 482 | -> ConduitT s' Void.Void m () 483 | -- ^ A raw stream data sink. Ex: 'sinkFile outFile' 484 | -> m () 485 | transformCSV' setIn setOut source c sink = runConduit $ 486 | source .| 487 | intoCSV setIn .| 488 | c .| 489 | fromCSV setOut .| 490 | sink 491 | 492 | 493 | 494 | 495 | ------------------ 496 | -- Vector Utils -- 497 | ------------------ 498 | 499 | 500 | 501 | ------------------------------------------------------------------------------- 502 | -- | An efficient sink that incrementally grows a vector from the input stream 503 | sinkVector :: (PrimMonad m, GV.Vector v a) => Int -> ConduitT a o m (v a) 504 | sinkVector by = do 505 | v <- lift $ GMV.new by 506 | go 0 v 507 | where 508 | -- i is the index of the next element to be written by go 509 | -- also exactly the number of elements in v so far 510 | go i v = do 511 | res <- await 512 | case res of 513 | Nothing -> do 514 | v' <- lift $ GV.freeze $ GMV.slice 0 i v 515 | return $! v' 516 | Just x -> do 517 | v' <- case GMV.length v == i of 518 | True -> lift $ GMV.grow v by 519 | False -> return v 520 | lift $ GMV.write v' i x 521 | go (i+1) v' 522 | -------------------------------------------------------------------------------- /src/Data/CSV/Conduit/Conversion.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE BangPatterns, CPP, FlexibleInstances, OverloadedStrings, 2 | Rank2Types #-} 3 | #ifdef GENERICS 4 | {-# LANGUAGE DefaultSignatures, TypeOperators, KindSignatures, FlexibleContexts, 5 | MultiParamTypeClasses, UndecidableInstances, ScopedTypeVariables, 6 | DataKinds #-} 7 | #endif 8 | 9 | ----------------------------------------------------------------------------- 10 | -- | 11 | -- Module : Data.CSV.Conduit.Conversion 12 | -- Copyright : Ozgun Ataman, Johan Tibell 13 | -- License : BSD3 14 | -- 15 | -- Maintainer : Ozgun Ataman 16 | -- Stability : experimental 17 | -- 18 | -- This module has been shamelessly taken from Johan Tibell's nicely 19 | -- put together cassava package, which itself borrows the approach 20 | -- from Bryan O'Sullivan's widely used aeson package. 21 | -- 22 | -- We make the necessary adjustments and some simplifications here to 23 | -- bolt this parsing interface onto our underlying "CSV" typeclass. 24 | ---------------------------------------------------------------------------- 25 | 26 | module Data.CSV.Conduit.Conversion 27 | ( 28 | -- * Type conversion 29 | Only(..) 30 | , Named (..) 31 | , NamedOrdered (..) 32 | , Record 33 | , NamedRecord 34 | , NamedRecordOrdered 35 | , FromRecord(..) 36 | , FromNamedRecord(..) 37 | , FromNamedRecordOrdered(..) 38 | , ToNamedRecord(..) 39 | , ToNamedRecordOrdered(..) 40 | , FromField(..) 41 | , ToRecord(..) 42 | , ToField(..) 43 | , Field 44 | 45 | -- * Parser 46 | , Parser 47 | , runParser 48 | 49 | -- * Accessors 50 | , index 51 | , (.!) 52 | , unsafeIndex 53 | , lookup 54 | , lookupOrdered 55 | , (.:) 56 | , namedField 57 | , (.=) 58 | , record 59 | , namedRecord 60 | , namedRecordOrdered 61 | ) where 62 | 63 | import Control.Applicative as A 64 | import Control.Monad (MonadPlus, mplus, mzero) 65 | import Data.Attoparsec.ByteString.Char8 (double, parseOnly) 66 | import qualified Data.Attoparsec.ByteString.Char8 as A8 67 | import qualified Data.ByteString as B 68 | import qualified Data.ByteString.Char8 as B8 69 | import qualified Data.ByteString.Lazy as L 70 | import Data.Int (Int8, Int16, Int32, Int64) 71 | import qualified Data.Map as M 72 | import qualified Data.Map.Ordered as MO 73 | import Data.Semigroup as Semigroup 74 | import qualified Data.Text as T 75 | import qualified Data.Text.Encoding as T 76 | import qualified Data.Text.Lazy as LT 77 | import qualified Data.Text.Lazy.Encoding as LT 78 | import Data.Traversable as DT 79 | import Data.Vector (Vector, (!)) 80 | import qualified Data.Vector as V 81 | import qualified Data.Vector.Unboxed as U 82 | import Data.Word as W 83 | import GHC.Float (double2Float) 84 | import Prelude hiding (lookup, takeWhile) 85 | 86 | #ifdef GENERICS 87 | import GHC.Generics 88 | import qualified Data.IntMap as IM 89 | #endif 90 | 91 | import Data.CSV.Conduit.Conversion.Internal 92 | 93 | 94 | ------------------------------------------------------------------------ 95 | -- bytestring compatibility 96 | 97 | toStrict :: L.ByteString -> B.ByteString 98 | fromStrict :: B.ByteString -> L.ByteString 99 | #if MIN_VERSION_bytestring(0,10,0) 100 | toStrict = L.toStrict 101 | fromStrict = L.fromStrict 102 | #else 103 | toStrict = B.concat . L.toChunks 104 | fromStrict = L.fromChunks . (:[]) 105 | #endif 106 | {-# INLINE toStrict #-} 107 | {-# INLINE fromStrict #-} 108 | 109 | ------------------------------------------------------------------------ 110 | -- Type conversion 111 | 112 | 113 | 114 | -- | A shorthand for the ByteString case of 'MapRow' 115 | type NamedRecord = M.Map B8.ByteString B8.ByteString 116 | type NamedRecordOrdered = MO.OMap B8.ByteString B8.ByteString 117 | 118 | 119 | -- | A wrapper around custom haskell types that can directly be 120 | -- converted/parsed from an incoming CSV stream. 121 | -- 122 | -- We define this wrapper to stop GHC from complaining 123 | -- about overlapping instances. Just use 'getNamed' to get your 124 | -- object out of the wrapper. 125 | newtype Named a = Named { getNamed :: a } deriving (Eq,Show,Read,Ord) 126 | newtype NamedOrdered a = NamedOrdered { getNamedOrdered :: a } deriving (Eq,Show,Read,Ord) 127 | 128 | -- | A record corresponds to a single line in a CSV file. 129 | type Record = Vector B8.ByteString 130 | 131 | -- | A single field within a record. 132 | type Field = B8.ByteString 133 | 134 | 135 | ------------------------------------------------------------------------ 136 | -- Index-based conversion 137 | 138 | -- | A type that can be converted from a single CSV record, with the 139 | -- possibility of failure. 140 | -- 141 | -- When writing an instance, use 'empty', 'mzero', or 'fail' to make a 142 | -- conversion fail, e.g. if a 'Record' has the wrong number of 143 | -- columns. 144 | -- 145 | -- Given this example data: 146 | -- 147 | -- > John,56 148 | -- > Jane,55 149 | -- 150 | -- here's an example type and instance: 151 | -- 152 | -- > data Person = Person { name :: !Text, age :: !Int } 153 | -- > 154 | -- > instance FromRecord Person where 155 | -- > parseRecord v 156 | -- > | length v == 2 = Person <$> 157 | -- > v .! 0 <*> 158 | -- > v .! 1 159 | -- > | otherwise = mzero 160 | class FromRecord a where 161 | parseRecord :: Record -> Parser a 162 | 163 | #ifdef GENERICS 164 | default parseRecord :: (Generic a, GFromRecord (Rep a)) => Record -> Parser a 165 | parseRecord r = to A.<$> gparseRecord r 166 | #endif 167 | 168 | -- | Haskell lacks a single-element tuple type, so if you CSV data 169 | -- with just one column you can use the 'Only' type to represent a 170 | -- single-column result. 171 | newtype Only a = Only { 172 | fromOnly :: a 173 | } deriving (Eq, Ord, Read, Show) 174 | 175 | -- | A type that can be converted to a single CSV record. 176 | -- 177 | -- An example type and instance: 178 | -- 179 | -- > data Person = Person { name :: !Text, age :: !Int } 180 | -- > 181 | -- > instance ToRecord Person where 182 | -- > toRecord (Person name age) = record [ 183 | -- > toField name, toField age] 184 | -- 185 | -- Outputs data on this form: 186 | -- 187 | -- > John,56 188 | -- > Jane,55 189 | class ToRecord a where 190 | toRecord :: a -> Record 191 | 192 | #ifdef GENERICS 193 | default toRecord :: (Generic a, GToRecord (Rep a) Field) => a -> Record 194 | toRecord = V.fromList . gtoRecord . from 195 | #endif 196 | 197 | instance FromField a => FromRecord (Only a) where 198 | parseRecord v 199 | | n == 1 = Only <$> unsafeIndex v 0 200 | | otherwise = lengthMismatch 1 v 201 | where 202 | n = V.length v 203 | 204 | -- TODO: Check if we want all toRecord conversions to be stricter. 205 | 206 | instance ToField a => ToRecord (Only a) where 207 | toRecord = V.singleton . toField . fromOnly 208 | 209 | instance (FromField a, FromField b) => FromRecord (a, b) where 210 | parseRecord v 211 | | n == 2 = (,) <$> unsafeIndex v 0 212 | <*> unsafeIndex v 1 213 | | otherwise = lengthMismatch 2 v 214 | where 215 | n = V.length v 216 | 217 | instance (ToField a, ToField b) => ToRecord (a, b) where 218 | toRecord (a, b) = V.fromList [toField a, toField b] 219 | 220 | instance (FromField a, FromField b, FromField c) => FromRecord (a, b, c) where 221 | parseRecord v 222 | | n == 3 = (,,) <$> unsafeIndex v 0 223 | <*> unsafeIndex v 1 224 | <*> unsafeIndex v 2 225 | | otherwise = lengthMismatch 3 v 226 | where 227 | n = V.length v 228 | 229 | instance (ToField a, ToField b, ToField c) => 230 | ToRecord (a, b, c) where 231 | toRecord (a, b, c) = V.fromList [toField a, toField b, toField c] 232 | 233 | instance (FromField a, FromField b, FromField c, FromField d) => 234 | FromRecord (a, b, c, d) where 235 | parseRecord v 236 | | n == 4 = (,,,) <$> unsafeIndex v 0 237 | <*> unsafeIndex v 1 238 | <*> unsafeIndex v 2 239 | <*> unsafeIndex v 3 240 | | otherwise = lengthMismatch 4 v 241 | where 242 | n = V.length v 243 | 244 | instance (ToField a, ToField b, ToField c, ToField d) => 245 | ToRecord (a, b, c, d) where 246 | toRecord (a, b, c, d) = V.fromList [ 247 | toField a, toField b, toField c, toField d] 248 | 249 | instance (FromField a, FromField b, FromField c, FromField d, FromField e) => 250 | FromRecord (a, b, c, d, e) where 251 | parseRecord v 252 | | n == 5 = (,,,,) <$> unsafeIndex v 0 253 | <*> unsafeIndex v 1 254 | <*> unsafeIndex v 2 255 | <*> unsafeIndex v 3 256 | <*> unsafeIndex v 4 257 | | otherwise = lengthMismatch 5 v 258 | where 259 | n = V.length v 260 | 261 | instance (ToField a, ToField b, ToField c, ToField d, ToField e) => 262 | ToRecord (a, b, c, d, e) where 263 | toRecord (a, b, c, d, e) = V.fromList [ 264 | toField a, toField b, toField c, toField d, toField e] 265 | 266 | instance (FromField a, FromField b, FromField c, FromField d, FromField e, 267 | FromField f) => 268 | FromRecord (a, b, c, d, e, f) where 269 | parseRecord v 270 | | n == 6 = (,,,,,) <$> unsafeIndex v 0 271 | <*> unsafeIndex v 1 272 | <*> unsafeIndex v 2 273 | <*> unsafeIndex v 3 274 | <*> unsafeIndex v 4 275 | <*> unsafeIndex v 5 276 | | otherwise = lengthMismatch 6 v 277 | where 278 | n = V.length v 279 | 280 | instance (ToField a, ToField b, ToField c, ToField d, ToField e, ToField f) => 281 | ToRecord (a, b, c, d, e, f) where 282 | toRecord (a, b, c, d, e, f) = V.fromList [ 283 | toField a, toField b, toField c, toField d, toField e, toField f] 284 | 285 | instance (FromField a, FromField b, FromField c, FromField d, FromField e, 286 | FromField f, FromField g) => 287 | FromRecord (a, b, c, d, e, f, g) where 288 | parseRecord v 289 | | n == 7 = (,,,,,,) <$> unsafeIndex v 0 290 | <*> unsafeIndex v 1 291 | <*> unsafeIndex v 2 292 | <*> unsafeIndex v 3 293 | <*> unsafeIndex v 4 294 | <*> unsafeIndex v 5 295 | <*> unsafeIndex v 6 296 | | otherwise = lengthMismatch 7 v 297 | where 298 | n = V.length v 299 | 300 | instance (ToField a, ToField b, ToField c, ToField d, ToField e, ToField f, 301 | ToField g) => 302 | ToRecord (a, b, c, d, e, f, g) where 303 | toRecord (a, b, c, d, e, f, g) = V.fromList [ 304 | toField a, toField b, toField c, toField d, toField e, toField f, 305 | toField g] 306 | 307 | lengthMismatch :: Int -> Record -> Parser a 308 | lengthMismatch expected v = 309 | fail $ "cannot unpack array of length " ++ 310 | show n ++ " into a " ++ desired ++ ". Input record: " ++ 311 | show v 312 | where 313 | n = V.length v 314 | desired | expected == 1 = "Only" 315 | | expected == 2 = "pair" 316 | | otherwise = show expected ++ "-tuple" 317 | 318 | instance FromField a => FromRecord [a] where 319 | parseRecord = DT.traverse parseField . V.toList 320 | 321 | instance ToField a => ToRecord [a] where 322 | toRecord = V.fromList . map toField 323 | 324 | instance FromField a => FromRecord (V.Vector a) where 325 | parseRecord = traverse parseField 326 | 327 | instance ToField a => ToRecord (Vector a) where 328 | toRecord = V.map toField 329 | 330 | instance (FromField a, U.Unbox a) => FromRecord (U.Vector a) where 331 | parseRecord = fmap U.convert . traverse parseField 332 | 333 | instance (ToField a, U.Unbox a) => ToRecord (U.Vector a) where 334 | toRecord = V.map toField . U.convert 335 | 336 | ------------------------------------------------------------------------ 337 | -- Name-based conversion 338 | 339 | -- | A type that can be converted from a single CSV record, with the 340 | -- possibility of failure. 341 | -- 342 | -- When writing an instance, use 'empty', 'mzero', or 'fail' to make a 343 | -- conversion fail, e.g. if a 'Record' has the wrong number of 344 | -- columns. 345 | -- 346 | -- Given this example data: 347 | -- 348 | -- > name,age 349 | -- > John,56 350 | -- > Jane,55 351 | -- 352 | -- here's an example type and instance: 353 | -- 354 | -- > {-# LANGUAGE OverloadedStrings #-} 355 | -- > 356 | -- > data Person = Person { name :: !Text, age :: !Int } 357 | -- > 358 | -- > instance FromNamedRecord Person where 359 | -- > parseNamedRecord m = Person <$> 360 | -- > m .: "name" <*> 361 | -- > m .: "age" 362 | -- 363 | -- Note the use of the @OverloadedStrings@ language extension which 364 | -- enables 'B8.ByteString' values to be written as string literals. 365 | class FromNamedRecord a where 366 | parseNamedRecord :: NamedRecord -> Parser a 367 | 368 | #ifdef GENERICS 369 | default parseNamedRecord :: (Generic a, GFromNamedRecord (Rep a)) => NamedRecord -> Parser a 370 | parseNamedRecord r = to <$> gparseNamedRecord r 371 | #endif 372 | 373 | class FromNamedRecordOrdered a where 374 | parseNamedRecordOrdered :: NamedRecordOrdered -> Parser a 375 | 376 | -- | A type that can be converted to a single CSV record. 377 | -- 378 | -- An example type and instance: 379 | -- 380 | -- > data Person = Person { name :: !Text, age :: !Int } 381 | -- > 382 | -- > instance ToNamedRecord Person where 383 | -- > toNamedRecord (Person name age) = namedRecord [ 384 | -- > "name" .= name, "age" .= age] 385 | class ToNamedRecord a where 386 | toNamedRecord :: a -> NamedRecord 387 | 388 | #ifdef GENERICS 389 | default toNamedRecord :: (Generic a, GToRecord (Rep a) (B.ByteString, B.ByteString)) => a -> NamedRecord 390 | toNamedRecord = namedRecord . gtoRecord . from 391 | #endif 392 | 393 | class ToNamedRecordOrdered a where 394 | toNamedRecordOrdered :: a -> NamedRecordOrdered 395 | 396 | instance FromField a => FromNamedRecord (M.Map B.ByteString a) where 397 | parseNamedRecord m = traverse parseField m 398 | 399 | instance FromField a => FromNamedRecordOrdered (MO.OMap B.ByteString a) where 400 | parseNamedRecordOrdered m = traverse parseField m 401 | 402 | instance ToField a => ToNamedRecord (M.Map B.ByteString a) where 403 | toNamedRecord = M.map toField 404 | 405 | instance ToField a => ToNamedRecordOrdered (MO.OMap B.ByteString a) where 406 | toNamedRecordOrdered a = MO.fromList $ map (fmap toField) $ MO.assocs a 407 | 408 | -- instance FromField a => FromNamedRecord (HM.HashMap B.ByteString a) where 409 | -- parseNamedRecord m = traverse (\ s -> parseField s) m 410 | 411 | -- instance ToField a => ToNamedRecord (HM.HashMap B.ByteString a) where 412 | -- toNamedRecord = HM.map toField 413 | 414 | ------------------------------------------------------------------------ 415 | -- Individual field conversion 416 | 417 | -- | A type that can be converted from a single CSV field, with the 418 | -- possibility of failure. 419 | -- 420 | -- When writing an instance, use 'empty', 'mzero', or 'fail' to make a 421 | -- conversion fail, e.g. if a 'Field' can't be converted to the given 422 | -- type. 423 | -- 424 | -- Example type and instance: 425 | -- 426 | -- > {-# LANGUAGE OverloadedStrings #-} 427 | -- > 428 | -- > data Color = Red | Green | Blue 429 | -- > 430 | -- > instance FromField Color where 431 | -- > parseField s 432 | -- > | s == "R" = pure Red 433 | -- > | s == "G" = pure Green 434 | -- > | s == "B" = pure Blue 435 | -- > | otherwise = mzero 436 | class FromField a where 437 | parseField :: Field -> Parser a 438 | 439 | -- | A type that can be converted to a single CSV field. 440 | -- 441 | -- Example type and instance: 442 | -- 443 | -- > {-# LANGUAGE OverloadedStrings #-} 444 | -- > 445 | -- > data Color = Red | Green | Blue 446 | -- > 447 | -- > instance ToField Color where 448 | -- > toField Red = "R" 449 | -- > toField Green = "G" 450 | -- > toField Blue = "B" 451 | class ToField a where 452 | toField :: a -> Field 453 | 454 | -- | 'Nothing' if the 'Field' is 'B.empty', 'Just' otherwise. 455 | instance FromField a => FromField (Maybe a) where 456 | parseField s 457 | | B.null s = pure Nothing 458 | | otherwise = Just <$> parseField s 459 | {-# INLINE parseField #-} 460 | 461 | -- | 'Nothing' is encoded as an 'B.empty' field. 462 | instance ToField a => ToField (Maybe a) where 463 | toField = maybe B.empty toField 464 | {-# INLINE toField #-} 465 | 466 | -- | Ignores the 'Field'. Always succeeds. 467 | instance FromField () where 468 | parseField _ = pure () 469 | {-# INLINE parseField #-} 470 | 471 | -- | Assumes UTF-8 encoding. 472 | instance FromField Char where 473 | parseField s = 474 | case T.decodeUtf8' s of 475 | Left e -> fail $ show e 476 | Right t 477 | | T.compareLength t 1 == EQ -> pure (T.head t) 478 | | otherwise -> typeError "Char" s Nothing 479 | {-# INLINE parseField #-} 480 | 481 | -- | Uses UTF-8 encoding. 482 | instance ToField Char where 483 | toField = toField . T.encodeUtf8 . T.singleton 484 | {-# INLINE toField #-} 485 | 486 | -- | Accepts same syntax as 'rational'. 487 | instance FromField Double where 488 | parseField = parseDouble 489 | {-# INLINE parseField #-} 490 | 491 | -- | Uses decimal notation or scientific notation, depending on the 492 | -- number. 493 | instance ToField Double where 494 | toField = realFloat 495 | {-# INLINE toField #-} 496 | 497 | -- | Accepts same syntax as 'rational'. 498 | instance FromField Float where 499 | parseField s = double2Float <$> parseDouble s 500 | {-# INLINE parseField #-} 501 | 502 | -- | Uses decimal notation or scientific notation, depending on the 503 | -- number. 504 | instance ToField Float where 505 | toField = realFloat 506 | {-# INLINE toField #-} 507 | 508 | parseDouble :: B.ByteString -> Parser Double 509 | parseDouble s = case parseOnly double s of 510 | Left err -> typeError "Double" s (Just err) 511 | Right n -> pure n 512 | {-# INLINE parseDouble #-} 513 | 514 | -- | Accepts a signed decimal number. 515 | instance FromField Int where 516 | parseField = parseSigned "Int" 517 | {-# INLINE parseField #-} 518 | 519 | -- | Uses decimal encoding with optional sign. 520 | instance ToField Int where 521 | toField = decimal 522 | {-# INLINE toField #-} 523 | 524 | -- | Accepts a signed decimal number. 525 | instance FromField Integer where 526 | parseField = parseSigned "Integer" 527 | {-# INLINE parseField #-} 528 | 529 | -- | Uses decimal encoding with optional sign. 530 | instance ToField Integer where 531 | toField = decimal 532 | {-# INLINE toField #-} 533 | 534 | -- | Accepts a signed decimal number. 535 | instance FromField Int8 where 536 | parseField = parseSigned "Int8" 537 | {-# INLINE parseField #-} 538 | 539 | -- | Uses decimal encoding with optional sign. 540 | instance ToField Int8 where 541 | toField = decimal 542 | {-# INLINE toField #-} 543 | 544 | -- | Accepts a signed decimal number. 545 | instance FromField Int16 where 546 | parseField = parseSigned "Int16" 547 | {-# INLINE parseField #-} 548 | 549 | -- | Uses decimal encoding with optional sign. 550 | instance ToField Int16 where 551 | toField = decimal 552 | {-# INLINE toField #-} 553 | 554 | -- | Accepts a signed decimal number. 555 | instance FromField Int32 where 556 | parseField = parseSigned "Int32" 557 | {-# INLINE parseField #-} 558 | 559 | -- | Uses decimal encoding with optional sign. 560 | instance ToField Int32 where 561 | toField = decimal 562 | {-# INLINE toField #-} 563 | 564 | -- | Accepts a signed decimal number. 565 | instance FromField Int64 where 566 | parseField = parseSigned "Int64" 567 | {-# INLINE parseField #-} 568 | 569 | -- | Uses decimal encoding with optional sign. 570 | instance ToField Int64 where 571 | toField = decimal 572 | {-# INLINE toField #-} 573 | 574 | -- | Accepts an unsigned decimal number. 575 | instance FromField W.Word where 576 | parseField = parseUnsigned "Word" 577 | {-# INLINE parseField #-} 578 | 579 | -- | Uses decimal encoding. 580 | instance ToField Word where 581 | toField = decimal 582 | {-# INLINE toField #-} 583 | 584 | -- | Accepts an unsigned decimal number. 585 | instance FromField Word8 where 586 | parseField = parseUnsigned "Word8" 587 | {-# INLINE parseField #-} 588 | 589 | -- | Uses decimal encoding. 590 | instance ToField Word8 where 591 | toField = decimal 592 | {-# INLINE toField #-} 593 | 594 | -- | Accepts an unsigned decimal number. 595 | instance FromField Word16 where 596 | parseField = parseUnsigned "Word16" 597 | {-# INLINE parseField #-} 598 | 599 | -- | Uses decimal encoding. 600 | instance ToField Word16 where 601 | toField = decimal 602 | {-# INLINE toField #-} 603 | 604 | -- | Accepts an unsigned decimal number. 605 | instance FromField Word32 where 606 | parseField = parseUnsigned "Word32" 607 | {-# INLINE parseField #-} 608 | 609 | -- | Uses decimal encoding. 610 | instance ToField Word32 where 611 | toField = decimal 612 | {-# INLINE toField #-} 613 | 614 | -- | Accepts an unsigned decimal number. 615 | instance FromField Word64 where 616 | parseField = parseUnsigned "Word64" 617 | {-# INLINE parseField #-} 618 | 619 | -- | Uses decimal encoding. 620 | instance ToField Word64 where 621 | toField = decimal 622 | {-# INLINE toField #-} 623 | 624 | instance FromField B.ByteString where 625 | parseField = pure 626 | {-# INLINE parseField #-} 627 | 628 | instance ToField B.ByteString where 629 | toField = id 630 | {-# INLINE toField #-} 631 | 632 | instance FromField L.ByteString where 633 | parseField = pure . fromStrict 634 | {-# INLINE parseField #-} 635 | 636 | instance ToField L.ByteString where 637 | toField = toStrict 638 | {-# INLINE toField #-} 639 | 640 | -- | Assumes UTF-8 encoding. Fails on invalid byte sequences. 641 | instance FromField T.Text where 642 | parseField = either (fail . show) pure . T.decodeUtf8' 643 | {-# INLINE parseField #-} 644 | 645 | -- | Uses UTF-8 encoding. 646 | instance ToField T.Text where 647 | toField = toField . T.encodeUtf8 648 | {-# INLINE toField #-} 649 | 650 | -- | Assumes UTF-8 encoding. Fails on invalid byte sequences. 651 | instance FromField LT.Text where 652 | parseField = either (fail . show) (pure . LT.fromStrict) . T.decodeUtf8' 653 | {-# INLINE parseField #-} 654 | 655 | -- | Uses UTF-8 encoding. 656 | instance ToField LT.Text where 657 | toField = toField . toStrict . LT.encodeUtf8 658 | {-# INLINE toField #-} 659 | 660 | -- | Assumes UTF-8 encoding. Fails on invalid byte sequences. 661 | instance FromField [Char] where 662 | parseField = fmap T.unpack . parseField 663 | {-# INLINE parseField #-} 664 | 665 | -- | Uses UTF-8 encoding. 666 | instance ToField [Char] where 667 | toField = toField . T.pack 668 | {-# INLINE toField #-} 669 | 670 | parseSigned :: (Integral a) => String -> B.ByteString -> Parser a 671 | parseSigned typ s = case parseOnly (A8.signed A8.decimal) s of 672 | Left err -> typeError typ s (Just err) 673 | Right n -> pure n 674 | {-# INLINE parseSigned #-} 675 | 676 | parseUnsigned :: Integral a => String -> B.ByteString -> Parser a 677 | parseUnsigned typ s = case parseOnly A8.decimal s of 678 | Left err -> typeError typ s (Just err) 679 | Right n -> pure n 680 | {-# INLINE parseUnsigned #-} 681 | 682 | typeError :: String -> B.ByteString -> Maybe String -> Parser a 683 | typeError typ s mmsg = 684 | fail $ "expected " ++ typ ++ ", got " ++ show (B8.unpack s) ++ cause 685 | where 686 | cause = case mmsg of 687 | Just msg -> " (" ++ msg ++ ")" 688 | Nothing -> "" 689 | 690 | ------------------------------------------------------------------------ 691 | -- Constructors and accessors 692 | 693 | -- | Retrieve the /n/th field in the given record. The result is 694 | -- 'empty' if the value cannot be converted to the desired type. 695 | -- Raises an exception if the index is out of bounds. 696 | -- 697 | -- 'index' is a simple convenience function that is equivalent to 698 | -- @'parseField' (v '!' idx)@. If you're certain that the index is not 699 | -- out of bounds, using 'unsafeIndex' is somewhat faster. 700 | index :: FromField a => Record -> Int -> Parser a 701 | index v idx = parseField (v ! idx) 702 | {-# INLINE index #-} 703 | 704 | -- | Alias for 'index'. 705 | (.!) :: FromField a => Record -> Int -> Parser a 706 | (.!) = index 707 | {-# INLINE (.!) #-} 708 | infixl 9 .! 709 | 710 | -- | Like 'index' but without bounds checking. 711 | unsafeIndex :: FromField a => Record -> Int -> Parser a 712 | unsafeIndex v idx = parseField (V.unsafeIndex v idx) 713 | {-# INLINE unsafeIndex #-} 714 | 715 | -- | Retrieve a field in the given record by name. The result is 716 | -- 'empty' if the field is missing or if the value cannot be converted 717 | -- to the desired type. 718 | lookup :: FromField a => NamedRecord -> B.ByteString -> Parser a 719 | lookup m name = maybe (fail err) parseField $ M.lookup name m 720 | where err = "no field named " ++ show (B8.unpack name) 721 | {-# INLINE lookup #-} 722 | 723 | lookupOrdered :: FromField a => NamedRecordOrdered -> B.ByteString -> Parser a 724 | lookupOrdered m name = maybe (fail err) parseField $ MO.lookup name m 725 | where err = "no field named " ++ show (B8.unpack name) 726 | {-# INLINE lookupOrdered #-} 727 | 728 | -- | Alias for 'lookup'. 729 | (.:) :: FromField a => NamedRecord -> B.ByteString -> Parser a 730 | (.:) = lookup 731 | {-# INLINE (.:) #-} 732 | 733 | -- | Construct a pair from a name and a value. For use with 734 | -- 'namedRecord'. 735 | namedField :: ToField a => B.ByteString -> a -> (B.ByteString, B.ByteString) 736 | namedField name val = (name, toField val) 737 | {-# INLINE namedField #-} 738 | 739 | -- | Alias for 'namedField'. 740 | (.=) :: ToField a => B.ByteString -> a -> (B.ByteString, B.ByteString) 741 | (.=) = namedField 742 | {-# INLINE (.=) #-} 743 | 744 | -- | Construct a record from a list of 'B.ByteString's. Use 'toField' 745 | -- to convert values to 'B.ByteString's for use with 'record'. 746 | record :: [B.ByteString] -> Record 747 | record = V.fromList 748 | 749 | -- | Construct a named record from a list of name-value 'B.ByteString' 750 | -- pairs. Use '.=' to construct such a pair from a name and a value. 751 | namedRecord :: [(B.ByteString, B.ByteString)] -> NamedRecord 752 | namedRecord = M.fromList 753 | 754 | namedRecordOrdered :: [(B.ByteString, B.ByteString)] -> NamedRecordOrdered 755 | namedRecordOrdered = MO.fromList 756 | 757 | ------------------------------------------------------------------------ 758 | -- Parser for converting records to data types 759 | 760 | -- | Failure continuation. 761 | type Failure f r = String -> f r 762 | -- | Success continuation. 763 | type Success a f r = a -> f r 764 | 765 | -- | Conversion of a field to a value might fail e.g. if the field is 766 | -- malformed. This possibility is captured by the 'Parser' type, which 767 | -- lets you compose several field conversions together in such a way 768 | -- that if any of them fail, the whole record conversion fails. 769 | newtype Parser a = Parser { 770 | unParser :: forall f r. 771 | Failure f r 772 | -> Success a f r 773 | -> f r 774 | } 775 | 776 | instance Monad Parser where 777 | m >>= g = Parser $ \kf ks -> let ks' a = unParser (g a) kf ks 778 | in unParser m kf ks' 779 | {-# INLINE (>>=) #-} 780 | 781 | #if MIN_VERSION_base(4,13,0) 782 | instance MonadFail Parser where 783 | #endif 784 | 785 | fail msg = Parser $ \kf _ks -> kf msg 786 | {-# INLINE fail #-} 787 | 788 | instance Functor Parser where 789 | fmap f m = Parser $ \kf ks -> let ks' a = ks (f a) 790 | in unParser m kf ks' 791 | {-# INLINE fmap #-} 792 | 793 | instance Applicative Parser where 794 | 795 | (<*>) = apP 796 | {-# INLINE (<*>) #-} 797 | pure a = Parser $ \_kf ks -> ks a 798 | {-# INLINE pure #-} 799 | 800 | instance Alternative Parser where 801 | empty = fail "empty" 802 | {-# INLINE empty #-} 803 | (<|>) = mplus 804 | {-# INLINE (<|>) #-} 805 | 806 | instance MonadPlus Parser where 807 | mzero = fail "mzero" 808 | {-# INLINE mzero #-} 809 | mplus a b = Parser $ \kf ks -> let kf' _ = unParser b kf ks 810 | in unParser a kf' ks 811 | {-# INLINE mplus #-} 812 | 813 | instance Semigroup.Semigroup (Parser a) where 814 | (<>) = mplus 815 | {-# INLINE (<>) #-} 816 | 817 | apP :: Parser (a -> b) -> Parser a -> Parser b 818 | apP d e = do 819 | b <- d 820 | a <- e 821 | return (b a) 822 | {-# INLINE apP #-} 823 | 824 | -- | Run a 'Parser', returning either @'Left' errMsg@ or @'Right' 825 | -- result@. Forces the value in the 'Left' or 'Right' constructors to 826 | -- weak head normal form. 827 | -- 828 | -- You most likely won't need to use this function directly, but it's 829 | -- included for completeness. 830 | runParser :: Parser a -> Either String a 831 | runParser p = unParser p left right 832 | where 833 | left !errMsg = Left errMsg 834 | right !x = Right x 835 | {-# INLINE runParser #-} 836 | 837 | #ifdef GENERICS 838 | 839 | class GFromRecord f where 840 | gparseRecord :: Record -> Parser (f p) 841 | 842 | instance GFromRecordSum f Record => GFromRecord (M1 i n f) where 843 | gparseRecord v = 844 | case (IM.lookup n gparseRecordSum) of 845 | Nothing -> lengthMismatch n v 846 | Just p -> M1 <$> p v 847 | where 848 | n = V.length v 849 | 850 | class GFromNamedRecord f where 851 | gparseNamedRecord :: NamedRecord -> Parser (f p) 852 | 853 | instance GFromRecordSum f NamedRecord => GFromNamedRecord (M1 i n f) where 854 | gparseNamedRecord v = 855 | foldr (\f p -> p <|> M1 <$> f v) empty (IM.elems gparseRecordSum) 856 | 857 | class GFromRecordSum f r where 858 | gparseRecordSum :: IM.IntMap (r -> Parser (f p)) 859 | 860 | instance (GFromRecordSum a r, GFromRecordSum b r) => GFromRecordSum (a :+: b) r where 861 | gparseRecordSum = 862 | IM.unionWith (\a b r -> a r <|> b r) 863 | (fmap (L1 <$>) <$> gparseRecordSum) 864 | (fmap (R1 <$>) <$> gparseRecordSum) 865 | 866 | instance GFromRecordProd f r => GFromRecordSum (M1 i n f) r where 867 | gparseRecordSum = IM.singleton n (fmap (M1 <$>) f) 868 | where 869 | (n, f) = gparseRecordProd 0 870 | 871 | class GFromRecordProd f r where 872 | gparseRecordProd :: Int -> (Int, r -> Parser (f p)) 873 | 874 | instance GFromRecordProd U1 r where 875 | gparseRecordProd n = (n, const (pure U1)) 876 | 877 | instance (GFromRecordProd a r, GFromRecordProd b r) => GFromRecordProd (a :*: b) r where 878 | gparseRecordProd n0 = (n2, f) 879 | where 880 | f r = (:*:) <$> fa r <*> fb r 881 | (n1, fa) = gparseRecordProd n0 882 | (n2, fb) = gparseRecordProd n1 883 | 884 | instance GFromRecordProd f Record => GFromRecordProd (M1 i n f) Record where 885 | gparseRecordProd n = fmap (M1 <$>) <$> gparseRecordProd n 886 | 887 | instance FromField a => GFromRecordProd (K1 i a) Record where 888 | gparseRecordProd n = (n + 1, \v -> K1 <$> parseField (V.unsafeIndex v n)) 889 | 890 | #if MIN_VERSION_base(4,9,0) 891 | data Proxy (s :: Meta) (f :: Type -> Type) a = Proxy 892 | #else 893 | data Proxy s (f :: * -> *) a = Proxy 894 | #endif 895 | 896 | instance (FromField a, Selector s) => GFromRecordProd (M1 S s (K1 i a)) NamedRecord where 897 | gparseRecordProd n = (n + 1, \v -> (M1 . K1) <$> v .: name) 898 | where 899 | name = T.encodeUtf8 (T.pack (selName (Proxy :: Proxy s f a))) 900 | 901 | 902 | class GToRecord a f where 903 | gtoRecord :: a p -> [f] 904 | 905 | instance GToRecord U1 f where 906 | gtoRecord U1 = [] 907 | 908 | instance (GToRecord a f, GToRecord b f) => GToRecord (a :*: b) f where 909 | gtoRecord (a :*: b) = gtoRecord a ++ gtoRecord b 910 | 911 | instance (GToRecord a f, GToRecord b f) => GToRecord (a :+: b) f where 912 | gtoRecord (L1 a) = gtoRecord a 913 | gtoRecord (R1 b) = gtoRecord b 914 | 915 | instance GToRecord a f => GToRecord (M1 D c a) f where 916 | gtoRecord (M1 a) = gtoRecord a 917 | 918 | instance GToRecord a f => GToRecord (M1 C c a) f where 919 | gtoRecord (M1 a) = gtoRecord a 920 | 921 | instance GToRecord a Field => GToRecord (M1 S c a) Field where 922 | gtoRecord (M1 a) = gtoRecord a 923 | 924 | instance ToField a => GToRecord (K1 i a) Field where 925 | gtoRecord (K1 a) = [toField a] 926 | 927 | instance (ToField a, Selector s) => GToRecord (M1 S s (K1 i a)) (B.ByteString, B.ByteString) where 928 | gtoRecord m@(M1 (K1 a)) = [T.encodeUtf8 (T.pack (selName m)) .= toField a] 929 | 930 | #endif 931 | -------------------------------------------------------------------------------- /src/Data/CSV/Conduit/Conversion/Internal.hs: -------------------------------------------------------------------------------- 1 | module Data.CSV.Conduit.Conversion.Internal 2 | ( decimal 3 | , realFloat 4 | ) where 5 | 6 | import Blaze.ByteString.Builder 7 | import Blaze.ByteString.Builder.Char8 8 | import Data.Array.Base (unsafeAt) 9 | import Data.Array.IArray 10 | import qualified Data.ByteString as B 11 | import Data.Char (ord) 12 | import Data.Int 13 | import Data.Word 14 | 15 | import Data.CSV.Conduit.Monoid as Monoid ((<>)) 16 | 17 | ------------------------------------------------------------------------ 18 | -- Integers 19 | 20 | decimal :: Integral a => a -> B.ByteString 21 | decimal = toByteString . formatDecimal 22 | {-# INLINE decimal #-} 23 | 24 | -- TODO: Add an optimized version for Integer. 25 | 26 | formatDecimal :: Integral a => a -> Builder 27 | {-# RULES "formatDecimal/Int" formatDecimal = formatBoundedSigned 28 | :: Int -> Builder #-} 29 | {-# RULES "formatDecimal/Int8" formatDecimal = formatBoundedSigned 30 | :: Int8 -> Builder #-} 31 | {-# RULES "formatDecimal/Int16" formatDecimal = formatBoundedSigned 32 | :: Int16 -> Builder #-} 33 | {-# RULES "formatDecimal/Int32" formatDecimal = formatBoundedSigned 34 | :: Int32 -> Builder #-} 35 | {-# RULES "formatDecimal/Int64" formatDecimal = formatBoundedSigned 36 | :: Int64 -> Builder #-} 37 | {-# RULES "formatDecimal/Word" formatDecimal = formatPositive 38 | :: Word -> Builder #-} 39 | {-# RULES "formatDecimal/Word8" formatDecimal = formatPositive 40 | :: Word8 -> Builder #-} 41 | {-# RULES "formatDecimal/Word16" formatDecimal = formatPositive 42 | :: Word16 -> Builder #-} 43 | {-# RULES "formatDecimal/Word32" formatDecimal = formatPositive 44 | :: Word32 -> Builder #-} 45 | {-# RULES "formatDecimal/Word64" formatDecimal = formatPositive 46 | :: Word64 -> Builder #-} 47 | {-# NOINLINE formatDecimal #-} 48 | formatDecimal i 49 | | i < 0 = minus Monoid.<> 50 | if i <= -128 51 | then formatPositive (-(i `quot` 10)) <> digit (-(i `rem` 10)) 52 | else formatPositive (-i) 53 | | otherwise = formatPositive i 54 | 55 | formatBoundedSigned :: (Integral a, Bounded a) => a -> Builder 56 | {-# SPECIALIZE formatBoundedSigned :: Int -> Builder #-} 57 | {-# SPECIALIZE formatBoundedSigned :: Int8 -> Builder #-} 58 | {-# SPECIALIZE formatBoundedSigned :: Int16 -> Builder #-} 59 | {-# SPECIALIZE formatBoundedSigned :: Int32 -> Builder #-} 60 | {-# SPECIALIZE formatBoundedSigned :: Int64 -> Builder #-} 61 | formatBoundedSigned i 62 | | i < 0 = minus <> 63 | if i == minBound 64 | then formatPositive (-(i `quot` 10)) <> digit (-(i `rem` 10)) 65 | else formatPositive (-i) 66 | | otherwise = formatPositive i 67 | 68 | formatPositive :: Integral a => a -> Builder 69 | {-# SPECIALIZE formatPositive :: Int -> Builder #-} 70 | {-# SPECIALIZE formatPositive :: Int8 -> Builder #-} 71 | {-# SPECIALIZE formatPositive :: Int16 -> Builder #-} 72 | {-# SPECIALIZE formatPositive :: Int32 -> Builder #-} 73 | {-# SPECIALIZE formatPositive :: Int64 -> Builder #-} 74 | {-# SPECIALIZE formatPositive :: Word -> Builder #-} 75 | {-# SPECIALIZE formatPositive :: Word8 -> Builder #-} 76 | {-# SPECIALIZE formatPositive :: Word16 -> Builder #-} 77 | {-# SPECIALIZE formatPositive :: Word32 -> Builder #-} 78 | {-# SPECIALIZE formatPositive :: Word64 -> Builder #-} 79 | formatPositive = go 80 | where go n | n < 10 = digit n 81 | | otherwise = go (n `quot` 10) <> digit (n `rem` 10) 82 | 83 | minus :: Builder 84 | minus = fromWord8 45 85 | 86 | zero :: Word8 87 | zero = 48 88 | 89 | digit :: Integral a => a -> Builder 90 | digit n = fromWord8 $! i2w (fromIntegral n) 91 | {-# INLINE digit #-} 92 | 93 | i2w :: Int -> Word8 94 | i2w i = zero + fromIntegral i 95 | {-# INLINE i2w #-} 96 | 97 | ------------------------------------------------------------------------ 98 | -- Floating point numbers 99 | 100 | realFloat :: RealFloat a => a -> B.ByteString 101 | {-# SPECIALIZE realFloat :: Float -> B.ByteString #-} 102 | {-# SPECIALIZE realFloat :: Double -> B.ByteString #-} 103 | realFloat = toByteString . formatRealFloat Generic 104 | 105 | -- | Control the rendering of floating point numbers. 106 | data FPFormat = Exponent 107 | -- ^ Scientific notation (e.g. @2.3e123@). 108 | | Fixed 109 | -- ^ Standard decimal notation. 110 | | Generic 111 | -- ^ Use decimal notation for values between @0.1@ and 112 | -- @9,999,999@, and scientific notation otherwise. 113 | deriving (Enum, Read, Show) 114 | 115 | formatRealFloat :: RealFloat a => FPFormat -> a -> Builder 116 | {-# SPECIALIZE formatRealFloat :: FPFormat -> Float -> Builder #-} 117 | {-# SPECIALIZE formatRealFloat :: FPFormat -> Double -> Builder #-} 118 | formatRealFloat fmt x 119 | | isNaN x = fromString "NaN" 120 | | isInfinite x = if x < 0 121 | then fromString "-Infinity" 122 | else fromString "Infinity" 123 | | x < 0 || isNegativeZero x = minus <> doFmt fmt (floatToDigits (-x)) 124 | | otherwise = doFmt fmt (floatToDigits x) 125 | where 126 | doFmt format (is, e) = 127 | let ds = map i2d is in 128 | case format of 129 | Generic -> 130 | doFmt (if e < 0 || e > 7 then Exponent else Fixed) 131 | (is,e) 132 | Exponent -> 133 | let show_e' = formatDecimal (e-1) in 134 | case ds of 135 | [48] -> fromString "0.0e0" 136 | [d] -> fromWord8 d <> fromString ".0e" <> show_e' 137 | (d:ds') -> fromWord8 d <> fromChar '.' <> fromWord8s ds' <> 138 | fromChar 'e' <> show_e' 139 | [] -> error "formatRealFloat/doFmt/Exponent: []" 140 | Fixed 141 | | e <= 0 -> fromString "0." <> 142 | fromByteString (B.replicate (-e) zero) <> 143 | fromWord8s ds 144 | | otherwise -> 145 | let 146 | f 0 s rs = mk0 (reverse s) <> fromChar '.' <> mk0 rs 147 | f n s [] = f (n-1) (zero:s) [] 148 | f n s (r:rs) = f (n-1) (r:s) rs 149 | in 150 | f e [] ds 151 | where mk0 ls = case ls of { [] -> fromWord8 zero ; _ -> fromWord8s ls} 152 | 153 | -- Based on "Printing Floating-Point Numbers Quickly and Accurately" 154 | -- by R.G. Burger and R.K. Dybvig in PLDI 96. 155 | -- This version uses a much slower logarithm estimator. It should be improved. 156 | 157 | -- | 'floatToDigits' takes a base and a non-negative 'RealFloat' number, 158 | -- and returns a list of digits and an exponent. 159 | -- In particular, if @x>=0@, and 160 | -- 161 | -- > floatToDigits base x = ([d1,d2,...,dn], e) 162 | -- 163 | -- then 164 | -- 165 | -- (1) @n >= 1@ 166 | -- 167 | -- (2) @x = 0.d1d2...dn * (base**e)@ 168 | -- 169 | -- (3) @0 <= di <= base-1@ 170 | 171 | floatToDigits :: (RealFloat a) => a -> ([Int], Int) 172 | {-# SPECIALIZE floatToDigits :: Float -> ([Int], Int) #-} 173 | {-# SPECIALIZE floatToDigits :: Double -> ([Int], Int) #-} 174 | floatToDigits 0 = ([0], 0) 175 | floatToDigits x = 176 | let 177 | (f0, e0) = decodeFloat x 178 | (minExp0, _) = floatRange x 179 | p = floatDigits x 180 | b = floatRadix x 181 | minExp = minExp0 - p -- the real minimum exponent 182 | -- Haskell requires that f be adjusted so denormalized numbers 183 | -- will have an impossibly low exponent. Adjust for this. 184 | (f, e) = 185 | let n = minExp - e0 in 186 | if n > 0 then (f0 `quot` (expt b n), e0+n) else (f0, e0) 187 | (r, s, mUp, mDn) = 188 | if e >= 0 then 189 | let be = expt b e in 190 | if f == expt b (p-1) then 191 | (f*be*b*2, 2*b, be*b, be) -- according to Burger and Dybvig 192 | else 193 | (f*be*2, 2, be, be) 194 | else 195 | if e > minExp && f == expt b (p-1) then 196 | (f*b*2, expt b (-e+1)*2, b, 1) 197 | else 198 | (f*2, expt b (-e)*2, 1, 1) 199 | k :: Int 200 | k = 201 | let 202 | k0 :: Int 203 | k0 = 204 | if b == 2 then 205 | -- logBase 10 2 is very slightly larger than 8651/28738 206 | -- (about 5.3558e-10), so if log x >= 0, the approximation 207 | -- k1 is too small, hence we add one and need one fixup step less. 208 | -- If log x < 0, the approximation errs rather on the high side. 209 | -- That is usually more than compensated for by ignoring the 210 | -- fractional part of logBase 2 x, but when x is a power of 1/2 211 | -- or slightly larger and the exponent is a multiple of the 212 | -- denominator of the rational approximation to logBase 10 2, 213 | -- k1 is larger than logBase 10 x. If k1 > 1 + logBase 10 x, 214 | -- we get a leading zero-digit we don't want. 215 | -- With the approximation 3/10, this happened for 216 | -- 0.5^1030, 0.5^1040, ..., 0.5^1070 and values close above. 217 | -- The approximation 8651/28738 guarantees k1 < 1 + logBase 10 x 218 | -- for IEEE-ish floating point types with exponent fields 219 | -- <= 17 bits and mantissae of several thousand bits, earlier 220 | -- convergents to logBase 10 2 would fail for long double. 221 | -- Using quot instead of div is a little faster and requires 222 | -- fewer fixup steps for negative lx. 223 | let lx = p - 1 + e0 224 | k1 = (lx * 8651) `quot` 28738 225 | in if lx >= 0 then k1 + 1 else k1 226 | else 227 | -- f :: Integer, log :: Float -> Float, 228 | -- ceiling :: Float -> Int 229 | ceiling ((log (fromInteger (f+1) :: Float) + 230 | fromIntegral e * log (fromInteger b)) / 231 | log 10) 232 | --WAS: fromInt e * log (fromInteger b)) 233 | 234 | fixup n = 235 | if n >= 0 then 236 | if r + mUp <= expt 10 n * s then n else fixup (n+1) 237 | else 238 | if expt 10 (-n) * (r + mUp) <= s then n else fixup (n+1) 239 | in 240 | fixup k0 241 | 242 | gen ds rn sN mUpN mDnN = 243 | let 244 | (dn, rn') = (rn * 10) `quotRem` sN 245 | mUpN' = mUpN * 10 246 | mDnN' = mDnN * 10 247 | in 248 | case (rn' < mDnN', rn' + mUpN' > sN) of 249 | (True, False) -> dn : ds 250 | (False, True) -> dn+1 : ds 251 | (True, True) -> if rn' * 2 < sN then dn : ds else dn+1 : ds 252 | (False, False) -> gen (dn:ds) rn' sN mUpN' mDnN' 253 | 254 | rds = 255 | if k >= 0 then 256 | gen [] r (s * expt 10 k) mUp mDn 257 | else 258 | let bk = expt 10 (-k) in 259 | gen [] (r * bk) s (mUp * bk) (mDn * bk) 260 | in 261 | (map fromIntegral (reverse rds), k) 262 | 263 | -- Exponentiation with a cache for the most common numbers. 264 | minExpt, maxExpt :: Int 265 | minExpt = 0 266 | maxExpt = 1100 267 | 268 | expt :: Integer -> Int -> Integer 269 | expt base n 270 | | base == 2 && n >= minExpt && n <= maxExpt = expts `unsafeAt` n 271 | | base == 10 && n <= maxExpt10 = expts10 `unsafeAt` n 272 | | otherwise = base^n 273 | 274 | expts :: Array Int Integer 275 | expts = array (minExpt,maxExpt) [(n,2^n) | n <- [minExpt .. maxExpt]] 276 | 277 | maxExpt10 :: Int 278 | maxExpt10 = 324 279 | 280 | expts10 :: Array Int Integer 281 | expts10 = array (minExpt,maxExpt10) [(n,10^n) | n <- [minExpt .. maxExpt10]] 282 | 283 | -- | Unsafe conversion for decimal digits. 284 | {-# INLINE i2d #-} 285 | i2d :: Int -> Word8 286 | i2d i = fromIntegral (ord '0' + i) 287 | -------------------------------------------------------------------------------- /src/Data/CSV/Conduit/Monoid.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE CPP #-} 2 | {-# OPTIONS_GHC -fno-warn-unused-imports #-} 3 | 4 | module Data.CSV.Conduit.Monoid 5 | ( (<>) 6 | ) where 7 | 8 | import Data.Monoid 9 | 10 | #if !MIN_VERSION_base(4,5,0) 11 | infixr 6 <> 12 | -- | An infix synonym for 'mappend'. 13 | (<>) :: Monoid m => m -> m -> m 14 | (<>) = mappend 15 | {-# INLINE (<>) #-} 16 | #endif 17 | -------------------------------------------------------------------------------- /src/Data/CSV/Conduit/Parser/ByteString.hs: -------------------------------------------------------------------------------- 1 | {-| 2 | 3 | This module exports the underlying Attoparsec row parser. This is helpful if 4 | you want to do some ad-hoc CSV string parsing. 5 | 6 | -} 7 | 8 | module Data.CSV.Conduit.Parser.ByteString 9 | ( parseCSV 10 | , parseRow 11 | , row 12 | , csv 13 | ) where 14 | 15 | ------------------------------------------------------------------------------- 16 | import Control.Applicative 17 | import Control.Monad (mzero) 18 | import Data.Attoparsec.ByteString as P hiding (take) 19 | import qualified Data.Attoparsec.ByteString.Char8 as C8 20 | import Data.ByteString.Char8 (ByteString) 21 | import qualified Data.ByteString.Char8 as B8 22 | import Data.Word (Word8) 23 | ------------------------------------------------------------------------------- 24 | import Data.CSV.Conduit.Types 25 | 26 | 27 | ------------------------------------------------------------------------------ 28 | -- | Try to parse given string as CSV 29 | parseCSV :: CSVSettings -> ByteString -> Either String [Row ByteString] 30 | parseCSV s = parseOnly $ csv s 31 | 32 | 33 | ------------------------------------------------------------------------------ 34 | -- | Try to parse given string as 'Row ByteString' 35 | parseRow :: CSVSettings -> ByteString -> Either String (Maybe (Row ByteString)) 36 | parseRow s = parseOnly $ row s 37 | 38 | 39 | ------------------------------------------------------------------------------ 40 | -- | Parse CSV 41 | csv :: CSVSettings -> Parser [Row ByteString] 42 | csv s = do 43 | r <- row s 44 | end <- atEnd 45 | if end 46 | then case r of 47 | Just x -> return [x] 48 | Nothing -> return [] 49 | else do 50 | rest <- csv s 51 | return $ case r of 52 | Just x -> x : rest 53 | Nothing -> rest 54 | 55 | 56 | ------------------------------------------------------------------------------ 57 | -- | Parse a CSV row 58 | row :: CSVSettings -> Parser (Maybe (Row ByteString)) 59 | row csvs = csvrow csvs <|> badrow 60 | 61 | csvEndOfLine :: Parser () 62 | csvEndOfLine = (word8 10 >> return ()) <|> ((word8 13 *> word8 10) >> return ()) <|> (word8 13 >> return ()) 63 | 64 | badrow :: Parser (Maybe (Row ByteString)) 65 | badrow = P.takeWhile (not . C8.isEndOfLine) *> 66 | (csvEndOfLine <|> C8.endOfInput) *> return Nothing 67 | 68 | csvrow :: CSVSettings -> Parser (Maybe (Row ByteString)) 69 | csvrow c = 70 | let rowbody = (quotedField' <|> field c) `sepBy` C8.char (csvSep c) 71 | properrow = rowbody <* (csvEndOfLine <|> P.endOfInput) 72 | quotedField' = case csvQuoteChar c of 73 | Nothing -> mzero 74 | Just q' -> try (quotedField q') 75 | in do 76 | res <- properrow 77 | return $ Just res 78 | 79 | field :: CSVSettings -> Parser ByteString 80 | field s = P.takeWhile (isFieldChar s) 81 | 82 | isFieldChar :: CSVSettings -> Word8 -> Bool 83 | isFieldChar s = notInClass xs' 84 | where xs = csvSep s : "\n\r" 85 | xs' = case csvQuoteChar s of 86 | Nothing -> xs 87 | Just x -> x : xs 88 | 89 | quotedField :: Char -> Parser ByteString 90 | quotedField c = 91 | let quoted = string dbl *> return c 92 | dbl = B8.pack [c,c] 93 | in do 94 | _ <- C8.char c 95 | f <- many (C8.notChar c <|> quoted) 96 | _ <- C8.char c 97 | return $ B8.pack f 98 | -------------------------------------------------------------------------------- /src/Data/CSV/Conduit/Parser/Text.hs: -------------------------------------------------------------------------------- 1 | {-| 2 | 3 | This module exports the underlying Attoparsec row parser. This is helpful if 4 | you want to do some ad-hoc CSV string parsing. 5 | 6 | -} 7 | 8 | module Data.CSV.Conduit.Parser.Text 9 | ( parseCSV 10 | , parseRow 11 | , row 12 | , csv 13 | ) where 14 | 15 | ------------------------------------------------------------------------------- 16 | import Control.Applicative 17 | import Control.Monad (mzero) 18 | import Data.Attoparsec.Text as P hiding (take) 19 | import qualified Data.Attoparsec.Text as T 20 | import Data.Text (Text) 21 | import qualified Data.Text as T 22 | ------------------------------------------------------------------------------- 23 | import Data.CSV.Conduit.Types 24 | ------------------------------------------------------------------------------- 25 | 26 | 27 | ------------------------------------------------------------------------------ 28 | -- | Try to parse given string as CSV 29 | parseCSV :: CSVSettings -> Text -> Either String [Row Text] 30 | parseCSV s = parseOnly $ csv s 31 | 32 | 33 | ------------------------------------------------------------------------------ 34 | -- | Try to parse given string as 'Row Text' 35 | parseRow :: CSVSettings -> Text -> Either String (Maybe (Row Text)) 36 | parseRow s = parseOnly $ row s 37 | 38 | 39 | ------------------------------------------------------------------------------ 40 | -- | Parse CSV 41 | csv :: CSVSettings -> Parser [Row Text] 42 | csv s = do 43 | r <- row s 44 | end <- atEnd 45 | if end 46 | then case r of 47 | Just x -> return [x] 48 | Nothing -> return [] 49 | else do 50 | rest <- csv s 51 | return $ case r of 52 | Just x -> x : rest 53 | Nothing -> rest 54 | 55 | 56 | ------------------------------------------------------------------------------ 57 | -- | Parse a CSV row 58 | row :: CSVSettings -> Parser (Maybe (Row Text)) 59 | row csvs = csvrow csvs <|> badrow 60 | 61 | csvEndOfLine :: Parser () 62 | csvEndOfLine = (char '\n' >> return ()) <|> (string (T.pack "\r\n") >> return ()) <|> (char '\r' >> return ()) 63 | 64 | badrow :: Parser (Maybe (Row Text)) 65 | badrow = P.takeWhile (not . T.isEndOfLine) *> 66 | (csvEndOfLine <|> T.endOfInput) *> return Nothing 67 | 68 | csvrow :: CSVSettings -> Parser (Maybe (Row Text)) 69 | csvrow c = 70 | let rowbody = (quotedField' <|> field c) `sepBy` T.char (csvSep c) 71 | properrow = rowbody <* (csvEndOfLine <|> P.endOfInput) 72 | quotedField' = case csvQuoteChar c of 73 | Nothing -> mzero 74 | Just q' -> try (quotedField q') 75 | in do 76 | res <- properrow 77 | return $ Just res 78 | 79 | field :: CSVSettings -> Parser Text 80 | field s = P.takeWhile (isFieldChar s) 81 | 82 | isFieldChar :: CSVSettings -> Char -> Bool 83 | isFieldChar s = notInClass xs' 84 | where xs = csvSep s : "\n\r" 85 | xs' = case csvQuoteChar s of 86 | Nothing -> xs 87 | Just x -> x : xs 88 | 89 | quotedField :: Char -> Parser Text 90 | quotedField c = do 91 | let quoted = string dbl *> return c 92 | dbl = T.pack [c,c] 93 | _ <- T.char c 94 | f <- many (T.notChar c <|> quoted) 95 | _ <- T.char c 96 | return $ T.pack f 97 | 98 | 99 | -------------------------------------------------------------------------------- /src/Data/CSV/Conduit/Types.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE OverloadedStrings #-} 2 | {-# LANGUAGE TypeSynonymInstances #-} 3 | 4 | module Data.CSV.Conduit.Types where 5 | 6 | ------------------------------------------------------------------------------- 7 | import Data.Default 8 | import qualified Data.Map as M 9 | import qualified Data.Map.Ordered as MO 10 | ------------------------------------------------------------------------------- 11 | 12 | data QuoteEmpty = DoQuoteEmpty | DontQuoteEmpty deriving (Show, Eq) 13 | 14 | ------------------------------------------------------------------------------- 15 | -- | Settings for a CSV file. This library is intended to be flexible 16 | -- and offer a way to process the majority of text data files out 17 | -- there. 18 | data CSVSettings = CSVSettings 19 | { 20 | -- | Separator character to be used in between fields 21 | csvSep :: !Char 22 | 23 | -- | Quote character that may sometimes be present around fields. 24 | -- If 'Nothing' is given, the library will never expect quotation 25 | -- even if it is present. 26 | , csvQuoteCharAndStyle :: !(Maybe (Char, QuoteEmpty)) 27 | } deriving (Show, Eq) 28 | 29 | 30 | csvQuoteChar :: CSVSettings -> Maybe Char 31 | csvQuoteChar = (fst <$>) . csvQuoteCharAndStyle 32 | 33 | ------------------------------------------------------------------------------- 34 | -- | Default settings for a CSV file. 35 | -- 36 | -- > csvSep = ',' 37 | -- > csvQuoteChar = Just '"' 38 | -- 39 | defCSVSettings :: CSVSettings 40 | defCSVSettings = CSVSettings 41 | { csvSep = ',' 42 | , csvQuoteCharAndStyle = Just ('"', DoQuoteEmpty) 43 | } 44 | 45 | defDontQuoteEmptyCSVSettings :: CSVSettings 46 | defDontQuoteEmptyCSVSettings = CSVSettings 47 | { csvSep = ',' 48 | , csvQuoteCharAndStyle = Just ('"', DontQuoteEmpty) 49 | } 50 | 51 | instance Default CSVSettings where 52 | def = defCSVSettings 53 | 54 | ------------------------------------------------------------------------------- 55 | -- | A 'Row' is just a list of fields 56 | type Row a = [a] 57 | 58 | ------------------------------------------------------------------------------- 59 | -- | A 'MapRow' is a dictionary based on 'Data.Map' where column names 60 | -- are keys and row's individual cell values are the values of the 61 | -- 'Map'. 62 | type MapRow a = M.Map a a 63 | 64 | -- | An 'OrderedMapRow' is a dictionary based on 'Data.Map.Ordered' where column 65 | -- names are keys and row's individual cell values are the values of the 'OMap'. 66 | -- Unlike 'MapRow', 'OrderedMapRow' preserves the insertion ordering of columns. 67 | -- 'OrderedMapRow' is a reasonable default in most cases. 68 | type OrderedMapRow a = MO.OMap a a 69 | -------------------------------------------------------------------------------- /stack.yaml: -------------------------------------------------------------------------------- 1 | resolver: lts-22.34 2 | packages: 3 | - . 4 | flags: 5 | csv-conduit: 6 | lib-Werror: true 7 | extra-package-dbs: [] 8 | -------------------------------------------------------------------------------- /test/AdHoc.hs: -------------------------------------------------------------------------------- 1 | module Main where 2 | 3 | 4 | import qualified Data.ByteString.Char8 as B 5 | import Data.Conduit 6 | import Data.Conduit.Binary 7 | import Data.CSV.Conduit 8 | import Data.Map ((!)) 9 | import System.Directory 10 | import System.Environment 11 | 12 | 13 | ------------------------------------------------------------------------------- 14 | main = do 15 | fi : fo : _ <- getArgs 16 | runResourceT $ mapCSVFile defCSVSettings f fi fo 17 | -- runResourceT $ sourceFile fi $$ sinkFile fo 18 | 19 | 20 | f :: Row B.ByteString -> [Row B.ByteString] 21 | f = return 22 | -------------------------------------------------------------------------------- /test/Test.hs: -------------------------------------------------------------------------------- 1 | {-# LANGUAGE MultiParamTypeClasses #-} 2 | {-# LANGUAGE OverloadedStrings #-} 3 | {-# LANGUAGE ScopedTypeVariables #-} 4 | 5 | module Main where 6 | 7 | import qualified Conduit as C 8 | import Control.Exception 9 | import qualified Data.ByteString.Char8 as B 10 | import Data.CSV.Conduit 11 | import Data.CSV.Conduit.Conversion 12 | import Data.CSV.Conduit.Types 13 | import qualified Data.Map as Map 14 | import qualified Data.Map.Ordered as OMap 15 | import Data.Monoid as M 16 | import Data.Text (Text) 17 | import qualified Data.Vector as V 18 | import System.Directory 19 | import Test.Framework (Test, defaultMain, testGroup) 20 | import Test.Framework.Providers.HUnit 21 | import Test.HUnit (assertFailure, (@=?), (@?=)) 22 | 23 | main :: IO () 24 | main = defaultMain tests 25 | 26 | tests :: [Test] 27 | tests = 28 | [ testGroup "Basic Ops" baseTests, 29 | testGroup "decodeCSV" decodeCSVTests 30 | ] 31 | 32 | baseTests :: [Test] 33 | baseTests = 34 | [ testCase "mapping with id works" test_identityMap, 35 | testCase "simple parsing works" (test_simpleParse testFile1), 36 | testCase "simple parsing works for Mac-Excel" (test_simpleParse testFile3), 37 | testCase "simple parsing works for Windows-Excel" (test_simpleParse testFile4), 38 | testCase "fails parsing gracefully" test_parseFail, 39 | testCase "OrderedMap" test_orderedMap 40 | ] 41 | 42 | decodeCSVTests :: [Test] 43 | decodeCSVTests = 44 | [ testCase "parses a CSV" $ do 45 | let efoos = decodeCSV defCSVSettings ("Foo\nfoo" :: B.ByteString) 46 | case efoos :: Either SomeException (V.Vector (Named Foo)) of 47 | Left e -> assertFailure (show e) 48 | Right foos -> V.fromList [Named Foo] @=? foos, 49 | testCase "eats parse errors, evidently" $ do 50 | let efoos = decodeCSV defCSVSettings ("Foo\nbad" :: B.ByteString) 51 | case efoos :: Either SomeException (V.Vector (Named Foo)) of 52 | Left e -> assertFailure (show e) 53 | Right foos -> M.mempty @=? foos 54 | ] 55 | 56 | data Foo = Foo deriving (Show, Eq) 57 | 58 | instance FromNamedRecord Foo where 59 | parseNamedRecord nr = do 60 | s <- nr .: "Foo" 61 | case s of 62 | "foo" -> pure Foo 63 | _ -> fail ("Expected \"foo\" but got " <> B.unpack s) 64 | 65 | instance ToNamedRecord Foo where 66 | toNamedRecord Foo = namedRecord ["Foo" .= ("foo" :: B.ByteString)] 67 | 68 | test_identityMap :: IO () 69 | test_identityMap = do 70 | _ <- runResourceT $ mapCSVFile csvSettings f testFile2 outFile 71 | f1 <- readFile testFile2 72 | f2 <- readFile outFile 73 | f1 @=? f2 74 | removeFile outFile 75 | where 76 | outFile = "test/testOut.csv" 77 | f :: Row Text -> [Row Text] 78 | f = return 79 | 80 | test_simpleParse :: FilePath -> IO () 81 | test_simpleParse fp = do 82 | (d :: V.Vector (MapRow B.ByteString)) <- readCSVFile csvSettings fp 83 | V.mapM_ assertRow d 84 | where 85 | assertRow r = v3 @=? (v1 + v2) 86 | where 87 | v1 = readBS $ r Map.! "Col2" 88 | v2 = readBS $ r Map.! "Col3" 89 | v3 = readBS $ r Map.! "Sum" 90 | 91 | test_parseFail :: IO () 92 | test_parseFail = do 93 | (d :: V.Vector (MapRow B.ByteString)) <- readCSVFile csvSettings testXLS 94 | errored <- catch (V.mapM_ assertRow d >> pure False) handler 95 | if errored then pure () else assertFailure "readCSVFile shouldn't read XLS" 96 | where 97 | handler :: ErrorCall -> IO Bool 98 | handler _ = pure True 99 | assertRow r = v3 @=? (v1 + v2) 100 | where 101 | v1 = readBS $ r Map.! "Col2" 102 | v2 = readBS $ r Map.! "Col3" 103 | v3 = readBS $ r Map.! "Sum" 104 | 105 | test_orderedMap :: IO () 106 | test_orderedMap = do 107 | unorderedRes <- 108 | C.runConduit $ 109 | C.yieldMany [unorderedRow] 110 | C..| writeHeaders defCSVSettings 111 | C..| C.foldC 112 | unorderedRes @?= ("\"a\",\"b\"\n\"aval\",\"bval\"\n" :: B.ByteString) 113 | orderedRes <- 114 | C.runConduit $ 115 | C.yieldMany [orderedRow] 116 | C..| writeHeadersOrdered defCSVSettings 117 | C..| C.foldC 118 | orderedRes @?= ("\"b\",\"a\"\n\"bval\",\"aval\"\n" :: B.ByteString) 119 | where 120 | orderedRow :: OrderedMapRow Text 121 | orderedRow = OMap.fromList pairs 122 | unorderedRow :: MapRow Text 123 | unorderedRow = Map.fromList pairs 124 | pairs = [("b", "bval"), ("a", "aval")] 125 | 126 | csvSettings :: CSVSettings 127 | csvSettings = defCSVSettings {csvQuoteCharAndStyle = Just ('`', DontQuoteEmpty)} 128 | 129 | testFile1, testFile2, testFile3, testFile4 :: FilePath 130 | testFile1 = "test/test.csv" 131 | testFile2 = "test/test.csv" 132 | testFile3 = "test/test-mac-excel.csv" 133 | testFile4 = "test/test-windows-excel.csv" 134 | 135 | testXLS :: FilePath 136 | testXLS = "test/test.xls" 137 | 138 | readBS :: B.ByteString -> Int 139 | readBS = read . B.unpack 140 | -------------------------------------------------------------------------------- /test/test-mac-excel.csv: -------------------------------------------------------------------------------- 1 | `Col1`,`Col2`,`Col3`,`Sum` `A`,`2`,`3`,`5` `B`,`3`,`4`,`7` `Field using the quote char ``this is the in-quoted value```,`4`,`5`,`9` -------------------------------------------------------------------------------- /test/test-windows-excel.csv: -------------------------------------------------------------------------------- 1 | Col1,Col2,Col3,Sum 2 | A,2,3,5 3 | B,3,4,7 4 | "Field using the quote char ""this is the in-quoted value""",4,5,9 5 | -------------------------------------------------------------------------------- /test/test.csv: -------------------------------------------------------------------------------- 1 | `Col1`,`Col2`,`Col3`,`Sum` 2 | `A`,`2`,`3`,`5` 3 | `B`,`3`,`4`,`7` 4 | `Field using the quote char ``this is the in-quoted value```,`4`,`5`,`9` 5 | -------------------------------------------------------------------------------- /test/test.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ozataman/csv-conduit/5656585611b06f8a97cea9570fb35dba910e9b48/test/test.xls --------------------------------------------------------------------------------