├── .github └── workflows │ ├── build.yml │ └── docs.yml ├── .gitignore ├── LICENSE ├── README.md ├── examples ├── eng.freq.csv ├── eng.nim └── eng.spell.csv ├── src └── tabby.nim ├── tabby.nimble └── tests ├── bench.nim ├── config.nims ├── eng.freq.csv ├── eng.spell.csv └── test.nim /.github/workflows/build.yml: -------------------------------------------------------------------------------- 1 | name: Github Actions 2 | on: [push, pull_request] 3 | jobs: 4 | build: 5 | strategy: 6 | fail-fast: false 7 | matrix: 8 | os: [ubuntu-latest, windows-latest] 9 | 10 | runs-on: ${{ matrix.os }} 11 | 12 | steps: 13 | - uses: actions/checkout@v3 14 | - uses: jiro4989/setup-nim-action@v1 15 | with: 16 | repo-token: ${{ secrets.GITHUB_TOKEN }} 17 | - run: nimble test -y 18 | - run: nimble test --gc:orc -y 19 | -------------------------------------------------------------------------------- /.github/workflows/docs.yml: -------------------------------------------------------------------------------- 1 | name: docs 2 | on: 3 | push: 4 | branches: 5 | - master 6 | env: 7 | nim-version: 'stable' 8 | nim-src: src/${{ github.event.repository.name }}.nim 9 | deploy-dir: .gh-pages 10 | jobs: 11 | docs: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v3 15 | - uses: jiro4989/setup-nim-action@v1 16 | with: 17 | nim-version: ${{ env.nim-version }} 18 | - run: nimble install -Y 19 | - run: nimble doc --index:on --project --git.url:https://github.com/${{ github.repository }} --git.commit:master --out:${{ env.deploy-dir }} ${{ env.nim-src }} 20 | - name: "Copy to index.html" 21 | run: cp ${{ env.deploy-dir }}/${{ github.event.repository.name }}.html ${{ env.deploy-dir }}/index.html 22 | - name: Deploy documents 23 | uses: peaceiris/actions-gh-pages@v3 24 | with: 25 | github_token: ${{ secrets.GITHUB_TOKEN }} 26 | publish_dir: ${{ env.deploy-dir }} 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # ignore files with no extention: 2 | * 3 | !*/ 4 | !*.* 5 | 6 | # normal ignores: 7 | *.exe 8 | nimcache 9 | *.pdb 10 | *.ilk 11 | .* 12 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2021 Andre von Houck 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 6 | 7 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Tabby - Fast CSV parser with hooks. 2 | 3 | `nimble install tabby` 4 | 5 | ![Github Actions](https://github.com/treeform/tabby/workflows/Github%20Actions/badge.svg) 6 | 7 | [API reference](https://treeform.github.io/tabby) 8 | 9 | This library has no dependencies other than the Nim standard library. 10 | 11 | ## About 12 | 13 | This library parses `.csv` or `.tsv` files directly into Nim objects. This is different from how Nim's standard library [parsecsv](https://nim-lang.org/docs/parsecsv.html) works which first parses them into an intermediate representation. This makes `tabby` generate fewer memory allocations. 14 | 15 | Tabby also has a simpler API and is easier to use with just two calls `fromCsv`/`toCsv`: 16 | ```nim 17 | let rows = strData.fromCsv(seq[RowObj]) 18 | ``` 19 | and back: 20 | ```nim 21 | echo rows.toCsv() 22 | ``` 23 | 24 | Tabby also supports arbitrary delimiters. Not only standard tab `\t` and `,` with linux `\n` and windows `\r\n` line endings, but any delimiter can be used. It's trivial to convert your data to and from any tabular format: 25 | ```nim 26 | strData.fromCsv(seq[RowObj], separator = ":", lineEnd = ";") 27 | ``` 28 | 29 | Tabby can also guess delimiters with `fromCsvGuess()` function. 30 | 31 | This library is similar to my other [jsony](https://github.com/treeform/jsony) project that is for `json`, if you like `jsony` you should like `tabby`. 32 | 33 | 34 | ## How to use 35 | 36 | You need to have CSV strData: 37 | ``` 38 | word,count 39 | the,23135851162 40 | of,13151942776 41 | and,12997637966 42 | ``` 43 | And a regular Nim object to parse into to: 44 | ```nim 45 | type FreqRow = object 46 | word: string 47 | count: int 48 | ``` 49 | 50 | Then simply read in the data: 51 | 52 | ```nim 53 | var rows = strData.fromCsv(seq[FreqRow]) 54 | ``` 55 | 56 | Compare this single line with confusing but equivalent `std/parcecsv` code: 57 | 58 | ```nim 59 | var 60 | rows: seq[FreqRow] 61 | strm = newStringStream(content) 62 | p: CsvParser 63 | p.open(strm, "tmp.csv") 64 | p.readHeaderRow() 65 | while p.readRow(): 66 | rows.add FreqRow(word: p.row[0], count: parseInt(p.row[1])) 67 | p.close() 68 | ``` 69 | 70 | ## Speed 71 | 72 | Even though tabby does not allocate intermediate objects, it does use header and field re-ordering, and parse and dump hooks, which makes makes it speed close to `std/parcecsv` but with much simpler API. 73 | 74 | ``` 75 | name ............................... min time avg time std dv runs 76 | tabby ............................ 134.951 ms 141.897 ms ±7.301 x100 77 | parsecsv ......................... 129.861 ms 136.086 ms ±7.026 x100 78 | ``` 79 | 80 | ## Parse Hooks 81 | 82 | Sometimes the data you get from a csv file is not quite in the format you are looking for. You can define your own `parseHook` function to parse any data. 83 | 84 | ```nim 85 | let csvData = """ 86 | country,budget 87 | US,$2000 88 | GB,$1000 89 | DE,$1000 90 | """ 91 | 92 | type 93 | Money = uint64 # in cents 94 | 95 | CountryMoney = object 96 | country: string 97 | budget: Money 98 | 99 | proc parseHook(p: ParseContext, v: var Money) = 100 | inc p.i # skip the $ 101 | var num: int 102 | p.parseHook(num) 103 | v = num.uint64 * 100 # in cents 104 | 105 | var rows = csvData.fromCsv(seq[CountryMoney]) 106 | ``` 107 | 108 | ## Dump Hooks 109 | 110 | Just like with parse hooks, sometimes the format you want to write to a csv file is not quite the format your objects are. You can define your own `dumpHook` function to output your data in any format. 111 | 112 | ```nim 113 | proc dumpHook(d: DumpContext, v: Money) = 114 | # read teh % 115 | d.data.add "$" 116 | d.data.add $(v div 100) 117 | 118 | echo rows.toCsv() 119 | ``` 120 | -------------------------------------------------------------------------------- /examples/eng.nim: -------------------------------------------------------------------------------- 1 | import critbits, tabby, print, strutils, algorithm, tables 2 | 3 | var spelling: CritBitTree[string] 4 | 5 | type SpellRow = object 6 | filenameId: int 7 | offsetSpan: string 8 | misspelling: string 9 | kind: string 10 | correction: string 11 | var spellRows = tabby.fromCsv($readFile("eng.spell.csv"), seq[SpellRow]) 12 | for s in spellRows: 13 | spelling[s.misspelling] = s.correction 14 | 15 | var scores: CritBitTree[int] 16 | 17 | type FreqRow = object 18 | word: string 19 | count: int 20 | var freqRows = tabby.fromCsv($readFile("eng.freq.csv"), seq[FreqRow]) 21 | for s in freqRows: 22 | scores[s.word] = s.count 23 | 24 | proc suggestions(prefix: string): seq[string] = 25 | var best: CountTable[string] 26 | for key, score in scores.pairsWithPrefix(prefix): 27 | best[key] = score 28 | 29 | for bad, good in spelling.pairsWithPrefix(prefix): 30 | best[good] = scores[good] 31 | 32 | best.sort() 33 | 34 | var i = 0 35 | for k, b in best: 36 | if i >= 10: 37 | break 38 | result.add(k) 39 | inc i 40 | 41 | echo suggestions("carr") 42 | echo suggestions("managery") 43 | echo suggestions("dja") 44 | echo suggestions("yi") 45 | -------------------------------------------------------------------------------- /src/tabby.nim: -------------------------------------------------------------------------------- 1 | # Fast direct CSV/TSV parser for nim. 2 | import std/[strutils, parseutils, strformat] 3 | 4 | type TabbyError* = object of ValueError 5 | 6 | type ParseContext* = ref object 7 | ## Context for parsing CSV/TSV. 8 | i*: int 9 | header*: seq[string] 10 | data*: string 11 | lineEnd*: string 12 | separator*: string 13 | 14 | template fieldPairs3*(x: untyped): untyped = 15 | ## Helper to iterate over fields of ref object. 16 | when compiles(x[]): 17 | x[].fieldPairs 18 | else: 19 | x.fieldPairs 20 | 21 | proc hasStringAt(s: string, at: int, other: string): bool = 22 | ## Tests if the string has another string at a specific position. 23 | for i, c in other: 24 | if s[at + i] != c: 25 | return false 26 | return true 27 | 28 | proc error(p: ParseContext, msg: string) = 29 | ## Shortcut to raise an exception. 30 | block: 31 | var 32 | at = 0 33 | atLine = 0 34 | line = 1 35 | column = 1 36 | while at < p.i: 37 | if p.data.hasStringAt(at, p.lineEnd): 38 | inc line 39 | atLine = at + 1 40 | column = 1 41 | inc column 42 | inc at 43 | var endLine = at 44 | while endLine < p.data.len: 45 | if p.data.hasStringAt(endLine, p.lineEnd): 46 | dec endLine 47 | break 48 | inc endLine 49 | echo p.data[atLine .. endLine].replace('\t', ' ') 50 | echo " ".repeat(column-2) & "^" 51 | raise newException(TabbyError, msg & " At line: " & $line & " column: " & $column) 52 | 53 | proc isNext(p: ParseContext, str: string): bool = 54 | ## Tests if the a str comes next. 55 | for i, c in str: 56 | if p.i + i >= p.data.len: 57 | return false 58 | if p.data[p.i + i] != c: 59 | return false 60 | return true 61 | 62 | proc skipLine(p: ParseContext) = 63 | ## Skips this line. 64 | let at = p.data.find(p.lineEnd, p.i) 65 | if at != -1: 66 | p.i = at + p.lineEnd.len 67 | 68 | proc skipSpaces(p: ParseContext) = 69 | ## Skips spaces to next token. 70 | while p.i < p.data.len and p.data[p.i] == ' ': 71 | inc p.i 72 | 73 | proc skipSep(p: ParseContext) = 74 | ## Skips current separator. 75 | if p.i < p.data.len and p.isNext(p.lineEnd): 76 | return 77 | elif p.i < p.data.len and p.isNext(p.separator): 78 | p.i += p.separator.len 79 | else: 80 | if p.i >= p.data.len: 81 | p.error(&"Failed to parse, end of data reached.") 82 | else: 83 | p.error(&"Failed to parse, separator expected, got: {p.data[p.i]}.") 84 | 85 | proc parseHook*(p: ParseContext, name: string, v: var string) = 86 | ## Parse hook for string. 87 | let start = p.i 88 | if p.data[p.i] in {'"', '\''}: 89 | # "quoted string" 90 | let quote = p.data[p.i] 91 | inc p.i 92 | while p.i < p.data.len: 93 | if p.data[p.i] == quote: 94 | # handle escaped double quote "" or '' 95 | if p.i + 1 < p.data.len and p.data[p.i + 1] == quote: 96 | inc p.i 97 | v.add quote 98 | inc p.i 99 | else: 100 | break 101 | elif p.data[p.i] == '\\': 102 | # handle escape quote 103 | inc p.i 104 | let c = p.data[p.i] 105 | case c: 106 | of '"', '\\', '/': v.add(c) 107 | of 'b': v.add '\b' 108 | of 'f': v.add '\f' 109 | of 'n': v.add '\n' 110 | of 'r': v.add '\r' 111 | of 't': v.add '\t' 112 | else: v.add c 113 | inc p.i 114 | else: 115 | v.add p.data[p.i] 116 | inc p.i 117 | inc p.i 118 | else: 119 | # plain string 120 | while p.i < p.data.len and not (p.isNext(p.separator) or p.isNext(p.lineEnd)): 121 | inc p.i 122 | v = p.data[start ..< p.i].strip() 123 | 124 | proc parseHook*(p: ParseContext, name: string, v: var SomeInteger) = 125 | ## Parse hook for integer number. 126 | var num: int 127 | let chars = parseutils.parseInt(p.data, num, p.i) 128 | if chars == 0: 129 | p.error(&"Failed to parse a \"{name}\" as integer.") 130 | p.i += chars 131 | v = num 132 | 133 | proc parseHook*(p: ParseContext, name: string, v: var SomeFloat) = 134 | ## Parse hook for float point number. 135 | var num: float 136 | let chars = parseutils.parseFloat(p.data, num, p.i) 137 | if chars == 0: 138 | p.error(&"Failed to parse a \"{name}\" as float.") 139 | p.i += chars 140 | v = num 141 | 142 | proc parseHook*(p: ParseContext, name: string, v: var bool) = 143 | ## Parse hook for boolean. 144 | var str: string 145 | p.parseHook(name, str) 146 | v = str.toLowerAscii() == "true" 147 | 148 | proc parseHook*[T: enum](p: ParseContext, name: string, v: var T) = 149 | ## Parse hook for boolean. 150 | var str: string 151 | p.parseHook(name, str) 152 | v = parseEnum[T](str) 153 | 154 | proc fromCsvFast*[T]( 155 | data: string, 156 | objType: type[seq[T]], 157 | hasHeader = true, 158 | lineEnd = "\n", 159 | separator = "," 160 | ): seq[T] = 161 | ## Read data seq as a CSV. 162 | ## Objects schema must match CSV schema. 163 | ## * hasHeader - should header be skipped. 164 | ## will be skipped if header is set. 165 | var p = ParseContext() 166 | p.data = data 167 | p.lineEnd = lineEnd 168 | p.separator = separator 169 | 170 | if hasHeader: 171 | p.skipLine() 172 | 173 | while p.i < p.data.len: 174 | var currentRow = T() 175 | for name, field in currentRow.fieldPairs3: 176 | parseHook(p, name, field) 177 | if p.i == p.data.len: 178 | result.add(currentRow) 179 | return 180 | p.skipSep() 181 | result.add(currentRow) 182 | p.skipLine() 183 | 184 | proc fromCsv*[T]( 185 | data: string, 186 | objType: type[seq[T]], 187 | header = newSeq[string](), 188 | hasHeader = true, 189 | lineEnd = "\n", 190 | separator = "," 191 | ): seq[T] = 192 | ## Read data seq as a CSV. 193 | ## * header - use this header to parse 194 | ## * hasHeader - does the current data have a header, 195 | ## will be skipped if header is set. 196 | 197 | var p = ParseContext() 198 | p.data = data 199 | p.header = header 200 | var userHeader = header.len != 0 201 | p.lineEnd = lineEnd 202 | p.separator = separator 203 | 204 | if hasHeader: 205 | while not p.isNext(p.lineEnd): 206 | var name: string 207 | p.skipSpaces() 208 | p.parseHook("header", name) 209 | p.skipSpaces() 210 | if not userHeader: 211 | p.header.add(name.strip()) 212 | p.skipSep() 213 | p.skipLine() 214 | else: 215 | if not userHeader: 216 | for name, field in T().fieldPairs3: 217 | p.header.add(name) 218 | 219 | doAssert p.header.len != 0 220 | 221 | while p.i < p.data.len: 222 | var currentRow = T() 223 | for headerName in p.header: 224 | for name, field in currentRow.fieldPairs3: 225 | if headerName == name: 226 | p.skipSpaces() 227 | if not p.isNext(p.separator): 228 | p.parseHook(name, field) 229 | p.skipSpaces() 230 | if p.i == p.data.len: 231 | result.add(currentRow) 232 | return 233 | p.skipSep() 234 | break 235 | 236 | result.add(currentRow) 237 | p.skipLine() 238 | 239 | proc fromCsvGuess*[T]( 240 | data: string, 241 | objType: type[seq[T]], 242 | header = newSeq[string](), 243 | hasHeader = true, 244 | ): seq[T] = 245 | ## Read data seq as a CSV. 246 | ## Tries to guess what separators or lineEnds are used. 247 | 248 | var separator = "," 249 | if data.count("\t") > data.count(","): 250 | separator = "\t" 251 | var lineEnd = "\n" 252 | if data.count("\r\n") > data.count("\n") div 2: 253 | lineEnd = "\r\n" 254 | 255 | return data.fromCsv(objType, header, hasHeader, lineEnd, separator) 256 | 257 | type DumpContext* = ref object 258 | header*: seq[string] 259 | data*: string 260 | lineEnd*: string 261 | separator*: string 262 | quote*: char 263 | 264 | proc dumpHook*(d: DumpContext, v: string) = 265 | var needsQuote = false 266 | for c in v: 267 | if c in {' ', '\t', '\n', '\r', '\\', ',', '\'', '"'}: 268 | needsQuote = true 269 | break 270 | if needsQuote: 271 | d.data.add d.quote 272 | for c in v: 273 | case c: 274 | of '\\': d.data.add r"\\" 275 | of '\b': d.data.add r"\b" 276 | of '\f': d.data.add r"\f" 277 | of '\n': d.data.add r"\n" 278 | of '\r': d.data.add r"\r" 279 | of '\t': d.data.add r"\t" 280 | of '"': d.data.add r"\""" 281 | of '\'': d.data.add r"\'" 282 | else: 283 | d.data.add c 284 | d.data.add d.quote 285 | else: 286 | d.data.add v 287 | 288 | proc dumpHook*[T](d: DumpContext, v: T) = 289 | d.data.add $v 290 | 291 | proc toCsv*[T]( 292 | data: seq[T], 293 | header = newSeq[string](), 294 | hasHeader = true, 295 | lineEnd = "\n", 296 | separator = ",", 297 | quote = '"' 298 | ): string = 299 | ## Writes out data seq as a CSV. 300 | ## * header - use this header to write fields in specific order. 301 | ## * hasHeader - should header row be written. 302 | var d = DumpContext() 303 | d.header = header 304 | d.lineEnd = lineEnd 305 | d.separator = separator 306 | d.quote = quote 307 | 308 | if d.header.len == 0: 309 | for name, field in T().fieldPairs3: 310 | d.header.add(name) 311 | 312 | if hasHeader: 313 | for name in d.header: 314 | d.dumpHook(name) 315 | d.data.add d.separator 316 | d.data.removeSuffix(d.separator) 317 | d.data.add(d.lineEnd) 318 | 319 | doAssert d.header.len != 0 320 | 321 | for row in data: 322 | for headerName in d.header: 323 | var found = false 324 | for name, field in row.fieldPairs3: 325 | if headerName == name: 326 | d.dumpHook(field) 327 | d.data.add d.separator 328 | found = true 329 | break 330 | if not found: 331 | d.data.add d.separator 332 | 333 | d.data.removeSuffix(d.separator) 334 | d.data.add(d.lineEnd) 335 | 336 | return d.data 337 | -------------------------------------------------------------------------------- /tabby.nimble: -------------------------------------------------------------------------------- 1 | version = "0.6.0" 2 | author = "Andre von Houck" 3 | description = "Direct to object CSV/TSV/tabulated data parser with hooks." 4 | license = "MIT" 5 | 6 | srcDir = "src" 7 | 8 | requires "nim >= 1.6.0" 9 | -------------------------------------------------------------------------------- /tests/bench.nim: -------------------------------------------------------------------------------- 1 | import tabby, benchy, parsecsv, streams, strutils 2 | 3 | # let content = readFile("tests/eng.spell.csv") 4 | 5 | # type SpellRow = object 6 | # filenameId: int 7 | # offsetSpan: string 8 | # misspelling: string 9 | # kind: string 10 | # correction: string 11 | 12 | # var testRows = fromCsv(content, seq[SpellRow]) 13 | 14 | # timeIt "tabby", 100: 15 | # var rows = fromCsv(content, seq[SpellRow]) 16 | # keep(rows) 17 | # doAssert testRows == rows 18 | 19 | # timeIt "parsecsv", 100: 20 | # var 21 | # rows: seq[SpellRow] 22 | # strm = newStringStream(content) 23 | # p: CsvParser 24 | # p.open(strm, "tmp.csv") 25 | # p.readHeaderRow() 26 | # while p.readRow(): 27 | # rows.add SpellRow( 28 | # filenameId: parseInt(p.row[0]), 29 | # offsetSpan: p.row[1], 30 | # misspelling: p.row[2], 31 | # kind: p.row[3], 32 | # correction: p.row[4] 33 | # ) 34 | # p.close() 35 | # keep(rows) 36 | # doAssert testRows == rows 37 | 38 | block: 39 | let content = readFile("tests/eng.freq.csv") 40 | 41 | type FreqRow = object 42 | word: string 43 | count: int 44 | 45 | var testRows = fromCsvFast(content, seq[FreqRow]) 46 | 47 | timeIt "tabby", 100: 48 | var rows = fromCsvFast(content, seq[FreqRow]) 49 | keep(rows) 50 | doAssert testRows == rows 51 | 52 | timeIt "parsecsv", 100: 53 | var 54 | rows: seq[FreqRow] 55 | strm = newStringStream(content) 56 | p: CsvParser 57 | p.open(strm, "tmp.csv") 58 | p.readHeaderRow() 59 | while p.readRow(): 60 | rows.add FreqRow( 61 | word: p.row[0], 62 | count: parseInt(p.row[1]) 63 | ) 64 | p.close() 65 | keep(rows) 66 | doAssert testRows == rows 67 | -------------------------------------------------------------------------------- /tests/config.nims: -------------------------------------------------------------------------------- 1 | --path:"../src" 2 | -------------------------------------------------------------------------------- /tests/test.nim: -------------------------------------------------------------------------------- 1 | ## Put your tests here. 2 | 3 | import tabby, strutils 4 | 5 | block: 6 | # Most basic parse. 7 | let csvData = """ 8 | word,count 9 | the,23135851162 10 | of,13151942776 11 | and,12997637966 12 | """ 13 | 14 | type FreqRow = object 15 | word: string 16 | count: int 17 | 18 | var rows = tabby.fromCsv(csvData, seq[FreqRow]) 19 | 20 | doAssert rows.len == 3 21 | doAssert rows[0].word == "the" 22 | doAssert rows[0].count == 23135851162 23 | doAssert rows[1].word == "of" 24 | doAssert rows[1].count == 13151942776 25 | doAssert rows[2].word == "and" 26 | doAssert rows[2].count == 12997637966 27 | 28 | doAssert rows.toCsv() == csvData 29 | 30 | block: 31 | # Object field layout does not match header layout. 32 | let csvData = """ 33 | word,count 34 | the,23135851162 35 | of,13151942776 36 | and,12997637966 37 | """ 38 | 39 | type FreqRow = object 40 | count: int 41 | extra: bool 42 | word: string 43 | 44 | var rows = tabby.fromCsv(csvData, seq[FreqRow]) 45 | doAssert rows.len == 3 46 | doAssert rows[0].word == "the" 47 | doAssert rows[0].count == 23135851162 48 | doAssert rows[1].word == "of" 49 | doAssert rows[1].count == 13151942776 50 | doAssert rows[2].word == "and" 51 | doAssert rows[2].count == 12997637966 52 | 53 | doAssert rows.toCsv(header = @["word", "count"]) == csvData 54 | 55 | 56 | block: 57 | # No header given, figure it out from object layout. 58 | let csvData = """ 59 | the,23135851162 60 | of,13151942776 61 | and,12997637966 62 | """ 63 | 64 | type FreqRow = object 65 | word: string 66 | count: int 67 | 68 | var rows = tabby.fromCsv(csvData, seq[FreqRow], hasHeader = false) 69 | doAssert rows.len == 3 70 | doAssert rows[0].word == "the" 71 | doAssert rows[0].count == 23135851162 72 | doAssert rows[1].word == "of" 73 | doAssert rows[1].count == 13151942776 74 | doAssert rows[2].word == "and" 75 | doAssert rows[2].count == 12997637966 76 | 77 | doAssert rows.toCsv(hasHeader = false) == csvData 78 | 79 | block: 80 | # Read header but use your own. 81 | let csvData = """ 82 | w_o_r_d,c_o_u_n_t 83 | the,23135851162 84 | of,13151942776 85 | and,12997637966 86 | """ 87 | 88 | type FreqRow = object 89 | count: int 90 | extra: bool 91 | word: string 92 | 93 | var rows = tabby.fromCsv(csvData, seq[FreqRow], header = @["word", "count"]) 94 | 95 | doAssert rows.len == 3 96 | doAssert rows[0].word == "the" 97 | doAssert rows[0].count == 23135851162 98 | doAssert rows[1].word == "of" 99 | doAssert rows[1].count == 13151942776 100 | doAssert rows[2].word == "and" 101 | doAssert rows[2].count == 12997637966 102 | 103 | doAssert "w_o_r_d,c_o_u_n_t\n" & rows.toCsv( 104 | header = @["word", "count"], hasHeader = false) == csvData 105 | 106 | 107 | block: 108 | # Use tab instead of comma. 109 | let csvData = """ 110 | wordcount 111 | the23135851162 112 | of13151942776 113 | and12997637966 114 | """.replace("", "\t") 115 | 116 | type FreqRow = object 117 | word: string 118 | count: int 119 | 120 | var rows = tabby.fromCsv( 121 | csvData, seq[FreqRow], hasHeader = true, separator = "\t" 122 | ) 123 | doAssert rows.len == 3 124 | doAssert rows[0].word == "the" 125 | doAssert rows[0].count == 23135851162 126 | doAssert rows[1].word == "of" 127 | doAssert rows[1].count == 13151942776 128 | doAssert rows[2].word == "and" 129 | doAssert rows[2].count == 12997637966 130 | 131 | doAssert rows.toCsv(separator = "\t") == csvData 132 | 133 | block: 134 | # Parse "quoted" strings. 135 | let csvData = """ 136 | "the apple",1 137 | "of,time",2 138 | "and\nthat",3 139 | "\"bye\"",4 140 | """ 141 | 142 | type TextRow = object 143 | text: string 144 | count: int 145 | 146 | var rows = tabby.fromCsv(csvData, seq[TextRow], hasHeader = false) 147 | doAssert rows.len == 4 148 | doAssert rows[0].text == "the apple" 149 | doAssert rows[0].count == 1 150 | doAssert rows[1].text == "of,time" 151 | doAssert rows[1].count == 2 152 | doAssert rows[2].text == "and\nthat" 153 | doAssert rows[2].count == 3 154 | doAssert rows[3].text == "\"bye\"" 155 | doAssert rows[3].count == 4 156 | 157 | doAssert rows.toCsv(hasHeader = false) == csvData 158 | 159 | block: 160 | # Parse 'quoted' strings. 161 | let csvData = """ 162 | 'the apple',1 163 | 'of,time',2 164 | 'and\nthat',3 165 | '\"bye\"',4 166 | """ 167 | 168 | type TextRow = object 169 | text: string 170 | count: int 171 | 172 | var rows = tabby.fromCsv(csvData, seq[TextRow], hasHeader = false) 173 | doAssert rows.len == 4 174 | doAssert rows[0].text == "the apple" 175 | doAssert rows[0].count == 1 176 | doAssert rows[1].text == "of,time" 177 | doAssert rows[1].count == 2 178 | doAssert rows[2].text == "and\nthat" 179 | doAssert rows[2].count == 3 180 | doAssert rows[3].text == "\"bye\"" 181 | doAssert rows[3].count == 4 182 | 183 | #doAssert rows.toCsv(hasHeader = false, quote='\'') == csvData 184 | 185 | block: 186 | # Parse windows line endings. 187 | let csvData = "word\tcount\r\nthe\t23135851162\r\n" 188 | 189 | type FreqRow = object 190 | word: string 191 | count: int 192 | 193 | var rows = tabby.fromCsv( 194 | csvData, seq[FreqRow], hasHeader = true, separator = "\t", lineEnd = "\r\n" 195 | ) 196 | doAssert rows.len == 1 197 | doAssert rows[0].word == "the" 198 | doAssert rows[0].count == 23135851162 199 | 200 | doAssert rows.toCsv(separator = "\t", lineEnd = "\r\n") == csvData 201 | 202 | 203 | block: 204 | # Parse crazy separator and crazy line endings. 205 | let csvData = "word:~:count-->the:~:23135851162-->" 206 | 207 | type FreqRow = object 208 | word: string 209 | count: int 210 | 211 | var rows = tabby.fromCsv( 212 | csvData, seq[FreqRow], hasHeader = true, separator = ":~:", lineEnd = "-->" 213 | ) 214 | doAssert rows.len == 1 215 | doAssert rows[0].word == "the" 216 | doAssert rows[0].count == 23135851162 217 | 218 | doAssert rows.toCsv(separator = ":~:", lineEnd = "-->") == csvData 219 | 220 | block: 221 | # Crazy spaces between tokens. 222 | let csvData = " word :~: count--> the:~: 23135851162 -->" 223 | 224 | type FreqRow = object 225 | word: string 226 | count: int 227 | 228 | var rows = tabby.fromCsv( 229 | csvData, seq[FreqRow], hasHeader = true, separator = ":~:", lineEnd = "-->" 230 | ) 231 | doAssert rows.len == 1 232 | doAssert rows[0].word == "the" 233 | doAssert rows[0].count == 23135851162 234 | 235 | block: 236 | # Missing last new line. 237 | let csvData = "word\tcount\r\nthe\t23135851162" 238 | 239 | type FreqRow = object 240 | word: string 241 | count: int 242 | 243 | var rows = tabby.fromCsv( 244 | csvData, seq[FreqRow], hasHeader = true, separator = "\t", lineEnd = "\r\n" 245 | ) 246 | doAssert rows.len == 1 247 | doAssert rows[0].word == "the" 248 | doAssert rows[0].count == 23135851162 249 | 250 | 251 | block: 252 | # Guess sep and newline 253 | let csvDatas = @[ 254 | "word,count\nthe,23135851162\n", 255 | "word\tcount\nthe\t23135851162\n", 256 | "word,count\r\nthe,23135851162\r\n", 257 | "word\tcount\r\nthe\t23135851162\r\n" 258 | ] 259 | 260 | type FreqRow = object 261 | word: string 262 | count: int 263 | 264 | for csvData in csvDatas: 265 | var rows = tabby.fromCsvGuess(csvData, seq[FreqRow]) 266 | doAssert rows.len == 1 267 | doAssert rows[0].word == "the" 268 | doAssert rows[0].count == 23135851162 269 | 270 | # Parse and dump hooks 271 | 272 | let csvData = """ 273 | country,budget 274 | US,$2000 275 | GB,$1000 276 | DE,$1000 277 | """ 278 | 279 | type 280 | Money = uint64 # in cents 281 | 282 | CountryMoney = object 283 | country: string 284 | budget: Money 285 | 286 | proc parseHook(p: ParseContext, name: string, v: var Money) = 287 | inc p.i # skip the $ 288 | var num: int 289 | p.parseHook(name, num) 290 | v = num.uint64 * 100 # in cents 291 | 292 | var rows = csvData.fromCsv(seq[CountryMoney]) 293 | 294 | proc dumpHook(p: DumpContext, v: Money) = 295 | # read teh % 296 | p.data.add "$" 297 | p.data.add $(v div 100) 298 | 299 | doAssert rows.toCsv() == """ 300 | country,budget 301 | US,$2000 302 | GB,$1000 303 | DE,$1000 304 | """ 305 | 306 | block: 307 | # One time to read two different formats: 308 | 309 | type Row = object 310 | id: int 311 | color: string 312 | date: string 313 | text: string 314 | enabled: bool 315 | 316 | let csvData = """ 317 | date,text 318 | Mar1,foo 319 | Mar2,bar 320 | Mar3,baz 321 | """ 322 | let rows = csvData.fromCsv(seq[Row]) 323 | doAssert rows.toCsv() == """ 324 | id,color,date,text,enabled 325 | 0,,Mar1,foo,false 326 | 0,,Mar2,bar,false 327 | 0,,Mar3,baz,false 328 | """ 329 | 330 | block: 331 | # Duplicate names. 332 | 333 | type Row = object 334 | id: int 335 | date1: string 336 | text1: string 337 | date2: string 338 | text2: string 339 | 340 | let csvData = """ 341 | date,text 342 | Mar1,foo 343 | Mar2,bar 344 | Mar3,baz 345 | """ 346 | let rows = csvData.fromCsv(seq[Row], header = @["date1", "text1"]) 347 | 348 | doAssert rows.toCsv() == """ 349 | id,date1,text1,date2,text2 350 | 0,Mar1,foo,, 351 | 0,Mar2,bar,, 352 | 0,Mar3,baz,, 353 | """ 354 | 355 | block: 356 | # Ignore extra spaces. 357 | let csvData = """ 358 | word, count 359 | the,23135851162 360 | of , 13151942776 361 | and , 12997637966 362 | """ 363 | 364 | type FreqRow = object 365 | word: string 366 | count: int 367 | 368 | var rows = tabby.fromCsv(csvData, seq[FreqRow]) 369 | doAssert rows.toCsv() == """ 370 | word,count 371 | the,23135851162 372 | of,13151942776 373 | and,12997637966 374 | """ 375 | 376 | block: 377 | # Unix style escapes 378 | let csvData = """ 379 | name,count 380 | John Smith,1 381 | "John Smith",2 382 | "\rJohn\tSmith\n",3 383 | """ 384 | 385 | type NameRow = object 386 | name: string 387 | count: int 388 | 389 | var rows = tabby.fromCsv(csvData, seq[NameRow]) 390 | doAssert rows.toCsv() == """ 391 | name,count 392 | "John Smith",1 393 | "John Smith",2 394 | "\rJohn\tSmith\n",3 395 | """ 396 | 397 | block: 398 | # CSV style escapes "" and '' 399 | let csvData = """ 400 | name,count 401 | 'John''s Book',1 402 | "John's Book",2 403 | "John ""Big"" Smith",3 404 | """ 405 | 406 | type NameRow = object 407 | name: string 408 | count: int 409 | 410 | var rows = tabby.fromCsv(csvData, seq[NameRow]) 411 | doAssert rows.toCsv() == """ 412 | name,count 413 | "John\'s Book",1 414 | "John\'s Book",2 415 | "John \"Big\" Smith",3 416 | """ 417 | --------------------------------------------------------------------------------