├── tests
    ├── config.nims
    ├── test_issue20.nim
    ├── test_issue28.nim
    ├── tests.nim
    └── testsFormula.nim
├── src
    ├── datamancer.nim
    └── datamancer
    │   ├── df_types.nim
    │   ├── formulaNameMacro.nim
    │   ├── value.nim
    │   ├── formulaExp.nim
    │   ├── io.nim
    │   └── column.nim
├── LICENSE
├── datamancer.nimble
├── data
    ├── fishdata_sparse.csv
    ├── mpg.csv
    └── 03-sample_hugo.csv
├── .github
    └── workflows
    │   └── ci.yml
├── changelog.org
├── docs
    ├── docs.nim
    └── datamancer.org
└── README.org


/tests/config.nims:
--------------------------------------------------------------------------------
1 | switch("path", "$projectDir/../src")


--------------------------------------------------------------------------------
/src/datamancer.nim:
--------------------------------------------------------------------------------
1 | ## .. include:: ./docs/datamancer.rst
2 | 
3 | import datamancer / [dataframe, io]
4 | export dataframe, io
5 | 


--------------------------------------------------------------------------------
/tests/test_issue20.nim:
--------------------------------------------------------------------------------
 1 | import datamancer
 2 | import unittest
 3 | 
 4 | test "Issue #20 - `isDigit` was removed":
 5 |   let n1 = %~ "1.1"
 6 |   let n2 = %~ "1.3e5"
 7 |   let n3 = %~ "aba"
 8 |   let n4 = %~ "1..1"
 9 |   let n5 = %~ "123"
10 |   let n6 = %~ "100_000"
11 |   let n7 = %~ "_100_000_"
12 |   check not n1.isInt
13 |   check not n2.isInt
14 |   check not n3.isInt
15 |   check not n4.isInt
16 |   check n5.isInt
17 |   check n6.isInt
18 |   check n7.isInt # this is a little unintuitive, but a downside of our simple def.
19 | 


--------------------------------------------------------------------------------
/src/datamancer/df_types.nim:
--------------------------------------------------------------------------------
 1 | import tables, sets
 2 | import column, value
 3 | 
 4 | type
 5 |   DataFrameKind* = enum
 6 |     dfNormal, dfGrouped
 7 | 
 8 |   # where value is as usual
 9 |   # then
10 |   DataFrame* = ref object
11 |     len*: int
12 |     data*: OrderedTable[string, Column]
13 |     case kind*: DataFrameKind
14 |     of dfGrouped:
15 |       # a grouped data frame stores the keys of the groups and maps them to
16 |       # a set of the categories
17 |       groupMap*: OrderedTable[string, HashSet[Value]]
18 |     else: discard
19 | 


--------------------------------------------------------------------------------
/tests/test_issue28.nim:
--------------------------------------------------------------------------------
 1 | import datamancer
 2 | import json
 3 | 
 4 | template accept(x) =
 5 |   static: assert(compiles(x))
 6 | 
 7 | template reject(x) =
 8 |   static: assert(not compiles(x))
 9 | 
10 | accept:
11 |   let f = fn {"Channel" == "Ch 0"}
12 | accept:
13 |   let f2 = fn({"Channel" == "Ch 0"})
14 | accept:
15 |   let f3 = fn:
16 |     {"Channel" == "Ch 0"}
17 | 
18 | # this should fail, because `json` provides a `{}` proc, which
19 | # means we do not resolve our untyped `{}` macro
20 | # But for some reason on Github Actions it passes?!
21 | #reject:
22 | #  let f = f{"Channel" == "Ch 0"}
23 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 SciNim team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/datamancer.nimble:
--------------------------------------------------------------------------------
 1 | # Package
 2 | 
 3 | version       = "0.1.8"
 4 | author        = "Vindaar"
 5 | description   = "A dataframe library with a dplyr like API"
 6 | license       = "MIT"
 7 | srcDir        = "src"
 8 | 
 9 | 
10 | # Dependencies
11 | 
12 | requires "nim >= 1.2.0"
13 | requires "https://github.com/Vindaar/seqmath >= 0.1.11"
14 | requires "arraymancer >= 0.7.1"
15 | 
16 | task test, "Run standard tests":
17 |   exec "nim c -r tests/testDf.nim"
18 |   exec "nim c -r tests/tests.nim"
19 |   exec "nim c -r tests/test_issue20.nim"
20 |   exec "nim c -r tests/test_issue28.nim"
21 |   exec "nim c -r tests/testsFormula.nim"
22 | 
23 | import os, strutils, strformat
24 | const
25 |   pkgName = "datamancer"
26 |   orgFile = "docs" / (pkgName & ".org")
27 |   rstFile = "docs" / (pkgName & ".rst")
28 |   rstFileAuto = "docs" / (pkgName & "_autogen.rst")
29 | 
30 | template canImport(x: untyped): untyped =
31 |   compiles:
32 |     import x
33 | 
34 | when canImport(docs / docs):
35 |   # can define the `gen_docs` task (docs already imported now)
36 |   # this is to hack around weird nimble + nimscript behavior.
37 |   # when overwriting an install nimble will try to parse the generated
38 |   # nimscript file and for some reason then it won't be able to import
39 |   # the module (even if it's put into `src/`).
40 |   task gen_docs, "Generate datamancer documentation":
41 |     # build the actual docs and the index
42 |     exec "pandoc " & orgFile & " -o " & rstFile
43 |     buildDocs(
44 |       "src/", "docs/",
45 |       defaultFlags = "--hints:off --warnings:off"
46 |     )
47 | 


--------------------------------------------------------------------------------
/data/fishdata_sparse.csv:
--------------------------------------------------------------------------------
  1 | fish,station,seen
  2 | 4842,"Release",1
  3 | 4843,"Release",1
  4 | 4844,"Release",1
  5 | 4845,"Release",1
  6 | 4847,"Release",1
  7 | 4848,"Release",1
  8 | 4849,"Release",1
  9 | 4850,"Release",1
 10 | 4851,"Release",1
 11 | 4854,"Release",1
 12 | 4855,"Release",1
 13 | 4857,"Release",1
 14 | 4858,"Release",1
 15 | 4859,"Release",1
 16 | 4861,"Release",1
 17 | 4862,"Release",1
 18 | 4863,"Release",1
 19 | 4864,"Release",1
 20 | 4865,"Release",1
 21 | 4842,"I80_1",1
 22 | 4843,"I80_1",1
 23 | 4844,"I80_1",1
 24 | 4845,"I80_1",1
 25 | 4847,"I80_1",1
 26 | 4848,"I80_1",1
 27 | 4849,"I80_1",1
 28 | 4850,"I80_1",1
 29 | 4851,"I80_1",1
 30 | 4854,"I80_1",1
 31 | 4855,"I80_1",1
 32 | 4857,"I80_1",1
 33 | 4858,"I80_1",1
 34 | 4859,"I80_1",1
 35 | 4861,"I80_1",1
 36 | 4862,"I80_1",1
 37 | 4863,"I80_1",1
 38 | 4864,"I80_1",1
 39 | 4865,"I80_1",1
 40 | 4842,"Lisbon",1
 41 | 4843,"Lisbon",1
 42 | 4844,"Lisbon",1
 43 | 4845,"Lisbon",1
 44 | 4847,"Lisbon",1
 45 | 4848,"Lisbon",1
 46 | 4855,"Lisbon",1
 47 | 4857,"Lisbon",1
 48 | 4858,"Lisbon",1
 49 | 4859,"Lisbon",1
 50 | 4861,"Lisbon",1
 51 | 4862,"Lisbon",1
 52 | 4865,"Lisbon",1
 53 | 4842,"Rstr",1
 54 | 4843,"Rstr",1
 55 | 4844,"Rstr",1
 56 | 4845,"Rstr",1
 57 | 4848,"Rstr",1
 58 | 4850,"Rstr",1
 59 | 4855,"Rstr",1
 60 | 4857,"Rstr",1
 61 | 4858,"Rstr",1
 62 | 4859,"Rstr",1
 63 | 4861,"Rstr",1
 64 | 4862,"Rstr",1
 65 | 4842,"Base_TD",1
 66 | 4843,"Base_TD",1
 67 | 4844,"Base_TD",1
 68 | 4845,"Base_TD",1
 69 | 4850,"Base_TD",1
 70 | 4855,"Base_TD",1
 71 | 4857,"Base_TD",1
 72 | 4858,"Base_TD",1
 73 | 4859,"Base_TD",1
 74 | 4861,"Base_TD",1
 75 | 4862,"Base_TD",1
 76 | 4842,"BCE",1
 77 | 4843,"BCE",1
 78 | 4844,"BCE",1
 79 | 4850,"BCE",1
 80 | 4857,"BCE",1
 81 | 4858,"BCE",1
 82 | 4861,"BCE",1
 83 | 4862,"BCE",1
 84 | 4842,"BCW",1
 85 | 4843,"BCW",1
 86 | 4844,"BCW",1
 87 | 4850,"BCW",1
 88 | 4857,"BCW",1
 89 | 4858,"BCW",1
 90 | 4861,"BCW",1
 91 | 4862,"BCW",1
 92 | 4842,"BCE2",1
 93 | 4843,"BCE2",1
 94 | 4844,"BCE2",1
 95 | 4857,"BCE2",1
 96 | 4858,"BCE2",1
 97 | 4861,"BCE2",1
 98 | 4862,"BCE2",1
 99 | 4842,"BCW2",1
100 | 4843,"BCW2",1
101 | 4844,"BCW2",1
102 | 4857,"BCW2",1
103 | 4858,"BCW2",1
104 | 4861,"BCW2",1
105 | 4862,"BCW2",1
106 | 4842,"MAE",1
107 | 4843,"MAE",1
108 | 4844,"MAE",1
109 | 4858,"MAE",1
110 | 4861,"MAE",1
111 | 4842,"MAW",1
112 | 4843,"MAW",1
113 | 4844,"MAW",1
114 | 4858,"MAW",1
115 | 4861,"MAW",1
116 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: datamancer CI
 2 | on:
 3 |   push:
 4 |     paths:
 5 |       - 'tests/**'
 6 |       - 'src/**'
 7 |       - 'docs/**'
 8 |       - 'datamancer.nimble'
 9 |       - '.github/workflows/ci.yml'
10 |   pull_request:
11 |     paths:
12 |       - 'tests/**'
13 |       - 'src/**'
14 |       - 'docs/**'
15 |       - 'datamancer.nimble'
16 |       - '.github/workflows/ci.yml'
17 | 
18 | jobs:
19 |   build:
20 |     strategy:
21 |       fail-fast: false
22 |       matrix:
23 |         branch: [version-1-4, devel]
24 |         target: [linux, macos, windows]
25 |         include:
26 |           - target: linux
27 |             builder: ubuntu-18.04
28 |           - target: macos
29 |             builder: macos-10.15
30 |           - target: windows
31 |             builder: windows-2019
32 |     name: '${{ matrix.target }} (${{ matrix.branch }})'
33 |     runs-on: ${{ matrix.builder }}
34 |     steps:
35 |       - name: Checkout
36 |         uses: actions/checkout@v2
37 |         with:
38 |           path: datamancer
39 | 
40 |       - name: Setup Nim
41 |         uses: alaviss/setup-nim@0.1.1
42 |         with:
43 |           path: nim
44 |           version: ${{ matrix.branch }}
45 | 
46 |       - name: Install dependencies (Ubuntu)
47 |         if: ${{matrix.target == 'linux'}}
48 |         run: |
49 |           sudo apt-get update
50 |           sudo apt-get install pandoc
51 | 
52 |       - name: Setup nimble & deps
53 |         shell: bash
54 |         run: |
55 |           cd datamancer
56 |           nimble refresh -y
57 |           nimble install -y
58 | 
59 |       - name: Run tests
60 |         shell: bash
61 |         run: |
62 |           cd datamancer
63 |           nimble -y test
64 | 
65 |       - name: Build docs
66 |         if: >
67 |           github.event_name == 'push' && github.ref == 'refs/heads/master' &&
68 |           matrix.target == 'linux' && matrix.branch == 'devel'
69 |         shell: bash
70 |         run: |
71 |           cd datamancer
72 |           # **HAVE** to call `develop`, cuz we're getting screwed by
73 |           # logic otherwise
74 |           nimble develop -y
75 |           nimble gen_docs
76 |           # TODO: fix this, need to iterate over all files, do similar to arraymancer docs
77 |           # Ignore failures for older Nim
78 |           cp docs/{the,}index.html || true
79 | 
80 |       - name: Publish docs
81 |         if: >
82 |           github.event_name == 'push' && github.ref == 'refs/heads/master' &&
83 |           matrix.target == 'linux' && matrix.branch == 'devel'
84 |         uses: crazy-max/ghaction-github-pages@v1
85 |         with:
86 |           build_dir: datamancer/docs
87 |         env:
88 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
89 | 


--------------------------------------------------------------------------------
/changelog.org:
--------------------------------------------------------------------------------
 1 | * v0.1.9
 2 | - add basic implementation of =spread= (inverse of =gather=; similar
 3 |   to dplyr =pivot_wider=). The current implementation is rather basic
 4 |   and performance may be suboptimal for very large data frames.
 5 | - add =null= helper to create a =VNull Value=
 6 | - significantly improve the docs of the =dataframe.nim= module.
 7 | - fixes an issue where unique column reference names were combined
 8 |   into the same column due to a bad name generation algorithm
 9 | - significantly improves performance in applications in which
10 |   allocation of memory is a bottleneck (tensors were zero
11 |   initialized).
12 | - disable formula output at CT by default. Compile with
13 |   =-d:echoFormulas= to see the output.
14 | - remove CT warnings for unrelated stuff (node kinds)  
15 | * v0.1.8
16 | - avoid some object conversions in column operations (ref #11)
17 | - add ~[]=~ overloads for columns for slice assignments
18 | - *significantly* improve performance of =mutate/transmute= operations
19 |   for grouped dataframes (O(150,000) groups in < 0.5 s possible now)
20 | - fixes #12 by avoiding hashing of columns. Some performance
21 |   regression in =innerJoin=, =setDiff= (~2x slower in bad cases).    
22 | * v0.1.7
23 | - allow assignment of constants in =seqsToDf=
24 | - allow assignment of scalars to DF as column directly
25 | - add filename argument to =showBrowser=
26 | - make =compileFormulaImpl= actually typed to make formulas work
27 |   correctly inside of generics (ref =ggplotnim=
28 |   https://github.com/Vindaar/ggplotnim/issues/116
29 | - change internal macro type logic to use strings
30 |   
31 | * v0.1.6
32 | - fix slicing of constant columns
33 | 
34 | * v0.1.5
35 | - fully qualify =Value= on scalar formula construction
36 | 
37 | * v0.1.4
38 | - fix formulas (and type deduction) for certain use cases involving
39 |   =nnkBracketExpr= that are *not* references to columns
40 | 
41 | * v0.1.3
42 | - improve type deduction capabilities for infix nodes
43 | - add overload for =drop= that doesn't just work on a mutable data
44 |   frame
45 | - fix reference semantics issues if DF is modified and visible in
46 |   result (only data is shared, but columns should be respected)
47 | - =arrange= now also takes a =varargs[string]= instead of a
48 |   =seq=. While there is still a bug of not properly being able to use
49 |   varargs, at least an array is possible (and hopefully at some point
50 |   proper varargs).
51 | 
52 | * v0.1.2
53 | - CSV parser is more robust, can handle unnammed columns
54 | - explicit types in =idx=, =col= column reference finally works
55 |   (e.g. =idx("foo", float)= accesses the column "foo" as a float
56 |   tensor overwriting type deductions and type hints)
57 | 
58 | * v0.1.1
59 | - allow =nnkMacroDef= in =findType=
60 | - add development notes and ideas about rewrite of formula macro in =notes/formula_dev_notes.org=
61 | 
62 | * v0.1.0
63 | 
64 | - initial version of Datamancer based on =ggplotnim= data frame with
65 |   major formula macro rewrite
66 | 


--------------------------------------------------------------------------------
/src/datamancer/formulaNameMacro.nim:
--------------------------------------------------------------------------------
 1 | import strformat, macros, strutils
 2 | 
 3 | proc build(n: NimNode): string
 4 | proc buildArgs(n: NimNode, head = ""): string =
 5 |   if result.len == 0 and head.len > 0:
 6 |     result = &"{head}"
 7 |   for i in 0 ..< n.len:
 8 |     if result.len == 0:
 9 |       result = &"({build(n[i])}"
10 |     else:
11 |       result.add &" {build(n[i])}"
12 |   result.add ")"
13 | 
14 | proc build(n: NimNode): string =
15 |   # convert to lisp representation
16 |   case n.kind
17 |   of nnkInfix:
18 |     result = &"({n[0].strVal} {build(n[1])} {build(n[2])})"
19 |   of nnkIntLit .. nnkFloat64Lit:
20 |     result = n.repr
21 |   of nnkStrLit, nnkRStrLit:
22 |     result = n.strVal
23 |   of nnkIdent, nnkSym:
24 |     # should correspond to a known identifier in the calling scope
25 |     result = n.strVal
26 |   of nnkPar, nnkCall, nnkCommand:
27 |     result = buildArgs(n)
28 |   of nnkDotExpr, nnkBracketExpr:
29 |     result = n.repr
30 |   of nnkPrefix:
31 |     when (NimMajor, NimMinor, NimPatch) < (1, 5, 0):
32 |       if n[0].strVal == "-":
33 |         result = &"-{build(n[1])}"
34 |       else:
35 |         result = &"({n[0].strVal} {build(n[1])})"
36 |     else:
37 |       result = &"({n[0].strVal} {build(n[1])})"
38 |   of nnkAccQuoted:
39 |     result = build(n[0])
40 |   of nnkCallStrLit:
41 |     result = n[1].strVal
42 |   of nnkCurly:
43 |     result = "({}"
44 |     for ch in n:
45 |       result.add &" {build(ch)}"
46 |     result.add ")"
47 |   of nnkBracket:
48 |     result = "([]"
49 |     for ch in n:
50 |       result.add &" {build(ch)}"
51 |     result.add ")"
52 |   of nnkIfExpr:
53 |     result = "(if"
54 |     for arg in n:
55 |       result.add &" {build(arg)}"
56 |     result.add ")"
57 |   of nnkElifExpr:
58 |     result = buildArgs(n, head = "(elif")
59 |   of nnkElseExpr:
60 |     result = buildArgs(n, head = "(else")
61 |   of nnkStmtList:
62 |     for ch in n:
63 |       if result.len == 0 and n.len > 1:
64 |         result = &"({buildArgs(ch)}"
65 |       elif result.len == 0:
66 |         result = &"{buildArgs(ch)}"
67 |       else:
68 |         result.add &" {buildArgs(ch)}"
69 |     if n.len > 1:
70 |       result.add ")"
71 |   of nnkOpenSymChoice, nnkClosedSymChoice:
72 |     result = n[0].strVal # take first symbol name
73 |   of nnkCheckedFieldExpr:
74 |     ## TODO: check if this is reasonable. It seems that this node contains
75 |     ## the original node as [0] and then the "environment" as [1]??
76 |     result = build(n[0])
77 |   else:
78 |     result = n.repr
79 |     warning("Node kind " & $n.kind & " not implemented " &
80 |       "for FormulaNode string representation. Node is:\n" & $(n.treeRepr))
81 | 
82 | proc buildName*(n: NimNode): string =
83 |   ## Builds the formula name in a lisp like representation. Only for debugging
84 |   ## and printing purposes.
85 |   result = build(n)
86 | 
87 | proc buildResultColName*(n: NimNode): NimNode =
88 |   ## Builds the name of the resulting column name of a formula. Mainly it simply uses the node
89 |   ## as is, except for column references via accented quotes and call string literals, in which
90 |   ## case we simply use the string values underlying.
91 |   ## We need to be able to use symbols from the local scope (or possible proc calls) to determine
92 |   ## the resulting column name at runtime.
93 |   case n.kind
94 |   of nnkAccQuoted: result = newLit(n[0].strVal)
95 |   of nnkCallStrLit: result = newLit(n[1].strVal)
96 |   else: result = n
97 | 


--------------------------------------------------------------------------------
/docs/docs.nim:
--------------------------------------------------------------------------------
  1 | import macros, strformat, strutils, sequtils, sets, tables, algorithm
  2 | 
  3 | from os import parentDir, getCurrentCompilerExe, DirSep, extractFilename, `/`, setCurrentDir
  4 | 
  5 | # NOTE:
  6 | # for some time on devel 1.3.x `paramCount` and `paramStr` had to be imported
  7 | # os, because they were removed for nimscript. This was reverted in:
  8 | # https://github.com/nim-lang/Nim/pull/14658
  9 | # For `nimdoc` we still have to import those from `os`!
 10 | when defined(nimdoc):
 11 |   from os import getCurrentDir, paramCount, paramStr
 12 | 
 13 | #[
 14 | This file is a slightly modified version of the same file of `nimterop`:
 15 | https://github.com/nimterop/nimterop/blob/master/nimterop/docs.nim
 16 | ]#
 17 | 
 18 | 
 19 | proc getNimRootDir(): string =
 20 |   #[
 21 |   hack, but works
 22 |   alternatively (but more complex), use (from a nim file, not nims otherwise
 23 |   you get Error: ambiguous call; both system.fileExists):
 24 |   import "$nim/testament/lib/stdtest/specialpaths.nim"
 25 |   nimRootDir
 26 |   ]#
 27 |   fmt"{currentSourcePath}".parentDir.parentDir.parentDir
 28 | 
 29 | const
 30 |   DirSep = when defined(windows): '\\' else: '/'
 31 | 
 32 | proc execAction(cmd: string): string =
 33 |   var
 34 |     ccmd = ""
 35 |     ret = 0
 36 |   when defined(Windows):
 37 |     ccmd = "cmd /c " & cmd
 38 |   elif defined(posix):
 39 |     ccmd = cmd
 40 |   else:
 41 |     doAssert false
 42 | 
 43 |   (result, ret) = gorgeEx(ccmd)
 44 |   doAssert ret == 0, "Command failed: " & $ret & "\ncmd: " & ccmd & "\nresult:\n" & result
 45 | 
 46 | template genRemove(name: untyped): untyped =
 47 |   proc `name`(s, toRemove: string): string =
 48 |     result = s
 49 |     result.`name`(toRemove)
 50 | genRemove(removePrefix)
 51 | genRemove(removeSuffix)
 52 | 
 53 | proc getFiles*(path: string): seq[string] =
 54 |   # Add files and dirs here, which should be skipped.
 55 |   #const excludeDirs = []
 56 |   #let ExcludeDirSet = toSet(excludeDirs)
 57 |   #if path.extractFilename in ExcludeDirSet: return
 58 |   # The files below are not valid by themselves, they are only included
 59 |   # from other files
 60 |   #const excludeFiles = []
 61 |   #let ExcludeFileSet = toSet(excludeFiles)
 62 | 
 63 |   for file in listFiles(path):
 64 |     if file.endsWith(".nim"): # and file.extractFilename notin ExcludeFileSet:
 65 |       result.add file
 66 |   for dir in listDirs(path):
 67 |     result.add getFiles(dir)
 68 | 
 69 | proc buildDocs*(path: string, docPath: string,
 70 |                 defaultFlags = "",
 71 |                 masterBranch = "master",
 72 |                 defines: openArray[string] = @[]) =
 73 |   ## Generate docs for all nim files in `path` and output all HTML files to the
 74 |   ## `docPath` in a flattened form (subdirectories are removed).
 75 |   ##
 76 |   ## If duplicate filenames are detected, they will be printed at the end.
 77 |   ##
 78 |   ## WARNING: not in use! `baseDir` is the project path by default and `files` and `path` are relative
 79 |   ## to that directory. Set to "" if using absolute paths.
 80 |   ##
 81 |   ## `masterBranch` is the name of the default branch to which the docs should link
 82 |   ## when clicking the `Source` button below a procedure etc.
 83 |   ##
 84 |   ## `defines` is a list of `-d:xxx` define flags (the `xxx` part) that should be passed
 85 |   ## to `nim doc` so that `getHeader()` is invoked correctly.
 86 |   ##
 87 |   ## Use the `--publish` flag with nimble to publish docs contained in
 88 |   ## `path` to Github in the `gh-pages` branch. This requires the ghp-import
 89 |   ## package for Python: `pip install ghp-import`
 90 |   ##
 91 |   ## WARNING: `--publish` will destroy any existing content in this branch.
 92 |   ##
 93 |   ## NOTE: `buildDocs()` only works correctly on Windows with Nim 1.0+ since
 94 |   ## https://github.com/nim-lang/Nim/pull/11814 is required.
 95 |   ##
 96 |   ##
 97 |   const gitUrl = "https://github.com/Vindaar/datamancer"
 98 |   ## WARNING: this means `gen_docs` *only* works if you use `nimble develop` on
 99 |   ## the repository. Nimble cannot deal with ****. This is frustrating. Thanks.
100 |   let baseDir = execAction("nimble path datamancer").parentDir & $DirSep
101 |   when defined(windows) and (NimMajor, NimMinor, NimPatch) < (1, 0, 0):
102 |     echo "buildDocs() unsupported on Windows for Nim < 1.0 - requires PR #11814"
103 |   else:
104 |     let
105 |       docPath = baseDir & docPath
106 |       path = baseDir & path
107 |       defStr = block:
108 |         var defStr = " " & defaultFlags
109 |         for def in defines:
110 |           defStr &= " -d:" & def
111 |         defStr
112 |       nim = getCurrentCompilerExe()
113 | 
114 |     # now we walk the whole `path` and build the documentation for each `.nim` file.
115 |     # While doing that we flatten the directory structure for the generated HTML files.
116 |     # `src/foo/bar/baz.nim` just becomes
117 |     # `docPath/baz.html`.
118 |     # This allows for all files to be in the `docPath` directory, which means each
119 |     # file will be able to find the `dochack.js` file, which will be put into
120 |     # the `docPath` directory, too (the inclusion of the `dochack.js` is done statically
121 |     # via our generated nimdoc.cfg file and is fixed for each generated HTML).
122 |     let files = getFiles(path)
123 |     var idx = 0
124 |     var fileSet = initHashSet[string]()
125 |     var duplSet = initHashSet[string]()
126 |     for file in files:
127 |       let baseName = file.extractFilename()
128 |       let relPath = file.removePrefix(path).removeSuffix(baseName)
129 |       let prefix = relPath.strip(chars = {'/'}) # remove possible trailing `/`
130 |         .split('/') # split path parts
131 |         .join(".") # concat by `.` instead
132 |       var outfile = baseName.replace(".nim", ".html")
133 |       if outfile in fileSet:
134 |         duplSet.incl outfile
135 |       else:
136 |         fileSet.incl outfile
137 |       outfile = docPath / outfile
138 |       echo "Processing: ", outfile, " [", idx, "/", files.len, "]"
139 |       # NOTE: Changing the current working directory to the project path is required in order for
140 |       # `git.commit:` to work! Otherwise we sit in `docs` and for some reason the relative path
141 |       # will eat one piece of the resulting `source` links and thereby removing the actual branch
142 |       # and we end up with a broken link!
143 |       echo execAction(&"cd {baseDir} && {nim} doc {defStr} --git.url:{gitUrl} --git.commit:{masterBranch} --git.devel:{masterBranch} -o:{outfile} --index:on {file}")
144 |       inc idx
145 |     ## now build  the index
146 |     echo execAction(&"{nim} buildIndex -o:{docPath}/theindex.html {docPath}")
147 |     when declared(getNimRootDir):
148 |       #[
149 |       NOTE: running it locally doesn't work anymore on modern chromium browser,
150 |       because they block "access from origin 'null' due to CORS policy".
151 |       this enables doc search, works at least locally with:
152 |       cd {docPath} && python -m SimpleHTTPServer 9009
153 |       ]#
154 |       echo execAction(&"{nim} js -o:{docPath}/dochack.js {getNimRootDir()}/tools/dochack/dochack.nim")
155 | 
156 |     # echo "Processed files: ", fileSet
157 |     if duplSet.card > 0:
158 |       echo "WARNING: Duplicate filenames detected: ", duplSet
159 | 


--------------------------------------------------------------------------------
/tests/tests.nim:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import datamancer
  3 | 
  4 | import tables, sets
  5 | import sequtils, seqmath
  6 | import math
  7 | 
  8 | proc almostEq(a, b: float, epsilon = 1e-8): bool =
  9 |   ## version of `almostEqual` for testing, which prints the values, if
 10 |   ## they mismatch
 11 |   result = almostEqual(a, b, epsilon)
 12 |   if not result:
 13 |     echo "Comparison failed: a = ", a, ", b = ", b
 14 | 
 15 | suite "Value":
 16 |   let
 17 |     v1 = %~ 1
 18 |     v2 = %~ 1.5
 19 |     v3 = %~ true
 20 |     v4 = %~ 'a'
 21 |     # `v5` itself is already a test, whether we can hash `Value`
 22 |     v5 = %~ { "test" : v1,
 23 |               "some" : v2,
 24 |               "heterogeneous" : v3,
 25 |               "fields" : v4 }.toOrderedTable
 26 |     v6 = Value(kind: VNull)
 27 | 
 28 |   test "Storing in sets":
 29 |     var valueSet = initHashSet[Value]()
 30 |     valueSet.incl v1
 31 |     valueSet.incl v2
 32 |     valueSet.incl v3
 33 |     valueSet.incl v4
 34 |     valueSet.incl v5
 35 |     valueSet.incl v6
 36 |     check v1 in valueSet
 37 |     check v2 in valueSet
 38 |     check v3 in valueSet
 39 |     check v4 in valueSet
 40 |     check v5 in valueSet
 41 |     check v6 in valueSet
 42 |     check valueSet.card == 6
 43 | 
 44 |   test "Storing in tables":
 45 |     var tab = initTable[string, Value]()
 46 |     tab["v1"] = v1
 47 |     tab["v2"] = v2
 48 |     tab["v3"] = v3
 49 |     tab["v4"] = v4 # is converted to string!
 50 |     tab["v5"] = v5
 51 |     tab["v6"] = v6
 52 |     check tab.len == 6
 53 |     check tab["v1"] == v1
 54 |     check tab["v2"] == v2
 55 |     check tab["v3"] == v3
 56 |     check tab["v4"] == v4
 57 |     check tab["v5"] == v5
 58 |     check tab["v6"] == v6
 59 | 
 60 |   test "Extracting values":
 61 |     check v1.toInt == 1
 62 |     check v2.toFloat == 1.5
 63 |     check v3.toBool == true
 64 |     check v4.toStr == "a"
 65 |     check v1.toStr == "1"
 66 |     check v2.toStr == "1.5"
 67 |     check v3.toStr == "true"
 68 |     expect(ValueError):
 69 |       discard v5.toStr
 70 |     expect(ValueError):
 71 |       discard v6.toStr
 72 | 
 73 |   test "Direct `isNumber` check":
 74 |     # Note: this test checks basically whether the content of a `Value`
 75 |     # to be echoed is recognized as a number (in which case it's engulfed
 76 |     # by literal ``"``) or a normal string (no ``"``)
 77 |     let n1 = "1.1"
 78 |     let n2 = "1.3e5"
 79 |     let n3 = "aba"
 80 |     let n4 = "1..1"
 81 |     let n5 = "1.123"
 82 |     let n6 = "1.5e5E5"
 83 |     let n7 = "e"
 84 |     let n8 = "E"
 85 |     let n9 = "."
 86 |     let n10 = "1e"
 87 |     let n11 = "1E"
 88 |     let n12 = "1."
 89 |     let n13 = "e1"
 90 |     let n14 = "E1"
 91 |     let n15 = ".1"
 92 |     # and some actually valid floats
 93 |     let n16 = "6.084E+01"
 94 |     let n17 = "1.676E+01"
 95 |     let n18 = "6.863E+00"
 96 |     let n19 = "2.007E+00"
 97 |     let n20 = "9.329E-01"
 98 |     let n21 = "2.441E-04"
 99 |     let n22 = "-2.441E-04"
100 |     let n23 = "--2.441"
101 |     let n24 = "-6.836E-04 "
102 |     let n25 = "2.930E-04    "
103 |     let n26 = "2.930E-04  d   "
104 |     check n1.isNumber
105 |     check n2.isNumber
106 |     check not n3.isNumber
107 |     check not n4.isNumber
108 |     check n5.isNumber
109 |     check not n6.isNumber
110 |     check not n7.isNumber
111 |     check not n8.isNumber
112 |     check not n9.isNumber
113 |     check not n10.isNumber
114 |     check not n11.isNumber
115 |     check n12.isNumber
116 |     check not n13.isNumber
117 |     check not n14.isNumber
118 |     check not n15.isNumber
119 |     check n16.isNumber
120 |     check n17.isNumber
121 |     check n18.isNumber
122 |     check n19.isNumber
123 |     check n20.isNumber
124 |     check n21.isNumber
125 |     check n22.isNumber
126 |     check not n23.isNumber
127 |     check n24.isNumber
128 |     check n25.isNumber
129 |     check not n26.isNumber
130 | 
131 |   test "String conversion":
132 |     # Note: this test checks basically whether the content of a `Value`
133 |     # to be echoed is recognized as a number (in which case it's engulfed
134 |     # by literal ``"``) or a normal string (no ``"``)
135 |     # This uses `isNumber` internally.
136 |     let n1 = %~ "1.1"
137 |     let n2 = %~ "1.3e5"
138 |     let n3 = %~ "aba"
139 |     let n4 = %~ "1..1"
140 |     let n5 = %~ "1.123"
141 |     let n6 = %~ "1.5e5E5"
142 |     let n7 = %~ "e"
143 |     let n8 = %~ "E"
144 |     let n9 = %~ "."
145 |     let n10 = %~ "1e"
146 |     let n11 = %~ "1E"
147 |     let n12 = %~ "1."
148 |     let n13 = %~ "e1"
149 |     let n14 = %~ "E1"
150 |     let n15 = %~ ".1"
151 |     # and some actually valid floats
152 |     let n16 = %~ "6.084E+01"
153 |     let n17 = %~ "1.676E+01"
154 |     let n18 = %~ "6.863E+00"
155 |     let n19 = %~ "2.007E+00"
156 |     let n20 = %~ "9.329E-01"
157 |     let n21 = %~ "2.441E-04"
158 |     let n22 = %~ "-2.441E-04"
159 |     check $n1 == "\"1.1\""
160 |     check $n2 == "\"1.3e5\""
161 |     check $n3 == "aba"
162 |     check $n4 == "1..1"
163 |     check $n5 == "\"1.123\""
164 |     check $n6 == "1.5e5E5"
165 |     check $n7 == "e"
166 |     check $n8 == "E"
167 |     check $n9 == "."
168 |     check $n10 == "1e"
169 |     check $n11 == "1E"
170 |     check $n12 == "\"1.\""
171 |     check $n13 == "e1"
172 |     check $n14 == "E1"
173 |     check $n15 == ".1"
174 |     check $n16 == "\"6.084E+01\""
175 |     check $n17 == "\"1.676E+01\""
176 |     check $n18 == "\"6.863E+00\""
177 |     check $n19 == "\"2.007E+00\""
178 |     check $n20 == "\"9.329E-01\""
179 |     check $n21 == "\"2.441E-04\""
180 |     check $n22 == "\"-2.441E-04\""
181 | 
182 |     # check that `emphStrNumber` can be disabled
183 |     check n16.pretty(emphStrNumber = false) == "6.084E+01"
184 |     check n17.pretty(emphStrNumber = false) == "1.676E+01"
185 |     check n18.pretty(emphStrNumber = false) == "6.863E+00"
186 |     check n19.pretty(emphStrNumber = false) == "2.007E+00"
187 |     check n20.pretty(emphStrNumber = false) == "9.329E-01"
188 |     check n21.pretty(emphStrNumber = false) == "2.441E-04"
189 |     check n22.pretty(emphStrNumber = false) == "-2.441E-04"
190 | 
191 | 
192 |   test "Math with Values":
193 |     check (v1 * v2).kind == VFloat
194 |     check (v1 + v1).kind == VFloat
195 |     check (v1 + v1) == %~ 2
196 |     check (v1 * v1).kind == VFloat
197 |     check almostEq((v1 * v2).toFloat, 1.5)
198 |     check almostEq((v1 / v2).toFloat, 2.0 / 3.0)
199 |     check v1 * v6 == Value(kind: VNull)
200 | 
201 | suite "Formula":
202 |   test "Testing ~ formula creation using f{} macro":
203 |     let f = f{"meanCty" ~ (c"hwy" + c"cty")}
204 |     # manual parens still appear in `name`!
205 |     check f.name == "(~ meanCty ((+ hwy cty)))"
206 |     when defined(defaultBackend):
207 |       let g = meanCty ~ hwy + cty
208 |       check $f == $g
209 |     # TODO: Add more tests here...
210 |     # create with `.` access
211 |     let tup = (a: 5.5, b: "ok")
212 |     let h = f{%~ tup.a == %~ tup.b}
213 |     check h.kind == fkVariable
214 |     check h.val == %~ false
215 |     check h.name == "(== (%~ tup.a) (%~ tup.b))"
216 | 
217 |     let f2 = f{float: "min" << min(c"runTimes")}
218 |     check $f2 == "min" # LHS of formula
219 |     check f2.name == "(<< min (min runTimes))"
220 | 
221 | 
222 |   test "Evaluate raw formula (no DF column dependency)":
223 |     # arithmetic works
224 |     check evaluate(f{1 + 2}) == %~ 3
225 |     # parens work in arithmetic
226 |     check evaluate(f{2 * (5 - 3)}) == %~ 4
227 |     check evaluate(f{10 / 10}) == %~ 1
228 |     # strings are evaluated to themseles
229 |     check evaluate(f{"hwy"}) == %~ "hwy"
230 | 
231 |   test "Formula, literal on RHS":
232 |     let f = f{"from" ~ 0}
233 |     check f.name == "(~ from 0)"
234 | 
235 |   test "Test formula creation of type `fkVariable`":
236 |     let f1 = f{"Test"}
237 |     let f2 = f{1.1}
238 |     let f3 = f{4}
239 |     let f4 = f{true}
240 |     check f1.kind == fkVariable
241 |     check f2.kind == fkVariable
242 |     check f3.kind == fkVariable
243 |     check f4.kind == fkVariable
244 |     check $f1 == "Test"
245 |     check $f2 == "1.1"
246 |     check $f3 == "4"
247 |     check $f4 == "true"
248 | 


--------------------------------------------------------------------------------
/README.org:
--------------------------------------------------------------------------------
  1 | 
  2 | * Datamancer
  3 | [[https://github.com/SciNim/datamancer/workflows/datamancer%20CI/badge.svg]]
  4 | 
  5 | #+ATTR_HTML: title="Join the chat at https://gitter.im/SciNim/Community"
  6 | [[https://gitter.im/SciNim/Community][file:https://badges.gitter.im/SciNim/Community.svg]]
  7 | 
  8 | ** Comparison to other dataframe libraries
  9 | 
 10 | Check out the following gist for a comparison of this library with
 11 | dplyr (R) and pandas (Python):
 12 | 
 13 | https://gist.github.com/Vindaar/6908c038707c7d8293049edb3d204f84
 14 | 
 15 | 
 16 | ** Documentation
 17 | 
 18 | The documentation is found at:
 19 | 
 20 | https://scinim.github.io/Datamancer/
 21 | 
 22 | with a short introduction under:
 23 | 
 24 | https://scinim.github.io/Datamancer/datamancer.html
 25 | 
 26 | ** Installation & dependencies
 27 | 
 28 | Installation should be just a 
 29 | #+BEGIN_SRC sh
 30 | nimble install datamancer
 31 | #+END_SRC
 32 | away.
 33 | 
 34 | ** Features and formulas
 35 | 
 36 | The data frame provides the "5 verbs" of [[https://dplyr.tidyverse.org/][dplyr]] and more. Main implemented functions:
 37 | - =filter=
 38 | - =mutate=, =transmute=
 39 | - =select=, =rename=
 40 | - =arrange=
 41 | - =summarize=
 42 | - =group_by=
 43 | - =arrange=
 44 | - =inner_join=
 45 | - =set_diff=
 46 | - =count=
 47 | - =bind_rows=
 48 | - =gather=
 49 | - =unique=,
 50 | which are all based on the =FormulaNode= object. Basically they all
 51 | receive =varargs[FormulaNode]=, which is evaluated in context of the
 52 | given dataframe.  
 53 | 
 54 | ** A few words on the =f{}= macro to create formulas
 55 | 
 56 | Use:
 57 | - no infix symbol and only code, which does not involve a column in
 58 |   the sense defined below in [[Column access]]:
 59 |   #+BEGIN_SRC nim
 60 |   f{1 + 2}
 61 |   f{"foo"}
 62 |   f{true}
 63 |   #+END_SRC
 64 |   a =FormulaNode= of kind =fkVariable=. Stores the values as a =Value=
 65 |   variant object.
 66 | - =<-= for assignment
 67 |   #+BEGIN_SRC nim
 68 |   f{"newName" <- "oldName"}
 69 |   #+END_SRC
 70 |   a =FormulaNode= of kind =fkAssign=.
 71 |   This does not involve a closure and is just a simple object storing
 72 |   a LHS as a string and the RHS as a =Value= (to also support constant
 73 |   columns via =f{"constantCol" <- 5}=).
 74 |   Typically used for =rename= or as an argument for =transmute= and
 75 |   =mutate= to just rename a column or to assign a constant column.
 76 | - =<<= for reduce operations
 77 |   #+BEGIN_SRC nim
 78 |   f{"meanHwy" << mean(`hwy`)}
 79 |   #+END_SRC
 80 |   a =FormulaNode= of kind =fkScalar=.
 81 |   Used only for =summarize= and means we reduce a full column to a
 82 |   single =Value=. This generates a closure, which computes the RHS and
 83 |   assigns it to a result variable of type =Value=. Type hints are
 84 |   required (for now) if only a single proc call is involved on the
 85 |   RHS to tell the macro as what to read the column "hwy" and what the
 86 |   result variable is.
 87 | - =~= for vector like proc
 88 |   #+BEGIN_SRC nim
 89 |   f{"xSquared" ~ `x` * `x`}
 90 |   #+END_SRC
 91 |   a =FormulaNode= of kind =fkVector=.
 92 |   Used in =mutate=, =transmute= to calculate a full column. This also
 93 |   generates a closure as the reduce operations =<<= does, except here
 94 |   we loop over the length of the DF and access each read tensor via =[idx]=.
 95 | - a formula without any infix symbols will be considered:
 96 |   - =fkVariable= if no column involved
 97 |   - =fkVector= else
 98 | 
 99 | *** Column access
100 | To access columns in the context of formula, the biggest change
101 | occured. In the old formula system, a literal string was attempted to
102 | be resolved as a DF column dynamically. Since the new formulas are
103 | compiled to closures, this would involve overhead and is thus avoided
104 | for clearer separation between columns and real strings. This also
105 | helps readers of a formula. This means:
106 | - =`columnName`=: accented quotes refer to a DF column. Be careful to
107 |   only use this for simple letters (no non letter characters or spaces).
108 | - =c"columnName"= : call string literals (by convention use a =c=
109 |   before the string) are interpreted as a column in the same way as
110 |   accented quotes, but allow for column names with spaces / non letter
111 |   characters.
112 | - =idx("columnName"), idx(`columnName`), idx(nimExpressionReturningString)=:
113 |   refers to a specific element of the referred column
114 | - =col("columnName"), col(`columnName`), col(nimExpressionReturningString)=:
115 |   refers to a the full tensor of the referred column
116 | - or directly via: =df[nimExpressionReturningString] /
117 |   df[nimExpressionReturningString][idx]=: to access columns / indices using
118 |   identifiers / symbols / general expressions that return a string
119 |   quotes, call string literals or just string literals). This is
120 |   equivalent to =idx= / =col=, so the latter are preferred.
121 | 
122 | The closures take a data frame as an argument, which is named
123 | =df=. The =df["columnName"]= refers to that argument, although not
124 | literally (it is gen'symmed and =df["columnName"]= refers to a
125 | =Column=). From that column we get the underlying =Tensor=.
126 | 
127 | In the context of calling procedures, e.g.:
128 | #+BEGIN_SRC nim
129 | f{someProc(`columnName`)}
130 | #+END_SRC
131 | it may not be clear whether the procedure is supposed to take the
132 | whole tensor as an argument or hand each element of the tensor in a
133 | loop. Internally the macro tries to determine a suitable call for
134 | either a scalar or tensor argument. If the called procedure is unique
135 | this will likely succeed. In case of heavily overloaded symbols
136 | (e.g. =max=) it also tries to determine a match from (if any)
137 | additional arguments given to that procedure (and uses their types if
138 | they are not column references).
139 | 
140 | In case at cannot be resolved, you will get an error at compile time
141 | to specify =idx= (per index access) or =col= (full column access) of the column.
142 | 
143 | So for example:
144 | #+BEGIN_SRC nim
145 | f{"asFloat" ~ parseFloat(idx("colName"))}
146 | #+END_SRC
147 | where =parseFloat= acts on each element individually. If there is only
148 | a single overload (as in case of =parseFloat=), the input and output
149 | types are inferred automatically to be:
150 | - read tensor =colName= as a =string=
151 | - result type is =float=
152 | 
153 | *** Type hints
154 | Type hints are required if the formula macro cannot determine the type
155 | required, either input or output. This is usually the case for
156 | ambiguous operations (overloaded procedures, only a single column
157 | without any operations, etc.). They are of
158 | the form:
159 | - =<type>: <actualFormula>=: simple type hint for the type of the
160 |   underlying tensor of the columns involved in the formula.  
161 | - =<type> -> <resDtype>: <actualFormula>=: full type for closure.
162 |   =<type>= is the dtype used for input tensors, =<resDtype>= the resulting
163 |   type.
164 | For example:
165 | #+begin_src nim
166 | f{int -> int: `x` * `y`}
167 | # ^--- type of the tensors involved on the RHS. Will be read as integers
168 | #           ^--- type of the resulting tensor
169 | #+end_src
170 | In this case the type would be determined to be float by the macro, so
171 | type hints are required in case we need them to be integers.
172 | 
173 | *NOTE:* it is not possible to include tensors of different data types
174 | in a single formula using type hints. However, if they appear in
175 | different branches of the formula AST and the types are determined
176 | automatically, this is possible. All input tensors of a computation will be read
177 | either by the automatically deduced data type or the =<type>= argument
178 | mentioned here. If an underlying tensor is not actually of the given
179 | data type, it will be converted via =T(val)=, where =T= is the type or
180 | if the conversion is not possible a runtime exception will be thrown.
181 | 
182 | In addition to looking at symbols in the scope, there is a step
183 | involving some simple heuristic rules, e.g. if =*=, =/= is involved, it's
184 | assumed that the input tensors are floats and the output as well. If
185 | =&= or =$= is involved, it's assumed to be strings. 
186 | Finally if =and= and other logic keywords are used, the result is
187 | assumed to be =bool= (not the input thought!). 
188 | 
189 | #+BEGIN_SRC nim
190 |     const floatSet = toSet(@["+", "-", "*", "/", "mod"])
191 |     const stringSet = toSet(@["&", "$"])
192 |     const boolSet = toSet(@["and", "or", "xor", ">", "<", ">=", "<=", "==", "!=",
193 |                             "true", "false", "in", "notin"])
194 | #+END_SRC
195 | 
196 | *** Notes on formula macro internals
197 | 
198 | For an insight into the implementation details, ideas and development
199 | notes, check out the following document:
200 | 
201 | https://github.com/SciNim/Datamancer/blob/master/notes/formula_dev_notes.org
202 | 


--------------------------------------------------------------------------------
/tests/testsFormula.nim:
--------------------------------------------------------------------------------
  1 | import datamancer, unittest, sequtils, math, strutils, streams, sugar
  2 | import seqmath
  3 | 
  4 | type
  5 |   Foo = object
  6 |     fd: float
  7 | 
  8 | suite "Formulas":
  9 |   let a = [1, 2, 3]
 10 |   let b = [3, 4, 5]
 11 |   let c = [4, 5, 6]
 12 |   let d = [8, 9, 10]
 13 |   let e = [11, 12, 13]
 14 |   let f = [false, true, false]
 15 |   let g = ["hello", "world", "foo"]
 16 |   let h = [2.5, 7.5, NaN]
 17 |   let i = ["5", "6", "7"]
 18 |   let df = seqsToDf(a, b, c, d, e, f, g, h, i)
 19 |   test "Basic `idx` tests with automatic type deduction from context":
 20 |     block:
 21 |       # - infix, "a" read as integer automatically
 22 |       let fn = f{ idx("a") == 5 }
 23 |       check fn.evaluate(df).bCol == [false, false, false].toTensor
 24 |     block:
 25 |       # - infix, a read as float automatically
 26 |       let fn = f{ idx("a") == 5.5 }
 27 |       check fn.evaluate(df).bCol == [false, false, false].toTensor
 28 |     block:
 29 |       # - infix involving `in`, type conversion on `idx` and set
 30 |       let fn = f{ idx("a").int8 in {1'i8, 3, 5, 7} }
 31 |       check fn.evaluate(df).bCol == [true, false, true].toTensor
 32 |     block:
 33 |       # - infix of `>` works
 34 |       # - type determined automatically
 35 |       let fn = f{ 5 > idx("a") }
 36 |       check fn.evaluate(df).bCol == [true, true, true].toTensor
 37 |     block:
 38 |       # - infix of `>` works w/ order switched around
 39 |       # - type determined automatically
 40 |       let fn = f{ idx("a") > 5 }
 41 |       check fn.evaluate(df).bCol == [false, false, false].toTensor
 42 |     block:
 43 |       # - type deduction on one side works with `Value`
 44 |       let fn = f{ idx("a") >= %~ 5.5 }
 45 |       check fn.evaluate(df).bCol == [false, false, false].toTensor
 46 |     block:
 47 |       # - reads data as `bool`
 48 |       # - runtime error due to a, b being int
 49 |       ## TODO: decide if this should become a CT error due to ambiguity.
 50 |       ## Probably yes, requires change to `assignType` I suppose (not to use
 51 |       ## default type info here)
 52 |       expect(ValueError):
 53 |         let fn = f{ idx("a") > idx("b") }
 54 |         discard fn.evaluate(df)
 55 |     block:
 56 |       # - RHS is float, infix means LHS will be read as float
 57 |       let fn = f{idx("a") < idx("b").float }
 58 |       check fn.evaluate(df).bCol == [true, true, true].toTensor
 59 |     block:
 60 |       # - above works with `==` too
 61 |       let fn = f{ idx("a") == idx("b").float }
 62 |       check fn.evaluate(df).bCol == [false, false, false].toTensor
 63 |     block:
 64 |       var fm = Foo(fd: 5.2)
 65 |       let fn = f{ idx("a") > fm.fd }
 66 |       check fn.evaluate(df).bCol == [false, false, false].toTensor
 67 | 
 68 |     block:
 69 |       # - prefix, automatic type deduction
 70 |       let fn = f{ not idx("f") }
 71 |       check fn.evaluate(df).bCol == [true, false, true].toTensor
 72 |     block:
 73 |       let fn = f{ idx("x") >= max(col("x")) * 0.5 }
 74 | 
 75 |     block:
 76 |       let fn = f{ parseInt(idx("a")) > 2 }
 77 | 
 78 |   test "Basic `col` test with type deduction from context":
 79 |     block:
 80 |       ## the following fails at CT, because type of output is ambiguous (max is overloaded)
 81 |       # let fn = f{ col("a").max }
 82 |       ## This one should always work
 83 |       let fn2 = f{float: col("a").max }
 84 |       check fn2.reduce(df).toInt == 3
 85 | 
 86 |     block:
 87 |       # - accessing column length works
 88 |       let fn = f{float: col("a").len }
 89 |       check fn.reduce(df).toInt == 3
 90 | 
 91 |     block:
 92 |       # - accessing tensor elments with bracket
 93 |       let fn = f{float: col("a")[1] }
 94 |       check fn.reduce(df).toInt == 2
 95 | 
 96 |   test "Automatic type deduction based on nnkDotExpr w/ a (non ambiguous) proc call":
 97 |     block:
 98 |       # - examples of determining type from unique procedure in a case where
 99 |       #   heuristic type extraction fails
100 |       proc uniqueProcWithType(x: int): int =
101 |         x + 5
102 |       let fn = f{ idx("a").uniqueProcWithType }
103 |       check fn.evaluate(df).iCol == [6, 7, 8].toTensor
104 | 
105 |   test "Automatic type deduction based on `idx` in argument of a call overloaded proc call":
106 |     block:
107 |       # - type deduction based on `idx` in specific argument of a typically overloaded
108 |       #   symbol. Can be deduced due to only single overload matching the arguments
109 |       proc someInt(): int = 2
110 |       proc max(x: int, y: string, z: float, b: int): int =
111 |         result = 5
112 |       let fn = f{ max(idx("a"), "hello", 5.5, someInt()) }
113 |       check fn.evaluate(df).iCol == [5, 5, 5].toTensor
114 | 
115 |     block:
116 |       # - automatically determines that `a` should be read as `int`
117 |       # - formula is mapping
118 |       let fn = f{ max(idx("a"), 2) }
119 |       check fn.evaluate(df).iCol == [2, 2, 3].toTensor
120 | 
121 |   test "Formula with an if expression accessing multiple columns":
122 |     block:
123 |       # - formula with an if expression accessing multiple columns
124 |       let fn = f{int -> int: if `a` < 2:
125 |                                `b`
126 |                              else:
127 |                                `c` }
128 |       check fn.evaluate(df).iCol == [3, 5, 6].toTensor
129 | 
130 |     when (NimMajor, NimMinor, NimPatch) >= (1, 4, 0):
131 |       block:
132 |         ## TODO: 1. we need the parenthesis (otherwise lexer error)
133 |         ## 2. return type is deduced to be bool. It should be taken from
134 |         ## the if expression! `nnkIfExpr` not implemented yet.
135 |         let fn = f{float -> float: "h" ~ (if classify(idx("h")) == fcNaN:
136 |                                             -1.0
137 |                                           else:
138 |                                             `h`)}
139 |         check fn.evaluate(df).fCol == [2.5, 7.5, -1.0].toTensor
140 | 
141 |   test "Dot expression requiring `Value` input works automatically":
142 |     block:
143 |       # - dot call requiring `Value` argument, output is object column (because
144 |       #   `isNull` returns a boolean as a `Value`
145 |       let fn = f{ idx("a").isNull }
146 |       check fn.evaluate(df).oCol == [%~ false, %~ false, %~ false].toTensor
147 | 
148 |   test "Infix with `notin` and local array":
149 |     block:
150 |       # - `notin` works and determines `g`
151 |       let existKeys = ["hello"]
152 |       let fn = f{string: `g` notin existKeys}
153 |       check fn.evaluate(df).bCol == [false, true, true].toTensor
154 | 
155 |   test "`ggplotnim` formula accessing (proc) field of an object":
156 |     block:
157 |       type
158 |         MS = object
159 |           trans: proc(x: float): float
160 |       let col = %~ "a"
161 |       let ms = MS(trans: (proc(x: float): float = 5.5))
162 |       let colStr = "log10(x4)"
163 |       let fn = f{float: colStr ~ ms.trans( df[col.toStr][idx] ) }
164 |       check fn.evaluate(df).fCol == [5.5, 5.5, 5.5].toTensor
165 | 
166 |   test "`max` overload is resolved in context of infix with float":
167 |     block:
168 |       let fn = f{ `a` >= max(`a`) * 0.5 }
169 |       check fn.evaluate(df).bCol == [false, true, true].toTensor
170 | 
171 |     block:
172 |       ## TODO: this is technically broken, because from `*` we take `float`
173 |       ## as result and from the integer `-1` we determine the infix to be
174 |       ## integer
175 |       #let fn = f{ -1 * c"hwy"}
176 | 
177 |   test "Reducing formula with boolean return value":
178 |     block:
179 |       let df2 = seqsToDf({"var1" : toSeq(0 ..< 10)})
180 |       let fn = f{ sum(`var1`) > 20000 }
181 |       check fn.reduce(df2).toBool == false
182 | 
183 |   test "Example of no `idx` but reducing proc (mean) as a mapping":
184 |     block:
185 |       ## example of a formula that contradicts our assumption that we should error in
186 |       ## case the determined formula kind and the given one mismatch.
187 |       ## In this case we might *want* to assign something + the mean for each element in
188 |       ## the DF (in the context of a `group_by` call this makes sense!
189 |       ## We'll turn it into a warning.
190 |       ## Also: keep in mind that if the user writes something, same as with type hints, we
191 |       ## should value that decision.
192 |       # here we only check it compiles (no CT error anymore)
193 |       let fn = f{float -> float: "subMeanHwy" ~ 0.0 + mean(col("hwy"))}
194 | 
195 |   test "Name test":
196 |     let f = f{"meanCty" ~ (c"hwy" + c"cty")}
197 |     # name is the full name. Manual parens (nnkPar) are included in representation.
198 |     check f.name == "(~ meanCty ((+ hwy cty)))"
199 | 
200 |   test "Constant mapping of integer":
201 |     let countCol = "count"
202 |     let fn = f{int: countCol ~ 0}
203 |     check fn.evaluate(df).iCol == [0, 0, 0].toTensor
204 | 
205 |   test "Name of long formula":
206 |     const cut_rms_trans_low = 0.1
207 |     const cut_rms_trans_high = 1.5
208 |     proc inRegion(x, y: float, r: string): bool =
209 |       discard
210 | 
211 |     let fn = f{float -> bool:
212 |       `rmsTransverse` >= cut_rms_trans_low and
213 |       `rmsTransverse` <= cut_rms_trans_high and
214 |       inRegion(df["centerX"][idx], df["centerY"][idx], "crSilver") and
215 |       `hits` < 500}
216 | 
217 |     check $fn == """(and (and (and (>= rmsTransverse cut_rms_trans_low) (<= rmsTransverse cut_rms_trans_high)) (inRegion df["centerX"][idx] df["centerY"][idx] crSilver)) (< hits 500))"""
218 | 
219 |   test "Explicit types in `col`, `idx`":
220 |     block:
221 |       # explicit types work
222 |       let fn = f{ idx("a", int) }
223 |       check fn.evaluate(df).iCol == [1, 2, 3].toTensor
224 | 
225 |     block:
226 |       # mixing explicit types work
227 |       let fn = f{ idx("a", int) + idx("i", string).parseInt}
228 |       check fn.evaluate(df).iCol == [6, 8, 10].toTensor
229 | 
230 |     block:
231 |       # type hints do ``not`` overwrite explicit types
232 |       let fn = f{string -> int: (
233 |         if `g` == "hello":
234 |           idx("a", int)
235 |         else:
236 |           idx("b", int)) }
237 |       check fn.evaluate(df).iCol == [1, 4, 5].toTensor
238 | 
239 |   test "Add with integer should produce integer":
240 |     let fn = f{"a+5" ~ `a` + 5 }
241 |     check fn.evaluate(df).kind == colInt
242 |     check fn.evaluate(df).iCol == [6, 7, 8].toTensor
243 | 
244 |   test "Add with float should produce float":
245 |     let fn = f{"a+5.0" ~ `a` + 5.0 }
246 |     check fn.evaluate(df).kind == colFloat
247 |     check fn.evaluate(df).fCol == [6.0, 7.0, 8.0].toTensor
248 | 
249 |   test "Complex reduction with multiple types and type deduction of `mean`":
250 |     # this was broken up to `v0.1.3`
251 |     let df = seqsToDf({ "x" : @[1, 2, 3, 4, 5], "y" : @["a", "b", "c", "d", "e"] })
252 |     block:
253 |       let fn = f{"mean+ord" << mean(`x`) + ord(max(col(`y`, string))[0]).float }
254 |       check fn.reduce(df).kind == VFloat
255 |       check fn.reduce(df).toFloat == 104.0
256 |     block:
257 |       let fn = f{"mean+ord" << mean(`x`) + col(`y`, string).max[0].ord.float }
258 |       check fn.reduce(df).kind == VFloat
259 |       check fn.reduce(df).toFloat == 104.0
260 | 
261 |   test "Formula variable name generation":
262 |     # this was broken up to `v0.1.8`, as all variables were turned into `colT`
263 |     # (we just *removed* the part that made each column unique)
264 |     let df = seqsToDf({"0" : [1,1,1], "1" : [2,2,2], "2" : [3,3,3]})
265 |     let fn = f{idx("0") + idx("1") + idx("2")}
266 |     check fn.evaluate(df).toTensor(int) == toTensor [6,6,6]
267 | 


--------------------------------------------------------------------------------
/data/mpg.csv:
--------------------------------------------------------------------------------
  1 | manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
  2 | audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
  3 | audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
  4 | audi,a4,2,2008,4,manual(m6),f,20,31,p,compact
  5 | audi,a4,2,2008,4,auto(av),f,21,30,p,compact
  6 | audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
  7 | audi,a4,2.8,1999,6,manual(m5),f,18,26,p,compact
  8 | audi,a4,3.1,2008,6,auto(av),f,18,27,p,compact
  9 | audi,a4 quattro,1.8,1999,4,manual(m5),4,18,26,p,compact
 10 | audi,a4 quattro,1.8,1999,4,auto(l5),4,16,25,p,compact
 11 | audi,a4 quattro,2,2008,4,manual(m6),4,20,28,p,compact
 12 | audi,a4 quattro,2,2008,4,auto(s6),4,19,27,p,compact
 13 | audi,a4 quattro,2.8,1999,6,auto(l5),4,15,25,p,compact
 14 | audi,a4 quattro,2.8,1999,6,manual(m5),4,17,25,p,compact
 15 | audi,a4 quattro,3.1,2008,6,auto(s6),4,17,25,p,compact
 16 | audi,a4 quattro,3.1,2008,6,manual(m6),4,15,25,p,compact
 17 | audi,a6 quattro,2.8,1999,6,auto(l5),4,15,24,p,midsize
 18 | audi,a6 quattro,3.1,2008,6,auto(s6),4,17,25,p,midsize
 19 | audi,a6 quattro,4.2,2008,8,auto(s6),4,16,23,p,midsize
 20 | chevrolet,c1500 suburban 2wd,5.3,2008,8,auto(l4),r,14,20,r,suv
 21 | chevrolet,c1500 suburban 2wd,5.3,2008,8,auto(l4),r,11,15,e,suv
 22 | chevrolet,c1500 suburban 2wd,5.3,2008,8,auto(l4),r,14,20,r,suv
 23 | chevrolet,c1500 suburban 2wd,5.7,1999,8,auto(l4),r,13,17,r,suv
 24 | chevrolet,c1500 suburban 2wd,6,2008,8,auto(l4),r,12,17,r,suv
 25 | chevrolet,corvette,5.7,1999,8,manual(m6),r,16,26,p,2seater
 26 | chevrolet,corvette,5.7,1999,8,auto(l4),r,15,23,p,2seater
 27 | chevrolet,corvette,6.2,2008,8,manual(m6),r,16,26,p,2seater
 28 | chevrolet,corvette,6.2,2008,8,auto(s6),r,15,25,p,2seater
 29 | chevrolet,corvette,7,2008,8,manual(m6),r,15,24,p,2seater
 30 | chevrolet,k1500 tahoe 4wd,5.3,2008,8,auto(l4),4,14,19,r,suv
 31 | chevrolet,k1500 tahoe 4wd,5.3,2008,8,auto(l4),4,11,14,e,suv
 32 | chevrolet,k1500 tahoe 4wd,5.7,1999,8,auto(l4),4,11,15,r,suv
 33 | chevrolet,k1500 tahoe 4wd,6.5,1999,8,auto(l4),4,14,17,d,suv
 34 | chevrolet,malibu,2.4,1999,4,auto(l4),f,19,27,r,midsize
 35 | chevrolet,malibu,2.4,2008,4,auto(l4),f,22,30,r,midsize
 36 | chevrolet,malibu,3.1,1999,6,auto(l4),f,18,26,r,midsize
 37 | chevrolet,malibu,3.5,2008,6,auto(l4),f,18,29,r,midsize
 38 | chevrolet,malibu,3.6,2008,6,auto(s6),f,17,26,r,midsize
 39 | dodge,caravan 2wd,2.4,1999,4,auto(l3),f,18,24,r,minivan
 40 | dodge,caravan 2wd,3,1999,6,auto(l4),f,17,24,r,minivan
 41 | dodge,caravan 2wd,3.3,1999,6,auto(l4),f,16,22,r,minivan
 42 | dodge,caravan 2wd,3.3,1999,6,auto(l4),f,16,22,r,minivan
 43 | dodge,caravan 2wd,3.3,2008,6,auto(l4),f,17,24,r,minivan
 44 | dodge,caravan 2wd,3.3,2008,6,auto(l4),f,17,24,r,minivan
 45 | dodge,caravan 2wd,3.3,2008,6,auto(l4),f,11,17,e,minivan
 46 | dodge,caravan 2wd,3.8,1999,6,auto(l4),f,15,22,r,minivan
 47 | dodge,caravan 2wd,3.8,1999,6,auto(l4),f,15,21,r,minivan
 48 | dodge,caravan 2wd,3.8,2008,6,auto(l6),f,16,23,r,minivan
 49 | dodge,caravan 2wd,4,2008,6,auto(l6),f,16,23,r,minivan
 50 | dodge,dakota pickup 4wd,3.7,2008,6,manual(m6),4,15,19,r,pickup
 51 | dodge,dakota pickup 4wd,3.7,2008,6,auto(l4),4,14,18,r,pickup
 52 | dodge,dakota pickup 4wd,3.9,1999,6,auto(l4),4,13,17,r,pickup
 53 | dodge,dakota pickup 4wd,3.9,1999,6,manual(m5),4,14,17,r,pickup
 54 | dodge,dakota pickup 4wd,4.7,2008,8,auto(l5),4,14,19,r,pickup
 55 | dodge,dakota pickup 4wd,4.7,2008,8,auto(l5),4,14,19,r,pickup
 56 | dodge,dakota pickup 4wd,4.7,2008,8,auto(l5),4,9,12,e,pickup
 57 | dodge,dakota pickup 4wd,5.2,1999,8,manual(m5),4,11,17,r,pickup
 58 | dodge,dakota pickup 4wd,5.2,1999,8,auto(l4),4,11,15,r,pickup
 59 | dodge,durango 4wd,3.9,1999,6,auto(l4),4,13,17,r,suv
 60 | dodge,durango 4wd,4.7,2008,8,auto(l5),4,13,17,r,suv
 61 | dodge,durango 4wd,4.7,2008,8,auto(l5),4,9,12,e,suv
 62 | dodge,durango 4wd,4.7,2008,8,auto(l5),4,13,17,r,suv
 63 | dodge,durango 4wd,5.2,1999,8,auto(l4),4,11,16,r,suv
 64 | dodge,durango 4wd,5.7,2008,8,auto(l5),4,13,18,r,suv
 65 | dodge,durango 4wd,5.9,1999,8,auto(l4),4,11,15,r,suv
 66 | dodge,ram 1500 pickup 4wd,4.7,2008,8,manual(m6),4,12,16,r,pickup
 67 | dodge,ram 1500 pickup 4wd,4.7,2008,8,auto(l5),4,9,12,e,pickup
 68 | dodge,ram 1500 pickup 4wd,4.7,2008,8,auto(l5),4,13,17,r,pickup
 69 | dodge,ram 1500 pickup 4wd,4.7,2008,8,auto(l5),4,13,17,r,pickup
 70 | dodge,ram 1500 pickup 4wd,4.7,2008,8,manual(m6),4,12,16,r,pickup
 71 | dodge,ram 1500 pickup 4wd,4.7,2008,8,manual(m6),4,9,12,e,pickup
 72 | dodge,ram 1500 pickup 4wd,5.2,1999,8,auto(l4),4,11,15,r,pickup
 73 | dodge,ram 1500 pickup 4wd,5.2,1999,8,manual(m5),4,11,16,r,pickup
 74 | dodge,ram 1500 pickup 4wd,5.7,2008,8,auto(l5),4,13,17,r,pickup
 75 | dodge,ram 1500 pickup 4wd,5.9,1999,8,auto(l4),4,11,15,r,pickup
 76 | ford,expedition 2wd,4.6,1999,8,auto(l4),r,11,17,r,suv
 77 | ford,expedition 2wd,5.4,1999,8,auto(l4),r,11,17,r,suv
 78 | ford,expedition 2wd,5.4,2008,8,auto(l6),r,12,18,r,suv
 79 | ford,explorer 4wd,4,1999,6,auto(l5),4,14,17,r,suv
 80 | ford,explorer 4wd,4,1999,6,manual(m5),4,15,19,r,suv
 81 | ford,explorer 4wd,4,1999,6,auto(l5),4,14,17,r,suv
 82 | ford,explorer 4wd,4,2008,6,auto(l5),4,13,19,r,suv
 83 | ford,explorer 4wd,4.6,2008,8,auto(l6),4,13,19,r,suv
 84 | ford,explorer 4wd,5,1999,8,auto(l4),4,13,17,r,suv
 85 | ford,f150 pickup 4wd,4.2,1999,6,auto(l4),4,14,17,r,pickup
 86 | ford,f150 pickup 4wd,4.2,1999,6,manual(m5),4,14,17,r,pickup
 87 | ford,f150 pickup 4wd,4.6,1999,8,manual(m5),4,13,16,r,pickup
 88 | ford,f150 pickup 4wd,4.6,1999,8,auto(l4),4,13,16,r,pickup
 89 | ford,f150 pickup 4wd,4.6,2008,8,auto(l4),4,13,17,r,pickup
 90 | ford,f150 pickup 4wd,5.4,1999,8,auto(l4),4,11,15,r,pickup
 91 | ford,f150 pickup 4wd,5.4,2008,8,auto(l4),4,13,17,r,pickup
 92 | ford,mustang,3.8,1999,6,manual(m5),r,18,26,r,subcompact
 93 | ford,mustang,3.8,1999,6,auto(l4),r,18,25,r,subcompact
 94 | ford,mustang,4,2008,6,manual(m5),r,17,26,r,subcompact
 95 | ford,mustang,4,2008,6,auto(l5),r,16,24,r,subcompact
 96 | ford,mustang,4.6,1999,8,auto(l4),r,15,21,r,subcompact
 97 | ford,mustang,4.6,1999,8,manual(m5),r,15,22,r,subcompact
 98 | ford,mustang,4.6,2008,8,manual(m5),r,15,23,r,subcompact
 99 | ford,mustang,4.6,2008,8,auto(l5),r,15,22,r,subcompact
100 | ford,mustang,5.4,2008,8,manual(m6),r,14,20,p,subcompact
101 | honda,civic,1.6,1999,4,manual(m5),f,28,33,r,subcompact
102 | honda,civic,1.6,1999,4,auto(l4),f,24,32,r,subcompact
103 | honda,civic,1.6,1999,4,manual(m5),f,25,32,r,subcompact
104 | honda,civic,1.6,1999,4,manual(m5),f,23,29,p,subcompact
105 | honda,civic,1.6,1999,4,auto(l4),f,24,32,r,subcompact
106 | honda,civic,1.8,2008,4,manual(m5),f,26,34,r,subcompact
107 | honda,civic,1.8,2008,4,auto(l5),f,25,36,r,subcompact
108 | honda,civic,1.8,2008,4,auto(l5),f,24,36,c,subcompact
109 | honda,civic,2,2008,4,manual(m6),f,21,29,p,subcompact
110 | hyundai,sonata,2.4,1999,4,auto(l4),f,18,26,r,midsize
111 | hyundai,sonata,2.4,1999,4,manual(m5),f,18,27,r,midsize
112 | hyundai,sonata,2.4,2008,4,auto(l4),f,21,30,r,midsize
113 | hyundai,sonata,2.4,2008,4,manual(m5),f,21,31,r,midsize
114 | hyundai,sonata,2.5,1999,6,auto(l4),f,18,26,r,midsize
115 | hyundai,sonata,2.5,1999,6,manual(m5),f,18,26,r,midsize
116 | hyundai,sonata,3.3,2008,6,auto(l5),f,19,28,r,midsize
117 | hyundai,tiburon,2,1999,4,auto(l4),f,19,26,r,subcompact
118 | hyundai,tiburon,2,1999,4,manual(m5),f,19,29,r,subcompact
119 | hyundai,tiburon,2,2008,4,manual(m5),f,20,28,r,subcompact
120 | hyundai,tiburon,2,2008,4,auto(l4),f,20,27,r,subcompact
121 | hyundai,tiburon,2.7,2008,6,auto(l4),f,17,24,r,subcompact
122 | hyundai,tiburon,2.7,2008,6,manual(m6),f,16,24,r,subcompact
123 | hyundai,tiburon,2.7,2008,6,manual(m5),f,17,24,r,subcompact
124 | jeep,grand cherokee 4wd,3,2008,6,auto(l5),4,17,22,d,suv
125 | jeep,grand cherokee 4wd,3.7,2008,6,auto(l5),4,15,19,r,suv
126 | jeep,grand cherokee 4wd,4,1999,6,auto(l4),4,15,20,r,suv
127 | jeep,grand cherokee 4wd,4.7,1999,8,auto(l4),4,14,17,r,suv
128 | jeep,grand cherokee 4wd,4.7,2008,8,auto(l5),4,9,12,e,suv
129 | jeep,grand cherokee 4wd,4.7,2008,8,auto(l5),4,14,19,r,suv
130 | jeep,grand cherokee 4wd,5.7,2008,8,auto(l5),4,13,18,r,suv
131 | jeep,grand cherokee 4wd,6.1,2008,8,auto(l5),4,11,14,p,suv
132 | land rover,range rover,4,1999,8,auto(l4),4,11,15,p,suv
133 | land rover,range rover,4.2,2008,8,auto(s6),4,12,18,r,suv
134 | land rover,range rover,4.4,2008,8,auto(s6),4,12,18,r,suv
135 | land rover,range rover,4.6,1999,8,auto(l4),4,11,15,p,suv
136 | lincoln,navigator 2wd,5.4,1999,8,auto(l4),r,11,17,r,suv
137 | lincoln,navigator 2wd,5.4,1999,8,auto(l4),r,11,16,p,suv
138 | lincoln,navigator 2wd,5.4,2008,8,auto(l6),r,12,18,r,suv
139 | mercury,mountaineer 4wd,4,1999,6,auto(l5),4,14,17,r,suv
140 | mercury,mountaineer 4wd,4,2008,6,auto(l5),4,13,19,r,suv
141 | mercury,mountaineer 4wd,4.6,2008,8,auto(l6),4,13,19,r,suv
142 | mercury,mountaineer 4wd,5,1999,8,auto(l4),4,13,17,r,suv
143 | nissan,altima,2.4,1999,4,manual(m5),f,21,29,r,compact
144 | nissan,altima,2.4,1999,4,auto(l4),f,19,27,r,compact
145 | nissan,altima,2.5,2008,4,auto(av),f,23,31,r,midsize
146 | nissan,altima,2.5,2008,4,manual(m6),f,23,32,r,midsize
147 | nissan,altima,3.5,2008,6,manual(m6),f,19,27,p,midsize
148 | nissan,altima,3.5,2008,6,auto(av),f,19,26,p,midsize
149 | nissan,maxima,3,1999,6,auto(l4),f,18,26,r,midsize
150 | nissan,maxima,3,1999,6,manual(m5),f,19,25,r,midsize
151 | nissan,maxima,3.5,2008,6,auto(av),f,19,25,p,midsize
152 | nissan,pathfinder 4wd,3.3,1999,6,auto(l4),4,14,17,r,suv
153 | nissan,pathfinder 4wd,3.3,1999,6,manual(m5),4,15,17,r,suv
154 | nissan,pathfinder 4wd,4,2008,6,auto(l5),4,14,20,p,suv
155 | nissan,pathfinder 4wd,5.6,2008,8,auto(s5),4,12,18,p,suv
156 | pontiac,grand prix,3.1,1999,6,auto(l4),f,18,26,r,midsize
157 | pontiac,grand prix,3.8,1999,6,auto(l4),f,16,26,p,midsize
158 | pontiac,grand prix,3.8,1999,6,auto(l4),f,17,27,r,midsize
159 | pontiac,grand prix,3.8,2008,6,auto(l4),f,18,28,r,midsize
160 | pontiac,grand prix,5.3,2008,8,auto(s4),f,16,25,p,midsize
161 | subaru,forester awd,2.5,1999,4,manual(m5),4,18,25,r,suv
162 | subaru,forester awd,2.5,1999,4,auto(l4),4,18,24,r,suv
163 | subaru,forester awd,2.5,2008,4,manual(m5),4,20,27,r,suv
164 | subaru,forester awd,2.5,2008,4,manual(m5),4,19,25,p,suv
165 | subaru,forester awd,2.5,2008,4,auto(l4),4,20,26,r,suv
166 | subaru,forester awd,2.5,2008,4,auto(l4),4,18,23,p,suv
167 | subaru,impreza awd,2.2,1999,4,auto(l4),4,21,26,r,subcompact
168 | subaru,impreza awd,2.2,1999,4,manual(m5),4,19,26,r,subcompact
169 | subaru,impreza awd,2.5,1999,4,manual(m5),4,19,26,r,subcompact
170 | subaru,impreza awd,2.5,1999,4,auto(l4),4,19,26,r,subcompact
171 | subaru,impreza awd,2.5,2008,4,auto(s4),4,20,25,p,compact
172 | subaru,impreza awd,2.5,2008,4,auto(s4),4,20,27,r,compact
173 | subaru,impreza awd,2.5,2008,4,manual(m5),4,19,25,p,compact
174 | subaru,impreza awd,2.5,2008,4,manual(m5),4,20,27,r,compact
175 | toyota,4runner 4wd,2.7,1999,4,manual(m5),4,15,20,r,suv
176 | toyota,4runner 4wd,2.7,1999,4,auto(l4),4,16,20,r,suv
177 | toyota,4runner 4wd,3.4,1999,6,auto(l4),4,15,19,r,suv
178 | toyota,4runner 4wd,3.4,1999,6,manual(m5),4,15,17,r,suv
179 | toyota,4runner 4wd,4,2008,6,auto(l5),4,16,20,r,suv
180 | toyota,4runner 4wd,4.7,2008,8,auto(l5),4,14,17,r,suv
181 | toyota,camry,2.2,1999,4,manual(m5),f,21,29,r,midsize
182 | toyota,camry,2.2,1999,4,auto(l4),f,21,27,r,midsize
183 | toyota,camry,2.4,2008,4,manual(m5),f,21,31,r,midsize
184 | toyota,camry,2.4,2008,4,auto(l5),f,21,31,r,midsize
185 | toyota,camry,3,1999,6,auto(l4),f,18,26,r,midsize
186 | toyota,camry,3,1999,6,manual(m5),f,18,26,r,midsize
187 | toyota,camry,3.5,2008,6,auto(s6),f,19,28,r,midsize
188 | toyota,camry solara,2.2,1999,4,auto(l4),f,21,27,r,compact
189 | toyota,camry solara,2.2,1999,4,manual(m5),f,21,29,r,compact
190 | toyota,camry solara,2.4,2008,4,manual(m5),f,21,31,r,compact
191 | toyota,camry solara,2.4,2008,4,auto(s5),f,22,31,r,compact
192 | toyota,camry solara,3,1999,6,auto(l4),f,18,26,r,compact
193 | toyota,camry solara,3,1999,6,manual(m5),f,18,26,r,compact
194 | toyota,camry solara,3.3,2008,6,auto(s5),f,18,27,r,compact
195 | toyota,corolla,1.8,1999,4,auto(l3),f,24,30,r,compact
196 | toyota,corolla,1.8,1999,4,auto(l4),f,24,33,r,compact
197 | toyota,corolla,1.8,1999,4,manual(m5),f,26,35,r,compact
198 | toyota,corolla,1.8,2008,4,manual(m5),f,28,37,r,compact
199 | toyota,corolla,1.8,2008,4,auto(l4),f,26,35,r,compact
200 | toyota,land cruiser wagon 4wd,4.7,1999,8,auto(l4),4,11,15,r,suv
201 | toyota,land cruiser wagon 4wd,5.7,2008,8,auto(s6),4,13,18,r,suv
202 | toyota,toyota tacoma 4wd,2.7,1999,4,manual(m5),4,15,20,r,pickup
203 | toyota,toyota tacoma 4wd,2.7,1999,4,auto(l4),4,16,20,r,pickup
204 | toyota,toyota tacoma 4wd,2.7,2008,4,manual(m5),4,17,22,r,pickup
205 | toyota,toyota tacoma 4wd,3.4,1999,6,manual(m5),4,15,17,r,pickup
206 | toyota,toyota tacoma 4wd,3.4,1999,6,auto(l4),4,15,19,r,pickup
207 | toyota,toyota tacoma 4wd,4,2008,6,manual(m6),4,15,18,r,pickup
208 | toyota,toyota tacoma 4wd,4,2008,6,auto(l5),4,16,20,r,pickup
209 | volkswagen,gti,2,1999,4,manual(m5),f,21,29,r,compact
210 | volkswagen,gti,2,1999,4,auto(l4),f,19,26,r,compact
211 | volkswagen,gti,2,2008,4,manual(m6),f,21,29,p,compact
212 | volkswagen,gti,2,2008,4,auto(s6),f,22,29,p,compact
213 | volkswagen,gti,2.8,1999,6,manual(m5),f,17,24,r,compact
214 | volkswagen,jetta,1.9,1999,4,manual(m5),f,33,44,d,compact
215 | volkswagen,jetta,2,1999,4,manual(m5),f,21,29,r,compact
216 | volkswagen,jetta,2,1999,4,auto(l4),f,19,26,r,compact
217 | volkswagen,jetta,2,2008,4,auto(s6),f,22,29,p,compact
218 | volkswagen,jetta,2,2008,4,manual(m6),f,21,29,p,compact
219 | volkswagen,jetta,2.5,2008,5,auto(s6),f,21,29,r,compact
220 | volkswagen,jetta,2.5,2008,5,manual(m5),f,21,29,r,compact
221 | volkswagen,jetta,2.8,1999,6,auto(l4),f,16,23,r,compact
222 | volkswagen,jetta,2.8,1999,6,manual(m5),f,17,24,r,compact
223 | volkswagen,new beetle,1.9,1999,4,manual(m5),f,35,44,d,subcompact
224 | volkswagen,new beetle,1.9,1999,4,auto(l4),f,29,41,d,subcompact
225 | volkswagen,new beetle,2,1999,4,manual(m5),f,21,29,r,subcompact
226 | volkswagen,new beetle,2,1999,4,auto(l4),f,19,26,r,subcompact
227 | volkswagen,new beetle,2.5,2008,5,manual(m5),f,20,28,r,subcompact
228 | volkswagen,new beetle,2.5,2008,5,auto(s6),f,20,29,r,subcompact
229 | volkswagen,passat,1.8,1999,4,manual(m5),f,21,29,p,midsize
230 | volkswagen,passat,1.8,1999,4,auto(l5),f,18,29,p,midsize
231 | volkswagen,passat,2,2008,4,auto(s6),f,19,28,p,midsize
232 | volkswagen,passat,2,2008,4,manual(m6),f,21,29,p,midsize
233 | volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize
234 | volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize
235 | volkswagen,passat,3.6,2008,6,auto(s6),f,17,26,p,midsize
236 | 


--------------------------------------------------------------------------------
/docs/datamancer.org:
--------------------------------------------------------------------------------
  1 | * Datamancer overview
  2 | 
  3 | The =datamancer= packages is a dataframe library, which (as a
  4 | companion to [[https://github.com/Vindaar/ggplotnim][ggplotnim]]) is also heavily inspired by a specific R
  5 | library. In this case [[https://dplyr.tidyverse.org/][dplyr]], a dataframe library of the tidyverse.
  6 | 
  7 | What follows is a basic introduction covering all fundamentals. We
  8 | won't cover the full API in this document, but rather aim to give the
  9 | understanding so that using the full library becomes easy.
 10 | 
 11 | ** What is a dataframe?
 12 | 
 13 | A dataframe is a data structure that consists of multiple 1
 14 | dimensional datasets of equal lengths - but possibly different types -
 15 | that have names associated to them.
 16 | 
 17 | From an abstract perspective it is a set of heterogeneous arrays (array not
 18 | specifically in terms of CT sized Nim arrays, but as general flat,
 19 | contiguous data structures) stored in a hash table. Each entry in the
 20 | table is called a column and the keys represent the column names.
 21 | 
 22 | In a sense then they are a data structure similar to what is
 23 | represented by a spreadsheet or naturally in a CSV file.
 24 | 
 25 | One useful distinction about the nature of individual columns of a
 26 | dataframe is whether the data described in it is continuous
 27 | (possibly N different values for a dataframe of length N) or discrete
 28 | (small number of N different values compared to a possibly much larger
 29 | M number of elements). 
 30 | 
 31 | ** What is the point of a dataframe?
 32 | 
 33 | A huge amount of data about the real world or most physical systems
 34 | can be described by individual numbers (scalars) or sets of such. For
 35 | one system or topic of interest, it is often natural (or slightly less
 36 | natural, but possible) to express the state of the system by a set of
 37 | numbers. Let's call that a "record". From this follows that possibly:
 38 | - a set of such systems
 39 | - the time evolution of such a system
 40 | - ...
 41 | can be described by a (possibly ordered) list of such records. Such a
 42 | list directly represents a dataframe.
 43 | 
 44 | What this implies is that as long as we can write down a number of
 45 | operations we can perform on or with a dataframe, we can apply such
 46 | operations to a large number of possible systems.
 47 | 
 48 | Therefore, dataframes are a very powerful datastructure.
 49 | 
 50 | The library inspiring datamancer defines a small set of (base)
 51 | operations to perform with dataframes. A small number of operations
 52 | makes it easy to reason about and combine them to produce complex
 53 | operations.
 54 | 
 55 | The five base operations ("the verbs of dplyr") are:
 56 | - =mutate=: modify an existing column or add a new one 
 57 | - =select=: select a subset of columns
 58 | - =filter=: filter out a subset of records based on one or more conditions
 59 | - =summarize=: reduce one or more columns to a single scalar
 60 | - =arrange=: sort the dataframe according to one or more columns
 61 | 
 62 | For discrete columns in a dataframe one more procedure is almost as
 63 | basic, namely =group_by=. It allows one to iterate over all subsets of
 64 | a dataframe that have the same value 'x' in a column 'A'.
 65 | 
 66 | The five verbs above naturally combine with =groub_by=. This means if
 67 | one of these operations is performed on a grouped dataframe, the
 68 | operation will be performed of each subgroup instead of the full
 69 | dataframe (which may or may not produce a different result).
 70 | 
 71 | ** Creating a dataframe
 72 | 
 73 | With some understanding of why we might want to bother with
 74 | dataframes, we can now ask ourselves how to create one. 
 75 | 
 76 | Usage typically starts with one of the following cases:
 77 | 1. data already available in =seq[T]/Tensor[T]= or some Nim object from which such
 78 |    can be created
 79 | 2. some CSV / TSV like ascii file or a =string= representing such a thing
 80 | 3. some binary file like HDF5
 81 | 4. some database 
 82 | 
 83 | Note about 3 and 4: simple access (without manually reading into a
 84 | =seq[T]/Tensor[T]=) is not supported for these two yet. These can be
 85 | added easily (code for HDF5 exists, but not as part of this
 86 | repository) if there is demand.
 87 | 
 88 | *** Supported datatypes and internal representation
 89 | 
 90 | The datamancer dataframes currently support the following types:
 91 | - =int=
 92 | - =float=
 93 | - =string=
 94 | - =bool=
 95 | - =Value=
 96 | where =Value= is a variant object that can store either of the above
 97 | datatypes. That type is used in case a single column stores multiple
 98 | data types.
 99 | 
100 | At this moment there is no direct support for =DateTime= or =Time=
101 | objects. That could be added if desired. It's a bit of work, but
102 | manageable. It's mainly missing, because so far I personally didn't
103 | really need it. 
104 | 
105 | Internally, one column is stored in a =Column= object. This object is
106 | a variant object, with different possible =ColumnKinds= (one kind for
107 | each of the native data types). The column stores an [[https://github.com/mratsim/Arraymancer][Arraymancer]]
108 | =Tensor[T]= for the respective type of the column.
109 | 
110 | The usage of variant objects allows for a fully dynamic, runtime
111 | mutable design. This is a trade-off between safety and convenience,
112 | which is placed more towards convenience for the simple reason that
113 | for often recurring computations of the same kind it is highly
114 | recommended to make use of a custom datatype that allows for
115 | optimizations applicable to the specific domain.
116 | 
117 | *** From =seq[T]/Tensor[T]=
118 | 
119 | For the case of having the data as =seq[T]=, we just use the
120 | =seqsToDf= template to create a DF from it. The template does not care
121 | whether the input is of type =seq[T]= or =Tensor[T]=. In the future
122 | support for pointer + length pairs can be added as well.
123 | 
124 | There are two ways to use =seqsToDf=. Assuming we have three sequences of possibly different types:
125 | #+BEGIN_SRC nim
126 | let s1: seq[int] = @[22, 54, 34]
127 | let s2: seq[float] = @[1.87, 1.75, 1.78]
128 | let s3: seq[string] = @["Mike", "Laura", "Sue"]
129 | #+END_SRC
130 | we can either create a DF and let the library automatically deduce the
131 | column names from the Nim identifiers of the given variables:
132 | #+BEGIN_SRC nim
133 | let dfAutoNamed = seqsToDf(s1, s2, s3)
134 | #+END_SRC
135 | which will give us a DF with column names:
136 | #+BEGIN_SRC nim
137 | "s1", "s2", "s3"
138 | #+END_SRC
139 | In many cases one might rather like a different name. In this case use the following
140 | syntax:
141 | #+BEGIN_SRC nim
142 | let df = seqsToDf({ "Age" : s1,
143 |                     "Height" : s2,
144 |                     "Name" : s3 })
145 | #+END_SRC
146 | which will then use the given strings for the column names.
147 | 
148 | If we print this dataframe we get the following output:
149 | #+begin_src
150 | Dataframe with 3 columns and 3 rows:
151 |        Idx       Age    Height      Name
152 |     dtype:       int     float    string
153 |          0        22      1.87      Mike
154 |          1        54      1.75     Laura
155 |          2        34      1.78       Sue
156 | #+end_src
157 | 
158 | We see that we get information about:
159 | - the number of columns in the dataframe
160 | - the number of rows in the dataframe
161 | - the names of each column
162 | - the data types of each column
163 | - their values with one record per row
164 | - and an additional index column          
165 | 
166 | *** From a CSV / TSV file
167 | 
168 | The second supported case is a CSV like file. For these the library
169 | provides a generalized =readCsv= proc. Strictly speaking it can also
170 | read TSV (or any delimited ASCII file) and provides a number of
171 | different options to make it suitable to a large variety of
172 | differently organized CSV files (what a euphemism):
173 | #+BEGIN_SRC nim
174 | proc readCsv*(fname: string,
175 |               sep: char = ',',
176 |               header: string = "",
177 |               skipLines = 0,
178 |               toSkip: set[char] = {},
179 |               colNames: seq[string] = @[],
180 |               skipInitialSpace = true,
181 |               quote = '"',
182 |              ): DataFrame
183 | #+END_SRC
184 | For a regular CSV file (comma separated) with a header line storing
185 | the column names, all we need is a filename. 
186 | 
187 | In addition to that the separator can be changed with =sep= and the
188 | header can have a designation (e.g. =#= indicating header lines that
189 | will be skipped automatically after the first). 
190 | 
191 | Further a specific number of lines can be skipped after the
192 | header. =toSkip= allows to skip any set of characters. These will be
193 | completely ignored outside of columns. 
194 | 
195 | If the file does not have a header =colNames= can be used to give
196 | names to the columns.
197 | 
198 | =skipInitialSpace= is useful to remove whitespace that might appear in
199 | addition to a separator (e.g. a comma separated file that also has a
200 | space after every comma). If it is false such spaces will be parsed
201 | into the data fields.
202 | 
203 | Finally, =quote= allows to ignore all characters that would otherwise
204 | act as separators, line breaks, ... within these. If ="= is used as
205 | quote and spaces to separate, there may be spaces within individual
206 | fields as long as these are within quotes.
207 | 
208 | Let's use this procedure to read the supplied =mpg= dataset:
209 | #+BEGIN_SRC nim :results raw 
210 | import datamancer
211 | let df = readCsv("../data/mpg.csv")
212 | #+END_SRC
213 | 
214 | 
215 | 
216 | We're now proud owner of dataframe that's a bit more exciting than the
217 | 3 column / 3 row one from before. We'll explore it in the next section.
218 | 
219 | ** Manipulating a DF
220 | 
221 | Now we have a DF. What then?
222 | 
223 | First of all we can look at it. Echoing a DF calls the =pretty=
224 | proc. For the DF introduced above, this looks like:
225 | #+BEGIN_SRC nim
226 | echo df
227 | #+END_SRC
228 | gives for the =mpg= dataset:
229 | #+BEGIN_SRC sh
230 | #+RESULTS:
231 | Dataframe with 11 columns and 234 rows:
232 |    Idx  manufacturer           model  displ  year  cyl  ...     drv   cty   hwy      fl     class
233 | dtype:        string          string  float   int  int  ...  string   int   int  string    string
234 |      0          audi              a4    1.8  1999    4  ...       f    18    29       p   compact
235 |      1          audi              a4    1.8  1999    4  ...       f    21    29       p   compact
236 |      2          audi              a4      2  2008    4  ...       f    20    31       p   compact
237 |      3          audi              a4      2  2008    4  ...       f    21    30       p   compact
238 |      4          audi              a4    2.8  1999    6  ...       f    16    26       p   compact
239 |      5          audi              a4    2.8  1999    6  ...       f    18    26       p   compact
240 |      6          audi              a4    3.1  2008    6  ...       f    18    27       p   compact
241 |      7          audi      a4 quattro    1.8  1999    4  ...     "4"    18    26       p   compact
242 |      8          audi      a4 quattro    1.8  1999    4  ...     "4"    16    25       p   compact
243 |      9          audi      a4 quattro      2  2008    4  ...     "4"    20    28       p   compact
244 |     10          audi      a4 quattro      2  2008    4  ...     "4"    19    27       p   compact
245 |     11          audi      a4 quattro    2.8  1999    6  ...     "4"    15    25       p   compact
246 |     12          audi      a4 quattro    2.8  1999    6  ...     "4"    17    25       p   compact
247 |     13          audi      a4 quattro    3.1  2008    6  ...     "4"    17    25       p   compact
248 |     14          audi      a4 quattro    3.1  2008    6  ...     "4"    15    25       p   compact
249 |     15          audi      a6 quattro    2.8  1999    6  ...     "4"    15    24       p   midsize
250 |     16          audi      a6 quattro    3.1  2008    6  ...     "4"    17    25       p   midsize
251 |     17          audi      a6 quattro    4.2  2008    8  ...     "4"    16    23       p   midsize
252 |     18     chevrolet c1500 suburb...    5.3  2008    8  ...       r    14    20       r       suv
253 |     19     chevrolet c1500 suburb...    5.3  2008    8  ...       r    11    15       e       suv
254 | #+END_SRC
255 | (NOTE: I shortened the output for the docs here)
256 | Notice how in the =drv= column the 4WD entries are echoed as "4"
257 | instead of just 4. That is to highlight that those values are actually
258 | stored as strings to not confuse them with numbers. 
259 | 
260 | By default only the first 20 entries will be shown. For more/less
261 | elements, call =pretty= directly:
262 | #+BEGIN_SRC nim
263 | echo df.pretty(100)
264 | #+END_SRC
265 | 
266 | =pretty= also takes a =precision= argument. This is given to the
267 | string conversion for =float= values to set the number of digits
268 | printed after the decimal point. However, it can also be used to
269 | change the width of the columns more generally. Note however the
270 | precision is added to a width of =6= by default. Also the column is at
271 | least as wide as the longest DF key.
272 | 
273 | Let's now check which cars in the dataset have the highest and lowest
274 | city fuel economy. For that we can simply arrange the dataframe
275 | according to the =cty= column and take the tail or head of the
276 | result. 
277 | #+BEGIN_SRC nim :results raw
278 | echo df.arrange("cty").head(5)
279 | #+END_SRC
280 | results in:
281 | #+BEGIN_SRC sh
282 | Dataframe with 11 columns and 5 rows:
283 | Idx    manufacturer           model           displ  ...  cyl  ...  drv   cty   hwy      fl    class
284 | dtype:       string          string           float       int    string   int   int  string   string
285 |   0           dodge dakota picku...             4.7  ...    8  ...  "4"     9    12       e   pickup
286 |   1           dodge     durango 4wd             4.7  ...    8  ...  "4"     9    12       e      suv
287 |   2           dodge ram 1500 pic...             4.7  ...    8  ...  "4"     9    12       e   pickup
288 |   3           dodge ram 1500 pic...             4.7  ...    8  ...  "4"     9    12       e   pickup
289 |   4            jeep grand cherok...             4.7  ...    8  ...  "4"     9    12       e      suv
290 | #+END_SRC
291 | and looking at the tail instead:
292 | #+BEGIN_SRC nim
293 | echo df.arrange("cty").tail(5)
294 | #+END_SRC
295 | will tell us that a new beetle is the most efficient car in the dataset:
296 | #+BEGIN_SRC sh
297 | Dataframe with 11 columns and 5 rows:
298 | Idx    manufacturer           model           displ  ...  cyl  ...  drv   cty   hwy     fl        class
299 | dtype:       string          string           float       int    string   int   int  string      string
300 |   0           honda           civic             1.6  ...    4  ...    f    28    33      r   subcompact
301 |   1          toyota         corolla             1.8  ...    4  ...    f    28    37      r      compact
302 |   2      volkswagen      new beetle             1.9  ...    4  ...    f    29    41      d   subcompact
303 |   3      volkswagen           jetta             1.9  ...    4  ...    f    33    44      d      compact
304 |   4      volkswagen      new beetle             1.9  ...    4  ...    f    35    44      d   subcompact
305 | #+END_SRC
306 | (=arrange= also takes an order argument, using the Nim stdlib's
307 | =SortOrder= enum).
308 | 
309 | As another example here to showcase the usage of =FormulaNodes=, let's
310 | find some cars with an engine displacement of more than 5 L and which
311 | are 2 seaters (I wonder what car might show up...):
312 | #+BEGIN_SRC nim
313 | echo df.filter(f{`displ` > 5.0 and `class` == "2seater"})
314 | #+END_SRC
315 | #+BEGIN_SRC sh
316 | Dataframe with 11 columns and 5 rows:
317 | Idx    manufacturer           model           displ  ...  cyl  ...  drv   cty   hwy     fl     class
318 | dtype:       string          string           float       int    string   int   int string    string
319 |   0       chevrolet        corvette             5.7  ...    8  ...    r    16    26      p   2seater
320 |   1       chevrolet        corvette             5.7  ...    8  ...    r    15    23      p   2seater
321 |   2       chevrolet        corvette             6.2  ...    8  ...    r    16    26      p   2seater
322 |   3       chevrolet        corvette             6.2  ...    8  ...    r    15    25      p   2seater
323 |   4       chevrolet        corvette               7  ...    8  ...    r    15    24      p   2seater
324 | #+END_SRC
325 | Surprise, surprise we found ourselves a bunch of corvettes!
326 | 
327 | (Note: for an explanation of this mythical =f{}= thing and those
328 | accented quotes, see the =Formula= section below).
329 | 
330 | Finally, let's make use of a formula, which takes an assignment. Let's
331 | say we want to convert the city fuel economy of the cars from MPG to
332 | L/100 km as is the standard in Germany. We'll do this with
333 | =mutate=. =mutate= will add an additional column to the dataframe.
334 | (well, if only it was clear whether the =mpg= given are US gallon or
335 | imperial gallon?)
336 | #+BEGIN_SRC nim :results raw
337 | import datamancer
338 | let df = readCsv("../data/mpg.csv")
339 | 
340 | let dfl100km = df.filter(f{`displ` > 5.0 and `class` == "2seater"})
341 |   .mutate(f{"cty / L/100km" ~ 235 / `cty`})
342 | echo dfl100km.pretty(5)
343 | #+END_SRC
344 | shows us:
345 | #+BEGIN_SRC sh
346 | Dataframe with 12 columns and 5 rows:
347 | Idx     manufacturer            model            displ  ...       trans  ...  cty   ...   cty / L/100km
348 | dtype:        string           string            float  ...      string  ...  int   ...           float
349 |   0        chevrolet         corvette              5.7  ...  manual(m6)  ...   16   ...           14.69
350 |   1        chevrolet         corvette              5.7  ...    auto(l4)  ...   15   ...           15.67
351 |   2        chevrolet         corvette              6.2  ...  manual(m6)  ...   16   ...           14.69
352 |   3        chevrolet         corvette              6.2  ...    auto(s6)  ...   15   ...           15.67
353 |   4        chevrolet         corvette                7  ...  manual(m6)  ...   15   ...           15.67
354 | #+END_SRC
355 | where I removed a couple of columns for better visibility again.
356 | 
357 | I used the chaining of =filter= and =mutate= above mainly to showcase
358 | that this works reliably. 
359 | 
360 | When looking at the formula above note that as in ggplot2 the tilde ~
361 | is used to indicate a dependency or in other words a mapping of
362 | something like Tensor to Tensor.
363 | 
364 | ** Formula
365 | 
366 | Here will go parts of what's in the README.
367 | 


--------------------------------------------------------------------------------
/src/datamancer/value.nim:
--------------------------------------------------------------------------------
  1 | import tables, strutils, math, fenv, parseutils, strformat, hashes
  2 | 
  3 | type
  4 |   ValueKind* = enum
  5 |     VNull,
  6 |     VBool,
  7 |     VInt,
  8 |     VFloat,
  9 |     VString,
 10 |     VObject
 11 | 
 12 |   Value* = object
 13 |     case kind*: ValueKind
 14 |     of VString:
 15 |       str*: string
 16 |     of VInt:
 17 |       num*: int #BiggestInt
 18 |     of VFloat:
 19 |       fnum*: float
 20 |     of VBool:
 21 |       bval*: bool
 22 |     of VObject:
 23 |       fields*: OrderedTable[string, Value] # alternative: `seq[(string, Value)]` pairs?
 24 |     of VNull:
 25 |       discard
 26 | 
 27 | proc pretty*(v: Value, precision = 4, emphStrNumber = true): string
 28 | 
 29 | func toValKind*[T](dtype: typedesc[T]): ValueKind =
 30 |   when T is float:
 31 |     result = VFloat
 32 |   elif T is int:
 33 |     result = VInt
 34 |   elif T is bool:
 35 |     result = VBool
 36 |   elif T is string:
 37 |     result = VString
 38 |   elif T is Value:
 39 |     result = VObject
 40 | 
 41 | iterator items*(row: Value): Value =
 42 |   doAssert row.kind == VObject
 43 |   for v in values(row.fields):
 44 |     yield v
 45 | 
 46 | iterator keys*(row: Value): string =
 47 |   doAssert row.kind == VObject
 48 |   for k in keys(row.fields):
 49 |     yield k
 50 | 
 51 | iterator pairs*(row: Value): tuple[key: string, val: Value] =
 52 |   ## Iterator for the elements of `row`. `row` has to be a JObject
 53 |   ## representing a row of a `DataFrame`
 54 |   assert row.kind == VObject
 55 |   for key, val in pairs(row.fields):
 56 |     yield (key, val)
 57 | 
 58 | proc contains*(v: Value, key: string): bool =
 59 |   doAssert v.kind == VObject
 60 |   result = v.fields.hasKey(key)
 61 | 
 62 | proc `[]`*(v: Value, key: string): Value {.inline.} =
 63 |   doAssert v.kind == VObject
 64 |   result = v.fields[key]
 65 | 
 66 | proc `[]=`*(v: var Value, key: string, val: Value) {.inline.} =
 67 |   doAssert v.kind == VObject
 68 |   v.fields[key] = val
 69 | 
 70 | proc `%~`*(c: char): Value =
 71 |   ## we convert a `char` to a `string`!
 72 |   result = Value(kind: VString, str: $c)
 73 | 
 74 | proc `%~`*(v: string): Value =
 75 |   result = Value(kind: VString, str: v)
 76 | 
 77 | proc `%~`*(v: SomeFloat): Value =
 78 |   result = Value(kind: VFloat, fnum: v.float)
 79 | 
 80 | proc `%~`*(v: SomeInteger): Value =
 81 |   result = Value(kind: VInt, num: v.int)
 82 | 
 83 | proc `%~`*(v: bool): Value =
 84 |   result = Value(kind: VBool, bval: v)
 85 | 
 86 | #proc `%~`*(v: Table[string, Value]): Value =
 87 | #  result = Value(kind: VObject, fields: v.toOrderedTable)
 88 | 
 89 | proc `%~`*(v: OrderedTable[string, Value]): Value =
 90 |   result = Value(kind: VObject, fields: v)
 91 | 
 92 | proc null*(): Value =
 93 |   ## Constructs a `VNull` value.
 94 |   Value(kind: VNull)
 95 | 
 96 | proc newVObject*(length = 8): Value =
 97 |   result = Value(kind: VObject)
 98 |   result.fields = initOrderedTable[string, Value](nextPowerOfTwo(length))
 99 | 
100 | proc `%~`*[T: not Value](s: openArray[T]): seq[Value] =
101 |   ## converts a `seq[T]` to a `seq[Value]`
102 |   result = newSeq[Value](s.len)
103 |   for i, x in s:
104 |     result[i] = %~ x
105 | 
106 | template `%~`*(s: openArray[Value]): seq[Value] = @s
107 | 
108 | proc toObject*(s: seq[(string, Value)]): Value =
109 |   ## converts the given sequence to an object
110 |   ## This is only used to store the result of the `groups` iterator as a
111 |   ## `Value`.
112 |   result = Value(kind: VObject)
113 |   result.fields = initOrderedTable[string, Value]()
114 |   for (key, val) in s:
115 |     result.fields[key] = val
116 | 
117 | proc toObject*(s: (string, Value)): Value = toObject(@[s])
118 | 
119 | func isNumber*(s: string): bool =
120 |   ## returns true, if `s` is a number according to our rules:
121 |   ## - starts with {0..9}
122 |   ## - ends with {0..9}
123 |   ## - may contain a single `.`
124 |   ## - may contain a single `e`, `E`
125 |   ## - may contain one minus, one plus at beginning and one for exponent
126 |   ## - else may only contain {0..9}
127 |   ## - `e`, `+`, `-`, `.` may not appear one after another
128 |   ## - may contain space before and after the number
129 |   ## It is only used to decide whether the stringifaction of `s`
130 |   ## will be surrounded by `"`.
131 |   var idx = skipWhile(s, toSkip = {' '})
132 |   template next(checkFor: untyped): untyped =
133 |     if idx < s.len - 1:
134 |       s[idx + 1] in checkFor
135 |     else:
136 |       false
137 |   var
138 |     negMinus = false
139 |     posPlus = false
140 |     expMinus = false
141 |     expPlus = false
142 |     numBeforeDot = false
143 |     numBeforeExp = false
144 |     dot = false
145 |     expE = false
146 |     sinceLastSpace = -1
147 |   while idx < s.len:
148 |     case s[idx]
149 |     of '-':
150 |       if next({'-', '+', '.'}):
151 |         # another `-+.` after `-`
152 |         return false
153 |       elif not negMinus:
154 |         negMinus = true
155 |       elif not expMinus:
156 |         expMinus = true
157 |       else:
158 |         # apparently has 3 minus
159 |         return false
160 |     of '+':
161 |       if next({'+', '-'}):
162 |         # another `-+.` after `-`
163 |         return false
164 |       elif not posPlus:
165 |         posPlus = true
166 |       elif not expPlus:
167 |         posPlus = true
168 |       else:
169 |         # apparently has 3 plus
170 |         return false
171 |     of '0' .. '9':
172 |       if not dot:
173 |         numBeforeDot = true
174 |       if not expE:
175 |         numBeforeExp = true
176 |       inc idx
177 |       continue
178 |     of '.':
179 |       if next({'e', 'E'}):
180 |         # `e` after `.`
181 |         return false
182 |       elif not dot and numBeforeDot:
183 |         dot = true
184 |       else:
185 |         # multiple dots or number before `dot`
186 |         return false
187 |     of 'e', 'E':
188 |       if not next({'0'..'9', '-', '+'}):
189 |         # apparently ends with an 'e', 'E'
190 |         return false
191 |       if not expE and numBeforeExp:
192 |         expE = true
193 |       else:
194 |         # multiple `e` or no number before `e`
195 |         return false
196 |     of ' ':
197 |       if sinceLastSpace == -1 or sinceLastSpace == 1:
198 |         # when we encounter a space, set our `spaceCounter` to 0 to start
199 |         # increasing it every itereation in main loop
200 |         sinceLastSpace = 0
201 |       elif sinceLastSpace > 1:
202 |         # apparently something between last space and this space
203 |         return false
204 |     else: return false # something not part of a number
205 |     inc idx
206 |     if sinceLastSpace >= 0:
207 |       # last iter found a space, so count spaces
208 |       inc sinceLastSpace
209 |   return true
210 | 
211 | func isNumber*(v: Value): bool =
212 |   doAssert v.kind == VString
213 |   result = v.str.isNumber
214 | 
215 | func isInt*(s: string): bool =
216 |   ## simple "most likely int" check. If the string only contains digits and
217 |   ## `_` we consider it an Int
218 |   s.allCharsInSet({'0' .. '9', '_'})
219 | 
220 | func isBool*(s: string): bool =
221 |   s == "true" or s == "false"
222 | 
223 | func isInt*(v: Value): bool =
224 |   ## checks whether the string contained in `Value` is likely an integer
225 |   ## For an `isFloat` equivalent see `isNumber`.
226 |   doAssert v.kind == VString
227 |   result = v.str.isInt
228 | 
229 | proc toFloat*(v: Value, allowNull: static bool = false): float =
230 |   when not allowNull:
231 |     doAssert v.kind in {VInt, VFloat}
232 |   else:
233 |     doAssert v.kind in {VInt, VFloat, VNull}
234 |   case v.kind
235 |   of VInt: result = v.num.float
236 |   of VFloat: result = v.fnum
237 |   of VNull:
238 |     # This branch is forbidden for `allowNull = false` due to `doAssert` at top!
239 |     result = 0.0
240 |   else: discard
241 | 
242 | proc toInt*(v: Value): int = #BiggestInt =
243 |   ## Converts a numeric value to an int. If the value is a float
244 |   ## we round and convert to int
245 |   doAssert v.kind in {VInt, VFloat}
246 |   case v.kind
247 |   of VInt: result = v.num
248 |   of VFloat: result = v.fnum.round.int
249 |   else: discard
250 | 
251 | proc toBool*(v: Value): bool =
252 |   ## Checks if the value is a bool and returns its value
253 |   doAssert v.kind == VBool
254 |   result = v.bval
255 | 
256 | proc toStr*(v: Value): string =
257 |   ## Returns the value `v` as a string. If the value is of kind `VString`,
258 |   ## no conversion is required.
259 |   ## This however will fail, if the input is of type
260 |   ## - VNull
261 |   ## - VObject
262 |   ## if you want string representations of those value types, use `$`
263 |   case v.kind
264 |   of VInt, VFloat, VBool: result = pretty(v)
265 |   of VString: result = v.str
266 |   else:
267 |     raise newException(ValueError, "Will not convert a Value of kind " &
268 |       $v.kind & " to string! Use `$` for that!")
269 | 
270 | proc to*[T: int | float | string | bool](v: Value, dtype: typedesc[T]): T =
271 |   when T is int:
272 |     result = v.toInt
273 |   elif T is float:
274 |     result = v.toFloat
275 |   elif T is string:
276 |     result = v.toStr
277 |   elif T is bool:
278 |     result = v.toBool
279 |   else:
280 |     doAssert false, "Impossible branch!"
281 | 
282 | template withNative*(v: Value,
283 |                      valName: untyped,
284 |                      body: untyped): untyped =
285 |   case v.kind
286 |   of VInt:
287 |     let `valName` {.inject.} =  v.num
288 |     body
289 |   of VFloat:
290 |     let `valName` {.inject.} =  v.fnum
291 |     body
292 |   of VString:
293 |     let `valName` {.inject.} =  v.str
294 |     body
295 |   of VBool:
296 |     let `valName` {.inject.} =  v.bval
297 |     body
298 |   of VNull:
299 |     # a null value is just a null value
300 |     let `valName` {.inject.} =  v
301 |     body
302 |   of VObject:
303 |     doAssert false, "not implemented / makes no sense for current usage"
304 | 
305 | template withNativeConversion*(kind: ValueKind,
306 |                                procName: untyped,
307 |                                body: untyped): untyped =
308 |   ## generates an environment, in which the correct `to*` proc
309 |   ## is named `procName` for `kind`
310 |   case kind
311 |   of VInt:
312 |     template `procName`(v: Value): untyped = v.toInt
313 |     type dtype {.inject.} = int
314 |     body
315 |   of VFloat:
316 |     template `procName`(v: Value): untyped = v.toFloat
317 |     type dtype {.inject.} = float
318 |     body
319 |   of VString:
320 |     template `procName`(v: Value): untyped = v.toStr
321 |     type dtype {.inject.} = string
322 |     body
323 |   of VBool:
324 |     template `procName`(v: Value): untyped = v.toBool
325 |     type dtype {.inject.} = bool
326 |     body
327 |   of VObject, VNull:
328 |     doAssert false, "not implemented / makes no sense for current usage"
329 | 
330 | 
331 | func isNull*(v: Value): Value =
332 |   ## returns whether `v` is a `VNull` value as a `VBool`
333 |   result = %~ (v.kind == VNull)
334 | 
335 | func almostEqual*(a, b: float, epsilon = 1e-8): bool =
336 |   # taken from
337 |   # https://floating-point-gui.de/errors/comparison/
338 |   let
339 |     absA = abs(a)
340 |     absB = abs(b)
341 |     diff = abs(a - b)
342 |   if a == b: # shortcut, handles infinities
343 |     result = true
344 |   elif a == 0 or b == 0 or (absA + absB) < minimumPositiveValue(float64):
345 |     # a or b is zero or both are extremely close to it
346 |     # relative error is less meaningful here
347 |     result = diff < (epsilon * minimumPositiveValue(float64))
348 |   else:
349 |     # use relative error
350 |     result = diff / min(absA + absB, maximumPositiveValue(float64)) < epsilon
351 | 
352 | proc `==`*(v, w: Value): bool =
353 |   ## checks whether the values are equal.
354 |   ## Note: if both values are numbers of different kind (`VInt` and `VFloat`) the
355 |   ## values are both compared as floats!
356 |   ## The float comparison happens with a floating point comparison with relatively
357 |   ## large epsilon (1-e8).
358 |   if v.kind != w.kind and
359 |      v.kind in {VInt, VFloat} and
360 |      w.kind in {VInt, VFloat}:
361 |     result = almostEqual(v.toFloat, w.toFloat)
362 |   elif v.kind != w.kind:
363 |     result = false
364 |   else:
365 |     case v.kind
366 |     of VString:
367 |       result = v.str == w.str
368 |     of VInt:
369 |       result = v.num == w.num
370 |     of VFloat:
371 |       result = almostEqual(v.fnum, w.fnum)
372 |     of VBool:
373 |       result = v.bval == w.bval
374 |     of VObject:
375 |       # NOTE: taken from json module
376 |       # we cannot use OrderedTable's equality here as
377 |       # the order does not matter for equality here.
378 |       if v.fields.len != w.fields.len: return false
379 |       for key, val in v.fields:
380 |         if not w.fields.hasKey(key): return false
381 |         if w.fields[key] != val: return false
382 |       result = true
383 |     of VNull:
384 |       result = true
385 | 
386 | proc `<`*(v, w: Value): bool =
387 |   ## checks whether the `v` is smaller than `w`
388 |   ## Note: this is only defined for a subset of the possible types!
389 |   ## Note2: if both are numbers of different kind (`VInt` and `VFloat`) the
390 |   ## values are compared as a float! For very large values this would be problematic,
391 |   ## but here we are lenient and assume the user uses `Value` for small calculations!
392 |   if v.kind != w.kind and
393 |      v.kind in {VFloat, VInt} and
394 |      w.kind in {VFloat, VInt}:
395 |     result = v.toFloat < w.toFloat
396 |   elif v.kind != w.kind and
397 |      v.kind in {VFloat, VInt, VString} and
398 |      w.kind in {VFloat, VInt, VString}:
399 |     # compare as strings
400 |     result = $v < $w
401 |   elif v.kind == w.kind:
402 |     case v.kind
403 |     of VString:
404 |       result = v.str < w.str
405 |     of VInt:
406 |       result = v.num < w.num
407 |     of VFloat:
408 |       result = v.fnum < w.fnum
409 |     of VBool:
410 |       result = v.bval < v.bval
411 |     of VObject:
412 |       # checks if objects have the same field, and if so whether the
413 |       # fields of `v` are smaller than those of `w`
414 |       result = true
415 |       for k in keys(v):
416 |         if k notin w:
417 |           return false
418 |         if v[k] < w[k]:
419 |           return true
420 |         elif v[k] > w[k]:
421 |           return false
422 |         # else v[k] is equal to w[k], continue
423 |     else:
424 |       raise newException(Exception, "Comparison `<` does not make sense for " &
425 |         "Value kind " & $v.kind & "!")
426 | 
427 | proc `<=`*(v, w: Value): bool =
428 |   ## checks whether `v` is smaller or equal than `w`
429 |   if v == w:
430 |     result = true
431 |   elif v < w:
432 |     result = true
433 | 
434 | proc smallerOrFalse*(v: Value, f: float): bool {.inline.} =
435 |   ## extension of `<` for `Value` to return `false` if `v` is
436 |   ## not a valid VInt/VFloat.
437 |   case v.kind
438 |   of VInt, VFloat: result = v.toFloat < f
439 |   else: result = false
440 | 
441 | proc largerOrFalse*(v: Value, f: float): bool {.inline.} =
442 |   ## extension of `<` for `Value` to return `false` if `v` is
443 |   ## not a valid VInt/VFloat.
444 |   case v.kind
445 |   of VInt, VFloat: result = v.toFloat > f
446 |   else: result = false
447 | 
448 | template makeMath(op: untyped): untyped =
449 |   proc `op`*(v, w: Value): Value =
450 |     ## Adds two Values together, if they are addeable.
451 |     ## These operations only work for `VInt` and `VFloat`. `VInt` is converted
452 |     ## to floats for the calculation. The result is always a `VFloat`!
453 |     if v.kind in {VFloat, VInt} and
454 |        w.kind in {VFloat, VInt}:
455 |       result = Value(kind: VFloat, fnum: `op`(v.toFloat, w.toFloat))
456 |     elif v.kind == VNull or w.kind == VNull:
457 |       result = Value(kind: VNull)
458 |     else:
459 |       raise newException(Exception, "Math operation does not make sense for " &
460 |         "Value kind " & $v.kind & "!")
461 | 
462 | makeMath(`+`)
463 | makeMath(`-`)
464 | makeMath(`*`)
465 | makeMath(`/`)
466 | 
467 | proc formatFloatValue(v: Value, precision: int): string =
468 |   ## Performs the formatting of a value of kind `VFloat` to string.
469 |   ## If the values are smaller < 1e-5 or > 1e5 scientific notation is
470 |   ## used.
471 |   doAssert v.kind == VFloat
472 |   let f = v.fnum
473 |   if almostEqual(abs(f), 0.0):
474 |     # to make sure zero is not formatted in scientific
475 |     result = f.formatBiggestFloat(format = ffDefault,
476 |                                   precision = precision)
477 |   elif abs(f) >= 1e5 or abs(f) <= 1e-5:
478 |     result = f.formatBiggestFloat(format = ffScientific,
479 |                                   precision = precision)
480 |   else:
481 |     result = f.formatBiggestFloat(format = ffDefault,
482 |                                   precision = precision)
483 |   result.trimZeros()
484 | 
485 | proc pretty*(v: Value, precision = 4, emphStrNumber = true): string =
486 |   ## converts the given value to its value as a string. For `VFloat` the
487 |   ## precision can be given.
488 |   ## If `emphStrNumber` is true, a number stored as a string will be emphasized
489 |   ## by enclosing it with explicit `"`. This is mainly for printing DFs to show
490 |   ## the user if a number is a number or a string.
491 |   case v.kind
492 |   of VInt:
493 |     result = $v.num
494 |   of VFloat:
495 |     result = formatFloatValue(v, precision = precision)
496 |   of VBool:
497 |     result = $v.bval
498 |   of VString:
499 |     let vstr = v.str
500 |     if emphStrNumber and (vstr.len == 0 or vstr.isNumber):
501 |       result = "\"" & vstr & "\""
502 |     else:
503 |       result = vstr
504 |   of VObject:
505 |     result.add "{"
506 |     var idx = 0
507 |     for k, x in v.fields:
508 |       if idx == v.fields.len - 1:
509 |         result.add (&"{k}: {pretty(x)}")
510 |       else:
511 |         result.add (&"{k}: {pretty(x)}, ")
512 |       inc idx
513 |     result.add "}"
514 |   of VNull:
515 |     result = "null"
516 | 
517 | template `$`*(v: Value): string = pretty(v)
518 | 
519 | proc hash*(x: Value): Hash =
520 |   case x.kind
521 |   of VInt:
522 |     result = hash(x.num)
523 |   of VFloat:
524 |     result = hash(x.fnum)
525 |   of VString:
526 |     result = hash(x.str)
527 |   of VBool:
528 |     result = hash(x.bval)
529 |   of VObject:
530 |     for k, v in x.fields:
531 |       result = result !& hash(k)
532 |       result = result !& hash(v)
533 |   of VNull:
534 |     result = 0
535 |   result = !$result
536 | 
537 | proc contains*(v: Value, has: Value): bool =
538 |   ## checks whether `has` is a subset of `v` if both are `VObject`.
539 |   ## A subset means that all keys of `has` are in `v` and their values match.
540 |   ## There may be more fields in `v` than in `has`
541 |   doAssert v.kind == VObject
542 |   doAssert has.kind == VObject
543 |   result = true
544 |   for key, val in has:
545 |     if key in v: result = result and val == v[key]
546 |     else: result = false
547 | 


--------------------------------------------------------------------------------
/data/03-sample_hugo.csv:
--------------------------------------------------------------------------------
  1 | ,BU,CT,IE,Cs134,Cs137,Eu154,U235,Pu239
  2 | 0,23,3102.5,1.5,1.88424083219819E-07,2.84566917879002E-05,2.69679813433217E-07,8.22522171634031E-05,0.000121476067501
  3 | 1,30,7482.5,1.5,5.20168249685682E-09,2.79799245808655E-05,1.52985963215638E-07,5.42325572566305E-05,0.00012902660029
  4 | 2,37,4197.5,1.5,1.45199693687987E-07,4.21499771616296E-05,4.17562528731865E-07,3.59841349105039E-05,0.000134992514145
  5 | 3,52.5,6752.5,1.5,2.16965292708179E-08,4.9979226189458E-05,3.50484414413579E-07,1.50402501420245E-05,0.000145445609543
  6 | 4,65.5,3102.5,1.5,8.08478210919765E-07,7.74388092788155E-05,9.6942317979626E-07,7.48810453458104E-06,0.000152838359631
  7 | 5,30,91.25,1.6,4.63841694081471E-06,4.46352992585013E-05,7.77532100448014E-07,6.15403234287554E-05,0.000129928345245
  8 | 6,34.5,8395,1.6,2.71624118536392E-09,3.01868876171812E-05,1.5011569350238E-07,4.75259853623482E-05,0.000133654186214
  9 | 7,57,1368.75,1.6,3.42660361651213E-06,7.59309237451187E-05,1.25129574849322E-06,1.35365167615274E-05,0.000148671041607
 10 | 8,57,5840,1.6,5.59338990804771E-08,5.7224184909055E-05,4.66071542695002E-07,1.35888359894115E-05,0.000148643804069
 11 | 9,27.5,365,1.7,3.08790706111218E-06,4.02748433202162E-05,6.37972051142385E-07,8.0098420206549E-05,0.000128426787842
 12 | 10,35,7847.5,1.7,4.55574059261689E-09,3.16746996946991E-05,1.7181072908398E-07,5.23717817097577E-05,0.000134991510437
 13 | 11,46.5,8942.5,1.7,2.48735805627675E-09,3.87972156630181E-05,1.90403175772635E-07,2.75877308521199E-05,0.000142981084191
 14 | 12,59.5,3011.25,1.7,7.96251684592367E-07,7.12445831121348E-05,9.07121405568088E-07,1.3636066711407E-05,0.000150575807944
 15 | 13,69,0,1.7,1.49265327403358E-05,9.88690244415683E-05,2.01340259501913E-06,8.28861591879921E-06,0.000153276312875
 16 | 14,22,0,1.8,2.90373875406948E-06,3.30913019608103E-05,4.77490315744394E-07,0.000121758621764,0.000120539544125
 17 | 15,25.5,1277.5,1.8,1.16012736983286E-06,3.52881874632362E-05,4.59893268685747E-07,0.000100175469562,0.000127054363219
 18 | 16,26.5,912.5,1.8,1.7327977866186E-06,3.74997135846502E-05,5.29825016639736E-07,9.47339573003523E-05,0.000128198386755
 19 | 17,33,4745,1.8,7.13603005947803E-08,3.63836420446526E-05,3.12466586610326E-07,6.59608160877924E-05,0.000134213270087
 20 | 18,41.5,10037.5,1.8,7.62128808328029E-10,3.24376701791896E-05,1.30322533184758E-07,4.12345752265645E-05,0.000140363154609
 21 | 19,47,4015,1.8,2.34107824534528E-07,5.35110960085349E-05,5.70492650681161E-07,3.04507778332328E-05,0.000143916276119
 22 | 20,52,2828.75,1.8,7.83136922917532E-07,6.34056189350734E-05,8.22049647800648E-07,2.32157392917567E-05,0.000146817626605
 23 | 21,62.5,8212.5,1.8,6.87804608714419E-09,5.3582261986737E-05,2.99610775104532E-07,1.33723153739579E-05,0.000152502738238
 24 | 22,30,1916.25,1.9,8.36280351381586E-07,3.97094102572973E-05,5.07645566666362E-07,8.68630440901851E-05,0.000132844577617
 25 | 23,38,1277.5,1.9,2.15337518561199E-06,5.19298470601829E-05,8.09901114158842E-07,5.61107891156539E-05,0.000139108882434
 26 | 24,53.5,8942.5,1.9,2.92233194107228E-09,4.42348036618602E-05,2.19836844358774E-07,2.43914997367572E-05,0.000148412808665
 27 | 25,60.5,1186.25,1.9,4.22002555451605E-06,8.10200157713558E-05,1.37100182269368E-06,1.68026687660663E-05,0.000152071751921
 28 | 26,33.5,10950,2,2.37102513185971E-10,2.49113875906117E-05,8.0074474374429E-08,7.99719871297953E-05,0.000136470519707
 29 | 27,42,1277.5,2,2.4254257067081E-06,5.7064627799565E-05,9.10682513286478E-07,5.06086453634115E-05,0.000142603154497
 30 | 28,43.5,1368.75,2,2.35387900927969E-06,5.86867070024351E-05,9.32409364204074E-07,4.67023160056911E-05,0.0001435108385
 31 | 29,54,6387.5,2,3.09576495075891E-08,5.24380012777716E-05,3.90032393618273E-07,2.67856778131325E-05,0.000149329758953
 32 | 30,55,2646.25,2,9.93495740299208E-07,6.76086861284335E-05,9.08677692101019E-07,2.53699250898907E-05,0.000149887448471
 33 | 31,56,1277.5,2,3.59022668275634E-06,7.49948614715264E-05,1.25146635182496E-06,2.40576105262813E-05,0.000150428448657
 34 | 32,63,4562.5,2,1.98674557633134E-07,6.79636755148268E-05,6.7676922005073E-07,1.67574308214044E-05,0.000153936298314
 35 | 33,67,1095,2,5.22728817639055E-06,8.96687736058673E-05,1.5426931459686E-06,1.36173719692919E-05,0.000155893642228
 36 | 34,21.5,2828.75,2.1,1.97723968104349E-07,2.70118094679826E-05,2.36070042227702E-07,0.000165212293199,0.000124203940056
 37 | 35,49,638.75,2.1,5.46270267176982E-06,6.88667800104983E-05,1.25401431277852E-06,3.90510848236718E-05,0.000147326554534
 38 | 36,63,10585,2.1,7.7649021566376E-10,4.64176135745528E-05,1.79067180800253E-07,1.89690560657233E-05,0.000154413095565
 39 | 37,67,3650,2.1,4.96884096553506E-07,7.62635593290236E-05,8.78491802953668E-07,1.54066192028162E-05,0.000156327077237
 40 | 38,20,9125,2.2,5.32430172169919E-10,1.69145080529553E-05,5.11924529435906E-08,0.000193533922559,0.000122354486384
 41 | 39,21.5,2737.5,2.2,2.11540310587198E-07,2.71566327246085E-05,2.3764135565168E-07,0.000179539138945,0.000125019319375
 42 | 40,43,5840,2.2,3.72326055268189E-08,4.37040245355853E-05,3.41242111108583E-07,5.97032949624967E-05,0.000144986683356
 43 | 41,70,2098.75,2.2,2.18141611682241E-06,8.7616458296196E-05,1.28793217955485E-06,1.49238711868889E-05,0.000158437242276
 44 | 42,23.5,4562.5,2.3,4.57767290248913E-08,2.6398125325377E-05,1.83820327773495E-07,0.000176265679368,0.000128897548561
 45 | 43,29.5,5110,2.3,4.10918341716166E-08,3.18617907626002E-05,2.36760756475201E-07,0.000131024932382,0.000136109650376
 46 | 44,29.5,5475,2.3,2.93672016721176E-08,3.11345434891859E-05,2.18422247414788E-07,0.000131028842674,0.000136106117348
 47 | 45,44,182.5,2.3,6.98711992995184E-06,6.38838727806421E-05,1.2201243792048E-06,6.27576594604642E-05,0.000146561015337
 48 | 46,49,10767.5,2.3,4.82357609660259E-10,3.62589235826303E-05,1.33816824290292E-07,4.86787975916935E-05,0.000149177297326
 49 | 47,25.5,1916.25,2.4,5.96883185594255E-07,3.38012430953463E-05,3.75683842906509E-07,0.000173083460019,0.000132596749798
 50 | 48,33,6022.5,2.4,2.0674265174375E-08,3.34729559038151E-05,2.27545834357507E-07,0.000120165750448,0.000140296505555
 51 | 49,42,7665,2.4,6.56987453770483E-09,3.80366689459262E-05,2.19952111029734E-07,7.67501009789035E-05,0.000146380458078
 52 | 50,43,4197.5,2.4,1.65933464504733E-07,4.84526606907299E-05,4.87587934085048E-07,7.29430193370778E-05,0.000146982123714
 53 | 51,49.5,9855,2.4,1.12604669687437E-09,3.87745412763824E-05,1.65443253781577E-07,5.26055972965375E-05,0.000150442032886
 54 | 52,62,4562.5,2.4,1.91740969715746E-07,6.68613788194208E-05,6.67874776913479E-07,2.79772465073893E-05,0.000156186304692
 55 | 53,21.5,8212.5,2.5,1.31783309179537E-09,1.91848747122469E-05,6.84622393270823E-08,0.000225119126276,0.000127151621839
 56 | 54,35.5,3285,2.5,2.87216546287636E-07,4.27140929279224E-05,4.61837743435703E-07,0.000115921927652,0.000143428237943
 57 | 55,39,1825,2.5,1.28334477030796E-06,5.13140131423784E-05,7.26285325954644E-07,9.76820554493864E-05,0.000145817577478
 58 | 56,52,8212.5,2.5,5.32038798564765E-09,4.49918266769322E-05,2.49938625879728E-07,5.13167662681124E-05,0.000152613774364
 59 | 57,62,273.75,2.5,9.90200500807536E-06,8.7673305735931E-05,1.72268710251402E-06,3.10924614929168E-05,0.000156953339519
 60 | 58,53.5,182.5,2.6,8.94277818391889E-06,7.68016411031811E-05,1.52059576421386E-06,5.24400570978479E-05,0.000154187483512
 61 | 59,59,6022.5,2.6,4.76355617012128E-08,5.82461515549754E-05,4.63678286227929E-07,4.00358183001655E-05,0.000156506990044
 62 | 60,70,1368.75,2.6,4.23219712970039E-06,9.16517448464663E-05,1.51861511803921E-06,2.32840071608402E-05,0.000161088014568
 63 | 61,32.5,8577.5,2.7,1.8569238760323E-09,2.80264166849705E-05,1.24107891534068E-07,0.000157211169067,0.000143133452792
 64 | 62,40.5,1368.75,2.7,1.98111263854068E-06,5.46337539847379E-05,8.33352339779185E-07,0.000108066806193,0.000148922689771
 65 | 63,32.5,6387.5,2.8,1.37931166221163E-08,3.21800488637227E-05,2.00207231077099E-07,0.000169259124846,0.000144131205246
 66 | 64,67.5,912.5,2.8,6.10542277137965E-06,9.11285117423774E-05,1.63206094620404E-06,3.22085217825989E-05,0.000161495430874
 67 | 65,33.5,2463.75,2.9,5.31989361174768E-07,4.24672749908237E-05,4.96893733293116E-07,0.000174015840376,0.00014625125964
 68 | 66,48,3650,2.9,3.13676834351377E-07,5.56552517674718E-05,6.26694760020223E-07,8.98301044523825E-05,0.000155042502487
 69 | 67,61.5,5110,2.9,1.12173503162396E-07,6.3996485149028E-05,5.89601513091981E-07,4.73947200701079E-05,0.000160171235532
 70 | 68,62,9672.5,2.9,1.7033436722549E-09,4.83206444166339E-05,2.17103004825399E-07,4.63343759456962E-05,0.000160307978863
 71 | 69,65,6752.5,3,2.66634058662711E-08,6.07534396881419E-05,4.34772880691482E-07,4.40724741498069E-05,0.000162462226848
 72 | 70,45,1368.75,3.1,2.27672954486003E-06,6.03971774209416E-05,9.55322919229101E-07,0.000120872224047,0.000156080408219
 73 | 71,47.5,6205,3.1,2.89554639346418E-08,4.68477355614104E-05,3.51382346017484E-07,0.000108092876596,0.000157207820402
 74 | 72,54.5,3467.5,3.1,4.33379363422111E-07,6.34028140839783E-05,7.52915453031601E-07,7.85086498851637E-05,0.000159957606697
 75 | 73,55,3650,3.1,3.7145463759155E-07,6.32221363090696E-05,7.30642229565115E-07,7.67231033448639E-05,0.000160150257777
 76 | 74,36.5,5657.5,3.2,3.16725917231739E-08,3.76782164184727E-05,2.75972036706425E-07,0.000187847154834,0.000152121558456
 77 | 75,54.5,1733.75,3.2,2.12475064095556E-06,7.07308883473186E-05,1.10371475424681E-06,8.52021748692741E-05,0.000161078859401
 78 | 76,64.5,2920,3.2,8.89478177759677E-07,7.68182522700119E-05,1.00798141630531E-06,5.38670635702221E-05,0.000164093664495
 79 | 77,67.5,10950,3.2,5.84688353024502E-10,4.82405658810163E-05,1.78827477733339E-07,4.6982033924468E-05,0.000165045623022
 80 | 78,22.5,9125,3.3,5.63250461108869E-10,1.8892410969189E-05,5.58744084045619E-08,0.000348235378678,0.000134706251734
 81 | 79,35,1368.75,3.3,1.51256445753264E-06,4.74357516460313E-05,6.64911951285053E-07,0.000213148728791,0.000152099200556
 82 | 80,47,3650,3.3,2.94175587791291E-07,5.44772278223077E-05,6.0888407965754E-07,0.000128512366211,0.000159461318315
 83 | 81,49,8030,3.3,5.57715018432749E-09,4.29751537589923E-05,2.4366003244946E-07,0.000117827647799,0.000160300825473
 84 | 82,57,182.5,3.3,9.41744967395323E-06,8.1398376793143E-05,1.63717932414911E-06,8.24897647148494E-05,0.000163098483309
 85 | 83,20.5,9307.5,3.4,3.95571617312328E-10,1.70361854640802E-05,4.43140201939785E-08,0.000393902320102,0.000130537567242
 86 | 84,23.5,4197.5,3.4,5.62552691641991E-08,2.69225173325624E-05,1.78242019946569E-07,0.000352777581404,0.000137453533567
 87 | 85,28,8395,3.4,1.61759100829611E-09,2.45098288816846E-05,9.70591950446074E-08,0.000297917752529,0.000145274816682
 88 | 86,38,1277.5,3.4,1.87457933162703E-06,5.16554354127345E-05,7.66609364901576E-07,0.000200817950708,0.000155780336611
 89 | 87,39.5,2646.25,3.4,5.67248315749234E-07,4.917750018283E-05,5.99543570378547E-07,0.000188912063257,0.000156780418361
 90 | 88,41.5,7117.5,3.4,9.79259948912521E-09,3.87988607540754E-05,2.38298430114336E-07,0.000174004246317,0.000157888024816
 91 | 89,34,0,3.5,4.97844398442882E-06,5.02624187265034E-05,8.50801363868222E-07,0.000249870673361,0.000151693492283
 92 | 90,47,3285,3.5,4.05633982223382E-07,5.57183792712714E-05,6.58172178865455E-07,0.00014779893824,0.000161878976038
 93 | 91,50.5,8395,3.5,4.01240853299933E-09,4.31160661267432E-05,2.31444632448452E-07,0.000127550767974,0.000163176115337
 94 | 92,56.5,1003.75,3.5,4.31045142038704E-06,7.65919611745065E-05,1.35301311788902E-06,9.83883569661425E-05,0.000165248472908
 95 | 93,39.5,1095,3.6,2.32263545422418E-06,5.42183429047053E-05,8.4025588691569E-07,0.000213675207593,0.000159072706347
 96 | 94,50.5,182.5,3.6,7.62722444735789E-06,7.24679791647171E-05,1.41939150414643E-06,0.000136613451681,0.000164603292982
 97 | 95,53,10585,3.6,5.73686720558575E-10,3.93012009645533E-05,1.51527682317434E-07,0.000123098264348,0.000165344713927
 98 | 96,36,4927.5,3.7,5.78485108683733E-08,3.88822870909328E-05,3.11350097635092E-07,0.00025909489375,0.000157554310919
 99 | 97,42.5,4197.5,3.7,1.45792973357398E-07,4.77166617290667E-05,4.67439001289781E-07,0.000201756899975,0.000162284481186
100 | 98,46.5,1368.75,3.7,2.28890163259763E-06,6.22244327944638E-05,9.89521520863111E-07,0.000172070885289,0.000164385514731
101 | 99,52,2737.5,3.7,7.56229496788469E-07,6.33860318926623E-05,8.38039721613337E-07,0.000137481552692,0.000166473100679
102 | 100,58,3467.5,3.7,4.58586252211486E-07,6.71531173161345E-05,8.10994334174174E-07,0.000106890994825,0.000168290296925
103 | 101,30,9490,3.8,6.40331965546492E-10,2.44387625081422E-05,8.36724601961178E-08,0.000339652048512,0.000151621342621
104 | 102,33,1551.25,3.8,1.10164064910971E-06,4.42282414374551E-05,5.66746050974128E-07,0.000305010281416,0.000155508277133
105 | 103,45.5,3285,3.8,3.75973074242247E-07,5.39685726426877E-05,6.28366018920651E-07,0.00019040988815,0.00016522953801
106 | 104,52,4745,3.8,1.18455229417992E-07,5.58122417678851E-05,5.37496144921828E-07,0.000146866530874,0.000167746310583
107 | 105,68,9490,3.8,2.20174366907928E-09,5.31935091462065E-05,2.51802812756313E-07,7.49729835500201E-05,0.000171720756448
108 | 106,27.5,10585,3.9,1.98703993458076E-10,2.09420936906045E-05,5.58738660582861E-08,0.000388158328344,0.00014848314924
109 | 107,47,10037.5,3.9,7.86127359841421E-10,3.63108965047742E-05,1.47481313410847E-07,0.000190803887459,0.000167082015773
110 | 108,60.5,6935,3.9,1.92850765041233E-08,5.59891085472045E-05,3.93051616997789E-07,0.000110729974217,0.000171258290747
111 | 109,64,5475,3.9,8.0642267065254E-08,6.47582519470785E-05,5.76690804674461E-07,9.56176459225205E-05,0.000172070995873
112 | 110,66.5,1825,3.9,2.45602897465434E-06,8.45741500868411E-05,1.34315926830512E-06,8.59741530735528E-05,0.000172679334672
113 | 111,48,2007.5,4,1.30754374086722E-06,6.15605407925548E-05,8.92687865768183E-07,0.000194569651108,0.000168825823392
114 | 112,50.5,2190,4,1.16779927832354E-06,6.37605417151054E-05,9.10400036152838E-07,0.000176649595062,0.000169814707897
115 | 113,53.5,730,4,4.91935939958526E-06,7.38922152696555E-05,1.35287990509034E-06,0.000156986745215,0.000170882804304
116 | 114,20,2190,4.1,2.53193427262266E-07,2.60961685726902E-05,1.90525524401579E-07,0.000537311016215,0.000132059850636
117 | 115,40,3285,4.1,3.02537861233079E-07,4.77229922786362E-05,5.20040145250274E-07,0.000276400956748,0.000165404769447
118 | 116,47.5,1460,4.1,2.11167305946489E-06,6.30790286226831E-05,9.92002455140442E-07,0.000209989289025,0.000169836087151
119 | 117,57,1825,4.1,1.97091994001427E-06,7.32136924948565E-05,1.1474202041492E-06,0.00014546390347,0.000173034296308
120 | 118,48,7665,4.2,7.0530181779051E-09,4.30198736212015E-05,2.55341740177435E-07,0.000217923769424,0.00017131597745
121 | 119,61,0,4.2,1.13445611606013E-05,8.74382067715301E-05,1.84406246342814E-06,0.000132112212817,0.000173367045266
122 | 120,61,3193.75,4.2,6.00100917165137E-07,7.144289602227E-05,9.10779426409667E-07,0.00013215626642,0.00017523138502
123 | 121,23,1095,4.3,8.6159081699018E-07,3.20172921065982E-05,3.13197171041991E-07,0.000527837263678,0.000141301108312
124 | 122,30,9855,4.3,4.37811170667107E-10,2.38557482404698E-05,7.47403426493968E-08,0.000424794490921,0.000155809192702
125 | 123,31.5,4927.5,4.3,4.33800062313869E-08,3.41036463797568E-05,2.40672274830482E-07,0.000404805567203,0.000158130368674
126 | 124,48.5,2646.25,4.3,7.21668021195263E-07,5.96706126346351E-05,7.84676815662326E-07,0.000225984474308,0.000173019350982
127 | 125,49,2098.75,4.3,1.21452492505462E-06,6.23826596397898E-05,8.97821608234253E-07,0.000221900304447,0.000173245095635
128 | 126,26.5,6022.5,4.4,1.18656722746171E-08,2.69288249267324E-05,1.37586388271734E-07,0.000492664044451,0.000149841659476
129 | 127,28,1368.75,4.4,9.48521714532006E-07,3.81447654147683E-05,4.26384666162163E-07,0.000470496924228,0.000152925407662
130 | 128,30.5,8395,4.4,1.66906452249052E-09,2.65335121105305E-05,1.04949728586575E-07,0.000435436634922,0.000157211508954
131 | 129,48,4562.5,4.4,1.20806560057035E-07,5.23253185104261E-05,5.05679141764975E-07,0.000242395441286,0.000173946972912
132 | 130,53.5,9125,4.4,2.10893312001623E-09,4.3407006453399E-05,2.12001611943617E-07,0.000198622668704,0.000176048708847
133 | 131,57.5,1642.5,4.4,2.31784923237589E-06,7.4628267699939E-05,1.21013075981255E-06,0.000170952163928,0.000177244105302
134 | 132,65.5,9307.5,4.4,2.38727694426232E-09,5.18769101012059E-05,2.56008209045958E-07,0.000125591046909,0.000178709955652
135 | 133,21.5,1551.25,4.5,4.90923196475934E-07,2.9102497554807E-05,2.43080910773364E-07,0.0005922166796,0.000137791550454
136 | 134,29.5,6935,4.5,6.1378450671227E-09,2.82190898491947E-05,1.36287980437162E-07,0.000467222403179,0.000156376779669
137 | 135,31.5,1186.25,4.5,1.33431975843804E-06,4.31937270893179E-05,5.42845704184384E-07,0.000439296532111,0.000159733133613
138 | 136,32.5,8760,4.5,1.32560385065279E-09,2.75779550076423E-05,1.07800903381432E-07,0.000425944071653,0.000161170349981
139 | 137,41.5,5840,4.5,2.89478475192148E-08,4.19540982073863E-05,3.07142377068495E-07,0.000318574775265,0.00017101880159
140 | 138,56,912.5,4.5,4.32055707816737E-06,7.62012593502799E-05,1.3768286730741E-06,0.000191335836553,0.000178145768364
141 | 139,57,3467.5,4.5,4.23254161958962E-07,6.59277932283289E-05,8.00435760623841E-07,0.000184425828159,0.000178394775013
142 | 140,69.5,1368.75,4.5,3.85987111971363E-06,9.06040113593491E-05,1.57121366953204E-06,0.000114134612242,0.000180679431678
143 | 141,20,456.25,4.6,1.19388807447226E-06,2.90984685411028E-05,2.66417441784073E-07,0.000638722132659,0.000133376783508
144 | 142,34,2190,4.6,6.01814421261134E-07,4.3654656555647E-05,4.96574888437103E-07,0.000423142967891,0.000164291650993
145 | 143,35.5,8030,4.6,3.00483897926296E-09,3.14618164924499E-05,1.47329872510756E-07,0.000403921308235,0.000166177335752
146 | 144,62,3467.5,4.6,4.66654095776323E-07,7.1240644469803E-05,8.78151099699867E-07,0.000161971045475,0.000180817374224
147 | 145,68,0,4.6,1.31034028597544E-05,9.67776491110723E-05,2.08626692405056E-06,0.000128796117512,0.000179840801425
148 | 146,69,1277.5,4.6,4.13141924936205E-06,9.04947486558296E-05,1.59566114797963E-06,0.000123909185096,0.000181876547105
149 | 147,22,1460,4.7,5.47972185663642E-07,2.99310160658316E-05,2.55153292322911E-07,0.000623951168786,0.000139936881015
150 | 148,29,6022.5,4.7,1.35565618713115E-08,2.93916868276575E-05,1.59577997496372E-07,0.000511111560613,0.000156944125156
151 | 149,44.5,9490,4.7,1.11555875851811E-09,3.56056741210485E-05,1.52037634052887E-07,0.000316157896917,0.000175668073016
152 | 150,58,7847.5,4.7,7.61615602764121E-09,5.07827632347685E-05,3.11220225349629E-07,0.00019850936878,0.000181373528443
153 | 151,59.5,1003.75,4.7,4.31153641656711E-06,8.02115760706824E-05,1.45283744632622E-06,0.000187918214041,0.000181801799911
154 | 152,33,8577.5,4.8,1.57085942711193E-09,2.8300458309801E-05,1.13477824549112E-07,0.000471502344469,0.0001643796442
155 | 153,38.5,547.5,4.8,3.33040706191584E-06,5.46125900810378E-05,8.73249671536257E-07,0.000399305413092,0.000171655811867
156 | 154,44,91.25,4.8,6.20702363810397E-06,6.38162090967527E-05,1.18880799001349E-06,0.000335827327081,0.000176511725026
157 | 155,49,273.75,4.8,6.27800331433849E-06,6.99434360119171E-05,1.33676850147627E-06,0.000285136931536,0.000179616900623
158 | 156,25,365,4.9,1.8633539808157E-06,3.6353181205431E-05,4.13339358579954E-07,0.000612673686487,0.000148968193773
159 | 157,45,8030,4.9,4.29653252763622E-09,3.94573809497852E-05,2.12110653122401E-07,0.000339676671363,0.000178214078695
160 | 158,59,6935,4.9,1.78747615188695E-08,5.46564954648793E-05,3.88870453183095E-07,0.00021261717434,0.000184301139232
161 | 159,21,5657.5,5.1,1.02007643855556E-08,2.19137084858016E-05,8.87595359564826E-08,0.000724107631595,0.000137783978028
162 | 160,23.5,4562.5,5.1,3.44518189096089E-08,2.62284859066288E-05,1.42176643661872E-07,0.00067869149593,0.000145582151695
163 | 161,55,9855,5.1,1.07366447414235E-09,4.24931535374513E-05,1.87008243427609E-07,0.000268759293348,0.000185890569049
164 | 162,68,6752.5,5.1,2.54520882646995E-08,6.30691102336121E-05,4.74544954413735E-07,0.000171599771892,0.000188689972871
165 | 163,21.5,2372.5,5.2,2.17512766732052E-07,2.76029346645149E-05,1.90738447587297E-07,0.000735528881152,0.000139756076879
166 | 164,26.5,3376.25,5.2,1.26994621430226E-07,3.17969274763896E-05,2.3285504633767E-07,0.000646696062247,0.000154135208938
167 | 165,44,456.25,5.2,4.30907201760011E-06,6.23147665875661E-05,1.08451939019097E-06,0.000396227226394,0.000181085797798
168 | 166,62.5,7847.5,5.2,8.09243414090313E-09,5.43459007721383E-05,3.39694510071525E-07,0.000219038064312,0.000189093883813
169 | 167,21,6205,5.3,6.06976182378544E-09,2.11625721747778E-05,7.73458024772474E-08,0.00076588082163,0.000138104573455
170 | 168,26.5,2190,5.3,3.75471498377032E-07,3.42702135139823E-05,3.00603380575311E-07,0.000666450030361,0.000154485339736
171 | 169,27,7482.5,5.3,2.97626361668341E-09,2.49715948233781E-05,9.68531875544561E-08,0.000657978585458,0.000155658516717
172 | 170,28,2555,5.3,2.96228269204825E-07,3.53375811648085E-05,3.08439926058218E-07,0.000641078913408,0.000158108590841
173 | 171,34,10402.5,5.3,2.97412259870175E-10,2.59361144536243E-05,7.80728860798333E-08,0.000546571217363,0.000169712765823
174 | 172,46.5,6752.5,5.3,1.4298039849228E-08,4.41144672885905E-05,2.93199915928956E-07,0.0003824428,0.000183892545536
175 | 173,49,1368.75,5.3,2.21283793799348E-06,6.52019005384815E-05,1.04159320140915E-06,0.000354514484622,0.000185638918096
176 | 174,61.5,2372.5,5.3,1.20757076124977E-06,7.56620609989457E-05,1.11576546387201E-06,0.000237640485428,0.000190045813873
177 | 175,53.5,9125,5.4,1.96848600584309E-09,4.33217959102557E-05,2.11434806370186E-07,0.000321789842407,0.000189042990405
178 | 176,66,2190,5.4,1.59076826212051E-06,8.17968630358488E-05,1.2642652194713E-06,0.000214283800177,0.000192363941075
179 | 177,66.5,8395,5.4,5.32738223742902E-09,5.5634942988013E-05,3.23905647330256E-07,0.000210765092405,0.000192350292514
180 | 178,31,2555,5.5,3.40615902724639E-07,3.89376740086019E-05,3.6756703598401E-07,0.000630498744899,0.000165867180771
181 | 179,32.5,7482.5,5.5,3.97774799133893E-09,2.9851639929216E-05,1.35393859337921E-07,0.000606596236117,0.000168735720129
182 | 180,42.5,8030,5.5,3.73094531693998E-09,3.7308444241546E-05,1.91015653325763E-07,0.000463137968867,0.000182800935576
183 | 181,43.5,3650,5.5,2.18775192334382E-07,5.03339421583874E-05,5.22162531865527E-07,0.00045021980584,0.000183817269152
184 | 182,31.5,4015,5.6,9.08037974550098E-08,3.60549574349836E-05,2.72697981908349E-07,0.000641386230522,0.000167168339403
185 | 183,35,2098.75,5.6,6.39130560107712E-07,4.5088162141988E-05,5.07011251091202E-07,0.000586159031619,0.000173620351684
186 | 184,52.5,2007.5,5.6,1.31664562891036E-06,6.67244438908396E-05,9.89455607577136E-07,0.000359804426455,0.000191230741628
187 | 185,61.5,10585,5.6,6.18413370097652E-10,4.49802752515297E-05,1.82316015669672E-07,0.000272732981288,0.000194292130607
188 | 186,65,6205,5.6,3.80997009891195E-08,6.25223068770416E-05,5.13334516126856E-07,0.000243543269718,0.000195008440139
189 | 187,46,3285,5.7,3.32371427757252E-07,5.43328072340943E-05,6.152094651117E-07,0.000450713878079,0.000188016463733
190 | 188,39.5,182.5,5.8,4.53281636847046E-06,5.71980844418222E-05,9.5100353762247E-07,0.000555333312912,0.000182030925926
191 | 189,44.5,1460,5.8,1.67322377082537E-06,5.9063513318847E-05,8.70570735206198E-07,0.000486223955951,0.00018765900733
192 | 190,51,2463.75,5.8,8.12118343712406E-07,6.30373548783621E-05,8.55542169831879E-07,0.000405874826594,0.000192770734464
193 | 191,69.5,4927.5,5.8,1.35479860007801E-07,7.21568022398813E-05,7.35771357585528E-07,0.000230555149196,0.000198407914838
194 | 192,35.5,2828.75,5.9,3.27517267393713E-07,4.36318724172379E-05,4.35873030329728E-07,0.000633947762216,0.000176568220118
195 | 193,39,1277.5,5.9,1.60664330175007E-06,5.27109136403554E-05,7.26425052165191E-07,0.000580356641421,0.000182112845599
196 | 194,41,10220,5.9,4.54134034528331E-10,3.1357828313774E-05,1.09147618378452E-07,0.000551350211943,0.000184554873875
197 | 195,63,4197.5,5.9,2.25870663356088E-07,6.88924620861157E-05,7.7248119641614E-07,0.000295784831154,0.000198612011975
198 | 196,24.5,5657.5,6,1.27528266937624E-08,2.54665834255973E-05,1.13577395090974E-07,0.000845540227973,0.000151200338751
199 | 197,46,730,6,3.42186828376995E-06,6.38355974393622E-05,1.07336615509102E-06,0.000499336878165,0.000191230615976
200 | 198,49.5,7847.5,6,5.52315857717899E-09,4.36511548177852E-05,2.49743661989427E-07,0.000454579917402,0.000194033925007
201 | 199,50.5,6022.5,6,2.9802432324916E-08,4.98430591925984E-05,3.82509655798467E-07,0.000442275314365,0.000194694090096
202 | 


--------------------------------------------------------------------------------
/src/datamancer/formulaExp.nim:
--------------------------------------------------------------------------------
  1 | import macros, sequtils, strformat, options, sets, tables, algorithm, strutils
  2 | import formulaNameMacro
  3 | 
  4 | import column, value, df_types
  5 | 
  6 | type
  7 |   AssignKind* = enum
  8 |     byIndex, byTensor
  9 |   ## replace occurence of nnkAccQuote, nnkCallStrLit, nnkBracketExpr(df) by:
 10 |   ReplaceByKind = enum
 11 |     rbIndex ## by call to tensor index, `tT[idx]`, in a `for` loop
 12 |     rbElement ## by single element, `tIdx` in a `forEach` call
 13 |     rbColumn ## by full tensor (df column), `colT` in `<<` formula
 14 |   ## `impure` in the context of `FormulaNode` refers to evaluation requiring
 15 |   ## a data frame. Pure formulas represent raw expressions that evaluate to
 16 |   ## a simple `Value`
 17 |   FormulaKind* = enum
 18 |     fkNone = "none" ## default value for uninitialized formula / no formula kind at CT yet
 19 |     fkVariable = "" ## Nim variable as `Value`, pure
 20 |     fkAssign = "<-" ## assignment op, pure
 21 |     fkVector = "~" ## map operation, impure
 22 |     fkScalar = "<<" ## reduce operation, impure
 23 |   ## either: `t in df["foo", int]`
 24 |   ## or: `t = df["foo", int]`
 25 |   Assign* = object
 26 |     asgnKind*: AssignKind
 27 |     node*: NimNode ## the exact node that will be replaced by this `Assign` instance
 28 |     ## TODO: rename / change `ReplaceByKind` as it's currently a bit unclear, in particular after
 29 |     ## `get` and `delete` was added!
 30 |     rbKind*: ReplaceByKind ## stores how this should be inserted
 31 |     element*: NimNode # e.g. `t`
 32 |     tensor*: NimNode # either `t` or `t_T` if `elmenent` used
 33 |     col*: NimNode # name of the column
 34 |     colType*: NimNode # e.g. `float`
 35 |     resType*: NimNode # the resulting type of the computation `Assign` is ``involved`` in!
 36 |   Preface* = object
 37 |     args*: seq[Assign]
 38 |   FormulaCT* = object
 39 |     funcKind*: FormulaKind
 40 |     preface*: Preface
 41 |     typeHint*: TypeHint # the actual type hint given in the formula
 42 |     resType*: NimNode # e.g. `ident"float"`
 43 |     name*: NimNode # name of the formula -> refers to new column / assignment name
 44 |     rawName*: string # name of the formula body as lisp
 45 |     loop*: NimNode # loop needs to be patched to remove accented quotes etc
 46 |   ## `Lift` stores a node which needs to be lifted out of a for loop, because it performs a
 47 |   ## reducing operation on a full DF column. It will be replaced by `liftedNode` in the loop
 48 |   ## body.
 49 |   Lift* = object
 50 |     toLift*: NimNode
 51 |     liftedNode*: NimNode
 52 | 
 53 |   ## The `TypeHint` and `HeuristicType` are only used for the shortform of the
 54 |   ## `Formula` syntax using `f{}`.
 55 |   ##
 56 |   ## In the general shortform `Formula` syntax `f{}` a `TypeHint` is of the form
 57 |   ## `f{float -> int: <operation>}`, where the first value is the type we use to read the
 58 |   ## DF tensors and the second the output datatype of the operation.
 59 |   ##
 60 |   ## If a `TypeHint` is found in a formula, we do attempt to heuristically determine
 61 |   ## sensible data types.
 62 |   TypeHint* = object
 63 |     inputType*: Option[NimNode]
 64 |     resType*: Option[NimNode]
 65 | 
 66 |   ## `HeuristicType` stores the input and output types of a formula constructed using the
 67 |   ## `f{}` syntax based on simple heuristic rules about the presence of certain operators
 68 |   ## and typed symbols (e.g. proc calls, literals). They are only used if no `TypeHint`
 69 |   ## is supplied.
 70 |   HeuristicType* = TypeHint
 71 | 
 72 |   ## `FormulaTypes` finally is the type used for input and output of the formula
 73 |   ## construction. Here the types *must* be set, otherwise it's a CT error (which happens
 74 |   ## if we cannot deduce a type and no hints are given)
 75 |   FormulaTypes* = object
 76 |     inputType*: NimNode
 77 |     resType*: NimNode
 78 | 
 79 | ## Idents used for the generated code
 80 | const
 81 |   InIdent = "in"
 82 |   ResIdent = "res"
 83 |   ResultIdent = "result"
 84 |   RIdent = "r"
 85 |   DFIdent = "df"
 86 |   IdxIdent = "idx"
 87 |   ColIdent = "Column"
 88 |   ValueIdent = "Value"
 89 | 
 90 | const Dtypes* = ["float", "int", "string", "bool", "Value"]
 91 | const DtypesAll* = ["float", "float64", "int", "int64", "string", "bool", "Value"]
 92 | 
 93 | const DtypeOrderMap* = {
 94 |   "Value" : 1,
 95 |   "Tensor[Value]" : 2,
 96 |   "Tensor[T]" : 3,
 97 |   "T" : 4,
 98 |   "Tensor[string]" : 5,
 99 |   "string" : 6,
100 |   "Tensor[int]" : 7,
101 |   "int" : 8,
102 |   "Tensor[int64]" : 9,
103 |   "int64" : 10,
104 |   "Tensor[float]" : 11,
105 |   "float" : 12,
106 |   "Tensor[float64]" : 13,
107 |   "float64" : 14,
108 |   "Tensor[bool]" : 15,
109 |   "bool" : 16 # if something  can be done with `bool`, take that
110 | }.toTable()
111 | const DtypeOrderMapKeys = toSeq(DtypeOrderMap.keys())
112 | 
113 | proc toStrType*(n: NimNode): NimNode =
114 |   case n.kind
115 |   of nnkIntLit .. nnkUInt64Lit: result = ident "int"
116 |   of nnkFloatLit .. nnkFloat128Lit: result = ident "float"
117 |   of nnkStrLit: result = ident "string"
118 |   of nnkIdent, nnkSym:
119 |     if n.strVal in ["true", "false"]: result = ident "bool"
120 |     else: result = ident(n.repr)
121 |   else: result = ident(n.repr)
122 | 
123 | proc isValidType*(n: NimNode): bool =  n.strVal in DtypeOrderMapKeys
124 | 
125 | proc sortTypes*(s: seq[string]): seq[string] =
126 |   ## sorts the types according to our own "priority list"
127 |   var ids = newSeq[int](s.len)
128 |   for i, el in s:
129 |     if el in DtypeOrderMap:
130 |       ids[i] = DtypeOrderMap[el]
131 |   result = zip(s, ids).sortedByIt(it[1]).mapIt(it[0])
132 | 
133 | proc sortTypes*(s: seq[NimNode]): seq[string] =
134 |   result = s.filterIt(it.isValidType).mapIt(it.strVal).sortTypes()
135 | 
136 | proc isColumnType*(n: NimNode): bool =
137 |   case n.kind
138 |   of nnkBracketExpr:
139 |     if n[0].kind in {nnkSym, nnkIdent} and n[0].strVal == "Tensor":
140 |       result = true
141 |   of nnkSym, nnkIdent:
142 |     if n.strVal.startsWith("Tensor"):
143 |       result = true
144 |   else: discard
145 | 
146 | proc checkIdent(n: NimNode, s: string): bool =
147 |   result = n.len > 0 and n[0].kind == nnkIdent and n[0].strVal == s
148 | 
149 | proc extractCall(stmts: NimNode, id: string): NimNode =
150 |   expectKind(stmts, nnkStmtList)
151 |   for ch in stmts:
152 |     case ch.kind
153 |     of nnkCall:
154 |       if checkIdent(ch, id):
155 |         return ch
156 |     else: continue
157 | 
158 | proc parsePreface(n: NimNode): Preface =
159 |   proc addInfixAssign(ch: NimNode): Assign =
160 |     doAssert checkIdent(ch, "in")
161 |     doAssert ch[1].kind == nnkIdent, "First element before `in` needs to be an ident!"
162 |     doAssert ch[2].kind == nnkBracketExpr, "`in` must refer to a `df[<col>, <type>]`!"
163 |     doAssert ch[2][0].strVal == "df", "`in` must refer to a `df[<col>, <type>]`!"
164 |     let elId = ch[1].strVal
165 |     let dtype = ch[2][2].strVal
166 |     doAssert dtype in Dtypes, "Column dtype " & $dtype & " not in " & $Dtypes & "!"
167 |     result = Assign(asgnKind: byIndex,
168 |                     node: ch,
169 |                     element: ident(elId),
170 |                     tensor: ident(elId & "T"),
171 |                     col: ch[2][1],
172 |                     colType: ident(dtype))
173 |   proc addAsgnAssign(ch: NimNode): Assign =
174 |     doAssert ch[0].kind == nnkIdent, "First element before `=` needs to be an ident!"
175 |     doAssert ch[1].kind == nnkBracketExpr, "`=` must assign from a `df[<col>, <type>]`!"
176 |     doAssert ch[1][0].strVal == "df", "`=` must assign from a `df[<col>, <type>]`!"
177 |     let tId = ch[0].strVal
178 |     let dtype = ch[1][2].strVal
179 |     doAssert dtype in Dtypes, "Column dtype " & $dtype & " not in " & $Dtypes & "!"
180 |     result = Assign(asgnKind: byTensor,
181 |                     node: ch,
182 |                     element: ident(tId & "Idx"),
183 |                     tensor: ident(tId),
184 |                     col: ch[1][1],
185 |                     colType: ident(dtype))
186 | 
187 |   expectKind(n, nnkCall)
188 |   expectKind(n[1], nnkStmtList)
189 |   for ch in n[1]:
190 |     case ch.kind
191 |     of nnkInfix: result.args.add addInfixAssign(ch)
192 |     of nnkAsgn: result.args.add addAsgnAssign(ch)
193 |     else: error("Invalid node kind " & $ch.kind & " in `preface`: " & (ch.repr))
194 | 
195 | proc parseSingle(n: NimNode): NimNode =
196 |   expectKind(n[1], nnkStmtList)
197 |   result = n[1][0]
198 | 
199 | proc parseLoop(n: NimNode): NimNode =
200 |   expectKind(n[1], nnkStmtList)
201 |   result = n[1]
202 | 
203 | func removeCallAcc(n: NimNode): NimNode =
204 |   result = if n.kind == nnkAccQuoted: newLit(n[0].strVal)
205 |            elif n.kind == nnkCallStrLit: n[1]
206 |            else: n
207 | 
208 | proc convertPreface(p: Preface): NimNode =
209 |   ## TODO:
210 |   ## anything that contains a type of `Tensor[T]` needs to be handled differently.
211 |   ## Instead of generating a `let colT = df["col", dType]` we need to just call
212 |   ## the function that
213 |   proc toLet(a: Assign): NimNode =
214 |     result = nnkIdentDefs.newTree(
215 |       a.tensor,
216 |       newEmptyNode(),
217 |       nnkBracketExpr.newTree(ident(DfIdent), a.col.removeCallAcc(),
218 |                              ident(a.colType.strVal)) #convert nnkSym to nnkIdent
219 |     )
220 |   result = nnkLetSection.newTree()
221 |   var seenTensors = initHashSet[string]()
222 |   for arg in p.args:
223 |     if arg.tensor.strVal notin seenTensors:
224 |       result.add toLet(arg)
225 |     seenTensors.incl arg.tensor.strVal
226 | 
227 | proc convertDtype(d: NimNode): NimNode =
228 |   result = nnkVarSection.newTree(
229 |     nnkIdentDefs.newTree(
230 |       ident(ResIdent),
231 |       newEmptyNode(),
232 |       nnkCall.newTree(
233 |         nnkBracketExpr.newTree(ident"newTensorUninit",
234 |                                d),
235 |         nnkDotExpr.newTree(ident(DfIdent),
236 |                            ident"len"))
237 |     )
238 |   )
239 | 
240 | proc `$`*(p: Preface): string =
241 |   result = "Preface("
242 |   for i, ch in p.args:
243 |     result.add &"Assign(element: {ch.element.strVal}, "
244 |     result.add &"asgnKind: {ch.asgnKind}, "
245 |     result.add &"node: {ch.node.repr}, "
246 |     result.add &"tensor: {ch.tensor.strVal}, "
247 |     result.add &"col: {buildName(ch.col)}, "
248 |     result.add &"rbKind: {ch.rbKind}, "
249 |     result.add &"colType: {buildName(ch.colType)}, "
250 |     result.add &"resType: {buildName(ch.resType)})"
251 |     if i < p.args.high:
252 |       result.add ", "
253 |   result.add ")"
254 | 
255 | proc contains(p: Preface, n: NimNode): bool =
256 |   for arg in p.args:
257 |     if arg.node == n:
258 |       return true
259 | 
260 | proc `[]`(p: Preface, n: NimNode): Assign =
261 |   for arg in p.args:
262 |     if arg.node == n:
263 |       return arg
264 |   error("Could not find " & n.repr & " in preface containing " & $p)
265 | 
266 | proc delete(p: var Preface, n: NimNode) =
267 |   var idx = 0
268 |   while idx < p.args.len:
269 |     if p.args[idx].node == n:
270 |       p.args.delete(idx)
271 |       # deleted so return
272 |       ## TODO: we don't depend on removing all "duplicates" (same column ref), right?
273 |       return
274 |     inc idx
275 | 
276 | proc nodeIsDf*(n: NimNode): bool =
277 |   if n.kind == nnkBracketExpr:
278 |     result = n[0].kind == nnkIdent and n[0].strVal == "df"
279 |   elif n.kind == nnkCall:
280 |     result = n[0].kind == nnkIdent and n[0].strVal == "col"
281 |   elif n.kind in {nnkCallStrLit, nnkAccQuoted}:
282 |     result = true
283 | 
284 | proc nodeIsDfIdx*(n: NimNode): bool =
285 |   if n.kind == nnkBracketExpr:
286 |     result = n[0].kind == nnkBracketExpr and n[0][0].kind == nnkIdent and
287 |     n[0][0].strVal == "df" and n[1].kind == nnkIdent and n[1].strVal == "idx"
288 |   elif n.kind == nnkCall:
289 |     result = n[0].kind == nnkIdent and n[0].strVal == "idx"
290 |   elif n.kind in {nnkCallStrLit, nnkAccQuoted}:
291 |     result = true
292 | 
293 | proc hasExplicitTypeHint*(n: NimNode): bool =
294 |   result = (n.nodeIsDf or n.nodeIsDfIdx) and
295 |     n.kind == nnkCall and
296 |     n.len == 3 and
297 |     n[2].kind in {nnkIdent, nnkSym} and
298 |     n[2].strVal in DtypesAll
299 | 
300 | proc get(p: var Preface, node: NimNode, useIdx: bool): NimNode =
301 |   let n = p[node]
302 |   p.delete(node)
303 |   result = if n.asgnKind == byIndex:
304 |              if useIdx:
305 |                nnkBracketExpr.newTree(
306 |                  n.tensor,
307 |                  ident(IdxIdent)
308 |                )
309 |              else:
310 |                n.element
311 |            else:
312 |              n.tensor
313 | 
314 | proc replaceByIdx(n: NimNode, preface: var Preface): NimNode =
315 |   ## recurses the node `n` and replaces all occurences by `t[idx]` for each
316 |   ## tensor in the loop
317 |   # first check if an ident that is in preface we have to replace or if
318 |   # an `nnkBracketExpr` which contains an ident from `preface`. In those cases
319 |   # return early
320 |   case n.kind
321 |   of nnkIdent, nnkSym:
322 |     if n in preface: return preface.get(n, useIdx = true)
323 |     else: return n
324 |   of nnkAccQuoted:
325 |     return preface.get(n, useIdx = true)
326 |   of nnkCallStrLit:
327 |     return preface.get(n, useIdx = true)
328 |   of nnkBracketExpr:
329 |     if n[0].kind == nnkIdent and n in preface:
330 |       return n
331 |     # if `df["someCol"]` replace by full tensor (e.g. in a call taking tensor)
332 |     if nodeIsDf(n) and n in preface:
333 |       return preface.get(n, useIdx = true)
334 |     if nodeIsDfIdx(n) and n in preface:
335 |       return preface.get(n, useIdx = true)
336 |   of nnkCall:
337 |     if (nodeIsDf(n) or nodeIsDfIdx(n)) and n in preface:
338 |       return preface.get(n, useIdx = true)
339 |   else: result = n
340 |   if n.len > 0:
341 |     result = newTree(n.kind)
342 |     for ch in n:
343 |       result.add replaceByIdx(ch, preface)
344 | 
345 | proc replaceByElement(n: NimNode, preface: var Preface): NimNode =
346 |   ## recurses the node `n` and replaces all occurences by `t` for each
347 |   ## tensor in the loop
348 |   # first check if an ident that is in preface we have to replace or if
349 |   # an `nnkBracketExpr` which contains an ident from `preface`. In those cases
350 |   # return early
351 |   case n.kind
352 |   of nnkIdent, nnkSym:
353 |     if n in preface: return preface.get(n, useIdx = false)
354 |     else: return n
355 |   of nnkAccQuoted:
356 |     return preface.get(n, useIdx = false)
357 |   of nnkCallStrLit:
358 |     return preface.get(n, useIdx = false)
359 |   of nnkBracketExpr:
360 |     if n[0].kind == nnkIdent and n in preface:
361 |       return preface.get(n, useIdx = false)
362 |     # for `df["someCol"]` replace by full tensor, e.g. for call taking tensor
363 |     if nodeIsDf(n) and n in preface:
364 |       return preface.get(n, useIdx = false)
365 |     if nodeIsDfIdx(n) and n in preface:
366 |       return preface.get(n, useIdx = false)
367 |   of nnkCall:
368 |     if (nodeIsDf(n) or nodeIsDfIdx(n)) and n in preface:
369 |       return preface.get(n, useIdx = false)
370 |   else: result = n
371 |   if n.len > 0:
372 |     result = newTree(n.kind)
373 |     for ch in n:
374 |       result.add replaceByElement(ch, preface)
375 | 
376 | proc replaceByColumn(n: NimNode, preface: var Preface): NimNode =
377 |   ## recurses the node `n` and replaces all occurences by full `col` (i.e. field `tensor`) for each
378 |   ## tensor in the loop
379 |   case n.kind
380 |   of nnkIdent, nnkSym:
381 |     if n in preface: return preface[n].tensor
382 |     else: return n
383 |   of nnkAccQuoted:
384 |     return preface[n].tensor
385 |   of nnkCallStrLit:
386 |     return preface[n].tensor
387 |   of nnkBracketExpr:
388 |     if n[0].kind == nnkIdent and n in preface:
389 |       return preface[n].tensor
390 |     # for `df["someCol"]` replace by full tensor, e.g. for call taking tensor
391 |     if nodeIsDf(n) and n in preface:
392 |       return preface[n].tensor
393 |     if nodeIsDfIdx(n) and n in preface:
394 |       error("Invalid usage of `idx` in a reducing formula! Access: " & $(n.repr))
395 |   of nnkCall:
396 |     if (nodeIsDf(n) or nodeIsDfIdx(n)) and n in preface:
397 |       return preface[n].tensor
398 |   else: result = n
399 |   if n.len > 0:
400 |     result = newTree(n.kind)
401 |     for ch in n:
402 |       result.add replaceByColumn(ch, preface)
403 | 
404 | proc fixupTensorIndices(loopStmts: NimNode, preface: var Preface,
405 |                         rbKind: ReplaceByKind): NimNode =
406 |   ## If `toElements` is true, we rewrite everything by `t` (where `t` is an
407 |   ## element of `tT` (Tensor). This includes
408 |   expectKind(loopStmts, nnkStmtList)
409 |   case rbKind
410 |   of rbIndex:
411 |     let loop = loopStmts[0].replaceByIdx(preface)
412 |     case loop.kind
413 |     of nnkAsgn:
414 |       doAssert loop[0].kind == nnkBracketExpr and
415 |         loop[0][0].kind == nnkIdent and loop[0][0].strVal == "r" and
416 |         loop[0][1].kind == nnkIdent and loop[0][1].strVal == "idx"
417 |       ## TODO: make this prettier / fix this
418 |     else:
419 |       # turn this into an nnkAsgn node with `res` as LHS and `nnkAsgn` on RHS
420 |       result = nnkAsgn.newTree(
421 |         nnkBracketExpr.newTree(ident(ResIdent), ident(IdxIdent)),
422 |         loop)
423 |   of rbElement:
424 |     let loop = loopStmts[0].replaceByElement(preface)
425 |     case loop.kind
426 |     of nnkAsgn: doAssert loop[0].kind == nnkIdent and loop[0].strVal == RIdent
427 |     else:
428 |       # turn this into an nnkAsgn node with `res` as LHS and `nnkAsgn` on RHS
429 |       result = nnkAsgn.newTree(ident(RIdent), loop)
430 |   of rbColumn:
431 |     let loop = loopStmts[0].replaceByColumn(preface)
432 |     case loop.kind
433 |     of nnkAsgn: doAssert loop[0].kind == nnkIdent and loop[0].strVal == ResIdent
434 |     else:
435 |       # turn this into an `nnkVarSection` node with `res` as LHS and `loop` as RHS
436 |       result = nnkVarSection.newTree(
437 |         nnkIdentDefs.newTree(
438 |           ident(ResIdent),
439 |           newEmptyNode(),
440 |           loop)
441 |       )
442 | 
443 | proc convertLoop(p: Preface, dtype, loop: NimNode,
444 |                  fnKind: FormulaKind): NimNode =
445 |   let memCopyable = ["float", "int", "bool"]
446 |   let isMemCopyable = dtype.strVal in memCopyable and
447 |     p.args.allIt(it.colType.strVal in memCopyable)
448 |   proc genForLoop(p: Preface, loop: NimNode): NimNode =
449 |     var mpreface = p
450 |     let loopIndexed = fixupTensorIndices(loop, mpreface, rbKind = rbIndex)
451 |     let idx = ident(IdxIdent)
452 |     let df = ident(DfIdent)
453 |     var loop = quote do:
454 |       for `idx` in 0 ..< `df`.len:
455 |         `loopIndexed`
456 |     result = newStmtList(loop)
457 | 
458 |   proc genForEach(p: Preface, loop: NimNode): NimNode =
459 |     var mpreface = p
460 |     let loopElements = fixupTensorIndices(loop, mpreface, rbKind = rbElement)
461 |     var forEach = nnkCommand.newTree(ident"forEach")
462 |     forEach.add nnkInfix.newTree(ident(InIdent), ident(RIdent), ident(ResIdent))
463 |     for arg in p.args:
464 |       forEach.add nnkInfix.newTree(ident(InIdent), arg.element, arg.tensor)
465 |     forEach.add nnkStmtList.newTree(loopElements)
466 |     result = newStmtList(forEach)
467 | 
468 |   proc addResultVector(): NimNode =
469 |     let resId = ident(ResIdent)
470 |     let resultId = ident(ResultIdent)
471 |     result = quote do:
472 |       `resultId` = toColumn `resId`
473 | 
474 |   case fnKind
475 |   of fkVector:
476 |     if not isMemCopyable:
477 |       result = genForLoop(p, loop)
478 |       result.add addResultVector()
479 |     else:
480 |       result = genForEach(p, loop)
481 |       result.add addResultVector()
482 |   of fkScalar:
483 |     let resultId = ident(ResultIdent)
484 |     var mpreface = p
485 |     let loopElements = fixupTensorIndices(loop, mpreface, rbKind = rbColumn)
486 |     let resId = ident(ResIdent)
487 |     result = quote do:
488 |       `loopElements`
489 |       `resultId` = %~ `resId`
490 |   else:
491 |     error("Invalid FormulaKind `" & $(fnKind.repr) & "` in `convertLoop`. Already handled " &
492 |       "in `compileFormula`!")
493 | 
494 | proc parseFormulaCT(stmts: NimNode): FormulaCT =
495 |   let preface = parsePreface(extractCall(stmts, "preface"))
496 |   ## TODO: if `dtype` not given: auto determine
497 |   let dtype = parseSingle(extractCall(stmts, "dtype"))
498 |   let name = parseSingle(extractCall(stmts, "name"))
499 |   let loop = parseLoop(extractCall(stmts, "loop"))
500 |   result = FormulaCT(preface: preface,
501 |                      resType: dtype,
502 |                      name: name,
503 |                      loop: loop)
504 | 
505 | proc generateClosure*(fct: FormulaCT): NimNode =
506 |   var procBody = newStmtList()
507 |   procBody.add convertPreface(fct.preface)
508 |   if fct.funcKind == fkVector:
509 |     procBody.add convertDtype(fct.resType)
510 |   procBody.add convertLoop(fct.preface, fct.resType, fct.loop, fct.funcKind)
511 |   result = procBody
512 |   var params: array[2, NimNode]
513 |   case fct.funcKind
514 |   of fkVector:
515 |     params = [ident(ColIdent),
516 |               nnkIdentDefs.newTree(ident(DfIdent),
517 |                                    ident"DataFrame",
518 |                                    newEmptyNode())]
519 |   of fkScalar:
520 |     when (NimMajor, NimMinor, NimPatch) < (1, 5, 0):
521 |       let valueId = ident(ValueIdent)
522 |     else:
523 |       let valueId = nnkDotExpr.newTree(ident"value", ident(ValueIdent))
524 |     # to avoid clashes with other `Value` objects, fully clarify we mean ours
525 |     params = [valueId,
526 |               nnkIdentDefs.newTree(ident(DfIdent),
527 |                                    ident"DataFrame",
528 |                                    newEmptyNode())]
529 |   else:
530 |     error("Invalid FormulaKind `" & $(fct.funcKind.repr) & "` in `convertLoop`. Already handled " &
531 |       "in `compileFormula`!")
532 |   result = newProc(newEmptyNode(),
533 |                    params = params,
534 |                    body = procBody,
535 |                    procType = nnkLambda)
536 | 
537 | proc compileFormula(stmts: NimNode): NimNode =
538 |   let fct = parseFormulaCT(stmts)
539 |   result = generateClosure(fct)
540 | 
541 | macro formula(y: untyped): untyped =
542 |   ## TODO: add some ability to explicitly create formulas of
543 |   ## different kinds more easily! Essentially force the type without
544 |   ## a check to avoid having to rely on heuristics.
545 |   ## Use
546 |   ## - `<-` for assignment
547 |   ## - `<<` for reduce operations, i.e. scalar proc?
548 |   ## - `~` for vector like proc
549 |   ## - formula without any of the above will be considered:
550 |   ##   - `fkVariable` if no column involved
551 |   ##   - `fkVector` else
552 |   ## - `<type>: <actualFormula>`: simple type hint for tensors in closure
553 |   ## - `<type> -> <resDtype>: <actualFormula>`: full type for closure.
554 |   ##   `<type>` is the dtype used for tensors, `<resDtype>` the resulting type
555 |   ## - `df[<someIdent/Sym>]`: to access columns using identifiers / symbols
556 |   ##   defined in the scope
557 |   ## - `idx`: can be used to access the loop iteration index
558 |   result = compileFormula(y)
559 | 
560 | when false: # isMainModule:
561 |   import math
562 |   import arraymancer / laser / strided_iteration / foreach
563 |   let f1 = formula:
564 |     preface:
565 |       t in df["foo", int] # t refers to each element of `foo` in the loop
566 |       u in df["bar", float]
567 |       v = df["baz", int] # v refers to the ``Tensor`` `baz`
568 |     dtype: float
569 |     name: "fooBar"
570 |     loop:
571 |       t.float * u + v[idx].float
572 | 
573 |   let f2 = f{ parseInt(`t`) > 5 }
574 | 
575 | 
576 | #let f2 = fn:
577 | #  preface:
578 | #    t in df["foo", int] # t refers to each element of `foo` in the loop
579 | #    u in df["bar", float]
580 | #    v = df["baz", int] # v refers to the ``Tensor`` `baz`
581 | #    #r in result
582 | #  dtype: bool
583 | #  name: "filterme"
584 | #  loop:
585 | #    t.float > u and v[idx].float < 2.2
586 | #
587 | #let f3 = fn:
588 | #  preface:
589 | #    t in df["foo", float] # t refers to each element of `foo` in the loop
590 | #  dtype: bool
591 | #  name: "noNan"
592 | #  loop:
593 | #    not (classify(t) == fcNan)
594 | 


--------------------------------------------------------------------------------
/src/datamancer/io.nim:
--------------------------------------------------------------------------------
  1 | import dataframe, value, column
  2 | 
  3 | import memfiles, streams, strutils, tables, parsecsv, sequtils
  4 | # for `showBrowser`
  5 | import browsers, strformat, os
  6 | 
  7 | # no-op for backward compatibility with `toDf(readCsv(...))`
  8 | proc toDf*(df: DataFrame): DataFrame {.deprecated: "`toDf` is not required anymore, because `readCsv` " &
  9 |   "already returns an actual `DataFrame` nowadays. Feel free to remove the `toDf` call."} =
 10 |   df
 11 | 
 12 | proc countLines(s: var FileStream): int =
 13 |   ## quickly counts the number of lines and then resets stream to beginning
 14 |   ## of file
 15 |   var buf = newString(500)
 16 |   while s.readLine(buf):
 17 |     inc result
 18 |   s.setPosition(0)
 19 | 
 20 | proc checkHeader(s: Stream, fname, header: string, colNames: seq[string]): bool =
 21 |   ## checks whether the given file contains the header `header`
 22 |   result = true
 23 |   if header.len > 0:
 24 |     var headerBuf: string
 25 |     if s.peekLine(headerBuf):
 26 |       result = headerBuf.startsWith(header)
 27 |     else:
 28 |       raise newException(IOError, "The input file " & $fname & " seems to be empty.")
 29 |   elif colNames.len > 0:
 30 |     # given some column names and a "header" without a symbol means we assume
 31 |     # there is no real header. If there is a real header in addition, user has
 32 |     # to use `skipLines = N` to skip it.
 33 |     result = false
 34 | 
 35 | proc readCsv*(s: Stream,
 36 |               sep = ',',
 37 |               header = "",
 38 |               skipLines = 0,
 39 |               colNames: seq[string] = @[],
 40 |               fname = "<unknown>"): OrderedTable[string, seq[string]] =
 41 |   ## returns a `Stream` with CSV like data as a table of `header` keys vs. `seq[string]`
 42 |   ## values, where idx 0 corresponds to the first data value
 43 |   ## The `header` field can be used to designate the symbol used to
 44 |   ## differentiate the `header`. By default `#`.
 45 |   ## `colNames` can be used to provide custom names for the columns.
 46 |   ## If any are given and a header is present with a character indiciating
 47 |   ## the header, it is automatically skipped. ``However``, if custom names are
 48 |   ## desired and there is a real header without any starting symbol (i.e.
 49 |   ## `header.len == 0`), please use `skipLines = N` to skip it manually!
 50 |   # first check if the file even has a header of type `header`
 51 |   let hasHeader = checkHeader(s, fname, header, colNames)
 52 | 
 53 |   var parser: CsvParser
 54 |   open(parser, s, fname, separator = sep, skipInitialSpace = true)
 55 | 
 56 |   if colNames.len > 0:
 57 |     # if `colNames` available, use as header
 58 |     parser.headers = colNames
 59 |     if hasHeader:
 60 |       # and skip the real header
 61 |       discard parser.readRow()
 62 |   elif hasHeader:
 63 |     # read the header and use it
 64 |     parser.readHeaderRow()
 65 |   else:
 66 |     # file has no header nor user gave column names, raise
 67 |     raise newException(IOError, "Input neither has header starting with " &
 68 |       $header & " nor were column names provided!")
 69 | 
 70 |   result = initOrderedTable[string, seq[string]]()
 71 |   # filter out the header, delimiter, if any
 72 |   parser.headers.keepItIf(it != header)
 73 | 
 74 |   # possibly strip the headers and create the result table of columns
 75 |   var colHeaders: seq[string]
 76 |   for colUnstripped in items(parser.headers):
 77 |     let col = colUnstripped.strip
 78 |     colHeaders.add col
 79 |     result[col] = newSeqOfCap[string](5000) # start with a reasonable default cap
 80 | 
 81 |   # parse the actual file using the headers
 82 |   var lnCount = 0
 83 |   while readRow(parser):
 84 |     if lnCount < skipLines:
 85 |       inc lnCount
 86 |       continue
 87 |     for i, col in parser.headers:
 88 |       parser.rowEntry(col).removePrefix({' '})
 89 |       parser.rowEntry(col).removeSuffix({' '})
 90 |       result[colHeaders[i]].add parser.rowEntry(col)
 91 |   parser.close()
 92 | 
 93 | template copyBuf(data: ptr UncheckedArray[char], buf: var string,
 94 |                  idx, colStart: int): untyped =
 95 |   let nIdx = idx - colStart
 96 |   if nIdx > 0:
 97 |     ## TODO: can we keep the buffer the same length and only copy the actual length?
 98 |     buf = newString(nIdx)
 99 |     copyMem(buf[0].addr, data[colStart].addr, nIdx)
100 |     buf.setLen(nIdx)
101 |   else:
102 |     buf.setLen(0)
103 | 
104 | template parseHeaderCol(data: ptr UncheckedArray[char], buf: var string,
105 |                         colNames: var seq[string],
106 |                         header: string, sep, quote: char,
107 |                         idx, colStart: int): untyped =
108 |   copyBuf(data, buf, idx, colStart)
109 |   if col == 0:
110 |     if not buf.startsWith(header):
111 |       raise newException(IOError, "Unexpected column name at column 0, missing " &
112 |         "expected header `" & header & "`. Found " & buf)
113 |     else:
114 |       buf.removePrefix(header)
115 |       # and remove possible whitespace
116 |       buf = buf.strip(chars = Whitespace + {quote})
117 |   let bufStripped = buf.strip(chars = Whitespace + {quote})
118 |   if bufStripped.len == 0 and sep in {' ', '\t'}:
119 |     # don't add any name because we are dealing with a space before the
120 |     # first column. We don't care about the `col` being off while parsing headers as
121 |     # we do not use it to access data.
122 |     # This is required over the `if` in the `parseLine` separator, because of possible
123 |     # files using header symbols e.g. '#'
124 |     discard
125 |   elif bufStripped.len == 0:
126 |     # in case a column does not have a name, we use `Unnamed` similar to pandas
127 |     let numUnknown = colNames.filterIt(it.startsWith("Unnamed"))
128 |     colNames.add("Unnamed" & $numUnknown.len)
129 |   else:
130 |     colNames.add bufStripped
131 | 
132 | template guessType(data: ptr UncheckedArray[char], buf: var string,
133 |                    colTypes: var seq[ColKind],
134 |                    col, idx, colStart, numCols: untyped): untyped =
135 |   # only determine types for as many cols as in header
136 |   if col < numCols:
137 |     copyBuf(data, buf, idx, colStart)
138 |     if buf.isInt:
139 |       colTypes[col] = colInt
140 |     elif buf.isNumber:
141 |       colTypes[col] = colFloat
142 |     elif buf.isBool:
143 |       colTypes[col] = colBool
144 |     else:
145 |       colTypes[col] = colString
146 | 
147 | proc i64(c: char): int {.inline.} = int(ord(c) - ord('0'))
148 | 
149 | proc pow10(e: int): float {.inline.} =
150 |   const p10 = [1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14,
151 |                1e-13, 1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05,
152 |                1e-4, 1e-3, 1e-2, 1e-1, 1.0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7,
153 |                1e8, 1e9]                        # 4*64B cache lines = 32 slots
154 |   if -22 <= e and e <= 9:
155 |     return p10[e + 22]                          # common case=small table lookup
156 |   result = 1.0
157 |   var base = 10.0
158 |   var e = e
159 |   if e < 0:
160 |     e = -e
161 |     base = 0.1
162 |   while e != 0:
163 |     if (e and 1) != 0:
164 |       result *= base
165 |     e = e shr 1
166 |     base *= base
167 | 
168 | type
169 |   RetType = enum
170 |     rtInt, rtFloat, rtNaN, rtError
171 | 
172 | proc parseNumber(data: ptr UncheckedArray[char],
173 |                  sep: char, # if this sep is found parsing ends
174 |                  idxIn: int,
175 |                  intVal: var int, floatVal: var float): RetType {.inline, noInit.} =
176 |   ## this code is taken and adapted from @c-blake's code in Nim PR #16055.
177 |   # Parse/validate/classify all at once, returning the type we parsed into
178 |   # and if not `rtError` the `intVal/floatVal` will store the parsed number
179 |   const Sign = {'+', '-'} # NOTE: `parseFloat` can generalize this to INF/NAN.
180 |   var idx = idxIn
181 |   var noDot = false
182 |   var exp = 0
183 |   var p10 = 0
184 |   var pnt = -1                                  # find '.' (point); do digits
185 |   var nD = 0
186 |   var giant = false
187 |   intVal = 0                                    # build intVal up from zero..
188 |   if data[idx] in Sign:
189 |     idx.inc                                     # skip optional sign
190 |   while data[idx] != '\0':                      # ..and track scale/pow10.
191 |     if data[idx] notin Digits:
192 |       if data[idx] != '.' or pnt >= 0:
193 |         break                                   # a second '.' is forbidden
194 |       pnt = nD                                  # save location of '.' (point)
195 |       nD.dec                                    # undo loop's nD.inc
196 |     elif nD < 18:                               # 2**63==9.2e18 => 18 digits ok
197 |       intVal = 10 * intVal + data[idx].i64      # core ASCII->binary transform
198 |     else:                                       # 20+ digits before decimal
199 |       giant = true #XXX condition should be more precise than "18 digits"
200 |       p10.inc                                   # any digit moves implicit '.'
201 |     idx.inc
202 |     nD.inc
203 |   if data[idxIn] == '-':
204 |     intVal = -intVal                            # adjust sign
205 | 
206 |   if pnt < 0:                                   # never saw '.'
207 |     if nD == 0 and data[idx] == sep:            # empty field in CSV
208 |       return rtNaN
209 |     pnt = nD; noDot = true                      # so set to number of digits
210 |   elif nD == 1:
211 |     return rtError                              # ONLY "[+-]*\.*"
212 | 
213 |   # `\0` is necessary to support parsing until the end of the file in case of no line break
214 |   if data[idx] notin {'\0', sep, '\n', '\r', '\l', 'e', 'E'}: ## TODO: generalize this?
215 |     return rtError
216 | 
217 |   if data[idx] in {'E', 'e'}:                   # optional exponent
218 |     idx.inc
219 |     let i0 = idx
220 |     if data[idx] in Sign:
221 |       idx.inc                                   # skip optional sign
222 |     while data[idx] in Digits:                  # build exponent
223 |       exp = 10 * exp + data[idx].i64
224 |       idx.inc
225 |     if data[i0] == '-':
226 |       exp = -exp                                # adjust sign
227 |   elif noDot: # and intVal < (1'i64 shl 53'i64) ? # No '.' & No [Ee]xponent
228 |     ## TODO: handle giant?
229 |     #if giant:
230 |     #  return rtError
231 |     #  #copyBuf(data, strVal, idx, idxIn)
232 |     return rtInt                                # mark as integer
233 |   exp += pnt - nD + p10                         # combine explicit&implicit exp
234 |   floatVal = intVal.float * pow10(exp)          # has round-off vs. 80-bit
235 |   ## TODO: handle giant?
236 |   #if giant:
237 |   #  return rtError
238 |   #  #copyBuf(data, strVal, idx, idxIn)
239 |   result = rtFloat                                # mark as float
240 | 
241 | template parseCol(data: ptr UncheckedArray[char], buf: var string,
242 |                   col: var Column,
243 |                   sep: char,
244 |                   colTypes: seq[ColKind], colIdx, idx, colStart, row, numCols: int,
245 |                   intVal: var int, floatVal: var float, rtType: var RetType): untyped =
246 |   ## if there are more `,` in a row than in the header, skip it
247 |   if likely(colIdx < numCols):
248 |     case colTypes[colIdx]
249 |     of colInt:
250 |       retType = parseNumber(data, sep, colStart, intVal, floatVal)
251 |       case retType
252 |       of rtInt: col.iCol[row] = intVal
253 |       of rtFloat, rtNaN:
254 |         # before we copy everything check if can be parsed to float, this branch will only
255 |         # be called a single time
256 |         col = toColumn col.iCol.asType(float)
257 |         if retType != rtNaN:
258 |           col.fCol[row] = floatVal
259 |         else:
260 |           col.fCol[row] = NaN
261 |         colTypes[colIdx] = colFloat
262 |       of rtError:
263 |         # object column
264 |         copyBuf(data, buf, idx, colStart)
265 |         col = toObjectColumn col
266 |         colTypes[colIdx] = colObject
267 |         col.oCol[row] = %~ buf
268 |     of colFloat:
269 |       retType = parseNumber(data, sep, colStart, intVal, floatVal)
270 |       case retType
271 |       of rtInt: col.fCol[row] = intVal.float
272 |       of rtFloat: col.fCol[row] = floatVal
273 |       of rtNaN: col.fCol[row] = NaN
274 |       of rtError:
275 |         # object column
276 |         copyBuf(data, buf, idx, colStart)
277 |         col = toObjectColumn col
278 |         colTypes[colIdx] = colObject
279 |         col.oCol[row] = %~ buf
280 |     of colBool:
281 |       copyBuf(data, buf, idx, colStart)
282 |       try:
283 |         col.bCol[row] = parseBool buf
284 |       except ValueError:
285 |         # object column
286 |         col = toObjectColumn col
287 |         colTypes[colIdx] = colObject
288 |         col.oCol[row] = %~ buf
289 |     of colString:
290 |       copyBuf(data, buf, idx, colStart)
291 |       col.sCol[row] = buf
292 |     of colObject:
293 |       # try to parse as number
294 |       retType = parseNumber(data, sep, colStart, intVal, floatVal)
295 |       case retType
296 |       of rtInt: col.oCol[row] = %~ intVal
297 |       of rtFloat: col.oCol[row] = %~ floatVal
298 |       of rtNaN: col.oCol[row] = Value(kind: VNull)
299 |       of rtError:
300 |         copyBuf(data, buf, idx, colStart)
301 |         col.oCol[row] = %~ buf
302 |     of colConstant: discard # already set
303 |     of colNone:
304 |       raise newException(IOError, "Invalid column type to parse into: `colNone`. " &
305 |         "This shouldn't have happened! row = " & $row & ", col = " & $col)
306 | 
307 | template parseLine(data: ptr UncheckedArray[char], buf: var string,
308 |                    sep: char,
309 |                    quote: char,
310 |                    col, idx, colStart, row: var int,
311 |                    lastWasSep, inQuote: var bool,
312 |                    toBreak: static bool,
313 |                    fnToCall: untyped): untyped =
314 |   if unlikely(data[idx] == quote):
315 |     inQuote = not inQuote
316 |   elif unlikely(inQuote):
317 |     inc idx
318 |     # skip ahead in case we start quote
319 |     continue
320 |   elif unlikely(data[idx] in {'\n', '\r', '\l'}):
321 |     fnToCall
322 |     inc row
323 |     col = 0
324 |     if data[idx] == '\r' and data[idx + 1] == '\l':
325 |       inc idx
326 |     colStart = idx + 1
327 |     lastWasSep = false
328 |     when toBreak:
329 |       inc idx
330 |       break
331 |   elif unlikely(skipInitialSpace and lastWasSep and data[idx] == ' '):
332 |     colStart = idx + 1
333 |   elif unlikely(data[idx] == sep):
334 |     # convert last column to data
335 |     if (idx - colStart > 0 or col > 0 or sep notin {' ', '\t'}):
336 |       # only parse if: we have characters to parse, unless we are not in the first
337 |       # column and unless our separator is not "spaces" like. Idea is only ignore
338 |       # empty (only spaces) first columns iff we are dealing with space separated files.
339 |       # For a proper separator like ',' empty inputs are allowed at the beginning.
340 |       fnToCall
341 |       inc col
342 |     colStart = idx + 1
343 |     lastWasSep = true
344 |   elif unlikely(data[idx] in toSkip):
345 |     colStart = idx + 1
346 |     lastWasSep = false
347 |   elif unlikely(lastWasSep):
348 |     lastWasSep = false
349 |   else:
350 |     discard
351 |   inc idx
352 | 
353 | proc readCsvTypedImpl(data: ptr UncheckedArray[char],
354 |                       size: int,
355 |                       lineCnt: int,
356 |                       sep: char = ',',
357 |                       header: string = "",
358 |                       skipLines = 0,
359 |                       toSkip: set[char] = {},
360 |                       colNamesIn: seq[string] = @[],
361 |                       skipInitialSpace = true,
362 |                       quote = '"'): DataFrame =
363 |   ## Implementation of the CSV parser that works on a data array of chars.
364 |   result = newDataFrame()
365 |   var
366 |     idx = 0
367 |     row = 0
368 |     col = 0
369 |     colStart = 0
370 |     lastWasSep = false
371 |     inQuote = false
372 |     buf = newStringOfCap(80)
373 | 
374 |   # 1. first parse the header
375 |   var colNames: seq[string]
376 |   while idx < size:
377 |     parseLine(data, buf, sep, quote, col, idx, colStart, row, lastWasSep, inQuote, toBreak = true):
378 |       parseHeaderCol(data, buf, colNames, header, sep, quote, idx, colStart)
379 | 
380 |   if colNamesIn.len > 0 and colNamesIn.len != colNames.len:
381 |     raise newException(IOError, "Input data contains " & $colNames.len & " columns, but " &
382 |       "given " & $colNamesIn.len & " column names given: " & $colNamesIn)
383 |   elif colNamesIn.len > 0:
384 |     colNames = colNamesIn
385 |     # reset index and row back to 0
386 |     row = 0
387 |     idx = 0
388 | 
389 |   # 1a. if `header` is set, skip all additional lines starting with header
390 |   if header.len > 0:
391 |     while idx < size:
392 |       parseLine(data, buf, sep, quote, col, idx, colStart, row, lastWasSep, inQuote, toBreak = false):
393 |         if col == 0 and data[colStart] != header[0]:
394 |           break
395 | 
396 |   let numCols = colNames.len
397 |   # 1b. skip `skipLines`
398 |   let rowStart = row
399 |   while idx < size:
400 |     parseLine(data, buf, sep, quote, col, idx, colStart, row, lastWasSep, inQuote, toBreak = false):
401 |       if row - rowStart == skipLines:
402 |         break
403 |   # compute the number of skipped lines in total
404 |   let skippedLines = row
405 |   # reset row to 0
406 |   row = 0
407 | 
408 |   # 2. peek the first line to determine the data types
409 |   var colTypes = newSeq[ColKind](numCols)
410 |   var lastIdx = idx
411 |   var lastColStart = colStart
412 |   var dataColsIdx = 0
413 |   while idx < size:
414 |     parseLine(data, buf, sep, quote, col, idx, colStart, row, lastWasSep, inQuote, toBreak = true):
415 |       guessType(data, buf, colTypes, col, idx, colStart, numCols)
416 |       # if we see the end of the line, store the current column number
417 |       if data[idx] in {'\n', '\r', '\l'}:
418 |         dataColsIdx = col
419 | 
420 |   if dataColsIdx + 1 != numCols:
421 |     raise newException(IOError, "Input data contains " & $(dataColsIdx + 1) & " in the data portion, but " &
422 |       $numCols & " columns in the header.")
423 |   # 2a. revert the indices (make it a peek)
424 |   idx = lastIdx
425 |   colStart = lastColStart
426 |   dec row
427 |   # 3. create the starting columns
428 |   var cols = newSeq[Column](numCols)
429 |   let dataLines = lineCnt - skippedLines
430 |   for i in 0 ..< colTypes.len:
431 |     # create column of length:
432 |     # lines in file - header - skipLines
433 |     cols[i] = newColumn(colTypes[i], dataLines)
434 |   # 4. parse the actual data
435 |   doAssert row >= 0, "Parsing the header failed"
436 |   var
437 |     retType: RetType
438 |     intVal: int
439 |     floatVal: float
440 |   while idx < size:
441 |     parseLine(data, buf, sep, quote, col, idx, colStart, row, lastWasSep, inQuote, toBreak = false):
442 |       parseCol(data, buf, cols[col], sep, colTypes, col, idx, colStart, row, numCols,
443 |                intVal, floatVal, retType)
444 |   if row + skippedLines < lineCnt:
445 |     # missing linebreak at end of last line
446 |     doAssert row + skippedLines == lineCnt - 1, "Bad file. Please report an issue."
447 |     parseCol(data, buf, cols[col], sep, colTypes, col, idx, colStart, row, numCols,
448 |              intVal, floatVal, retType)
449 |   for i, col in colNames:
450 |     result[col] = cols[i]
451 |   result.len = dataLines
452 | 
453 | proc readCsv*(fname: string,
454 |               sep: char = ',',
455 |               header: string = "",
456 |               skipLines = 0,
457 |               toSkip: set[char] = {},
458 |               colNames: seq[string] = @[],
459 |               skipInitialSpace = true,
460 |               quote = '"',
461 |              ): DataFrame =
462 |   ## Reads a DF from a CSV file using the separator character `sep`.
463 |   ##
464 |   ## `toSkip` can be used to skip optional characters that may be present
465 |   ## in the data. For instance if a CSV file is separated by `,`, but contains
466 |   ## additional whitespace (`5, 10, 8` instead of `5,10,8`) this can be
467 |   ## parsed correctly by setting `toSkip = {' '}`.
468 |   ##
469 |   ## `header` designates the symbol that defines the header of the CSV file.
470 |   ## By default it's empty meaning that the first line will be treated as
471 |   ## the header. If a header is given, e.g. `"#"`, this means we will determine
472 |   ## the column names from the first line (which has to start with `#`) and
473 |   ## skip every line until the first line starting without `#`.
474 |   ##
475 |   ## `skipLines` is used to skip `N` number of lines at the beginning of the
476 |   ## file.
477 |   result = newDataFrame()
478 |   var ff = memfiles.open(fname)
479 |   var lineCnt = 0
480 |   for slice in memSlices(ff):
481 |     if slice.size > 0:
482 |       inc lineCnt
483 | 
484 |   ## we're dealing with ASCII files, thus each byte can be interpreted as a char
485 |   var data = cast[ptr UncheckedArray[char]](ff.mem)
486 |   result = readCsvTypedImpl(data, ff.size, lineCnt, sep, header, skipLines, toSkip, colNames)
487 |   ff.close()
488 | 
489 | proc parseCsvString*(csvData: string,
490 |                      sep: char = ',',
491 |                      header: string = "",
492 |                      skipLines = 0,
493 |                      toSkip: set[char] = {},
494 |                      colNames: seq[string] = @[]): DataFrame =
495 |   ## Parses a `DataFrame` from a string containing CSV data.
496 |   ##
497 |   ## `toSkip` can be used to skip optional characters that may be present
498 |   ## in the data. For instance if a CSV file is separated by `,`, but contains
499 |   ## additional whitespace (`5, 10, 8` instead of `5,10,8`) this can be
500 |   ## parsed correctly by setting `toSkip = {' '}`.
501 |   ##
502 |   ## `header` designates the symbol that defines the header of the CSV file.
503 |   ## By default it's empty meaning that the first line will be treated as
504 |   ## the header. If a header is given, e.g. `"#"`, this means we will determine
505 |   ## the column names from the first line (which has to start with `#`) and
506 |   ## skip every line until the first line starting without `#`.
507 |   ##
508 |   ## `skipLines` is used to skip `N` number of lines at the beginning of the
509 |   ## file.
510 |   result = newDataFrame()
511 | 
512 |   ## we're dealing with ASCII files, thus each byte can be interpreted as a char
513 |   var data = cast[ptr UncheckedArray[char]](csvData[0].unsafeAddr)
514 |   result = readCsvTypedImpl(data, csvData.len, countLines(csvData), sep, header, skipLines, toSkip, colNames)
515 | 
516 | proc readCsvAlt*(fname: string,
517 |                  sep = ',',
518 |                  header = "",
519 |                  skipLines = 0,
520 |                  colNames: seq[string] = @[]): OrderedTable[string, seq[string]] =
521 |   ## returns a CSV file as a table of `header` keys vs. `seq[string]`
522 |   ## values, where idx 0 corresponds to the first data value
523 |   ## The `header` field can be used to designate the symbol used to
524 |   ## differentiate the `header`. By default `#`.
525 |   ## `colNames` can be used to provide custom names for the columns.
526 |   ## If any are given and a header is present with a character indiciating
527 |   ## the header, it is automatically skipped. ``However``, if custom names are
528 |   ## desired and there is a real header without any starting symbol (i.e.
529 |   ## `header.len == 0`), please use `skipLines = N` to skip it manually!
530 |   var s = newFileStream(fname, fmRead)
531 |   if s == nil:
532 |     raise newException(IOError, "Input file " & $fname & " does not exist! " &
533 |      "`readCsv` failed.")
534 |   result = s.readCsv(sep, header, skipLines, colNames, fname = fname)
535 |   s.close()
536 | 
537 | proc writeCsv*(df: DataFrame, filename: string, sep = ',', header = "",
538 |                precision = 4) =
539 |   ## writes a DataFrame to a "CSV" (separator can be changed) file.
540 |   ## `sep` is the actual separator to be used. `header` indicates a potential
541 |   ## symbol marking the header line, e.g. `#`
542 |   var data = newStringOfCap(df.len * 8) # for some reserved space
543 |   # add header symbol to first line
544 |   data.add header
545 |   let keys = getKeys(df)
546 |   data.add join(keys, $sep) & "\n"
547 |   var idx = 0
548 |   for row in df:
549 |     idx = 0
550 |     for x in row:
551 |       if idx > 0:
552 |         data.add $sep
553 |       data.add pretty(x, precision = precision)
554 |       inc idx
555 |     data.add "\n"
556 |   writeFile(filename, data)
557 | 
558 | proc showBrowser*(df: DataFrame, fname = "df.html", path = getTempDir(), toRemove = false) =
559 |   ## Displays the given DataFrame as a table in the default browser.
560 |   ##
561 |   ## Note: the HTML generation is not written for speed at this time. For very large
562 |   ## dataframes expect bad performance.
563 |   const tmpl = """
564 | <!DOCTYPE html>
565 | <html>
566 | <head>
567 | <style>
568 | table {
569 |   font-family: arial, sans-serif;
570 |   border-collapse: collapse;
571 |   width: 100%;
572 | }
573 | 
574 | td, th {
575 |   border: 1px solid #dddddd;
576 |   text-align: left;
577 |   padding: 8px;
578 | }
579 | 
580 | tr:nth-child(even) {
581 |   background-color: #dddddd;
582 | }
583 | </style>
584 | </head>
585 | <body>
586 | 
587 | <table>
588 |   $#
589 | </table>
590 | 
591 | </body>
592 | </html>
593 | """
594 |   var
595 |     header: string
596 |     body: string
597 |   header = "<thead>\n<tr>"
598 |   for k in df.getKeys:
599 |     header.add &"<th> {k} <br><br> {df[k].kind.toNimType} </th>"
600 |   header.add "</tr>\n</thead>"
601 |   body = "<tbody>"
602 |   for row in df:
603 |     body.add "<tr>\n"
604 |     for x in row:
605 |       body.add &"<td>{pretty(x)}</td>"
606 |     body.add "\n</tr>"
607 |   body.add "</tbody>"
608 |   let fname = path / fname
609 |   writeFile(fname, tmpl % [header & body])
610 |   openDefaultBrowser(fname)
611 |   if toRemove:
612 |     # opening browsers may be slow, so wait a long time before we delete (file still needs to
613 |     # be there when the browser is finally open. Thus default is to keep the file
614 |     sleep(1000)
615 |     removeFile(fname)
616 | 


--------------------------------------------------------------------------------
/src/datamancer/column.nim:
--------------------------------------------------------------------------------
  1 | import arraymancer
  2 | import value, sugar, math, strformat
  3 | from sequtils import allIt
  4 | 
  5 | type
  6 |   ColKind* = enum
  7 |     colNone, colFloat, colInt, colBool, colString, colObject, colConstant
  8 |   Column* = ref object
  9 |     len*: int
 10 |     case kind*: ColKind
 11 |     of colFloat: fCol*: Tensor[float]
 12 |     of colInt: iCol*: Tensor[int]
 13 |     of colBool: bCol*: Tensor[bool]
 14 |     of colString: sCol*: Tensor[string]
 15 |     of colObject: oCol*: Tensor[Value]
 16 |     of colConstant: cCol*: Value
 17 |     of colNone: discard
 18 | 
 19 | template `%~`*(v: Value): Value = v
 20 | proc pretty*(c: Column): string
 21 | proc compatibleColumns*(c1, c2: Column): bool {.inline.}
 22 | # just a no-op
 23 | template toColumn*(c: Column): Column = c
 24 | 
 25 | func high*(c: Column): int = c.len - 1
 26 | 
 27 | func isConstant*(c: Column): bool = c.kind == colConstant
 28 | 
 29 | proc toColumn*[T: SomeFloat | SomeInteger | string | bool | Value](t: Tensor[T]): Column =
 30 |   when T is SomeInteger:
 31 |     result = Column(kind: colInt,
 32 |                     iCol: t.asType(int),
 33 |                     len: t.size)
 34 |   elif T is SomeFloat:
 35 |     result = Column(kind: colFloat,
 36 |                     fCol: t.asType(float),
 37 |                     len: t.size)
 38 |   elif T is bool:
 39 |     result = Column(kind: colBool,
 40 |                     bCol: t,
 41 |                     len: t.size)
 42 |   elif T is string:
 43 |     result = Column(kind: colString,
 44 |                     sCol: t,
 45 |                     len: t.size)
 46 |   elif T is Value:
 47 |     result = Column(kind: colObject,
 48 |                     oCol: t,
 49 |                     len: t.size)
 50 | 
 51 | proc constantColumn*[T](val: T, len: int): Column =
 52 |   ## creates a constant column based on `val` and its type
 53 |   result = Column(len: len, kind: colConstant, cCol: %~ val)
 54 | 
 55 | proc constantToFull*(c: Column): Column =
 56 |   ## creates a real constant full tensor column based on a constant column
 57 |   if c.kind != colConstant: return c
 58 |   withNative(c.cCol, val):
 59 |     result = toColumn newTensorWith[type(val)](c.len, val)
 60 | 
 61 | proc `[]`*(c: Column, slice: Slice[int]): Column =
 62 |   case c.kind
 63 |   of colInt: result = toColumn c.iCol[slice.a .. slice.b]
 64 |   of colFloat: result = toColumn c.fCol[slice.a .. slice.b]
 65 |   of colString: result = toColumn c.sCol[slice.a .. slice.b]
 66 |   of colBool: result = toColumn c.bCol[slice.a .. slice.b]
 67 |   of colObject: result = toColumn c.oCol[slice.a .. slice.b]
 68 |   of colConstant:
 69 |     # for constant keep column, only adjust the length to the slice
 70 |     result = c
 71 |     result.len = slice.b - slice.a + 1
 72 |   of colNone: raise newException(IndexError, "Accessed column is empty!")
 73 | 
 74 | proc newColumn*(kind = colNone, length = 0): Column =
 75 |   case kind
 76 |   of colFloat: result = toColumn newTensor[float](length)
 77 |   of colInt: result = toColumn newTensor[int](length)
 78 |   of colString: result = toColumn newTensor[string](length)
 79 |   of colBool: result = toColumn newTensor[bool](length)
 80 |   of colObject: result = toColumn newTensor[Value](length)
 81 |   of colConstant: result = constantColumn(Value(kind: VNull), length)
 82 |   of colNone: result = Column(kind: colNone, len: 0)
 83 | 
 84 | 
 85 | proc toColKind*[T](dtype: typedesc[T]): ColKind =
 86 |   when T is SomeFloat:
 87 |     result = colFloat
 88 |   elif T is SomeInteger:
 89 |     result = colInt
 90 |   elif T is bool:
 91 |     result = colBool
 92 |   elif T is string:
 93 |     result = colString
 94 |   elif T is Value:
 95 |     result = colObject
 96 | 
 97 | proc toColKind*(vKind: ValueKind): ColKind =
 98 |   case vKind
 99 |   of VFloat: result = colFloat
100 |   of VInt: result = colInt
101 |   of VString: result = colString
102 |   of VBool: result = colBool
103 |   of VObject: result = colObject
104 |   of VNull: result = colObject
105 | 
106 | proc toValueKind*(colKind: ColKind): ValueKind =
107 |   case colKind
108 |   of colFloat: result = VFloat
109 |   of colInt: result = VInt
110 |   of colString: result = VString
111 |   of colBool: result = VBool
112 |   of colObject: result = VObject
113 |   of colConstant: result = VObject
114 |   of colNone: result = VNull
115 | 
116 | proc toNimType*(colKind: ColKind): string =
117 |   ## returns the string name of the underlying data type of the column kind
118 |   case colKind
119 |   of colFloat: result = "float"
120 |   of colInt: result = "int"
121 |   of colString: result = "string"
122 |   of colBool: result = "bool"
123 |   of colObject: result = "object"
124 |   of colConstant: result = "constant"
125 |   of colNone: result = "null"
126 | 
127 | template withNativeTensor*(c: Column,
128 |                            valName: untyped,
129 |                            body: untyped): untyped =
130 |   case c.kind
131 |   of colInt:
132 |     let `valName` {.inject.} =  c.iCol
133 |     body
134 |   of colFloat:
135 |     let `valName` {.inject.} =  c.fCol
136 |     body
137 |   of colString:
138 |     let `valName` {.inject.} =  c.sCol
139 |     body
140 |   of colBool:
141 |     let `valName` {.inject.} =  c.bCol
142 |     body
143 |   of colObject:
144 |     let `valName` {.inject.} =  c.oCol
145 |     body
146 |   of colConstant:
147 |     withNative(c.cCol, realVal):
148 |       let `valName` {.inject.} = newTensorWith(c.len, realVal)
149 |       body
150 |   of colNone: raise newException(ValueError, "Accessed column is empty!")
151 | 
152 | proc combinedColKind*(c: seq[ColKind]): ColKind =
153 |   if c.allIt(it == c[0]):
154 |     # all the same, take any
155 |     result = c[0]
156 |   elif c.allIt(it in {colInt, colFloat}):
157 |     # int and float can be combined to float, since we're lenient like that
158 |     result = colFloat
159 |   else:
160 |     # the rest can only be merged via object columns of `Values`.
161 |     result = colObject
162 | 
163 | template withNative*(c: Column, idx: int,
164 |                      valName: untyped,
165 |                      body: untyped): untyped =
166 |   case c.kind
167 |   of colInt:
168 |     let `valName` {.inject.} =  c[idx, int]
169 |     body
170 |   of colFloat:
171 |     let `valName` {.inject.} =  c[idx, float]
172 |     body
173 |   of colString:
174 |     let `valName` {.inject.} =  c[idx, string]
175 |     body
176 |   of colBool:
177 |     let `valName` {.inject.} =  c[idx, bool]
178 |     body
179 |   of colObject:
180 |     let `valName` {.inject.} =  c[idx, Value]
181 |     body
182 |   of colConstant:
183 |     let `valName` {.inject.} =  c[idx, Value]
184 |     body
185 |   of colNone: raise newException(ValueError, "Accessed column is empty!")
186 | 
187 | template withNativeDtype*(c: Column, body: untyped): untyped =
188 |   case c.kind
189 |   of colInt:
190 |     type dtype {.inject.} = int
191 |     body
192 |   of colFloat:
193 |     type dtype {.inject.} = float
194 |     body
195 |   of colString:
196 |     type dtype {.inject.} = string
197 |     body
198 |   of colBool:
199 |     type dtype {.inject.} = bool
200 |     body
201 |   of colObject, colConstant:
202 |     type dtype {.inject.} = Value
203 |     body
204 |   of colNone: raise newException(ValueError, "Accessed column is empty!")
205 | 
206 | template withDtypeByColKind*(colKind: ColKind, body: untyped): untyped =
207 |   case colKind
208 |   of colInt:
209 |     type dtype {.inject.} = int
210 |     body
211 |   of colFloat:
212 |     type dtype {.inject.} = float
213 |     body
214 |   of colString:
215 |     type dtype {.inject.} = string
216 |     body
217 |   of colBool:
218 |     type dtype {.inject.} = bool
219 |     body
220 |   of colObject, colConstant:
221 |     type dtype {.inject.} = Value
222 |     body
223 |   of colNone: raise newException(ValueError, "Invalid column kind!")
224 | 
225 | proc asValue*[T](t: Tensor[T]): Tensor[Value] {.noInit.} =
226 |   ## Apply type conversion on the whole tensor
227 |   result = t.map(x => (%~ x))
228 | 
229 | proc valueTo*[T](t: Tensor[Value], dtype: typedesc[T],
230 |                  dropNulls: static bool = false): Tensor[T] =
231 |   when not dropNulls:
232 |     when T is string:
233 |       result = t.map(x => x.toStr)
234 |     elif T is float:
235 |       result = t.map(x => x.toFloat)
236 |     elif T is int:
237 |       result = t.map(x => x.toInt)
238 |     elif T is bool:
239 |       result = t.map(x => x.toBool)
240 |     elif T is Value:
241 |       result = t
242 |   else:
243 |     # filter tensor to non Null values
244 |     var outputIdx = newSeqOfCap[int](t.size)
245 |     for idx, x in t:
246 |       if x.kind != VNull:
247 |         outputIdx.add idx[0]
248 |     result = newTensor[T](outputIdx.len)
249 |     when T is string:
250 |       for i, idx in outputIdx:
251 |         result[i] = t[idx].toStr
252 |     elif T is float:
253 |       for i, idx in outputIdx:
254 |         result[i] = t[idx].toFloat
255 |     elif T is int:
256 |       for i, idx in outputIdx:
257 |         result[i] = t[idx].toInt
258 |     elif T is bool:
259 |       for i, idx in outputIdx:
260 |         result[i] = t[idx].toBool
261 |     elif T is Value:
262 |       for i, idx in outputIdx:
263 |         result[i] = t[idx]
264 | 
265 | proc toTensor*[T](c: Column, dtype: typedesc[T],
266 |                   dropNulls: static bool = false): Tensor[T] =
267 |   ## `dropNulls` only has an effect on `colObject` columns. It allows to
268 |   ## drop Null values to get (hopefully) a valid raw Tensor
269 |   case c.kind
270 |   of colInt:
271 |     when T is int:
272 |       result = c.iCol
273 |     elif T is SomeNumber:
274 |       result = c.iCol.asType(T)
275 |     elif T is Value:
276 |       result = c.iCol.asValue
277 |     elif T is string:
278 |       result = c.iCol.map_inline($x)
279 |     else:
280 |       raise newException(ValueError, "Invalid conversion of int column to " & $T & "!")
281 |   of colFloat:
282 |     when T is float:
283 |       result = c.fCol
284 |     elif T is SomeNumber:
285 |       result = c.fCol.asType(T)
286 |     elif T is Value:
287 |       result = c.fCol.asValue
288 |     elif T is string:
289 |       result = c.fCol.map_inline($x)
290 |     else:
291 |       raise newException(ValueError, "Invalid conversion of float column to " & $T & "!")
292 |   of colString:
293 |     when T is string:
294 |       result = c.sCol
295 |     elif T is Value:
296 |       result = c.sCol.asValue
297 |     else:
298 |       raise newException(ValueError, "Invalid conversion of string column to " & $T & "!")
299 |   of colBool:
300 |     when T is bool:
301 |       result = c.bCol
302 |     elif T is Value:
303 |       result = c.bCol.asValue
304 |     else:
305 |       raise newException(ValueError, "Invalid conversion of bool column to " & $T & "!")
306 |   of colObject:
307 |     result = c.oCol.valueTo(T, dropNulls = dropNulls)
308 |   of colConstant:
309 |     result = c.constantToFull.toTensor(dtype, dropNulls)
310 |   of colNone: raise newException(ValueError, "Accessed column is empty!")
311 | 
312 | proc toTensor*[T](c: Column, slice: Slice[int], dtype: typedesc[T]): Tensor[T] =
313 |   case c.kind
314 |   of colInt:
315 |     when T is int:
316 |       result = c.iCol[slice.a .. slice.b]
317 |     elif T is SomeNumber:
318 |       result = c.iCol[slice.a .. slice.b].asType(T)
319 |   of colFloat:
320 |     when T is float:
321 |       result = c.fCol[slice.a .. slice.b]
322 |     elif T is SomeNumber:
323 |       result = c.fCol[slice.a .. slice.b].asType(T)
324 |   of colString:
325 |     when T is string:
326 |       result = c.sCol[slice.a .. slice.b]
327 |   of colBool:
328 |     when T is bool:
329 |       result = c.bCol[slice.a .. slice.b]
330 |   of colObject:
331 |     result = c.oCol[slice.a .. slice.b].valueTo(T)
332 |   of colConstant:
333 |     result = newTensorWith[T](slice.b - slice.a + 1, c.cCol.to(T))
334 |   of colNone: raise newException(ValueError, "Accessed column is empty!")
335 | 
336 | proc `[]`*[T](c: Column, idx: int, dtype: typedesc[T]): T =
337 |   when T isnot Value:
338 |     case c.kind
339 |     of colInt:
340 |       when T is int:
341 |         result = c.iCol[idx]
342 |       elif T is SomeNumber:
343 |         result = c.iCol[idx].T
344 |       elif T is string:
345 |         result = $c.iCol[idx]
346 |     of colFloat:
347 |       when T is float:
348 |         result = c.fCol[idx]
349 |       elif T is SomeNumber:
350 |         result = c.fCol[idx].T
351 |       elif T is string:
352 |         # convert to Value and then string so that we use one single
353 |         # formatting function. This is slow anyways
354 |         result = pretty(%~ c.fCol[idx])
355 |     of colString:
356 |       when T is string:
357 |         result = c.sCol[idx]
358 |     of colBool:
359 |       when T is bool:
360 |         result = c.bCol[idx]
361 |     of colObject:
362 |       when T is string:
363 |         result = c.oCol[idx].toStr
364 |       elif T is float:
365 |         result = c.oCol[idx].toFloat
366 |       elif T is int:
367 |         result = c.oCol[idx].toInt
368 |       elif T is bool:
369 |         result = c.oCol[idx].toBool
370 |     of colConstant:
371 |       when T is string:
372 |         result = c.cCol.toStr
373 |       elif T is float:
374 |         result = c.cCol.toFloat
375 |       elif T is int:
376 |         result = c.cCol.toInt
377 |       elif T is bool:
378 |         result = c.cCol.toBool
379 |     of colNone: raise newException(ValueError, "Accessed column is empty!")
380 |   else:
381 |     case c.kind
382 |     of colInt: result = %~ c.iCol[idx]
383 |     of colFloat: result = %~ c.fCol[idx]
384 |     of colString: result = %~ c.sCol[idx]
385 |     of colBool: result = %~ c.bCol[idx]
386 |     of colObject: result = c.oCol[idx]
387 |     of colConstant: result = c.cCol
388 |     of colNone: raise newException(ValueError, "Accessed column is empty!")
389 | 
390 | proc toObjectColumn*(c: Column): Column =
391 |   ## returns `c` as an object column
392 |   var res = newTensor[Value](c.len)
393 |   withNativeTensor(c, t):
394 |     for idx in 0 ..< c.len:
395 |       res[idx] = %~ (t[idx])
396 |   result = toColumn res
397 | 
398 | proc `[]=`*[T](c: var Column, idx: int, val: T) =
399 |   ## assign `val` to column `c` at index `idx`
400 |   ## If the types match, it just calls `[]=` on the tensor.
401 |   ## If they are compatible, `val` is converted to c's type.
402 |   ## If they are incompatible, `c` will be rewritten to an object
403 |   ## column.
404 |   var rewriteAsValue = false
405 |   case c.kind
406 |   of colFloat:
407 |     when T is float:
408 |       c.fCol[idx] = val
409 |     elif T is SomeNumber:
410 |       c.fCol[idx] = val.float
411 |   of colInt:
412 |     when T is int:
413 |       c.iCol[idx] = val
414 |     else:
415 |       rewriteAsValue = true
416 |   of colString:
417 |     when T is string:
418 |       c.sCol[idx] = val
419 |     else:
420 |       rewriteAsValue = true
421 |   of colBool:
422 |     when T is bool:
423 |       c.bCol[idx] = val
424 |     else:
425 |       rewriteAsValue = true
426 |   of colObject:
427 |     c.oCol[idx] = %~ val
428 |   of colConstant:
429 |     if c.cCol == %~ val: discard # do nothing
430 |     elif c.cCol.kind == VNull:
431 |       # turn into constant column of `val`
432 |       c.cCol = %~ val
433 |     else:
434 |       # need to replace constant column by non constant with changed value at
435 |       # specified index
436 |       c = c.constantToFull()
437 |       c[idx] = val
438 |   of colNone: raise newException(ValueError, "Accessed column is empty!")
439 |   if rewriteAsValue:
440 |     # rewrite as an object column
441 |     c = c.toObjectColumn()
442 |     c.oCol[idx] = %~ val
443 | 
444 | proc `[]=`*[T](c: var Column, slice: Slice[int], t: Tensor[T]) =
445 |   ## Assigns the tensor `t` to the slice `slice`. The slice length must match
446 |   ## the tensor length exactly and must be smaller than the column length.
447 |   ##
448 |   ## If the type of `t` does not match the column kind, we reallocate to an object column.
449 |   let length = slice.b - slice.a + 1
450 |   let sa = slice.a
451 |   let sb = slice.b
452 |   if length != t.size:
453 |     raise newException(ValueError, "Given tensor of size " & $t.size & " does not match slice " &
454 |       $slice & " with length: " & $length & ".")
455 |   elif length > c.len:
456 |     raise newException(ValueError, "Given slice " & $slice & " of length " & $length &
457 |       " is larger than column length of " & $c.len & ".")
458 |   case c.kind
459 |   of colInt:
460 |     when T is int:
461 |       c.iCol[sa .. sb] = t
462 |     else:
463 |       c = c.toObjectColumn()
464 |       c.oCol[sa .. sb] = t.asValue()
465 |   of colFloat:
466 |     when T is float:
467 |       c.fCol[sa .. sb] = t
468 |     elif T is int:
469 |       c.fCol[sa .. sb] = t.asType(float)
470 |     else:
471 |       c = c.toObjectColumn()
472 |       c.oCol[sa .. sb] = t.asValue()
473 |   of colString:
474 |     when T is string:
475 |       c.sCol[sa .. sb] = t
476 |     else:
477 |       c = c.toObjectColumn()
478 |       c.oCol[sa .. sb] = t.asValue()
479 |   of colBool:
480 |     when T is bool:
481 |       c.bCol[sa .. sb] = t
482 |     else:
483 |       c = c.toObjectColumn()
484 |       c.oCol[sa .. sb] = t.asValue()
485 |   of colConstant:
486 |     ## if we are handed a Tensor to slice assign, we have to convert to a full column
487 |     ## Then try again with the full tensor (possibly convert to object column then)
488 |     c = c.constantToFull()
489 |     c[sa .. sb] = t
490 |   of colObject:
491 |     when T is Value:
492 |       c.oCol[sa .. sb] = t
493 |     else:
494 |       c.oCol[sa .. sb] = t.asValue()
495 |   of colNone:
496 |     raise newException(ValueError, "Cannot assign a tensor to an empty column.")
497 | 
498 | proc `[]=`*(c: var Column, slice: Slice[int], col: Column) =
499 |   let sa = slice.a.int
500 |   let sb = slice.b.int
501 |   if c.compatibleColumns(col) and c.kind != colConstant:
502 |     withNativeDtype(c):
503 |       c[slice] = col.toTensor(dtype)
504 |   elif c.kind == colConstant and col.kind == colConstant:
505 |     if c.cCol == col.cCol: return # nothing to do
506 |     else:
507 |       c = c.constantToFull()
508 |       let c2 = col.constantToFull()
509 |       c[slice] = c2
510 |   else:
511 |     c = c.toObjectColumn()
512 |     c.oCol[sa .. sb] = col.toTensor(Value)
513 | 
514 | template withNative2*(c1, c2: Column, idx1, idx2: int,
515 |                       valName1, valName2: untyped,
516 |                       body: untyped): untyped =
517 |   assert c1.kind == c2.kind
518 |   case c1.kind
519 |   of colInt:
520 |     let `valName1` {.inject.} =  c1[idx1, int]
521 |     let `valName2` {.inject.} =  c2[idx2, int]
522 |     body
523 |   of colFloat:
524 |     let `valName1` {.inject.} =  c1[idx1, float]
525 |     let `valName2` {.inject.} =  c2[idx2, float]
526 |     body
527 |   of colString:
528 |     let `valName1` {.inject.} =  c1[idx1, string]
529 |     let `valName2` {.inject.} =  c2[idx2, string]
530 |     body
531 |   of colBool:
532 |     let `valName1` {.inject.} =  c1[idx1, bool]
533 |     let `valName2` {.inject.} =  c2[idx2, bool]
534 |     body
535 |   of colObject:
536 |     let `valName1` {.inject.} =  c1[idx1, Value]
537 |     let `valName2` {.inject.} =  c2[idx2, Value]
538 |     body
539 |   of colConstant: raise newException(ValueError, "Accessed column is constant!")
540 |   of colNone: raise newException(ValueError, "Accessed column is empty!")
541 | 
542 | proc compatibleColumns*(c1, c2: Column): bool {.inline.} =
543 |   if c1.kind == c2.kind: result = true
544 |   elif c1.kind in {colInt, colFloat} and
545 |        c2.kind in {colInt, colFloat}:
546 |     result = true
547 |   else: result = false
548 | 
549 | proc equal*(c1: Column, idx1: int, c2: Column, idx2: int): bool =
550 |   ## checks if the value in `c1` at `idx1` is equal to the
551 |   ## value in `c2` at `idx2`
552 |   if not compatibleColumns(c1, c2): return false
553 |   elif c1.kind == c2.kind:
554 |     withNativeDtype(c1):
555 |       result = c1[idx1, dtype] == c2[idx2, dtype]
556 |   else:
557 |     # need to get the enveloping kind and read the data using that corresponding
558 |     # data type
559 |     let kind = combinedColKind(@[c1.kind, c2.kind])
560 |     withDtypeByColKind(kind):
561 |       result = c1[idx1, dtype] == c2[idx2, dtype]
562 | 
563 | proc toObject*(c: Column): Column {.inline.} =
564 |   case c.kind
565 |   of colObject: result = c
566 |   of colInt: result = toColumn c.iCol.asValue
567 |   of colFloat: result = toColumn c.fCol.asValue
568 |   of colString: result = toColumn c.sCol.asValue
569 |   of colBool: result = toColumn c.bCol.asValue
570 |   of colConstant: raise newException(ValueError, "Accessed column is constant!")
571 |   of colNone: raise newException(ValueError, "Accessed column is empty!")
572 | 
573 | proc add*(c1, c2: Column): Column =
574 |   ## adds column `c2` to `c1`. Uses `concat` internally.
575 |   if c1.isNil: return c2 # allows to add to an uninitialized column
576 |   if c2.len == 0: return c1
577 |   elif c1.len == 0: return c2
578 |   if c1.kind == c2.kind:
579 |     # just concat directly
580 |     case c1.kind
581 |     of colInt: result = toColumn concat(c1.iCol, c2.iCol, axis = 0)
582 |     of colFloat: result = toColumn concat(c1.fCol, c2.fCol, axis = 0)
583 |     of colBool: result = toColumn concat(c1.bCol, c2.bCol, axis = 0)
584 |     of colString: result = toColumn concat(c1.sCol, c2.sCol, axis = 0)
585 |     of colObject: result = toColumn concat(c1.oCol, c2.oCol, axis = 0)
586 |     of colConstant:
587 |       if c1.cCol == c2.cCol: result = c1 # does not matter which to return
588 |       else: result = add(c1.constantToFull, c2.constantToFull)
589 |     of colNone: doAssert false, "Both columns are empty!"
590 |   elif compatibleColumns(c1, c2):
591 |     # convert both to float
592 |     case c1.kind
593 |     of colInt:
594 |       # c1 is int, c2 is float
595 |       assert c2.kind == colFloat
596 |       result = toColumn concat(c1.iCol.asType(float), c2.fCol, axis = 0)
597 |     of colFloat:
598 |       # c1 is float, c2 is int
599 |       assert c2.kind == colInt
600 |       result = toColumn concat(c1.fCol, c2.iCol.asType(float), axis = 0)
601 |     else: doAssert false, "cannot happen, since not compatible!"
602 |   elif c1.kind == colConstant or c2.kind == colConstant:
603 |     result = add(c1.constantToFull, c2.constantToFull)
604 |   else:
605 |     # convert both columns to Value
606 |     result = toColumn concat(c1.toObject.oCol, c2.toObject.oCol, axis = 0)
607 |   result.len = c1.len + c2.len
608 | 
609 | proc toColumn*[T: SomeFloat | SomeInteger | string | bool | Value](s: openArray[T]): Column =
610 |   var vals = newTensor[T](s.len)
611 |   for i, x in s:
612 |     vals[i] = x
613 |   result = toColumn(vals)
614 | 
615 | proc toColumn*[T: SomeFloat | SomeInteger | string | bool | Value](x: T): Column =
616 |   # also possible to create single row column, but inefficient
617 |   # for `summarize` though there's no way around
618 |   let vals = newTensorWith[T](1, x)
619 |   result = toColumn(vals)
620 | 
621 | proc toNativeColumn*(s: openArray[Value]): Column =
622 |   ## given input as `Value`, will attempt to return the column as native
623 |   ## data type.
624 |   ## NOTE: this is unsafe and assumes the values are indeed all one type!
625 |   if s.len > 0:
626 |     withNativeConversion(s[0].kind, get):
627 |       var data = newTensor[dtype](s.len)
628 |       for i, x in s:
629 |         data[i] = get(x)
630 |       result = toColumn data
631 | 
632 | proc toNativeColumn*(c: Column, failIfImpossible: static bool = true): Column =
633 |   ## attempts to convert the given column from `colObject` to its
634 |   ## native type, if possible. This is mainly useful after removal
635 |   ## of null values. If it fails (i.e. floats and strings in one
636 |   ## col) the result stays a colObject.
637 |   ##
638 |   ## In the default case `failIfImpossible = true` this procedure will
639 |   ## fail with an `AssertionDefect` if a column contains multiple datatypes.
640 |   ## This can be disabled so that at worst the input is returned as an
641 |   ## object type column.
642 |   if c.kind != colObject: return c
643 |   # assuming the column ``can`` be converted to native type, the
644 |   # first element contains all information we need, namely the
645 |   # value kind of ``all`` elements in the column
646 |   # exception: first element is int, but mixed with float
647 |   let vKind = c[0, Value].kind
648 |   ## TODO: this can fail...
649 |   withNativeConversion(vKind, get):
650 |     var data = newTensor[dtype](c.len)
651 |     let cValue = c.toTensor(Value)
652 |     for i in 0 ..< c.len:
653 |       when failIfImpossible:
654 |         doAssert cValue[i].kind == vKind, "Column contains actual multiple datatypes! " &
655 |           $vKind & " and " & $cValue[i].kind & "!"
656 |       else:
657 |         if cValue[i].kind != vKind:
658 |           # not possible to convert, return input
659 |           return c
660 |       data[i] = get cValue[i]
661 |     result = toColumn data
662 | 
663 | proc nullColumn*(num: int): Column =
664 |   ## returns an object `Column` with `N` values, which are
665 |   ## all `VNull`
666 |   var nullseq = newSeq[Value](num)
667 |   for i in 0 ..< num:
668 |     nullseq[i] = Value(kind: VNull)
669 |   result = toColumn(nullseq)
670 | 
671 | #proc `*`[T: SomeNumber]*(c: Column, x: T)
672 | proc contains*[T: float | string | int | bool | Value](c: Column, val: T): bool =
673 |   let t = toTensor(c, T)
674 |   result = false
675 |   for x in t:
676 |     if val == x:
677 |       return true
678 | 
679 | template liftScalarToColumn*(name: untyped): untyped =
680 |   proc `name`*(c: Column): Value =
681 |     withNativeDtype(c):
682 |       result = %~ `name`(c.toTensor(dtype))
683 | liftScalarToColumn(max)
684 | 
685 | proc pretty*(c: Column): string =
686 |   ## pretty prints a Column
687 |   result = &"Column of type: {toNimType(c.kind)} with length: {c.len}\n"
688 |   withNativeTensor(c, t):
689 |     result.add &"  contained Tensor: {t}"
690 | template `$`*(c: Column): string = pretty(c)
691 | 
692 | proc clone*(c: Column): Column =
693 |   ## clones the given column by cloning the Tensor
694 |   result = Column(kind: c.kind, len: c.len)
695 |   case result.kind
696 |   of colInt: result.iCol = c.iCol.clone()
697 |   of colFloat: result.fCol = c.fCol.clone()
698 |   of colString: result.sCol = c.sCol.clone()
699 |   of colBool: result.bCol = c.bCol.clone()
700 |   of colObject: result.oCol = c.oCol.clone()
701 |   of colConstant: result.cCol = c.cCol # just a `Value`
702 |   of colNone: discard
703 | 
704 | proc map*[T; U](c: Column, fn: (T -> U)): Column =
705 |   ## Maps a given column given `fn` to a new column.
706 |   ## Because `Column` is a variant type, an untyped mapping function
707 |   ## won't compile.
708 |   ##
709 |   ## See the `map_inline` template below, which attempts to work around this
710 |   ## limitation by compiling all map function bodies, which are valid for `c`.
711 |   ##
712 |   ## .. code-block:: nim
713 |   ##   c.map((x: int) => x * 5)
714 |   ##
715 |   ## Using this is not really recommended. Use `df["x", int].map(x => x * 5)` instead!
716 |   result = toColumn c.toTensor(T).map_inline(fn(x))
717 | 
718 | template map_inline*(c: Column, body: untyped): Column =
719 |   ## This is a helper template, which attempts to work around this
720 |   ## limitation by compiling all map function bodies, which are valid for `c`.
721 |   ## However, be careful: by using the template you throw out possible compile
722 |   ## time checking and replace it by possible exceptions in your code!
723 |   ##
724 |   ## .. code-block:: nim
725 |   ##   c.map_inline(x * 5)
726 |   ##
727 |   ## This example will throw a runtime exception, if `* 5` is invalid for the
728 |   ## column type that `c` actually is at runtime!
729 |   ## Using this is not really recommended. Use `df["x", int].map_inline(x * 5)` instead!
730 |   withNativeDtype(c):
731 |     var res: Column
732 |     when compiles((map(c, (x: dtype) => body))):
733 |       res = toColumn map(c, (x: dtype) => body)
734 |     else:
735 |       ## Cannot raise a CT error unfortunately I think, because this branch will always be compiled
736 |       ## for one of the column types
737 |       raise newException(Exception, "Column is of invalid type for map body `" & $(astToStr(body)) &
738 |         "` for dtype of column: " & $(c.kind.toNimType))
739 |     res
740 | 


--------------------------------------------------------------------------------