├── tests ├── config.nims ├── test_issue20.nim ├── test_issue28.nim ├── tests.nim └── testsFormula.nim ├── src ├── datamancer.nim └── datamancer │ ├── df_types.nim │ ├── formulaNameMacro.nim │ ├── value.nim │ ├── formulaExp.nim │ ├── io.nim │ └── column.nim ├── LICENSE ├── datamancer.nimble ├── data ├── fishdata_sparse.csv ├── mpg.csv └── 03-sample_hugo.csv ├── .github └── workflows │ └── ci.yml ├── changelog.org ├── docs ├── docs.nim └── datamancer.org └── README.org /tests/config.nims: -------------------------------------------------------------------------------- 1 | switch("path", "$projectDir/../src") -------------------------------------------------------------------------------- /src/datamancer.nim: -------------------------------------------------------------------------------- 1 | ## .. include:: ./docs/datamancer.rst 2 | 3 | import datamancer / [dataframe, io] 4 | export dataframe, io 5 | -------------------------------------------------------------------------------- /tests/test_issue20.nim: -------------------------------------------------------------------------------- 1 | import datamancer 2 | import unittest 3 | 4 | test "Issue #20 - `isDigit` was removed": 5 | let n1 = %~ "1.1" 6 | let n2 = %~ "1.3e5" 7 | let n3 = %~ "aba" 8 | let n4 = %~ "1..1" 9 | let n5 = %~ "123" 10 | let n6 = %~ "100_000" 11 | let n7 = %~ "_100_000_" 12 | check not n1.isInt 13 | check not n2.isInt 14 | check not n3.isInt 15 | check not n4.isInt 16 | check n5.isInt 17 | check n6.isInt 18 | check n7.isInt # this is a little unintuitive, but a downside of our simple def. 19 | -------------------------------------------------------------------------------- /src/datamancer/df_types.nim: -------------------------------------------------------------------------------- 1 | import tables, sets 2 | import column, value 3 | 4 | type 5 | DataFrameKind* = enum 6 | dfNormal, dfGrouped 7 | 8 | # where value is as usual 9 | # then 10 | DataFrame* = ref object 11 | len*: int 12 | data*: OrderedTable[string, Column] 13 | case kind*: DataFrameKind 14 | of dfGrouped: 15 | # a grouped data frame stores the keys of the groups and maps them to 16 | # a set of the categories 17 | groupMap*: OrderedTable[string, HashSet[Value]] 18 | else: discard 19 | -------------------------------------------------------------------------------- /tests/test_issue28.nim: -------------------------------------------------------------------------------- 1 | import datamancer 2 | import json 3 | 4 | template accept(x) = 5 | static: assert(compiles(x)) 6 | 7 | template reject(x) = 8 | static: assert(not compiles(x)) 9 | 10 | accept: 11 | let f = fn {"Channel" == "Ch 0"} 12 | accept: 13 | let f2 = fn({"Channel" == "Ch 0"}) 14 | accept: 15 | let f3 = fn: 16 | {"Channel" == "Ch 0"} 17 | 18 | # this should fail, because `json` provides a `{}` proc, which 19 | # means we do not resolve our untyped `{}` macro 20 | # But for some reason on Github Actions it passes?! 21 | #reject: 22 | # let f = f{"Channel" == "Ch 0"} 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 SciNim team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /datamancer.nimble: -------------------------------------------------------------------------------- 1 | # Package 2 | 3 | version = "0.1.8" 4 | author = "Vindaar" 5 | description = "A dataframe library with a dplyr like API" 6 | license = "MIT" 7 | srcDir = "src" 8 | 9 | 10 | # Dependencies 11 | 12 | requires "nim >= 1.2.0" 13 | requires "https://github.com/Vindaar/seqmath >= 0.1.11" 14 | requires "arraymancer >= 0.7.1" 15 | 16 | task test, "Run standard tests": 17 | exec "nim c -r tests/testDf.nim" 18 | exec "nim c -r tests/tests.nim" 19 | exec "nim c -r tests/test_issue20.nim" 20 | exec "nim c -r tests/test_issue28.nim" 21 | exec "nim c -r tests/testsFormula.nim" 22 | 23 | import os, strutils, strformat 24 | const 25 | pkgName = "datamancer" 26 | orgFile = "docs" / (pkgName & ".org") 27 | rstFile = "docs" / (pkgName & ".rst") 28 | rstFileAuto = "docs" / (pkgName & "_autogen.rst") 29 | 30 | template canImport(x: untyped): untyped = 31 | compiles: 32 | import x 33 | 34 | when canImport(docs / docs): 35 | # can define the `gen_docs` task (docs already imported now) 36 | # this is to hack around weird nimble + nimscript behavior. 37 | # when overwriting an install nimble will try to parse the generated 38 | # nimscript file and for some reason then it won't be able to import 39 | # the module (even if it's put into `src/`). 40 | task gen_docs, "Generate datamancer documentation": 41 | # build the actual docs and the index 42 | exec "pandoc " & orgFile & " -o " & rstFile 43 | buildDocs( 44 | "src/", "docs/", 45 | defaultFlags = "--hints:off --warnings:off" 46 | ) 47 | -------------------------------------------------------------------------------- /data/fishdata_sparse.csv: -------------------------------------------------------------------------------- 1 | fish,station,seen 2 | 4842,"Release",1 3 | 4843,"Release",1 4 | 4844,"Release",1 5 | 4845,"Release",1 6 | 4847,"Release",1 7 | 4848,"Release",1 8 | 4849,"Release",1 9 | 4850,"Release",1 10 | 4851,"Release",1 11 | 4854,"Release",1 12 | 4855,"Release",1 13 | 4857,"Release",1 14 | 4858,"Release",1 15 | 4859,"Release",1 16 | 4861,"Release",1 17 | 4862,"Release",1 18 | 4863,"Release",1 19 | 4864,"Release",1 20 | 4865,"Release",1 21 | 4842,"I80_1",1 22 | 4843,"I80_1",1 23 | 4844,"I80_1",1 24 | 4845,"I80_1",1 25 | 4847,"I80_1",1 26 | 4848,"I80_1",1 27 | 4849,"I80_1",1 28 | 4850,"I80_1",1 29 | 4851,"I80_1",1 30 | 4854,"I80_1",1 31 | 4855,"I80_1",1 32 | 4857,"I80_1",1 33 | 4858,"I80_1",1 34 | 4859,"I80_1",1 35 | 4861,"I80_1",1 36 | 4862,"I80_1",1 37 | 4863,"I80_1",1 38 | 4864,"I80_1",1 39 | 4865,"I80_1",1 40 | 4842,"Lisbon",1 41 | 4843,"Lisbon",1 42 | 4844,"Lisbon",1 43 | 4845,"Lisbon",1 44 | 4847,"Lisbon",1 45 | 4848,"Lisbon",1 46 | 4855,"Lisbon",1 47 | 4857,"Lisbon",1 48 | 4858,"Lisbon",1 49 | 4859,"Lisbon",1 50 | 4861,"Lisbon",1 51 | 4862,"Lisbon",1 52 | 4865,"Lisbon",1 53 | 4842,"Rstr",1 54 | 4843,"Rstr",1 55 | 4844,"Rstr",1 56 | 4845,"Rstr",1 57 | 4848,"Rstr",1 58 | 4850,"Rstr",1 59 | 4855,"Rstr",1 60 | 4857,"Rstr",1 61 | 4858,"Rstr",1 62 | 4859,"Rstr",1 63 | 4861,"Rstr",1 64 | 4862,"Rstr",1 65 | 4842,"Base_TD",1 66 | 4843,"Base_TD",1 67 | 4844,"Base_TD",1 68 | 4845,"Base_TD",1 69 | 4850,"Base_TD",1 70 | 4855,"Base_TD",1 71 | 4857,"Base_TD",1 72 | 4858,"Base_TD",1 73 | 4859,"Base_TD",1 74 | 4861,"Base_TD",1 75 | 4862,"Base_TD",1 76 | 4842,"BCE",1 77 | 4843,"BCE",1 78 | 4844,"BCE",1 79 | 4850,"BCE",1 80 | 4857,"BCE",1 81 | 4858,"BCE",1 82 | 4861,"BCE",1 83 | 4862,"BCE",1 84 | 4842,"BCW",1 85 | 4843,"BCW",1 86 | 4844,"BCW",1 87 | 4850,"BCW",1 88 | 4857,"BCW",1 89 | 4858,"BCW",1 90 | 4861,"BCW",1 91 | 4862,"BCW",1 92 | 4842,"BCE2",1 93 | 4843,"BCE2",1 94 | 4844,"BCE2",1 95 | 4857,"BCE2",1 96 | 4858,"BCE2",1 97 | 4861,"BCE2",1 98 | 4862,"BCE2",1 99 | 4842,"BCW2",1 100 | 4843,"BCW2",1 101 | 4844,"BCW2",1 102 | 4857,"BCW2",1 103 | 4858,"BCW2",1 104 | 4861,"BCW2",1 105 | 4862,"BCW2",1 106 | 4842,"MAE",1 107 | 4843,"MAE",1 108 | 4844,"MAE",1 109 | 4858,"MAE",1 110 | 4861,"MAE",1 111 | 4842,"MAW",1 112 | 4843,"MAW",1 113 | 4844,"MAW",1 114 | 4858,"MAW",1 115 | 4861,"MAW",1 116 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: datamancer CI 2 | on: 3 | push: 4 | paths: 5 | - 'tests/**' 6 | - 'src/**' 7 | - 'docs/**' 8 | - 'datamancer.nimble' 9 | - '.github/workflows/ci.yml' 10 | pull_request: 11 | paths: 12 | - 'tests/**' 13 | - 'src/**' 14 | - 'docs/**' 15 | - 'datamancer.nimble' 16 | - '.github/workflows/ci.yml' 17 | 18 | jobs: 19 | build: 20 | strategy: 21 | fail-fast: false 22 | matrix: 23 | branch: [version-1-4, devel] 24 | target: [linux, macos, windows] 25 | include: 26 | - target: linux 27 | builder: ubuntu-18.04 28 | - target: macos 29 | builder: macos-10.15 30 | - target: windows 31 | builder: windows-2019 32 | name: '${{ matrix.target }} (${{ matrix.branch }})' 33 | runs-on: ${{ matrix.builder }} 34 | steps: 35 | - name: Checkout 36 | uses: actions/checkout@v2 37 | with: 38 | path: datamancer 39 | 40 | - name: Setup Nim 41 | uses: alaviss/setup-nim@0.1.1 42 | with: 43 | path: nim 44 | version: ${{ matrix.branch }} 45 | 46 | - name: Install dependencies (Ubuntu) 47 | if: ${{matrix.target == 'linux'}} 48 | run: | 49 | sudo apt-get update 50 | sudo apt-get install pandoc 51 | 52 | - name: Setup nimble & deps 53 | shell: bash 54 | run: | 55 | cd datamancer 56 | nimble refresh -y 57 | nimble install -y 58 | 59 | - name: Run tests 60 | shell: bash 61 | run: | 62 | cd datamancer 63 | nimble -y test 64 | 65 | - name: Build docs 66 | if: > 67 | github.event_name == 'push' && github.ref == 'refs/heads/master' && 68 | matrix.target == 'linux' && matrix.branch == 'devel' 69 | shell: bash 70 | run: | 71 | cd datamancer 72 | # **HAVE** to call `develop`, cuz we're getting screwed by 73 | # logic otherwise 74 | nimble develop -y 75 | nimble gen_docs 76 | # TODO: fix this, need to iterate over all files, do similar to arraymancer docs 77 | # Ignore failures for older Nim 78 | cp docs/{the,}index.html || true 79 | 80 | - name: Publish docs 81 | if: > 82 | github.event_name == 'push' && github.ref == 'refs/heads/master' && 83 | matrix.target == 'linux' && matrix.branch == 'devel' 84 | uses: crazy-max/ghaction-github-pages@v1 85 | with: 86 | build_dir: datamancer/docs 87 | env: 88 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 89 | -------------------------------------------------------------------------------- /changelog.org: -------------------------------------------------------------------------------- 1 | * v0.1.9 2 | - add basic implementation of =spread= (inverse of =gather=; similar 3 | to dplyr =pivot_wider=). The current implementation is rather basic 4 | and performance may be suboptimal for very large data frames. 5 | - add =null= helper to create a =VNull Value= 6 | - significantly improve the docs of the =dataframe.nim= module. 7 | - fixes an issue where unique column reference names were combined 8 | into the same column due to a bad name generation algorithm 9 | - significantly improves performance in applications in which 10 | allocation of memory is a bottleneck (tensors were zero 11 | initialized). 12 | - disable formula output at CT by default. Compile with 13 | =-d:echoFormulas= to see the output. 14 | - remove CT warnings for unrelated stuff (node kinds) 15 | * v0.1.8 16 | - avoid some object conversions in column operations (ref #11) 17 | - add ~[]=~ overloads for columns for slice assignments 18 | - *significantly* improve performance of =mutate/transmute= operations 19 | for grouped dataframes (O(150,000) groups in < 0.5 s possible now) 20 | - fixes #12 by avoiding hashing of columns. Some performance 21 | regression in =innerJoin=, =setDiff= (~2x slower in bad cases). 22 | * v0.1.7 23 | - allow assignment of constants in =seqsToDf= 24 | - allow assignment of scalars to DF as column directly 25 | - add filename argument to =showBrowser= 26 | - make =compileFormulaImpl= actually typed to make formulas work 27 | correctly inside of generics (ref =ggplotnim= 28 | https://github.com/Vindaar/ggplotnim/issues/116 29 | - change internal macro type logic to use strings 30 | 31 | * v0.1.6 32 | - fix slicing of constant columns 33 | 34 | * v0.1.5 35 | - fully qualify =Value= on scalar formula construction 36 | 37 | * v0.1.4 38 | - fix formulas (and type deduction) for certain use cases involving 39 | =nnkBracketExpr= that are *not* references to columns 40 | 41 | * v0.1.3 42 | - improve type deduction capabilities for infix nodes 43 | - add overload for =drop= that doesn't just work on a mutable data 44 | frame 45 | - fix reference semantics issues if DF is modified and visible in 46 | result (only data is shared, but columns should be respected) 47 | - =arrange= now also takes a =varargs[string]= instead of a 48 | =seq=. While there is still a bug of not properly being able to use 49 | varargs, at least an array is possible (and hopefully at some point 50 | proper varargs). 51 | 52 | * v0.1.2 53 | - CSV parser is more robust, can handle unnammed columns 54 | - explicit types in =idx=, =col= column reference finally works 55 | (e.g. =idx("foo", float)= accesses the column "foo" as a float 56 | tensor overwriting type deductions and type hints) 57 | 58 | * v0.1.1 59 | - allow =nnkMacroDef= in =findType= 60 | - add development notes and ideas about rewrite of formula macro in =notes/formula_dev_notes.org= 61 | 62 | * v0.1.0 63 | 64 | - initial version of Datamancer based on =ggplotnim= data frame with 65 | major formula macro rewrite 66 | -------------------------------------------------------------------------------- /src/datamancer/formulaNameMacro.nim: -------------------------------------------------------------------------------- 1 | import strformat, macros, strutils 2 | 3 | proc build(n: NimNode): string 4 | proc buildArgs(n: NimNode, head = ""): string = 5 | if result.len == 0 and head.len > 0: 6 | result = &"{head}" 7 | for i in 0 ..< n.len: 8 | if result.len == 0: 9 | result = &"({build(n[i])}" 10 | else: 11 | result.add &" {build(n[i])}" 12 | result.add ")" 13 | 14 | proc build(n: NimNode): string = 15 | # convert to lisp representation 16 | case n.kind 17 | of nnkInfix: 18 | result = &"({n[0].strVal} {build(n[1])} {build(n[2])})" 19 | of nnkIntLit .. nnkFloat64Lit: 20 | result = n.repr 21 | of nnkStrLit, nnkRStrLit: 22 | result = n.strVal 23 | of nnkIdent, nnkSym: 24 | # should correspond to a known identifier in the calling scope 25 | result = n.strVal 26 | of nnkPar, nnkCall, nnkCommand: 27 | result = buildArgs(n) 28 | of nnkDotExpr, nnkBracketExpr: 29 | result = n.repr 30 | of nnkPrefix: 31 | when (NimMajor, NimMinor, NimPatch) < (1, 5, 0): 32 | if n[0].strVal == "-": 33 | result = &"-{build(n[1])}" 34 | else: 35 | result = &"({n[0].strVal} {build(n[1])})" 36 | else: 37 | result = &"({n[0].strVal} {build(n[1])})" 38 | of nnkAccQuoted: 39 | result = build(n[0]) 40 | of nnkCallStrLit: 41 | result = n[1].strVal 42 | of nnkCurly: 43 | result = "({}" 44 | for ch in n: 45 | result.add &" {build(ch)}" 46 | result.add ")" 47 | of nnkBracket: 48 | result = "([]" 49 | for ch in n: 50 | result.add &" {build(ch)}" 51 | result.add ")" 52 | of nnkIfExpr: 53 | result = "(if" 54 | for arg in n: 55 | result.add &" {build(arg)}" 56 | result.add ")" 57 | of nnkElifExpr: 58 | result = buildArgs(n, head = "(elif") 59 | of nnkElseExpr: 60 | result = buildArgs(n, head = "(else") 61 | of nnkStmtList: 62 | for ch in n: 63 | if result.len == 0 and n.len > 1: 64 | result = &"({buildArgs(ch)}" 65 | elif result.len == 0: 66 | result = &"{buildArgs(ch)}" 67 | else: 68 | result.add &" {buildArgs(ch)}" 69 | if n.len > 1: 70 | result.add ")" 71 | of nnkOpenSymChoice, nnkClosedSymChoice: 72 | result = n[0].strVal # take first symbol name 73 | of nnkCheckedFieldExpr: 74 | ## TODO: check if this is reasonable. It seems that this node contains 75 | ## the original node as [0] and then the "environment" as [1]?? 76 | result = build(n[0]) 77 | else: 78 | result = n.repr 79 | warning("Node kind " & $n.kind & " not implemented " & 80 | "for FormulaNode string representation. Node is:\n" & $(n.treeRepr)) 81 | 82 | proc buildName*(n: NimNode): string = 83 | ## Builds the formula name in a lisp like representation. Only for debugging 84 | ## and printing purposes. 85 | result = build(n) 86 | 87 | proc buildResultColName*(n: NimNode): NimNode = 88 | ## Builds the name of the resulting column name of a formula. Mainly it simply uses the node 89 | ## as is, except for column references via accented quotes and call string literals, in which 90 | ## case we simply use the string values underlying. 91 | ## We need to be able to use symbols from the local scope (or possible proc calls) to determine 92 | ## the resulting column name at runtime. 93 | case n.kind 94 | of nnkAccQuoted: result = newLit(n[0].strVal) 95 | of nnkCallStrLit: result = newLit(n[1].strVal) 96 | else: result = n 97 | -------------------------------------------------------------------------------- /docs/docs.nim: -------------------------------------------------------------------------------- 1 | import macros, strformat, strutils, sequtils, sets, tables, algorithm 2 | 3 | from os import parentDir, getCurrentCompilerExe, DirSep, extractFilename, `/`, setCurrentDir 4 | 5 | # NOTE: 6 | # for some time on devel 1.3.x `paramCount` and `paramStr` had to be imported 7 | # os, because they were removed for nimscript. This was reverted in: 8 | # https://github.com/nim-lang/Nim/pull/14658 9 | # For `nimdoc` we still have to import those from `os`! 10 | when defined(nimdoc): 11 | from os import getCurrentDir, paramCount, paramStr 12 | 13 | #[ 14 | This file is a slightly modified version of the same file of `nimterop`: 15 | https://github.com/nimterop/nimterop/blob/master/nimterop/docs.nim 16 | ]# 17 | 18 | 19 | proc getNimRootDir(): string = 20 | #[ 21 | hack, but works 22 | alternatively (but more complex), use (from a nim file, not nims otherwise 23 | you get Error: ambiguous call; both system.fileExists): 24 | import "$nim/testament/lib/stdtest/specialpaths.nim" 25 | nimRootDir 26 | ]# 27 | fmt"{currentSourcePath}".parentDir.parentDir.parentDir 28 | 29 | const 30 | DirSep = when defined(windows): '\\' else: '/' 31 | 32 | proc execAction(cmd: string): string = 33 | var 34 | ccmd = "" 35 | ret = 0 36 | when defined(Windows): 37 | ccmd = "cmd /c " & cmd 38 | elif defined(posix): 39 | ccmd = cmd 40 | else: 41 | doAssert false 42 | 43 | (result, ret) = gorgeEx(ccmd) 44 | doAssert ret == 0, "Command failed: " & $ret & "\ncmd: " & ccmd & "\nresult:\n" & result 45 | 46 | template genRemove(name: untyped): untyped = 47 | proc `name`(s, toRemove: string): string = 48 | result = s 49 | result.`name`(toRemove) 50 | genRemove(removePrefix) 51 | genRemove(removeSuffix) 52 | 53 | proc getFiles*(path: string): seq[string] = 54 | # Add files and dirs here, which should be skipped. 55 | #const excludeDirs = [] 56 | #let ExcludeDirSet = toSet(excludeDirs) 57 | #if path.extractFilename in ExcludeDirSet: return 58 | # The files below are not valid by themselves, they are only included 59 | # from other files 60 | #const excludeFiles = [] 61 | #let ExcludeFileSet = toSet(excludeFiles) 62 | 63 | for file in listFiles(path): 64 | if file.endsWith(".nim"): # and file.extractFilename notin ExcludeFileSet: 65 | result.add file 66 | for dir in listDirs(path): 67 | result.add getFiles(dir) 68 | 69 | proc buildDocs*(path: string, docPath: string, 70 | defaultFlags = "", 71 | masterBranch = "master", 72 | defines: openArray[string] = @[]) = 73 | ## Generate docs for all nim files in `path` and output all HTML files to the 74 | ## `docPath` in a flattened form (subdirectories are removed). 75 | ## 76 | ## If duplicate filenames are detected, they will be printed at the end. 77 | ## 78 | ## WARNING: not in use! `baseDir` is the project path by default and `files` and `path` are relative 79 | ## to that directory. Set to "" if using absolute paths. 80 | ## 81 | ## `masterBranch` is the name of the default branch to which the docs should link 82 | ## when clicking the `Source` button below a procedure etc. 83 | ## 84 | ## `defines` is a list of `-d:xxx` define flags (the `xxx` part) that should be passed 85 | ## to `nim doc` so that `getHeader()` is invoked correctly. 86 | ## 87 | ## Use the `--publish` flag with nimble to publish docs contained in 88 | ## `path` to Github in the `gh-pages` branch. This requires the ghp-import 89 | ## package for Python: `pip install ghp-import` 90 | ## 91 | ## WARNING: `--publish` will destroy any existing content in this branch. 92 | ## 93 | ## NOTE: `buildDocs()` only works correctly on Windows with Nim 1.0+ since 94 | ## https://github.com/nim-lang/Nim/pull/11814 is required. 95 | ## 96 | ## 97 | const gitUrl = "https://github.com/Vindaar/datamancer" 98 | ## WARNING: this means `gen_docs` *only* works if you use `nimble develop` on 99 | ## the repository. Nimble cannot deal with ****. This is frustrating. Thanks. 100 | let baseDir = execAction("nimble path datamancer").parentDir & $DirSep 101 | when defined(windows) and (NimMajor, NimMinor, NimPatch) < (1, 0, 0): 102 | echo "buildDocs() unsupported on Windows for Nim < 1.0 - requires PR #11814" 103 | else: 104 | let 105 | docPath = baseDir & docPath 106 | path = baseDir & path 107 | defStr = block: 108 | var defStr = " " & defaultFlags 109 | for def in defines: 110 | defStr &= " -d:" & def 111 | defStr 112 | nim = getCurrentCompilerExe() 113 | 114 | # now we walk the whole `path` and build the documentation for each `.nim` file. 115 | # While doing that we flatten the directory structure for the generated HTML files. 116 | # `src/foo/bar/baz.nim` just becomes 117 | # `docPath/baz.html`. 118 | # This allows for all files to be in the `docPath` directory, which means each 119 | # file will be able to find the `dochack.js` file, which will be put into 120 | # the `docPath` directory, too (the inclusion of the `dochack.js` is done statically 121 | # via our generated nimdoc.cfg file and is fixed for each generated HTML). 122 | let files = getFiles(path) 123 | var idx = 0 124 | var fileSet = initHashSet[string]() 125 | var duplSet = initHashSet[string]() 126 | for file in files: 127 | let baseName = file.extractFilename() 128 | let relPath = file.removePrefix(path).removeSuffix(baseName) 129 | let prefix = relPath.strip(chars = {'/'}) # remove possible trailing `/` 130 | .split('/') # split path parts 131 | .join(".") # concat by `.` instead 132 | var outfile = baseName.replace(".nim", ".html") 133 | if outfile in fileSet: 134 | duplSet.incl outfile 135 | else: 136 | fileSet.incl outfile 137 | outfile = docPath / outfile 138 | echo "Processing: ", outfile, " [", idx, "/", files.len, "]" 139 | # NOTE: Changing the current working directory to the project path is required in order for 140 | # `git.commit:` to work! Otherwise we sit in `docs` and for some reason the relative path 141 | # will eat one piece of the resulting `source` links and thereby removing the actual branch 142 | # and we end up with a broken link! 143 | echo execAction(&"cd {baseDir} && {nim} doc {defStr} --git.url:{gitUrl} --git.commit:{masterBranch} --git.devel:{masterBranch} -o:{outfile} --index:on {file}") 144 | inc idx 145 | ## now build the index 146 | echo execAction(&"{nim} buildIndex -o:{docPath}/theindex.html {docPath}") 147 | when declared(getNimRootDir): 148 | #[ 149 | NOTE: running it locally doesn't work anymore on modern chromium browser, 150 | because they block "access from origin 'null' due to CORS policy". 151 | this enables doc search, works at least locally with: 152 | cd {docPath} && python -m SimpleHTTPServer 9009 153 | ]# 154 | echo execAction(&"{nim} js -o:{docPath}/dochack.js {getNimRootDir()}/tools/dochack/dochack.nim") 155 | 156 | # echo "Processed files: ", fileSet 157 | if duplSet.card > 0: 158 | echo "WARNING: Duplicate filenames detected: ", duplSet 159 | -------------------------------------------------------------------------------- /tests/tests.nim: -------------------------------------------------------------------------------- 1 | import unittest 2 | import datamancer 3 | 4 | import tables, sets 5 | import sequtils, seqmath 6 | import math 7 | 8 | proc almostEq(a, b: float, epsilon = 1e-8): bool = 9 | ## version of `almostEqual` for testing, which prints the values, if 10 | ## they mismatch 11 | result = almostEqual(a, b, epsilon) 12 | if not result: 13 | echo "Comparison failed: a = ", a, ", b = ", b 14 | 15 | suite "Value": 16 | let 17 | v1 = %~ 1 18 | v2 = %~ 1.5 19 | v3 = %~ true 20 | v4 = %~ 'a' 21 | # `v5` itself is already a test, whether we can hash `Value` 22 | v5 = %~ { "test" : v1, 23 | "some" : v2, 24 | "heterogeneous" : v3, 25 | "fields" : v4 }.toOrderedTable 26 | v6 = Value(kind: VNull) 27 | 28 | test "Storing in sets": 29 | var valueSet = initHashSet[Value]() 30 | valueSet.incl v1 31 | valueSet.incl v2 32 | valueSet.incl v3 33 | valueSet.incl v4 34 | valueSet.incl v5 35 | valueSet.incl v6 36 | check v1 in valueSet 37 | check v2 in valueSet 38 | check v3 in valueSet 39 | check v4 in valueSet 40 | check v5 in valueSet 41 | check v6 in valueSet 42 | check valueSet.card == 6 43 | 44 | test "Storing in tables": 45 | var tab = initTable[string, Value]() 46 | tab["v1"] = v1 47 | tab["v2"] = v2 48 | tab["v3"] = v3 49 | tab["v4"] = v4 # is converted to string! 50 | tab["v5"] = v5 51 | tab["v6"] = v6 52 | check tab.len == 6 53 | check tab["v1"] == v1 54 | check tab["v2"] == v2 55 | check tab["v3"] == v3 56 | check tab["v4"] == v4 57 | check tab["v5"] == v5 58 | check tab["v6"] == v6 59 | 60 | test "Extracting values": 61 | check v1.toInt == 1 62 | check v2.toFloat == 1.5 63 | check v3.toBool == true 64 | check v4.toStr == "a" 65 | check v1.toStr == "1" 66 | check v2.toStr == "1.5" 67 | check v3.toStr == "true" 68 | expect(ValueError): 69 | discard v5.toStr 70 | expect(ValueError): 71 | discard v6.toStr 72 | 73 | test "Direct `isNumber` check": 74 | # Note: this test checks basically whether the content of a `Value` 75 | # to be echoed is recognized as a number (in which case it's engulfed 76 | # by literal ``"``) or a normal string (no ``"``) 77 | let n1 = "1.1" 78 | let n2 = "1.3e5" 79 | let n3 = "aba" 80 | let n4 = "1..1" 81 | let n5 = "1.123" 82 | let n6 = "1.5e5E5" 83 | let n7 = "e" 84 | let n8 = "E" 85 | let n9 = "." 86 | let n10 = "1e" 87 | let n11 = "1E" 88 | let n12 = "1." 89 | let n13 = "e1" 90 | let n14 = "E1" 91 | let n15 = ".1" 92 | # and some actually valid floats 93 | let n16 = "6.084E+01" 94 | let n17 = "1.676E+01" 95 | let n18 = "6.863E+00" 96 | let n19 = "2.007E+00" 97 | let n20 = "9.329E-01" 98 | let n21 = "2.441E-04" 99 | let n22 = "-2.441E-04" 100 | let n23 = "--2.441" 101 | let n24 = "-6.836E-04 " 102 | let n25 = "2.930E-04 " 103 | let n26 = "2.930E-04 d " 104 | check n1.isNumber 105 | check n2.isNumber 106 | check not n3.isNumber 107 | check not n4.isNumber 108 | check n5.isNumber 109 | check not n6.isNumber 110 | check not n7.isNumber 111 | check not n8.isNumber 112 | check not n9.isNumber 113 | check not n10.isNumber 114 | check not n11.isNumber 115 | check n12.isNumber 116 | check not n13.isNumber 117 | check not n14.isNumber 118 | check not n15.isNumber 119 | check n16.isNumber 120 | check n17.isNumber 121 | check n18.isNumber 122 | check n19.isNumber 123 | check n20.isNumber 124 | check n21.isNumber 125 | check n22.isNumber 126 | check not n23.isNumber 127 | check n24.isNumber 128 | check n25.isNumber 129 | check not n26.isNumber 130 | 131 | test "String conversion": 132 | # Note: this test checks basically whether the content of a `Value` 133 | # to be echoed is recognized as a number (in which case it's engulfed 134 | # by literal ``"``) or a normal string (no ``"``) 135 | # This uses `isNumber` internally. 136 | let n1 = %~ "1.1" 137 | let n2 = %~ "1.3e5" 138 | let n3 = %~ "aba" 139 | let n4 = %~ "1..1" 140 | let n5 = %~ "1.123" 141 | let n6 = %~ "1.5e5E5" 142 | let n7 = %~ "e" 143 | let n8 = %~ "E" 144 | let n9 = %~ "." 145 | let n10 = %~ "1e" 146 | let n11 = %~ "1E" 147 | let n12 = %~ "1." 148 | let n13 = %~ "e1" 149 | let n14 = %~ "E1" 150 | let n15 = %~ ".1" 151 | # and some actually valid floats 152 | let n16 = %~ "6.084E+01" 153 | let n17 = %~ "1.676E+01" 154 | let n18 = %~ "6.863E+00" 155 | let n19 = %~ "2.007E+00" 156 | let n20 = %~ "9.329E-01" 157 | let n21 = %~ "2.441E-04" 158 | let n22 = %~ "-2.441E-04" 159 | check $n1 == "\"1.1\"" 160 | check $n2 == "\"1.3e5\"" 161 | check $n3 == "aba" 162 | check $n4 == "1..1" 163 | check $n5 == "\"1.123\"" 164 | check $n6 == "1.5e5E5" 165 | check $n7 == "e" 166 | check $n8 == "E" 167 | check $n9 == "." 168 | check $n10 == "1e" 169 | check $n11 == "1E" 170 | check $n12 == "\"1.\"" 171 | check $n13 == "e1" 172 | check $n14 == "E1" 173 | check $n15 == ".1" 174 | check $n16 == "\"6.084E+01\"" 175 | check $n17 == "\"1.676E+01\"" 176 | check $n18 == "\"6.863E+00\"" 177 | check $n19 == "\"2.007E+00\"" 178 | check $n20 == "\"9.329E-01\"" 179 | check $n21 == "\"2.441E-04\"" 180 | check $n22 == "\"-2.441E-04\"" 181 | 182 | # check that `emphStrNumber` can be disabled 183 | check n16.pretty(emphStrNumber = false) == "6.084E+01" 184 | check n17.pretty(emphStrNumber = false) == "1.676E+01" 185 | check n18.pretty(emphStrNumber = false) == "6.863E+00" 186 | check n19.pretty(emphStrNumber = false) == "2.007E+00" 187 | check n20.pretty(emphStrNumber = false) == "9.329E-01" 188 | check n21.pretty(emphStrNumber = false) == "2.441E-04" 189 | check n22.pretty(emphStrNumber = false) == "-2.441E-04" 190 | 191 | 192 | test "Math with Values": 193 | check (v1 * v2).kind == VFloat 194 | check (v1 + v1).kind == VFloat 195 | check (v1 + v1) == %~ 2 196 | check (v1 * v1).kind == VFloat 197 | check almostEq((v1 * v2).toFloat, 1.5) 198 | check almostEq((v1 / v2).toFloat, 2.0 / 3.0) 199 | check v1 * v6 == Value(kind: VNull) 200 | 201 | suite "Formula": 202 | test "Testing ~ formula creation using f{} macro": 203 | let f = f{"meanCty" ~ (c"hwy" + c"cty")} 204 | # manual parens still appear in `name`! 205 | check f.name == "(~ meanCty ((+ hwy cty)))" 206 | when defined(defaultBackend): 207 | let g = meanCty ~ hwy + cty 208 | check $f == $g 209 | # TODO: Add more tests here... 210 | # create with `.` access 211 | let tup = (a: 5.5, b: "ok") 212 | let h = f{%~ tup.a == %~ tup.b} 213 | check h.kind == fkVariable 214 | check h.val == %~ false 215 | check h.name == "(== (%~ tup.a) (%~ tup.b))" 216 | 217 | let f2 = f{float: "min" << min(c"runTimes")} 218 | check $f2 == "min" # LHS of formula 219 | check f2.name == "(<< min (min runTimes))" 220 | 221 | 222 | test "Evaluate raw formula (no DF column dependency)": 223 | # arithmetic works 224 | check evaluate(f{1 + 2}) == %~ 3 225 | # parens work in arithmetic 226 | check evaluate(f{2 * (5 - 3)}) == %~ 4 227 | check evaluate(f{10 / 10}) == %~ 1 228 | # strings are evaluated to themseles 229 | check evaluate(f{"hwy"}) == %~ "hwy" 230 | 231 | test "Formula, literal on RHS": 232 | let f = f{"from" ~ 0} 233 | check f.name == "(~ from 0)" 234 | 235 | test "Test formula creation of type `fkVariable`": 236 | let f1 = f{"Test"} 237 | let f2 = f{1.1} 238 | let f3 = f{4} 239 | let f4 = f{true} 240 | check f1.kind == fkVariable 241 | check f2.kind == fkVariable 242 | check f3.kind == fkVariable 243 | check f4.kind == fkVariable 244 | check $f1 == "Test" 245 | check $f2 == "1.1" 246 | check $f3 == "4" 247 | check $f4 == "true" 248 | -------------------------------------------------------------------------------- /README.org: -------------------------------------------------------------------------------- 1 | 2 | * Datamancer 3 | [[https://github.com/SciNim/datamancer/workflows/datamancer%20CI/badge.svg]] 4 | 5 | #+ATTR_HTML: title="Join the chat at https://gitter.im/SciNim/Community" 6 | [[https://gitter.im/SciNim/Community][file:https://badges.gitter.im/SciNim/Community.svg]] 7 | 8 | ** Comparison to other dataframe libraries 9 | 10 | Check out the following gist for a comparison of this library with 11 | dplyr (R) and pandas (Python): 12 | 13 | https://gist.github.com/Vindaar/6908c038707c7d8293049edb3d204f84 14 | 15 | 16 | ** Documentation 17 | 18 | The documentation is found at: 19 | 20 | https://scinim.github.io/Datamancer/ 21 | 22 | with a short introduction under: 23 | 24 | https://scinim.github.io/Datamancer/datamancer.html 25 | 26 | ** Installation & dependencies 27 | 28 | Installation should be just a 29 | #+BEGIN_SRC sh 30 | nimble install datamancer 31 | #+END_SRC 32 | away. 33 | 34 | ** Features and formulas 35 | 36 | The data frame provides the "5 verbs" of [[https://dplyr.tidyverse.org/][dplyr]] and more. Main implemented functions: 37 | - =filter= 38 | - =mutate=, =transmute= 39 | - =select=, =rename= 40 | - =arrange= 41 | - =summarize= 42 | - =group_by= 43 | - =arrange= 44 | - =inner_join= 45 | - =set_diff= 46 | - =count= 47 | - =bind_rows= 48 | - =gather= 49 | - =unique=, 50 | which are all based on the =FormulaNode= object. Basically they all 51 | receive =varargs[FormulaNode]=, which is evaluated in context of the 52 | given dataframe. 53 | 54 | ** A few words on the =f{}= macro to create formulas 55 | 56 | Use: 57 | - no infix symbol and only code, which does not involve a column in 58 | the sense defined below in [[Column access]]: 59 | #+BEGIN_SRC nim 60 | f{1 + 2} 61 | f{"foo"} 62 | f{true} 63 | #+END_SRC 64 | a =FormulaNode= of kind =fkVariable=. Stores the values as a =Value= 65 | variant object. 66 | - =<-= for assignment 67 | #+BEGIN_SRC nim 68 | f{"newName" <- "oldName"} 69 | #+END_SRC 70 | a =FormulaNode= of kind =fkAssign=. 71 | This does not involve a closure and is just a simple object storing 72 | a LHS as a string and the RHS as a =Value= (to also support constant 73 | columns via =f{"constantCol" <- 5}=). 74 | Typically used for =rename= or as an argument for =transmute= and 75 | =mutate= to just rename a column or to assign a constant column. 76 | - =<<= for reduce operations 77 | #+BEGIN_SRC nim 78 | f{"meanHwy" << mean(`hwy`)} 79 | #+END_SRC 80 | a =FormulaNode= of kind =fkScalar=. 81 | Used only for =summarize= and means we reduce a full column to a 82 | single =Value=. This generates a closure, which computes the RHS and 83 | assigns it to a result variable of type =Value=. Type hints are 84 | required (for now) if only a single proc call is involved on the 85 | RHS to tell the macro as what to read the column "hwy" and what the 86 | result variable is. 87 | - =~= for vector like proc 88 | #+BEGIN_SRC nim 89 | f{"xSquared" ~ `x` * `x`} 90 | #+END_SRC 91 | a =FormulaNode= of kind =fkVector=. 92 | Used in =mutate=, =transmute= to calculate a full column. This also 93 | generates a closure as the reduce operations =<<= does, except here 94 | we loop over the length of the DF and access each read tensor via =[idx]=. 95 | - a formula without any infix symbols will be considered: 96 | - =fkVariable= if no column involved 97 | - =fkVector= else 98 | 99 | *** Column access 100 | To access columns in the context of formula, the biggest change 101 | occured. In the old formula system, a literal string was attempted to 102 | be resolved as a DF column dynamically. Since the new formulas are 103 | compiled to closures, this would involve overhead and is thus avoided 104 | for clearer separation between columns and real strings. This also 105 | helps readers of a formula. This means: 106 | - =`columnName`=: accented quotes refer to a DF column. Be careful to 107 | only use this for simple letters (no non letter characters or spaces). 108 | - =c"columnName"= : call string literals (by convention use a =c= 109 | before the string) are interpreted as a column in the same way as 110 | accented quotes, but allow for column names with spaces / non letter 111 | characters. 112 | - =idx("columnName"), idx(`columnName`), idx(nimExpressionReturningString)=: 113 | refers to a specific element of the referred column 114 | - =col("columnName"), col(`columnName`), col(nimExpressionReturningString)=: 115 | refers to a the full tensor of the referred column 116 | - or directly via: =df[nimExpressionReturningString] / 117 | df[nimExpressionReturningString][idx]=: to access columns / indices using 118 | identifiers / symbols / general expressions that return a string 119 | quotes, call string literals or just string literals). This is 120 | equivalent to =idx= / =col=, so the latter are preferred. 121 | 122 | The closures take a data frame as an argument, which is named 123 | =df=. The =df["columnName"]= refers to that argument, although not 124 | literally (it is gen'symmed and =df["columnName"]= refers to a 125 | =Column=). From that column we get the underlying =Tensor=. 126 | 127 | In the context of calling procedures, e.g.: 128 | #+BEGIN_SRC nim 129 | f{someProc(`columnName`)} 130 | #+END_SRC 131 | it may not be clear whether the procedure is supposed to take the 132 | whole tensor as an argument or hand each element of the tensor in a 133 | loop. Internally the macro tries to determine a suitable call for 134 | either a scalar or tensor argument. If the called procedure is unique 135 | this will likely succeed. In case of heavily overloaded symbols 136 | (e.g. =max=) it also tries to determine a match from (if any) 137 | additional arguments given to that procedure (and uses their types if 138 | they are not column references). 139 | 140 | In case at cannot be resolved, you will get an error at compile time 141 | to specify =idx= (per index access) or =col= (full column access) of the column. 142 | 143 | So for example: 144 | #+BEGIN_SRC nim 145 | f{"asFloat" ~ parseFloat(idx("colName"))} 146 | #+END_SRC 147 | where =parseFloat= acts on each element individually. If there is only 148 | a single overload (as in case of =parseFloat=), the input and output 149 | types are inferred automatically to be: 150 | - read tensor =colName= as a =string= 151 | - result type is =float= 152 | 153 | *** Type hints 154 | Type hints are required if the formula macro cannot determine the type 155 | required, either input or output. This is usually the case for 156 | ambiguous operations (overloaded procedures, only a single column 157 | without any operations, etc.). They are of 158 | the form: 159 | - =: =: simple type hint for the type of the 160 | underlying tensor of the columns involved in the formula. 161 | - = -> : =: full type for closure. 162 | == is the dtype used for input tensors, == the resulting 163 | type. 164 | For example: 165 | #+begin_src nim 166 | f{int -> int: `x` * `y`} 167 | # ^--- type of the tensors involved on the RHS. Will be read as integers 168 | # ^--- type of the resulting tensor 169 | #+end_src 170 | In this case the type would be determined to be float by the macro, so 171 | type hints are required in case we need them to be integers. 172 | 173 | *NOTE:* it is not possible to include tensors of different data types 174 | in a single formula using type hints. However, if they appear in 175 | different branches of the formula AST and the types are determined 176 | automatically, this is possible. All input tensors of a computation will be read 177 | either by the automatically deduced data type or the == argument 178 | mentioned here. If an underlying tensor is not actually of the given 179 | data type, it will be converted via =T(val)=, where =T= is the type or 180 | if the conversion is not possible a runtime exception will be thrown. 181 | 182 | In addition to looking at symbols in the scope, there is a step 183 | involving some simple heuristic rules, e.g. if =*=, =/= is involved, it's 184 | assumed that the input tensors are floats and the output as well. If 185 | =&= or =$= is involved, it's assumed to be strings. 186 | Finally if =and= and other logic keywords are used, the result is 187 | assumed to be =bool= (not the input thought!). 188 | 189 | #+BEGIN_SRC nim 190 | const floatSet = toSet(@["+", "-", "*", "/", "mod"]) 191 | const stringSet = toSet(@["&", "$"]) 192 | const boolSet = toSet(@["and", "or", "xor", ">", "<", ">=", "<=", "==", "!=", 193 | "true", "false", "in", "notin"]) 194 | #+END_SRC 195 | 196 | *** Notes on formula macro internals 197 | 198 | For an insight into the implementation details, ideas and development 199 | notes, check out the following document: 200 | 201 | https://github.com/SciNim/Datamancer/blob/master/notes/formula_dev_notes.org 202 | -------------------------------------------------------------------------------- /tests/testsFormula.nim: -------------------------------------------------------------------------------- 1 | import datamancer, unittest, sequtils, math, strutils, streams, sugar 2 | import seqmath 3 | 4 | type 5 | Foo = object 6 | fd: float 7 | 8 | suite "Formulas": 9 | let a = [1, 2, 3] 10 | let b = [3, 4, 5] 11 | let c = [4, 5, 6] 12 | let d = [8, 9, 10] 13 | let e = [11, 12, 13] 14 | let f = [false, true, false] 15 | let g = ["hello", "world", "foo"] 16 | let h = [2.5, 7.5, NaN] 17 | let i = ["5", "6", "7"] 18 | let df = seqsToDf(a, b, c, d, e, f, g, h, i) 19 | test "Basic `idx` tests with automatic type deduction from context": 20 | block: 21 | # - infix, "a" read as integer automatically 22 | let fn = f{ idx("a") == 5 } 23 | check fn.evaluate(df).bCol == [false, false, false].toTensor 24 | block: 25 | # - infix, a read as float automatically 26 | let fn = f{ idx("a") == 5.5 } 27 | check fn.evaluate(df).bCol == [false, false, false].toTensor 28 | block: 29 | # - infix involving `in`, type conversion on `idx` and set 30 | let fn = f{ idx("a").int8 in {1'i8, 3, 5, 7} } 31 | check fn.evaluate(df).bCol == [true, false, true].toTensor 32 | block: 33 | # - infix of `>` works 34 | # - type determined automatically 35 | let fn = f{ 5 > idx("a") } 36 | check fn.evaluate(df).bCol == [true, true, true].toTensor 37 | block: 38 | # - infix of `>` works w/ order switched around 39 | # - type determined automatically 40 | let fn = f{ idx("a") > 5 } 41 | check fn.evaluate(df).bCol == [false, false, false].toTensor 42 | block: 43 | # - type deduction on one side works with `Value` 44 | let fn = f{ idx("a") >= %~ 5.5 } 45 | check fn.evaluate(df).bCol == [false, false, false].toTensor 46 | block: 47 | # - reads data as `bool` 48 | # - runtime error due to a, b being int 49 | ## TODO: decide if this should become a CT error due to ambiguity. 50 | ## Probably yes, requires change to `assignType` I suppose (not to use 51 | ## default type info here) 52 | expect(ValueError): 53 | let fn = f{ idx("a") > idx("b") } 54 | discard fn.evaluate(df) 55 | block: 56 | # - RHS is float, infix means LHS will be read as float 57 | let fn = f{idx("a") < idx("b").float } 58 | check fn.evaluate(df).bCol == [true, true, true].toTensor 59 | block: 60 | # - above works with `==` too 61 | let fn = f{ idx("a") == idx("b").float } 62 | check fn.evaluate(df).bCol == [false, false, false].toTensor 63 | block: 64 | var fm = Foo(fd: 5.2) 65 | let fn = f{ idx("a") > fm.fd } 66 | check fn.evaluate(df).bCol == [false, false, false].toTensor 67 | 68 | block: 69 | # - prefix, automatic type deduction 70 | let fn = f{ not idx("f") } 71 | check fn.evaluate(df).bCol == [true, false, true].toTensor 72 | block: 73 | let fn = f{ idx("x") >= max(col("x")) * 0.5 } 74 | 75 | block: 76 | let fn = f{ parseInt(idx("a")) > 2 } 77 | 78 | test "Basic `col` test with type deduction from context": 79 | block: 80 | ## the following fails at CT, because type of output is ambiguous (max is overloaded) 81 | # let fn = f{ col("a").max } 82 | ## This one should always work 83 | let fn2 = f{float: col("a").max } 84 | check fn2.reduce(df).toInt == 3 85 | 86 | block: 87 | # - accessing column length works 88 | let fn = f{float: col("a").len } 89 | check fn.reduce(df).toInt == 3 90 | 91 | block: 92 | # - accessing tensor elments with bracket 93 | let fn = f{float: col("a")[1] } 94 | check fn.reduce(df).toInt == 2 95 | 96 | test "Automatic type deduction based on nnkDotExpr w/ a (non ambiguous) proc call": 97 | block: 98 | # - examples of determining type from unique procedure in a case where 99 | # heuristic type extraction fails 100 | proc uniqueProcWithType(x: int): int = 101 | x + 5 102 | let fn = f{ idx("a").uniqueProcWithType } 103 | check fn.evaluate(df).iCol == [6, 7, 8].toTensor 104 | 105 | test "Automatic type deduction based on `idx` in argument of a call overloaded proc call": 106 | block: 107 | # - type deduction based on `idx` in specific argument of a typically overloaded 108 | # symbol. Can be deduced due to only single overload matching the arguments 109 | proc someInt(): int = 2 110 | proc max(x: int, y: string, z: float, b: int): int = 111 | result = 5 112 | let fn = f{ max(idx("a"), "hello", 5.5, someInt()) } 113 | check fn.evaluate(df).iCol == [5, 5, 5].toTensor 114 | 115 | block: 116 | # - automatically determines that `a` should be read as `int` 117 | # - formula is mapping 118 | let fn = f{ max(idx("a"), 2) } 119 | check fn.evaluate(df).iCol == [2, 2, 3].toTensor 120 | 121 | test "Formula with an if expression accessing multiple columns": 122 | block: 123 | # - formula with an if expression accessing multiple columns 124 | let fn = f{int -> int: if `a` < 2: 125 | `b` 126 | else: 127 | `c` } 128 | check fn.evaluate(df).iCol == [3, 5, 6].toTensor 129 | 130 | when (NimMajor, NimMinor, NimPatch) >= (1, 4, 0): 131 | block: 132 | ## TODO: 1. we need the parenthesis (otherwise lexer error) 133 | ## 2. return type is deduced to be bool. It should be taken from 134 | ## the if expression! `nnkIfExpr` not implemented yet. 135 | let fn = f{float -> float: "h" ~ (if classify(idx("h")) == fcNaN: 136 | -1.0 137 | else: 138 | `h`)} 139 | check fn.evaluate(df).fCol == [2.5, 7.5, -1.0].toTensor 140 | 141 | test "Dot expression requiring `Value` input works automatically": 142 | block: 143 | # - dot call requiring `Value` argument, output is object column (because 144 | # `isNull` returns a boolean as a `Value` 145 | let fn = f{ idx("a").isNull } 146 | check fn.evaluate(df).oCol == [%~ false, %~ false, %~ false].toTensor 147 | 148 | test "Infix with `notin` and local array": 149 | block: 150 | # - `notin` works and determines `g` 151 | let existKeys = ["hello"] 152 | let fn = f{string: `g` notin existKeys} 153 | check fn.evaluate(df).bCol == [false, true, true].toTensor 154 | 155 | test "`ggplotnim` formula accessing (proc) field of an object": 156 | block: 157 | type 158 | MS = object 159 | trans: proc(x: float): float 160 | let col = %~ "a" 161 | let ms = MS(trans: (proc(x: float): float = 5.5)) 162 | let colStr = "log10(x4)" 163 | let fn = f{float: colStr ~ ms.trans( df[col.toStr][idx] ) } 164 | check fn.evaluate(df).fCol == [5.5, 5.5, 5.5].toTensor 165 | 166 | test "`max` overload is resolved in context of infix with float": 167 | block: 168 | let fn = f{ `a` >= max(`a`) * 0.5 } 169 | check fn.evaluate(df).bCol == [false, true, true].toTensor 170 | 171 | block: 172 | ## TODO: this is technically broken, because from `*` we take `float` 173 | ## as result and from the integer `-1` we determine the infix to be 174 | ## integer 175 | #let fn = f{ -1 * c"hwy"} 176 | 177 | test "Reducing formula with boolean return value": 178 | block: 179 | let df2 = seqsToDf({"var1" : toSeq(0 ..< 10)}) 180 | let fn = f{ sum(`var1`) > 20000 } 181 | check fn.reduce(df2).toBool == false 182 | 183 | test "Example of no `idx` but reducing proc (mean) as a mapping": 184 | block: 185 | ## example of a formula that contradicts our assumption that we should error in 186 | ## case the determined formula kind and the given one mismatch. 187 | ## In this case we might *want* to assign something + the mean for each element in 188 | ## the DF (in the context of a `group_by` call this makes sense! 189 | ## We'll turn it into a warning. 190 | ## Also: keep in mind that if the user writes something, same as with type hints, we 191 | ## should value that decision. 192 | # here we only check it compiles (no CT error anymore) 193 | let fn = f{float -> float: "subMeanHwy" ~ 0.0 + mean(col("hwy"))} 194 | 195 | test "Name test": 196 | let f = f{"meanCty" ~ (c"hwy" + c"cty")} 197 | # name is the full name. Manual parens (nnkPar) are included in representation. 198 | check f.name == "(~ meanCty ((+ hwy cty)))" 199 | 200 | test "Constant mapping of integer": 201 | let countCol = "count" 202 | let fn = f{int: countCol ~ 0} 203 | check fn.evaluate(df).iCol == [0, 0, 0].toTensor 204 | 205 | test "Name of long formula": 206 | const cut_rms_trans_low = 0.1 207 | const cut_rms_trans_high = 1.5 208 | proc inRegion(x, y: float, r: string): bool = 209 | discard 210 | 211 | let fn = f{float -> bool: 212 | `rmsTransverse` >= cut_rms_trans_low and 213 | `rmsTransverse` <= cut_rms_trans_high and 214 | inRegion(df["centerX"][idx], df["centerY"][idx], "crSilver") and 215 | `hits` < 500} 216 | 217 | check $fn == """(and (and (and (>= rmsTransverse cut_rms_trans_low) (<= rmsTransverse cut_rms_trans_high)) (inRegion df["centerX"][idx] df["centerY"][idx] crSilver)) (< hits 500))""" 218 | 219 | test "Explicit types in `col`, `idx`": 220 | block: 221 | # explicit types work 222 | let fn = f{ idx("a", int) } 223 | check fn.evaluate(df).iCol == [1, 2, 3].toTensor 224 | 225 | block: 226 | # mixing explicit types work 227 | let fn = f{ idx("a", int) + idx("i", string).parseInt} 228 | check fn.evaluate(df).iCol == [6, 8, 10].toTensor 229 | 230 | block: 231 | # type hints do ``not`` overwrite explicit types 232 | let fn = f{string -> int: ( 233 | if `g` == "hello": 234 | idx("a", int) 235 | else: 236 | idx("b", int)) } 237 | check fn.evaluate(df).iCol == [1, 4, 5].toTensor 238 | 239 | test "Add with integer should produce integer": 240 | let fn = f{"a+5" ~ `a` + 5 } 241 | check fn.evaluate(df).kind == colInt 242 | check fn.evaluate(df).iCol == [6, 7, 8].toTensor 243 | 244 | test "Add with float should produce float": 245 | let fn = f{"a+5.0" ~ `a` + 5.0 } 246 | check fn.evaluate(df).kind == colFloat 247 | check fn.evaluate(df).fCol == [6.0, 7.0, 8.0].toTensor 248 | 249 | test "Complex reduction with multiple types and type deduction of `mean`": 250 | # this was broken up to `v0.1.3` 251 | let df = seqsToDf({ "x" : @[1, 2, 3, 4, 5], "y" : @["a", "b", "c", "d", "e"] }) 252 | block: 253 | let fn = f{"mean+ord" << mean(`x`) + ord(max(col(`y`, string))[0]).float } 254 | check fn.reduce(df).kind == VFloat 255 | check fn.reduce(df).toFloat == 104.0 256 | block: 257 | let fn = f{"mean+ord" << mean(`x`) + col(`y`, string).max[0].ord.float } 258 | check fn.reduce(df).kind == VFloat 259 | check fn.reduce(df).toFloat == 104.0 260 | 261 | test "Formula variable name generation": 262 | # this was broken up to `v0.1.8`, as all variables were turned into `colT` 263 | # (we just *removed* the part that made each column unique) 264 | let df = seqsToDf({"0" : [1,1,1], "1" : [2,2,2], "2" : [3,3,3]}) 265 | let fn = f{idx("0") + idx("1") + idx("2")} 266 | check fn.evaluate(df).toTensor(int) == toTensor [6,6,6] 267 | -------------------------------------------------------------------------------- /data/mpg.csv: -------------------------------------------------------------------------------- 1 | manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class 2 | audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact 3 | audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact 4 | audi,a4,2,2008,4,manual(m6),f,20,31,p,compact 5 | audi,a4,2,2008,4,auto(av),f,21,30,p,compact 6 | audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact 7 | audi,a4,2.8,1999,6,manual(m5),f,18,26,p,compact 8 | audi,a4,3.1,2008,6,auto(av),f,18,27,p,compact 9 | audi,a4 quattro,1.8,1999,4,manual(m5),4,18,26,p,compact 10 | audi,a4 quattro,1.8,1999,4,auto(l5),4,16,25,p,compact 11 | audi,a4 quattro,2,2008,4,manual(m6),4,20,28,p,compact 12 | audi,a4 quattro,2,2008,4,auto(s6),4,19,27,p,compact 13 | audi,a4 quattro,2.8,1999,6,auto(l5),4,15,25,p,compact 14 | audi,a4 quattro,2.8,1999,6,manual(m5),4,17,25,p,compact 15 | audi,a4 quattro,3.1,2008,6,auto(s6),4,17,25,p,compact 16 | audi,a4 quattro,3.1,2008,6,manual(m6),4,15,25,p,compact 17 | audi,a6 quattro,2.8,1999,6,auto(l5),4,15,24,p,midsize 18 | audi,a6 quattro,3.1,2008,6,auto(s6),4,17,25,p,midsize 19 | audi,a6 quattro,4.2,2008,8,auto(s6),4,16,23,p,midsize 20 | chevrolet,c1500 suburban 2wd,5.3,2008,8,auto(l4),r,14,20,r,suv 21 | chevrolet,c1500 suburban 2wd,5.3,2008,8,auto(l4),r,11,15,e,suv 22 | chevrolet,c1500 suburban 2wd,5.3,2008,8,auto(l4),r,14,20,r,suv 23 | chevrolet,c1500 suburban 2wd,5.7,1999,8,auto(l4),r,13,17,r,suv 24 | chevrolet,c1500 suburban 2wd,6,2008,8,auto(l4),r,12,17,r,suv 25 | chevrolet,corvette,5.7,1999,8,manual(m6),r,16,26,p,2seater 26 | chevrolet,corvette,5.7,1999,8,auto(l4),r,15,23,p,2seater 27 | chevrolet,corvette,6.2,2008,8,manual(m6),r,16,26,p,2seater 28 | chevrolet,corvette,6.2,2008,8,auto(s6),r,15,25,p,2seater 29 | chevrolet,corvette,7,2008,8,manual(m6),r,15,24,p,2seater 30 | chevrolet,k1500 tahoe 4wd,5.3,2008,8,auto(l4),4,14,19,r,suv 31 | chevrolet,k1500 tahoe 4wd,5.3,2008,8,auto(l4),4,11,14,e,suv 32 | chevrolet,k1500 tahoe 4wd,5.7,1999,8,auto(l4),4,11,15,r,suv 33 | chevrolet,k1500 tahoe 4wd,6.5,1999,8,auto(l4),4,14,17,d,suv 34 | chevrolet,malibu,2.4,1999,4,auto(l4),f,19,27,r,midsize 35 | chevrolet,malibu,2.4,2008,4,auto(l4),f,22,30,r,midsize 36 | chevrolet,malibu,3.1,1999,6,auto(l4),f,18,26,r,midsize 37 | chevrolet,malibu,3.5,2008,6,auto(l4),f,18,29,r,midsize 38 | chevrolet,malibu,3.6,2008,6,auto(s6),f,17,26,r,midsize 39 | dodge,caravan 2wd,2.4,1999,4,auto(l3),f,18,24,r,minivan 40 | dodge,caravan 2wd,3,1999,6,auto(l4),f,17,24,r,minivan 41 | dodge,caravan 2wd,3.3,1999,6,auto(l4),f,16,22,r,minivan 42 | dodge,caravan 2wd,3.3,1999,6,auto(l4),f,16,22,r,minivan 43 | dodge,caravan 2wd,3.3,2008,6,auto(l4),f,17,24,r,minivan 44 | dodge,caravan 2wd,3.3,2008,6,auto(l4),f,17,24,r,minivan 45 | dodge,caravan 2wd,3.3,2008,6,auto(l4),f,11,17,e,minivan 46 | dodge,caravan 2wd,3.8,1999,6,auto(l4),f,15,22,r,minivan 47 | dodge,caravan 2wd,3.8,1999,6,auto(l4),f,15,21,r,minivan 48 | dodge,caravan 2wd,3.8,2008,6,auto(l6),f,16,23,r,minivan 49 | dodge,caravan 2wd,4,2008,6,auto(l6),f,16,23,r,minivan 50 | dodge,dakota pickup 4wd,3.7,2008,6,manual(m6),4,15,19,r,pickup 51 | dodge,dakota pickup 4wd,3.7,2008,6,auto(l4),4,14,18,r,pickup 52 | dodge,dakota pickup 4wd,3.9,1999,6,auto(l4),4,13,17,r,pickup 53 | dodge,dakota pickup 4wd,3.9,1999,6,manual(m5),4,14,17,r,pickup 54 | dodge,dakota pickup 4wd,4.7,2008,8,auto(l5),4,14,19,r,pickup 55 | dodge,dakota pickup 4wd,4.7,2008,8,auto(l5),4,14,19,r,pickup 56 | dodge,dakota pickup 4wd,4.7,2008,8,auto(l5),4,9,12,e,pickup 57 | dodge,dakota pickup 4wd,5.2,1999,8,manual(m5),4,11,17,r,pickup 58 | dodge,dakota pickup 4wd,5.2,1999,8,auto(l4),4,11,15,r,pickup 59 | dodge,durango 4wd,3.9,1999,6,auto(l4),4,13,17,r,suv 60 | dodge,durango 4wd,4.7,2008,8,auto(l5),4,13,17,r,suv 61 | dodge,durango 4wd,4.7,2008,8,auto(l5),4,9,12,e,suv 62 | dodge,durango 4wd,4.7,2008,8,auto(l5),4,13,17,r,suv 63 | dodge,durango 4wd,5.2,1999,8,auto(l4),4,11,16,r,suv 64 | dodge,durango 4wd,5.7,2008,8,auto(l5),4,13,18,r,suv 65 | dodge,durango 4wd,5.9,1999,8,auto(l4),4,11,15,r,suv 66 | dodge,ram 1500 pickup 4wd,4.7,2008,8,manual(m6),4,12,16,r,pickup 67 | dodge,ram 1500 pickup 4wd,4.7,2008,8,auto(l5),4,9,12,e,pickup 68 | dodge,ram 1500 pickup 4wd,4.7,2008,8,auto(l5),4,13,17,r,pickup 69 | dodge,ram 1500 pickup 4wd,4.7,2008,8,auto(l5),4,13,17,r,pickup 70 | dodge,ram 1500 pickup 4wd,4.7,2008,8,manual(m6),4,12,16,r,pickup 71 | dodge,ram 1500 pickup 4wd,4.7,2008,8,manual(m6),4,9,12,e,pickup 72 | dodge,ram 1500 pickup 4wd,5.2,1999,8,auto(l4),4,11,15,r,pickup 73 | dodge,ram 1500 pickup 4wd,5.2,1999,8,manual(m5),4,11,16,r,pickup 74 | dodge,ram 1500 pickup 4wd,5.7,2008,8,auto(l5),4,13,17,r,pickup 75 | dodge,ram 1500 pickup 4wd,5.9,1999,8,auto(l4),4,11,15,r,pickup 76 | ford,expedition 2wd,4.6,1999,8,auto(l4),r,11,17,r,suv 77 | ford,expedition 2wd,5.4,1999,8,auto(l4),r,11,17,r,suv 78 | ford,expedition 2wd,5.4,2008,8,auto(l6),r,12,18,r,suv 79 | ford,explorer 4wd,4,1999,6,auto(l5),4,14,17,r,suv 80 | ford,explorer 4wd,4,1999,6,manual(m5),4,15,19,r,suv 81 | ford,explorer 4wd,4,1999,6,auto(l5),4,14,17,r,suv 82 | ford,explorer 4wd,4,2008,6,auto(l5),4,13,19,r,suv 83 | ford,explorer 4wd,4.6,2008,8,auto(l6),4,13,19,r,suv 84 | ford,explorer 4wd,5,1999,8,auto(l4),4,13,17,r,suv 85 | ford,f150 pickup 4wd,4.2,1999,6,auto(l4),4,14,17,r,pickup 86 | ford,f150 pickup 4wd,4.2,1999,6,manual(m5),4,14,17,r,pickup 87 | ford,f150 pickup 4wd,4.6,1999,8,manual(m5),4,13,16,r,pickup 88 | ford,f150 pickup 4wd,4.6,1999,8,auto(l4),4,13,16,r,pickup 89 | ford,f150 pickup 4wd,4.6,2008,8,auto(l4),4,13,17,r,pickup 90 | ford,f150 pickup 4wd,5.4,1999,8,auto(l4),4,11,15,r,pickup 91 | ford,f150 pickup 4wd,5.4,2008,8,auto(l4),4,13,17,r,pickup 92 | ford,mustang,3.8,1999,6,manual(m5),r,18,26,r,subcompact 93 | ford,mustang,3.8,1999,6,auto(l4),r,18,25,r,subcompact 94 | ford,mustang,4,2008,6,manual(m5),r,17,26,r,subcompact 95 | ford,mustang,4,2008,6,auto(l5),r,16,24,r,subcompact 96 | ford,mustang,4.6,1999,8,auto(l4),r,15,21,r,subcompact 97 | ford,mustang,4.6,1999,8,manual(m5),r,15,22,r,subcompact 98 | ford,mustang,4.6,2008,8,manual(m5),r,15,23,r,subcompact 99 | ford,mustang,4.6,2008,8,auto(l5),r,15,22,r,subcompact 100 | ford,mustang,5.4,2008,8,manual(m6),r,14,20,p,subcompact 101 | honda,civic,1.6,1999,4,manual(m5),f,28,33,r,subcompact 102 | honda,civic,1.6,1999,4,auto(l4),f,24,32,r,subcompact 103 | honda,civic,1.6,1999,4,manual(m5),f,25,32,r,subcompact 104 | honda,civic,1.6,1999,4,manual(m5),f,23,29,p,subcompact 105 | honda,civic,1.6,1999,4,auto(l4),f,24,32,r,subcompact 106 | honda,civic,1.8,2008,4,manual(m5),f,26,34,r,subcompact 107 | honda,civic,1.8,2008,4,auto(l5),f,25,36,r,subcompact 108 | honda,civic,1.8,2008,4,auto(l5),f,24,36,c,subcompact 109 | honda,civic,2,2008,4,manual(m6),f,21,29,p,subcompact 110 | hyundai,sonata,2.4,1999,4,auto(l4),f,18,26,r,midsize 111 | hyundai,sonata,2.4,1999,4,manual(m5),f,18,27,r,midsize 112 | hyundai,sonata,2.4,2008,4,auto(l4),f,21,30,r,midsize 113 | hyundai,sonata,2.4,2008,4,manual(m5),f,21,31,r,midsize 114 | hyundai,sonata,2.5,1999,6,auto(l4),f,18,26,r,midsize 115 | hyundai,sonata,2.5,1999,6,manual(m5),f,18,26,r,midsize 116 | hyundai,sonata,3.3,2008,6,auto(l5),f,19,28,r,midsize 117 | hyundai,tiburon,2,1999,4,auto(l4),f,19,26,r,subcompact 118 | hyundai,tiburon,2,1999,4,manual(m5),f,19,29,r,subcompact 119 | hyundai,tiburon,2,2008,4,manual(m5),f,20,28,r,subcompact 120 | hyundai,tiburon,2,2008,4,auto(l4),f,20,27,r,subcompact 121 | hyundai,tiburon,2.7,2008,6,auto(l4),f,17,24,r,subcompact 122 | hyundai,tiburon,2.7,2008,6,manual(m6),f,16,24,r,subcompact 123 | hyundai,tiburon,2.7,2008,6,manual(m5),f,17,24,r,subcompact 124 | jeep,grand cherokee 4wd,3,2008,6,auto(l5),4,17,22,d,suv 125 | jeep,grand cherokee 4wd,3.7,2008,6,auto(l5),4,15,19,r,suv 126 | jeep,grand cherokee 4wd,4,1999,6,auto(l4),4,15,20,r,suv 127 | jeep,grand cherokee 4wd,4.7,1999,8,auto(l4),4,14,17,r,suv 128 | jeep,grand cherokee 4wd,4.7,2008,8,auto(l5),4,9,12,e,suv 129 | jeep,grand cherokee 4wd,4.7,2008,8,auto(l5),4,14,19,r,suv 130 | jeep,grand cherokee 4wd,5.7,2008,8,auto(l5),4,13,18,r,suv 131 | jeep,grand cherokee 4wd,6.1,2008,8,auto(l5),4,11,14,p,suv 132 | land rover,range rover,4,1999,8,auto(l4),4,11,15,p,suv 133 | land rover,range rover,4.2,2008,8,auto(s6),4,12,18,r,suv 134 | land rover,range rover,4.4,2008,8,auto(s6),4,12,18,r,suv 135 | land rover,range rover,4.6,1999,8,auto(l4),4,11,15,p,suv 136 | lincoln,navigator 2wd,5.4,1999,8,auto(l4),r,11,17,r,suv 137 | lincoln,navigator 2wd,5.4,1999,8,auto(l4),r,11,16,p,suv 138 | lincoln,navigator 2wd,5.4,2008,8,auto(l6),r,12,18,r,suv 139 | mercury,mountaineer 4wd,4,1999,6,auto(l5),4,14,17,r,suv 140 | mercury,mountaineer 4wd,4,2008,6,auto(l5),4,13,19,r,suv 141 | mercury,mountaineer 4wd,4.6,2008,8,auto(l6),4,13,19,r,suv 142 | mercury,mountaineer 4wd,5,1999,8,auto(l4),4,13,17,r,suv 143 | nissan,altima,2.4,1999,4,manual(m5),f,21,29,r,compact 144 | nissan,altima,2.4,1999,4,auto(l4),f,19,27,r,compact 145 | nissan,altima,2.5,2008,4,auto(av),f,23,31,r,midsize 146 | nissan,altima,2.5,2008,4,manual(m6),f,23,32,r,midsize 147 | nissan,altima,3.5,2008,6,manual(m6),f,19,27,p,midsize 148 | nissan,altima,3.5,2008,6,auto(av),f,19,26,p,midsize 149 | nissan,maxima,3,1999,6,auto(l4),f,18,26,r,midsize 150 | nissan,maxima,3,1999,6,manual(m5),f,19,25,r,midsize 151 | nissan,maxima,3.5,2008,6,auto(av),f,19,25,p,midsize 152 | nissan,pathfinder 4wd,3.3,1999,6,auto(l4),4,14,17,r,suv 153 | nissan,pathfinder 4wd,3.3,1999,6,manual(m5),4,15,17,r,suv 154 | nissan,pathfinder 4wd,4,2008,6,auto(l5),4,14,20,p,suv 155 | nissan,pathfinder 4wd,5.6,2008,8,auto(s5),4,12,18,p,suv 156 | pontiac,grand prix,3.1,1999,6,auto(l4),f,18,26,r,midsize 157 | pontiac,grand prix,3.8,1999,6,auto(l4),f,16,26,p,midsize 158 | pontiac,grand prix,3.8,1999,6,auto(l4),f,17,27,r,midsize 159 | pontiac,grand prix,3.8,2008,6,auto(l4),f,18,28,r,midsize 160 | pontiac,grand prix,5.3,2008,8,auto(s4),f,16,25,p,midsize 161 | subaru,forester awd,2.5,1999,4,manual(m5),4,18,25,r,suv 162 | subaru,forester awd,2.5,1999,4,auto(l4),4,18,24,r,suv 163 | subaru,forester awd,2.5,2008,4,manual(m5),4,20,27,r,suv 164 | subaru,forester awd,2.5,2008,4,manual(m5),4,19,25,p,suv 165 | subaru,forester awd,2.5,2008,4,auto(l4),4,20,26,r,suv 166 | subaru,forester awd,2.5,2008,4,auto(l4),4,18,23,p,suv 167 | subaru,impreza awd,2.2,1999,4,auto(l4),4,21,26,r,subcompact 168 | subaru,impreza awd,2.2,1999,4,manual(m5),4,19,26,r,subcompact 169 | subaru,impreza awd,2.5,1999,4,manual(m5),4,19,26,r,subcompact 170 | subaru,impreza awd,2.5,1999,4,auto(l4),4,19,26,r,subcompact 171 | subaru,impreza awd,2.5,2008,4,auto(s4),4,20,25,p,compact 172 | subaru,impreza awd,2.5,2008,4,auto(s4),4,20,27,r,compact 173 | subaru,impreza awd,2.5,2008,4,manual(m5),4,19,25,p,compact 174 | subaru,impreza awd,2.5,2008,4,manual(m5),4,20,27,r,compact 175 | toyota,4runner 4wd,2.7,1999,4,manual(m5),4,15,20,r,suv 176 | toyota,4runner 4wd,2.7,1999,4,auto(l4),4,16,20,r,suv 177 | toyota,4runner 4wd,3.4,1999,6,auto(l4),4,15,19,r,suv 178 | toyota,4runner 4wd,3.4,1999,6,manual(m5),4,15,17,r,suv 179 | toyota,4runner 4wd,4,2008,6,auto(l5),4,16,20,r,suv 180 | toyota,4runner 4wd,4.7,2008,8,auto(l5),4,14,17,r,suv 181 | toyota,camry,2.2,1999,4,manual(m5),f,21,29,r,midsize 182 | toyota,camry,2.2,1999,4,auto(l4),f,21,27,r,midsize 183 | toyota,camry,2.4,2008,4,manual(m5),f,21,31,r,midsize 184 | toyota,camry,2.4,2008,4,auto(l5),f,21,31,r,midsize 185 | toyota,camry,3,1999,6,auto(l4),f,18,26,r,midsize 186 | toyota,camry,3,1999,6,manual(m5),f,18,26,r,midsize 187 | toyota,camry,3.5,2008,6,auto(s6),f,19,28,r,midsize 188 | toyota,camry solara,2.2,1999,4,auto(l4),f,21,27,r,compact 189 | toyota,camry solara,2.2,1999,4,manual(m5),f,21,29,r,compact 190 | toyota,camry solara,2.4,2008,4,manual(m5),f,21,31,r,compact 191 | toyota,camry solara,2.4,2008,4,auto(s5),f,22,31,r,compact 192 | toyota,camry solara,3,1999,6,auto(l4),f,18,26,r,compact 193 | toyota,camry solara,3,1999,6,manual(m5),f,18,26,r,compact 194 | toyota,camry solara,3.3,2008,6,auto(s5),f,18,27,r,compact 195 | toyota,corolla,1.8,1999,4,auto(l3),f,24,30,r,compact 196 | toyota,corolla,1.8,1999,4,auto(l4),f,24,33,r,compact 197 | toyota,corolla,1.8,1999,4,manual(m5),f,26,35,r,compact 198 | toyota,corolla,1.8,2008,4,manual(m5),f,28,37,r,compact 199 | toyota,corolla,1.8,2008,4,auto(l4),f,26,35,r,compact 200 | toyota,land cruiser wagon 4wd,4.7,1999,8,auto(l4),4,11,15,r,suv 201 | toyota,land cruiser wagon 4wd,5.7,2008,8,auto(s6),4,13,18,r,suv 202 | toyota,toyota tacoma 4wd,2.7,1999,4,manual(m5),4,15,20,r,pickup 203 | toyota,toyota tacoma 4wd,2.7,1999,4,auto(l4),4,16,20,r,pickup 204 | toyota,toyota tacoma 4wd,2.7,2008,4,manual(m5),4,17,22,r,pickup 205 | toyota,toyota tacoma 4wd,3.4,1999,6,manual(m5),4,15,17,r,pickup 206 | toyota,toyota tacoma 4wd,3.4,1999,6,auto(l4),4,15,19,r,pickup 207 | toyota,toyota tacoma 4wd,4,2008,6,manual(m6),4,15,18,r,pickup 208 | toyota,toyota tacoma 4wd,4,2008,6,auto(l5),4,16,20,r,pickup 209 | volkswagen,gti,2,1999,4,manual(m5),f,21,29,r,compact 210 | volkswagen,gti,2,1999,4,auto(l4),f,19,26,r,compact 211 | volkswagen,gti,2,2008,4,manual(m6),f,21,29,p,compact 212 | volkswagen,gti,2,2008,4,auto(s6),f,22,29,p,compact 213 | volkswagen,gti,2.8,1999,6,manual(m5),f,17,24,r,compact 214 | volkswagen,jetta,1.9,1999,4,manual(m5),f,33,44,d,compact 215 | volkswagen,jetta,2,1999,4,manual(m5),f,21,29,r,compact 216 | volkswagen,jetta,2,1999,4,auto(l4),f,19,26,r,compact 217 | volkswagen,jetta,2,2008,4,auto(s6),f,22,29,p,compact 218 | volkswagen,jetta,2,2008,4,manual(m6),f,21,29,p,compact 219 | volkswagen,jetta,2.5,2008,5,auto(s6),f,21,29,r,compact 220 | volkswagen,jetta,2.5,2008,5,manual(m5),f,21,29,r,compact 221 | volkswagen,jetta,2.8,1999,6,auto(l4),f,16,23,r,compact 222 | volkswagen,jetta,2.8,1999,6,manual(m5),f,17,24,r,compact 223 | volkswagen,new beetle,1.9,1999,4,manual(m5),f,35,44,d,subcompact 224 | volkswagen,new beetle,1.9,1999,4,auto(l4),f,29,41,d,subcompact 225 | volkswagen,new beetle,2,1999,4,manual(m5),f,21,29,r,subcompact 226 | volkswagen,new beetle,2,1999,4,auto(l4),f,19,26,r,subcompact 227 | volkswagen,new beetle,2.5,2008,5,manual(m5),f,20,28,r,subcompact 228 | volkswagen,new beetle,2.5,2008,5,auto(s6),f,20,29,r,subcompact 229 | volkswagen,passat,1.8,1999,4,manual(m5),f,21,29,p,midsize 230 | volkswagen,passat,1.8,1999,4,auto(l5),f,18,29,p,midsize 231 | volkswagen,passat,2,2008,4,auto(s6),f,19,28,p,midsize 232 | volkswagen,passat,2,2008,4,manual(m6),f,21,29,p,midsize 233 | volkswagen,passat,2.8,1999,6,auto(l5),f,16,26,p,midsize 234 | volkswagen,passat,2.8,1999,6,manual(m5),f,18,26,p,midsize 235 | volkswagen,passat,3.6,2008,6,auto(s6),f,17,26,p,midsize 236 | -------------------------------------------------------------------------------- /docs/datamancer.org: -------------------------------------------------------------------------------- 1 | * Datamancer overview 2 | 3 | The =datamancer= packages is a dataframe library, which (as a 4 | companion to [[https://github.com/Vindaar/ggplotnim][ggplotnim]]) is also heavily inspired by a specific R 5 | library. In this case [[https://dplyr.tidyverse.org/][dplyr]], a dataframe library of the tidyverse. 6 | 7 | What follows is a basic introduction covering all fundamentals. We 8 | won't cover the full API in this document, but rather aim to give the 9 | understanding so that using the full library becomes easy. 10 | 11 | ** What is a dataframe? 12 | 13 | A dataframe is a data structure that consists of multiple 1 14 | dimensional datasets of equal lengths - but possibly different types - 15 | that have names associated to them. 16 | 17 | From an abstract perspective it is a set of heterogeneous arrays (array not 18 | specifically in terms of CT sized Nim arrays, but as general flat, 19 | contiguous data structures) stored in a hash table. Each entry in the 20 | table is called a column and the keys represent the column names. 21 | 22 | In a sense then they are a data structure similar to what is 23 | represented by a spreadsheet or naturally in a CSV file. 24 | 25 | One useful distinction about the nature of individual columns of a 26 | dataframe is whether the data described in it is continuous 27 | (possibly N different values for a dataframe of length N) or discrete 28 | (small number of N different values compared to a possibly much larger 29 | M number of elements). 30 | 31 | ** What is the point of a dataframe? 32 | 33 | A huge amount of data about the real world or most physical systems 34 | can be described by individual numbers (scalars) or sets of such. For 35 | one system or topic of interest, it is often natural (or slightly less 36 | natural, but possible) to express the state of the system by a set of 37 | numbers. Let's call that a "record". From this follows that possibly: 38 | - a set of such systems 39 | - the time evolution of such a system 40 | - ... 41 | can be described by a (possibly ordered) list of such records. Such a 42 | list directly represents a dataframe. 43 | 44 | What this implies is that as long as we can write down a number of 45 | operations we can perform on or with a dataframe, we can apply such 46 | operations to a large number of possible systems. 47 | 48 | Therefore, dataframes are a very powerful datastructure. 49 | 50 | The library inspiring datamancer defines a small set of (base) 51 | operations to perform with dataframes. A small number of operations 52 | makes it easy to reason about and combine them to produce complex 53 | operations. 54 | 55 | The five base operations ("the verbs of dplyr") are: 56 | - =mutate=: modify an existing column or add a new one 57 | - =select=: select a subset of columns 58 | - =filter=: filter out a subset of records based on one or more conditions 59 | - =summarize=: reduce one or more columns to a single scalar 60 | - =arrange=: sort the dataframe according to one or more columns 61 | 62 | For discrete columns in a dataframe one more procedure is almost as 63 | basic, namely =group_by=. It allows one to iterate over all subsets of 64 | a dataframe that have the same value 'x' in a column 'A'. 65 | 66 | The five verbs above naturally combine with =groub_by=. This means if 67 | one of these operations is performed on a grouped dataframe, the 68 | operation will be performed of each subgroup instead of the full 69 | dataframe (which may or may not produce a different result). 70 | 71 | ** Creating a dataframe 72 | 73 | With some understanding of why we might want to bother with 74 | dataframes, we can now ask ourselves how to create one. 75 | 76 | Usage typically starts with one of the following cases: 77 | 1. data already available in =seq[T]/Tensor[T]= or some Nim object from which such 78 | can be created 79 | 2. some CSV / TSV like ascii file or a =string= representing such a thing 80 | 3. some binary file like HDF5 81 | 4. some database 82 | 83 | Note about 3 and 4: simple access (without manually reading into a 84 | =seq[T]/Tensor[T]=) is not supported for these two yet. These can be 85 | added easily (code for HDF5 exists, but not as part of this 86 | repository) if there is demand. 87 | 88 | *** Supported datatypes and internal representation 89 | 90 | The datamancer dataframes currently support the following types: 91 | - =int= 92 | - =float= 93 | - =string= 94 | - =bool= 95 | - =Value= 96 | where =Value= is a variant object that can store either of the above 97 | datatypes. That type is used in case a single column stores multiple 98 | data types. 99 | 100 | At this moment there is no direct support for =DateTime= or =Time= 101 | objects. That could be added if desired. It's a bit of work, but 102 | manageable. It's mainly missing, because so far I personally didn't 103 | really need it. 104 | 105 | Internally, one column is stored in a =Column= object. This object is 106 | a variant object, with different possible =ColumnKinds= (one kind for 107 | each of the native data types). The column stores an [[https://github.com/mratsim/Arraymancer][Arraymancer]] 108 | =Tensor[T]= for the respective type of the column. 109 | 110 | The usage of variant objects allows for a fully dynamic, runtime 111 | mutable design. This is a trade-off between safety and convenience, 112 | which is placed more towards convenience for the simple reason that 113 | for often recurring computations of the same kind it is highly 114 | recommended to make use of a custom datatype that allows for 115 | optimizations applicable to the specific domain. 116 | 117 | *** From =seq[T]/Tensor[T]= 118 | 119 | For the case of having the data as =seq[T]=, we just use the 120 | =seqsToDf= template to create a DF from it. The template does not care 121 | whether the input is of type =seq[T]= or =Tensor[T]=. In the future 122 | support for pointer + length pairs can be added as well. 123 | 124 | There are two ways to use =seqsToDf=. Assuming we have three sequences of possibly different types: 125 | #+BEGIN_SRC nim 126 | let s1: seq[int] = @[22, 54, 34] 127 | let s2: seq[float] = @[1.87, 1.75, 1.78] 128 | let s3: seq[string] = @["Mike", "Laura", "Sue"] 129 | #+END_SRC 130 | we can either create a DF and let the library automatically deduce the 131 | column names from the Nim identifiers of the given variables: 132 | #+BEGIN_SRC nim 133 | let dfAutoNamed = seqsToDf(s1, s2, s3) 134 | #+END_SRC 135 | which will give us a DF with column names: 136 | #+BEGIN_SRC nim 137 | "s1", "s2", "s3" 138 | #+END_SRC 139 | In many cases one might rather like a different name. In this case use the following 140 | syntax: 141 | #+BEGIN_SRC nim 142 | let df = seqsToDf({ "Age" : s1, 143 | "Height" : s2, 144 | "Name" : s3 }) 145 | #+END_SRC 146 | which will then use the given strings for the column names. 147 | 148 | If we print this dataframe we get the following output: 149 | #+begin_src 150 | Dataframe with 3 columns and 3 rows: 151 | Idx Age Height Name 152 | dtype: int float string 153 | 0 22 1.87 Mike 154 | 1 54 1.75 Laura 155 | 2 34 1.78 Sue 156 | #+end_src 157 | 158 | We see that we get information about: 159 | - the number of columns in the dataframe 160 | - the number of rows in the dataframe 161 | - the names of each column 162 | - the data types of each column 163 | - their values with one record per row 164 | - and an additional index column 165 | 166 | *** From a CSV / TSV file 167 | 168 | The second supported case is a CSV like file. For these the library 169 | provides a generalized =readCsv= proc. Strictly speaking it can also 170 | read TSV (or any delimited ASCII file) and provides a number of 171 | different options to make it suitable to a large variety of 172 | differently organized CSV files (what a euphemism): 173 | #+BEGIN_SRC nim 174 | proc readCsv*(fname: string, 175 | sep: char = ',', 176 | header: string = "", 177 | skipLines = 0, 178 | toSkip: set[char] = {}, 179 | colNames: seq[string] = @[], 180 | skipInitialSpace = true, 181 | quote = '"', 182 | ): DataFrame 183 | #+END_SRC 184 | For a regular CSV file (comma separated) with a header line storing 185 | the column names, all we need is a filename. 186 | 187 | In addition to that the separator can be changed with =sep= and the 188 | header can have a designation (e.g. =#= indicating header lines that 189 | will be skipped automatically after the first). 190 | 191 | Further a specific number of lines can be skipped after the 192 | header. =toSkip= allows to skip any set of characters. These will be 193 | completely ignored outside of columns. 194 | 195 | If the file does not have a header =colNames= can be used to give 196 | names to the columns. 197 | 198 | =skipInitialSpace= is useful to remove whitespace that might appear in 199 | addition to a separator (e.g. a comma separated file that also has a 200 | space after every comma). If it is false such spaces will be parsed 201 | into the data fields. 202 | 203 | Finally, =quote= allows to ignore all characters that would otherwise 204 | act as separators, line breaks, ... within these. If ="= is used as 205 | quote and spaces to separate, there may be spaces within individual 206 | fields as long as these are within quotes. 207 | 208 | Let's use this procedure to read the supplied =mpg= dataset: 209 | #+BEGIN_SRC nim :results raw 210 | import datamancer 211 | let df = readCsv("../data/mpg.csv") 212 | #+END_SRC 213 | 214 | 215 | 216 | We're now proud owner of dataframe that's a bit more exciting than the 217 | 3 column / 3 row one from before. We'll explore it in the next section. 218 | 219 | ** Manipulating a DF 220 | 221 | Now we have a DF. What then? 222 | 223 | First of all we can look at it. Echoing a DF calls the =pretty= 224 | proc. For the DF introduced above, this looks like: 225 | #+BEGIN_SRC nim 226 | echo df 227 | #+END_SRC 228 | gives for the =mpg= dataset: 229 | #+BEGIN_SRC sh 230 | #+RESULTS: 231 | Dataframe with 11 columns and 234 rows: 232 | Idx manufacturer model displ year cyl ... drv cty hwy fl class 233 | dtype: string string float int int ... string int int string string 234 | 0 audi a4 1.8 1999 4 ... f 18 29 p compact 235 | 1 audi a4 1.8 1999 4 ... f 21 29 p compact 236 | 2 audi a4 2 2008 4 ... f 20 31 p compact 237 | 3 audi a4 2 2008 4 ... f 21 30 p compact 238 | 4 audi a4 2.8 1999 6 ... f 16 26 p compact 239 | 5 audi a4 2.8 1999 6 ... f 18 26 p compact 240 | 6 audi a4 3.1 2008 6 ... f 18 27 p compact 241 | 7 audi a4 quattro 1.8 1999 4 ... "4" 18 26 p compact 242 | 8 audi a4 quattro 1.8 1999 4 ... "4" 16 25 p compact 243 | 9 audi a4 quattro 2 2008 4 ... "4" 20 28 p compact 244 | 10 audi a4 quattro 2 2008 4 ... "4" 19 27 p compact 245 | 11 audi a4 quattro 2.8 1999 6 ... "4" 15 25 p compact 246 | 12 audi a4 quattro 2.8 1999 6 ... "4" 17 25 p compact 247 | 13 audi a4 quattro 3.1 2008 6 ... "4" 17 25 p compact 248 | 14 audi a4 quattro 3.1 2008 6 ... "4" 15 25 p compact 249 | 15 audi a6 quattro 2.8 1999 6 ... "4" 15 24 p midsize 250 | 16 audi a6 quattro 3.1 2008 6 ... "4" 17 25 p midsize 251 | 17 audi a6 quattro 4.2 2008 8 ... "4" 16 23 p midsize 252 | 18 chevrolet c1500 suburb... 5.3 2008 8 ... r 14 20 r suv 253 | 19 chevrolet c1500 suburb... 5.3 2008 8 ... r 11 15 e suv 254 | #+END_SRC 255 | (NOTE: I shortened the output for the docs here) 256 | Notice how in the =drv= column the 4WD entries are echoed as "4" 257 | instead of just 4. That is to highlight that those values are actually 258 | stored as strings to not confuse them with numbers. 259 | 260 | By default only the first 20 entries will be shown. For more/less 261 | elements, call =pretty= directly: 262 | #+BEGIN_SRC nim 263 | echo df.pretty(100) 264 | #+END_SRC 265 | 266 | =pretty= also takes a =precision= argument. This is given to the 267 | string conversion for =float= values to set the number of digits 268 | printed after the decimal point. However, it can also be used to 269 | change the width of the columns more generally. Note however the 270 | precision is added to a width of =6= by default. Also the column is at 271 | least as wide as the longest DF key. 272 | 273 | Let's now check which cars in the dataset have the highest and lowest 274 | city fuel economy. For that we can simply arrange the dataframe 275 | according to the =cty= column and take the tail or head of the 276 | result. 277 | #+BEGIN_SRC nim :results raw 278 | echo df.arrange("cty").head(5) 279 | #+END_SRC 280 | results in: 281 | #+BEGIN_SRC sh 282 | Dataframe with 11 columns and 5 rows: 283 | Idx manufacturer model displ ... cyl ... drv cty hwy fl class 284 | dtype: string string float int string int int string string 285 | 0 dodge dakota picku... 4.7 ... 8 ... "4" 9 12 e pickup 286 | 1 dodge durango 4wd 4.7 ... 8 ... "4" 9 12 e suv 287 | 2 dodge ram 1500 pic... 4.7 ... 8 ... "4" 9 12 e pickup 288 | 3 dodge ram 1500 pic... 4.7 ... 8 ... "4" 9 12 e pickup 289 | 4 jeep grand cherok... 4.7 ... 8 ... "4" 9 12 e suv 290 | #+END_SRC 291 | and looking at the tail instead: 292 | #+BEGIN_SRC nim 293 | echo df.arrange("cty").tail(5) 294 | #+END_SRC 295 | will tell us that a new beetle is the most efficient car in the dataset: 296 | #+BEGIN_SRC sh 297 | Dataframe with 11 columns and 5 rows: 298 | Idx manufacturer model displ ... cyl ... drv cty hwy fl class 299 | dtype: string string float int string int int string string 300 | 0 honda civic 1.6 ... 4 ... f 28 33 r subcompact 301 | 1 toyota corolla 1.8 ... 4 ... f 28 37 r compact 302 | 2 volkswagen new beetle 1.9 ... 4 ... f 29 41 d subcompact 303 | 3 volkswagen jetta 1.9 ... 4 ... f 33 44 d compact 304 | 4 volkswagen new beetle 1.9 ... 4 ... f 35 44 d subcompact 305 | #+END_SRC 306 | (=arrange= also takes an order argument, using the Nim stdlib's 307 | =SortOrder= enum). 308 | 309 | As another example here to showcase the usage of =FormulaNodes=, let's 310 | find some cars with an engine displacement of more than 5 L and which 311 | are 2 seaters (I wonder what car might show up...): 312 | #+BEGIN_SRC nim 313 | echo df.filter(f{`displ` > 5.0 and `class` == "2seater"}) 314 | #+END_SRC 315 | #+BEGIN_SRC sh 316 | Dataframe with 11 columns and 5 rows: 317 | Idx manufacturer model displ ... cyl ... drv cty hwy fl class 318 | dtype: string string float int string int int string string 319 | 0 chevrolet corvette 5.7 ... 8 ... r 16 26 p 2seater 320 | 1 chevrolet corvette 5.7 ... 8 ... r 15 23 p 2seater 321 | 2 chevrolet corvette 6.2 ... 8 ... r 16 26 p 2seater 322 | 3 chevrolet corvette 6.2 ... 8 ... r 15 25 p 2seater 323 | 4 chevrolet corvette 7 ... 8 ... r 15 24 p 2seater 324 | #+END_SRC 325 | Surprise, surprise we found ourselves a bunch of corvettes! 326 | 327 | (Note: for an explanation of this mythical =f{}= thing and those 328 | accented quotes, see the =Formula= section below). 329 | 330 | Finally, let's make use of a formula, which takes an assignment. Let's 331 | say we want to convert the city fuel economy of the cars from MPG to 332 | L/100 km as is the standard in Germany. We'll do this with 333 | =mutate=. =mutate= will add an additional column to the dataframe. 334 | (well, if only it was clear whether the =mpg= given are US gallon or 335 | imperial gallon?) 336 | #+BEGIN_SRC nim :results raw 337 | import datamancer 338 | let df = readCsv("../data/mpg.csv") 339 | 340 | let dfl100km = df.filter(f{`displ` > 5.0 and `class` == "2seater"}) 341 | .mutate(f{"cty / L/100km" ~ 235 / `cty`}) 342 | echo dfl100km.pretty(5) 343 | #+END_SRC 344 | shows us: 345 | #+BEGIN_SRC sh 346 | Dataframe with 12 columns and 5 rows: 347 | Idx manufacturer model displ ... trans ... cty ... cty / L/100km 348 | dtype: string string float ... string ... int ... float 349 | 0 chevrolet corvette 5.7 ... manual(m6) ... 16 ... 14.69 350 | 1 chevrolet corvette 5.7 ... auto(l4) ... 15 ... 15.67 351 | 2 chevrolet corvette 6.2 ... manual(m6) ... 16 ... 14.69 352 | 3 chevrolet corvette 6.2 ... auto(s6) ... 15 ... 15.67 353 | 4 chevrolet corvette 7 ... manual(m6) ... 15 ... 15.67 354 | #+END_SRC 355 | where I removed a couple of columns for better visibility again. 356 | 357 | I used the chaining of =filter= and =mutate= above mainly to showcase 358 | that this works reliably. 359 | 360 | When looking at the formula above note that as in ggplot2 the tilde ~ 361 | is used to indicate a dependency or in other words a mapping of 362 | something like Tensor to Tensor. 363 | 364 | ** Formula 365 | 366 | Here will go parts of what's in the README. 367 | -------------------------------------------------------------------------------- /src/datamancer/value.nim: -------------------------------------------------------------------------------- 1 | import tables, strutils, math, fenv, parseutils, strformat, hashes 2 | 3 | type 4 | ValueKind* = enum 5 | VNull, 6 | VBool, 7 | VInt, 8 | VFloat, 9 | VString, 10 | VObject 11 | 12 | Value* = object 13 | case kind*: ValueKind 14 | of VString: 15 | str*: string 16 | of VInt: 17 | num*: int #BiggestInt 18 | of VFloat: 19 | fnum*: float 20 | of VBool: 21 | bval*: bool 22 | of VObject: 23 | fields*: OrderedTable[string, Value] # alternative: `seq[(string, Value)]` pairs? 24 | of VNull: 25 | discard 26 | 27 | proc pretty*(v: Value, precision = 4, emphStrNumber = true): string 28 | 29 | func toValKind*[T](dtype: typedesc[T]): ValueKind = 30 | when T is float: 31 | result = VFloat 32 | elif T is int: 33 | result = VInt 34 | elif T is bool: 35 | result = VBool 36 | elif T is string: 37 | result = VString 38 | elif T is Value: 39 | result = VObject 40 | 41 | iterator items*(row: Value): Value = 42 | doAssert row.kind == VObject 43 | for v in values(row.fields): 44 | yield v 45 | 46 | iterator keys*(row: Value): string = 47 | doAssert row.kind == VObject 48 | for k in keys(row.fields): 49 | yield k 50 | 51 | iterator pairs*(row: Value): tuple[key: string, val: Value] = 52 | ## Iterator for the elements of `row`. `row` has to be a JObject 53 | ## representing a row of a `DataFrame` 54 | assert row.kind == VObject 55 | for key, val in pairs(row.fields): 56 | yield (key, val) 57 | 58 | proc contains*(v: Value, key: string): bool = 59 | doAssert v.kind == VObject 60 | result = v.fields.hasKey(key) 61 | 62 | proc `[]`*(v: Value, key: string): Value {.inline.} = 63 | doAssert v.kind == VObject 64 | result = v.fields[key] 65 | 66 | proc `[]=`*(v: var Value, key: string, val: Value) {.inline.} = 67 | doAssert v.kind == VObject 68 | v.fields[key] = val 69 | 70 | proc `%~`*(c: char): Value = 71 | ## we convert a `char` to a `string`! 72 | result = Value(kind: VString, str: $c) 73 | 74 | proc `%~`*(v: string): Value = 75 | result = Value(kind: VString, str: v) 76 | 77 | proc `%~`*(v: SomeFloat): Value = 78 | result = Value(kind: VFloat, fnum: v.float) 79 | 80 | proc `%~`*(v: SomeInteger): Value = 81 | result = Value(kind: VInt, num: v.int) 82 | 83 | proc `%~`*(v: bool): Value = 84 | result = Value(kind: VBool, bval: v) 85 | 86 | #proc `%~`*(v: Table[string, Value]): Value = 87 | # result = Value(kind: VObject, fields: v.toOrderedTable) 88 | 89 | proc `%~`*(v: OrderedTable[string, Value]): Value = 90 | result = Value(kind: VObject, fields: v) 91 | 92 | proc null*(): Value = 93 | ## Constructs a `VNull` value. 94 | Value(kind: VNull) 95 | 96 | proc newVObject*(length = 8): Value = 97 | result = Value(kind: VObject) 98 | result.fields = initOrderedTable[string, Value](nextPowerOfTwo(length)) 99 | 100 | proc `%~`*[T: not Value](s: openArray[T]): seq[Value] = 101 | ## converts a `seq[T]` to a `seq[Value]` 102 | result = newSeq[Value](s.len) 103 | for i, x in s: 104 | result[i] = %~ x 105 | 106 | template `%~`*(s: openArray[Value]): seq[Value] = @s 107 | 108 | proc toObject*(s: seq[(string, Value)]): Value = 109 | ## converts the given sequence to an object 110 | ## This is only used to store the result of the `groups` iterator as a 111 | ## `Value`. 112 | result = Value(kind: VObject) 113 | result.fields = initOrderedTable[string, Value]() 114 | for (key, val) in s: 115 | result.fields[key] = val 116 | 117 | proc toObject*(s: (string, Value)): Value = toObject(@[s]) 118 | 119 | func isNumber*(s: string): bool = 120 | ## returns true, if `s` is a number according to our rules: 121 | ## - starts with {0..9} 122 | ## - ends with {0..9} 123 | ## - may contain a single `.` 124 | ## - may contain a single `e`, `E` 125 | ## - may contain one minus, one plus at beginning and one for exponent 126 | ## - else may only contain {0..9} 127 | ## - `e`, `+`, `-`, `.` may not appear one after another 128 | ## - may contain space before and after the number 129 | ## It is only used to decide whether the stringifaction of `s` 130 | ## will be surrounded by `"`. 131 | var idx = skipWhile(s, toSkip = {' '}) 132 | template next(checkFor: untyped): untyped = 133 | if idx < s.len - 1: 134 | s[idx + 1] in checkFor 135 | else: 136 | false 137 | var 138 | negMinus = false 139 | posPlus = false 140 | expMinus = false 141 | expPlus = false 142 | numBeforeDot = false 143 | numBeforeExp = false 144 | dot = false 145 | expE = false 146 | sinceLastSpace = -1 147 | while idx < s.len: 148 | case s[idx] 149 | of '-': 150 | if next({'-', '+', '.'}): 151 | # another `-+.` after `-` 152 | return false 153 | elif not negMinus: 154 | negMinus = true 155 | elif not expMinus: 156 | expMinus = true 157 | else: 158 | # apparently has 3 minus 159 | return false 160 | of '+': 161 | if next({'+', '-'}): 162 | # another `-+.` after `-` 163 | return false 164 | elif not posPlus: 165 | posPlus = true 166 | elif not expPlus: 167 | posPlus = true 168 | else: 169 | # apparently has 3 plus 170 | return false 171 | of '0' .. '9': 172 | if not dot: 173 | numBeforeDot = true 174 | if not expE: 175 | numBeforeExp = true 176 | inc idx 177 | continue 178 | of '.': 179 | if next({'e', 'E'}): 180 | # `e` after `.` 181 | return false 182 | elif not dot and numBeforeDot: 183 | dot = true 184 | else: 185 | # multiple dots or number before `dot` 186 | return false 187 | of 'e', 'E': 188 | if not next({'0'..'9', '-', '+'}): 189 | # apparently ends with an 'e', 'E' 190 | return false 191 | if not expE and numBeforeExp: 192 | expE = true 193 | else: 194 | # multiple `e` or no number before `e` 195 | return false 196 | of ' ': 197 | if sinceLastSpace == -1 or sinceLastSpace == 1: 198 | # when we encounter a space, set our `spaceCounter` to 0 to start 199 | # increasing it every itereation in main loop 200 | sinceLastSpace = 0 201 | elif sinceLastSpace > 1: 202 | # apparently something between last space and this space 203 | return false 204 | else: return false # something not part of a number 205 | inc idx 206 | if sinceLastSpace >= 0: 207 | # last iter found a space, so count spaces 208 | inc sinceLastSpace 209 | return true 210 | 211 | func isNumber*(v: Value): bool = 212 | doAssert v.kind == VString 213 | result = v.str.isNumber 214 | 215 | func isInt*(s: string): bool = 216 | ## simple "most likely int" check. If the string only contains digits and 217 | ## `_` we consider it an Int 218 | s.allCharsInSet({'0' .. '9', '_'}) 219 | 220 | func isBool*(s: string): bool = 221 | s == "true" or s == "false" 222 | 223 | func isInt*(v: Value): bool = 224 | ## checks whether the string contained in `Value` is likely an integer 225 | ## For an `isFloat` equivalent see `isNumber`. 226 | doAssert v.kind == VString 227 | result = v.str.isInt 228 | 229 | proc toFloat*(v: Value, allowNull: static bool = false): float = 230 | when not allowNull: 231 | doAssert v.kind in {VInt, VFloat} 232 | else: 233 | doAssert v.kind in {VInt, VFloat, VNull} 234 | case v.kind 235 | of VInt: result = v.num.float 236 | of VFloat: result = v.fnum 237 | of VNull: 238 | # This branch is forbidden for `allowNull = false` due to `doAssert` at top! 239 | result = 0.0 240 | else: discard 241 | 242 | proc toInt*(v: Value): int = #BiggestInt = 243 | ## Converts a numeric value to an int. If the value is a float 244 | ## we round and convert to int 245 | doAssert v.kind in {VInt, VFloat} 246 | case v.kind 247 | of VInt: result = v.num 248 | of VFloat: result = v.fnum.round.int 249 | else: discard 250 | 251 | proc toBool*(v: Value): bool = 252 | ## Checks if the value is a bool and returns its value 253 | doAssert v.kind == VBool 254 | result = v.bval 255 | 256 | proc toStr*(v: Value): string = 257 | ## Returns the value `v` as a string. If the value is of kind `VString`, 258 | ## no conversion is required. 259 | ## This however will fail, if the input is of type 260 | ## - VNull 261 | ## - VObject 262 | ## if you want string representations of those value types, use `$` 263 | case v.kind 264 | of VInt, VFloat, VBool: result = pretty(v) 265 | of VString: result = v.str 266 | else: 267 | raise newException(ValueError, "Will not convert a Value of kind " & 268 | $v.kind & " to string! Use `$` for that!") 269 | 270 | proc to*[T: int | float | string | bool](v: Value, dtype: typedesc[T]): T = 271 | when T is int: 272 | result = v.toInt 273 | elif T is float: 274 | result = v.toFloat 275 | elif T is string: 276 | result = v.toStr 277 | elif T is bool: 278 | result = v.toBool 279 | else: 280 | doAssert false, "Impossible branch!" 281 | 282 | template withNative*(v: Value, 283 | valName: untyped, 284 | body: untyped): untyped = 285 | case v.kind 286 | of VInt: 287 | let `valName` {.inject.} = v.num 288 | body 289 | of VFloat: 290 | let `valName` {.inject.} = v.fnum 291 | body 292 | of VString: 293 | let `valName` {.inject.} = v.str 294 | body 295 | of VBool: 296 | let `valName` {.inject.} = v.bval 297 | body 298 | of VNull: 299 | # a null value is just a null value 300 | let `valName` {.inject.} = v 301 | body 302 | of VObject: 303 | doAssert false, "not implemented / makes no sense for current usage" 304 | 305 | template withNativeConversion*(kind: ValueKind, 306 | procName: untyped, 307 | body: untyped): untyped = 308 | ## generates an environment, in which the correct `to*` proc 309 | ## is named `procName` for `kind` 310 | case kind 311 | of VInt: 312 | template `procName`(v: Value): untyped = v.toInt 313 | type dtype {.inject.} = int 314 | body 315 | of VFloat: 316 | template `procName`(v: Value): untyped = v.toFloat 317 | type dtype {.inject.} = float 318 | body 319 | of VString: 320 | template `procName`(v: Value): untyped = v.toStr 321 | type dtype {.inject.} = string 322 | body 323 | of VBool: 324 | template `procName`(v: Value): untyped = v.toBool 325 | type dtype {.inject.} = bool 326 | body 327 | of VObject, VNull: 328 | doAssert false, "not implemented / makes no sense for current usage" 329 | 330 | 331 | func isNull*(v: Value): Value = 332 | ## returns whether `v` is a `VNull` value as a `VBool` 333 | result = %~ (v.kind == VNull) 334 | 335 | func almostEqual*(a, b: float, epsilon = 1e-8): bool = 336 | # taken from 337 | # https://floating-point-gui.de/errors/comparison/ 338 | let 339 | absA = abs(a) 340 | absB = abs(b) 341 | diff = abs(a - b) 342 | if a == b: # shortcut, handles infinities 343 | result = true 344 | elif a == 0 or b == 0 or (absA + absB) < minimumPositiveValue(float64): 345 | # a or b is zero or both are extremely close to it 346 | # relative error is less meaningful here 347 | result = diff < (epsilon * minimumPositiveValue(float64)) 348 | else: 349 | # use relative error 350 | result = diff / min(absA + absB, maximumPositiveValue(float64)) < epsilon 351 | 352 | proc `==`*(v, w: Value): bool = 353 | ## checks whether the values are equal. 354 | ## Note: if both values are numbers of different kind (`VInt` and `VFloat`) the 355 | ## values are both compared as floats! 356 | ## The float comparison happens with a floating point comparison with relatively 357 | ## large epsilon (1-e8). 358 | if v.kind != w.kind and 359 | v.kind in {VInt, VFloat} and 360 | w.kind in {VInt, VFloat}: 361 | result = almostEqual(v.toFloat, w.toFloat) 362 | elif v.kind != w.kind: 363 | result = false 364 | else: 365 | case v.kind 366 | of VString: 367 | result = v.str == w.str 368 | of VInt: 369 | result = v.num == w.num 370 | of VFloat: 371 | result = almostEqual(v.fnum, w.fnum) 372 | of VBool: 373 | result = v.bval == w.bval 374 | of VObject: 375 | # NOTE: taken from json module 376 | # we cannot use OrderedTable's equality here as 377 | # the order does not matter for equality here. 378 | if v.fields.len != w.fields.len: return false 379 | for key, val in v.fields: 380 | if not w.fields.hasKey(key): return false 381 | if w.fields[key] != val: return false 382 | result = true 383 | of VNull: 384 | result = true 385 | 386 | proc `<`*(v, w: Value): bool = 387 | ## checks whether the `v` is smaller than `w` 388 | ## Note: this is only defined for a subset of the possible types! 389 | ## Note2: if both are numbers of different kind (`VInt` and `VFloat`) the 390 | ## values are compared as a float! For very large values this would be problematic, 391 | ## but here we are lenient and assume the user uses `Value` for small calculations! 392 | if v.kind != w.kind and 393 | v.kind in {VFloat, VInt} and 394 | w.kind in {VFloat, VInt}: 395 | result = v.toFloat < w.toFloat 396 | elif v.kind != w.kind and 397 | v.kind in {VFloat, VInt, VString} and 398 | w.kind in {VFloat, VInt, VString}: 399 | # compare as strings 400 | result = $v < $w 401 | elif v.kind == w.kind: 402 | case v.kind 403 | of VString: 404 | result = v.str < w.str 405 | of VInt: 406 | result = v.num < w.num 407 | of VFloat: 408 | result = v.fnum < w.fnum 409 | of VBool: 410 | result = v.bval < v.bval 411 | of VObject: 412 | # checks if objects have the same field, and if so whether the 413 | # fields of `v` are smaller than those of `w` 414 | result = true 415 | for k in keys(v): 416 | if k notin w: 417 | return false 418 | if v[k] < w[k]: 419 | return true 420 | elif v[k] > w[k]: 421 | return false 422 | # else v[k] is equal to w[k], continue 423 | else: 424 | raise newException(Exception, "Comparison `<` does not make sense for " & 425 | "Value kind " & $v.kind & "!") 426 | 427 | proc `<=`*(v, w: Value): bool = 428 | ## checks whether `v` is smaller or equal than `w` 429 | if v == w: 430 | result = true 431 | elif v < w: 432 | result = true 433 | 434 | proc smallerOrFalse*(v: Value, f: float): bool {.inline.} = 435 | ## extension of `<` for `Value` to return `false` if `v` is 436 | ## not a valid VInt/VFloat. 437 | case v.kind 438 | of VInt, VFloat: result = v.toFloat < f 439 | else: result = false 440 | 441 | proc largerOrFalse*(v: Value, f: float): bool {.inline.} = 442 | ## extension of `<` for `Value` to return `false` if `v` is 443 | ## not a valid VInt/VFloat. 444 | case v.kind 445 | of VInt, VFloat: result = v.toFloat > f 446 | else: result = false 447 | 448 | template makeMath(op: untyped): untyped = 449 | proc `op`*(v, w: Value): Value = 450 | ## Adds two Values together, if they are addeable. 451 | ## These operations only work for `VInt` and `VFloat`. `VInt` is converted 452 | ## to floats for the calculation. The result is always a `VFloat`! 453 | if v.kind in {VFloat, VInt} and 454 | w.kind in {VFloat, VInt}: 455 | result = Value(kind: VFloat, fnum: `op`(v.toFloat, w.toFloat)) 456 | elif v.kind == VNull or w.kind == VNull: 457 | result = Value(kind: VNull) 458 | else: 459 | raise newException(Exception, "Math operation does not make sense for " & 460 | "Value kind " & $v.kind & "!") 461 | 462 | makeMath(`+`) 463 | makeMath(`-`) 464 | makeMath(`*`) 465 | makeMath(`/`) 466 | 467 | proc formatFloatValue(v: Value, precision: int): string = 468 | ## Performs the formatting of a value of kind `VFloat` to string. 469 | ## If the values are smaller < 1e-5 or > 1e5 scientific notation is 470 | ## used. 471 | doAssert v.kind == VFloat 472 | let f = v.fnum 473 | if almostEqual(abs(f), 0.0): 474 | # to make sure zero is not formatted in scientific 475 | result = f.formatBiggestFloat(format = ffDefault, 476 | precision = precision) 477 | elif abs(f) >= 1e5 or abs(f) <= 1e-5: 478 | result = f.formatBiggestFloat(format = ffScientific, 479 | precision = precision) 480 | else: 481 | result = f.formatBiggestFloat(format = ffDefault, 482 | precision = precision) 483 | result.trimZeros() 484 | 485 | proc pretty*(v: Value, precision = 4, emphStrNumber = true): string = 486 | ## converts the given value to its value as a string. For `VFloat` the 487 | ## precision can be given. 488 | ## If `emphStrNumber` is true, a number stored as a string will be emphasized 489 | ## by enclosing it with explicit `"`. This is mainly for printing DFs to show 490 | ## the user if a number is a number or a string. 491 | case v.kind 492 | of VInt: 493 | result = $v.num 494 | of VFloat: 495 | result = formatFloatValue(v, precision = precision) 496 | of VBool: 497 | result = $v.bval 498 | of VString: 499 | let vstr = v.str 500 | if emphStrNumber and (vstr.len == 0 or vstr.isNumber): 501 | result = "\"" & vstr & "\"" 502 | else: 503 | result = vstr 504 | of VObject: 505 | result.add "{" 506 | var idx = 0 507 | for k, x in v.fields: 508 | if idx == v.fields.len - 1: 509 | result.add (&"{k}: {pretty(x)}") 510 | else: 511 | result.add (&"{k}: {pretty(x)}, ") 512 | inc idx 513 | result.add "}" 514 | of VNull: 515 | result = "null" 516 | 517 | template `$`*(v: Value): string = pretty(v) 518 | 519 | proc hash*(x: Value): Hash = 520 | case x.kind 521 | of VInt: 522 | result = hash(x.num) 523 | of VFloat: 524 | result = hash(x.fnum) 525 | of VString: 526 | result = hash(x.str) 527 | of VBool: 528 | result = hash(x.bval) 529 | of VObject: 530 | for k, v in x.fields: 531 | result = result !& hash(k) 532 | result = result !& hash(v) 533 | of VNull: 534 | result = 0 535 | result = !$result 536 | 537 | proc contains*(v: Value, has: Value): bool = 538 | ## checks whether `has` is a subset of `v` if both are `VObject`. 539 | ## A subset means that all keys of `has` are in `v` and their values match. 540 | ## There may be more fields in `v` than in `has` 541 | doAssert v.kind == VObject 542 | doAssert has.kind == VObject 543 | result = true 544 | for key, val in has: 545 | if key in v: result = result and val == v[key] 546 | else: result = false 547 | -------------------------------------------------------------------------------- /data/03-sample_hugo.csv: -------------------------------------------------------------------------------- 1 | ,BU,CT,IE,Cs134,Cs137,Eu154,U235,Pu239 2 | 0,23,3102.5,1.5,1.88424083219819E-07,2.84566917879002E-05,2.69679813433217E-07,8.22522171634031E-05,0.000121476067501 3 | 1,30,7482.5,1.5,5.20168249685682E-09,2.79799245808655E-05,1.52985963215638E-07,5.42325572566305E-05,0.00012902660029 4 | 2,37,4197.5,1.5,1.45199693687987E-07,4.21499771616296E-05,4.17562528731865E-07,3.59841349105039E-05,0.000134992514145 5 | 3,52.5,6752.5,1.5,2.16965292708179E-08,4.9979226189458E-05,3.50484414413579E-07,1.50402501420245E-05,0.000145445609543 6 | 4,65.5,3102.5,1.5,8.08478210919765E-07,7.74388092788155E-05,9.6942317979626E-07,7.48810453458104E-06,0.000152838359631 7 | 5,30,91.25,1.6,4.63841694081471E-06,4.46352992585013E-05,7.77532100448014E-07,6.15403234287554E-05,0.000129928345245 8 | 6,34.5,8395,1.6,2.71624118536392E-09,3.01868876171812E-05,1.5011569350238E-07,4.75259853623482E-05,0.000133654186214 9 | 7,57,1368.75,1.6,3.42660361651213E-06,7.59309237451187E-05,1.25129574849322E-06,1.35365167615274E-05,0.000148671041607 10 | 8,57,5840,1.6,5.59338990804771E-08,5.7224184909055E-05,4.66071542695002E-07,1.35888359894115E-05,0.000148643804069 11 | 9,27.5,365,1.7,3.08790706111218E-06,4.02748433202162E-05,6.37972051142385E-07,8.0098420206549E-05,0.000128426787842 12 | 10,35,7847.5,1.7,4.55574059261689E-09,3.16746996946991E-05,1.7181072908398E-07,5.23717817097577E-05,0.000134991510437 13 | 11,46.5,8942.5,1.7,2.48735805627675E-09,3.87972156630181E-05,1.90403175772635E-07,2.75877308521199E-05,0.000142981084191 14 | 12,59.5,3011.25,1.7,7.96251684592367E-07,7.12445831121348E-05,9.07121405568088E-07,1.3636066711407E-05,0.000150575807944 15 | 13,69,0,1.7,1.49265327403358E-05,9.88690244415683E-05,2.01340259501913E-06,8.28861591879921E-06,0.000153276312875 16 | 14,22,0,1.8,2.90373875406948E-06,3.30913019608103E-05,4.77490315744394E-07,0.000121758621764,0.000120539544125 17 | 15,25.5,1277.5,1.8,1.16012736983286E-06,3.52881874632362E-05,4.59893268685747E-07,0.000100175469562,0.000127054363219 18 | 16,26.5,912.5,1.8,1.7327977866186E-06,3.74997135846502E-05,5.29825016639736E-07,9.47339573003523E-05,0.000128198386755 19 | 17,33,4745,1.8,7.13603005947803E-08,3.63836420446526E-05,3.12466586610326E-07,6.59608160877924E-05,0.000134213270087 20 | 18,41.5,10037.5,1.8,7.62128808328029E-10,3.24376701791896E-05,1.30322533184758E-07,4.12345752265645E-05,0.000140363154609 21 | 19,47,4015,1.8,2.34107824534528E-07,5.35110960085349E-05,5.70492650681161E-07,3.04507778332328E-05,0.000143916276119 22 | 20,52,2828.75,1.8,7.83136922917532E-07,6.34056189350734E-05,8.22049647800648E-07,2.32157392917567E-05,0.000146817626605 23 | 21,62.5,8212.5,1.8,6.87804608714419E-09,5.3582261986737E-05,2.99610775104532E-07,1.33723153739579E-05,0.000152502738238 24 | 22,30,1916.25,1.9,8.36280351381586E-07,3.97094102572973E-05,5.07645566666362E-07,8.68630440901851E-05,0.000132844577617 25 | 23,38,1277.5,1.9,2.15337518561199E-06,5.19298470601829E-05,8.09901114158842E-07,5.61107891156539E-05,0.000139108882434 26 | 24,53.5,8942.5,1.9,2.92233194107228E-09,4.42348036618602E-05,2.19836844358774E-07,2.43914997367572E-05,0.000148412808665 27 | 25,60.5,1186.25,1.9,4.22002555451605E-06,8.10200157713558E-05,1.37100182269368E-06,1.68026687660663E-05,0.000152071751921 28 | 26,33.5,10950,2,2.37102513185971E-10,2.49113875906117E-05,8.0074474374429E-08,7.99719871297953E-05,0.000136470519707 29 | 27,42,1277.5,2,2.4254257067081E-06,5.7064627799565E-05,9.10682513286478E-07,5.06086453634115E-05,0.000142603154497 30 | 28,43.5,1368.75,2,2.35387900927969E-06,5.86867070024351E-05,9.32409364204074E-07,4.67023160056911E-05,0.0001435108385 31 | 29,54,6387.5,2,3.09576495075891E-08,5.24380012777716E-05,3.90032393618273E-07,2.67856778131325E-05,0.000149329758953 32 | 30,55,2646.25,2,9.93495740299208E-07,6.76086861284335E-05,9.08677692101019E-07,2.53699250898907E-05,0.000149887448471 33 | 31,56,1277.5,2,3.59022668275634E-06,7.49948614715264E-05,1.25146635182496E-06,2.40576105262813E-05,0.000150428448657 34 | 32,63,4562.5,2,1.98674557633134E-07,6.79636755148268E-05,6.7676922005073E-07,1.67574308214044E-05,0.000153936298314 35 | 33,67,1095,2,5.22728817639055E-06,8.96687736058673E-05,1.5426931459686E-06,1.36173719692919E-05,0.000155893642228 36 | 34,21.5,2828.75,2.1,1.97723968104349E-07,2.70118094679826E-05,2.36070042227702E-07,0.000165212293199,0.000124203940056 37 | 35,49,638.75,2.1,5.46270267176982E-06,6.88667800104983E-05,1.25401431277852E-06,3.90510848236718E-05,0.000147326554534 38 | 36,63,10585,2.1,7.7649021566376E-10,4.64176135745528E-05,1.79067180800253E-07,1.89690560657233E-05,0.000154413095565 39 | 37,67,3650,2.1,4.96884096553506E-07,7.62635593290236E-05,8.78491802953668E-07,1.54066192028162E-05,0.000156327077237 40 | 38,20,9125,2.2,5.32430172169919E-10,1.69145080529553E-05,5.11924529435906E-08,0.000193533922559,0.000122354486384 41 | 39,21.5,2737.5,2.2,2.11540310587198E-07,2.71566327246085E-05,2.3764135565168E-07,0.000179539138945,0.000125019319375 42 | 40,43,5840,2.2,3.72326055268189E-08,4.37040245355853E-05,3.41242111108583E-07,5.97032949624967E-05,0.000144986683356 43 | 41,70,2098.75,2.2,2.18141611682241E-06,8.7616458296196E-05,1.28793217955485E-06,1.49238711868889E-05,0.000158437242276 44 | 42,23.5,4562.5,2.3,4.57767290248913E-08,2.6398125325377E-05,1.83820327773495E-07,0.000176265679368,0.000128897548561 45 | 43,29.5,5110,2.3,4.10918341716166E-08,3.18617907626002E-05,2.36760756475201E-07,0.000131024932382,0.000136109650376 46 | 44,29.5,5475,2.3,2.93672016721176E-08,3.11345434891859E-05,2.18422247414788E-07,0.000131028842674,0.000136106117348 47 | 45,44,182.5,2.3,6.98711992995184E-06,6.38838727806421E-05,1.2201243792048E-06,6.27576594604642E-05,0.000146561015337 48 | 46,49,10767.5,2.3,4.82357609660259E-10,3.62589235826303E-05,1.33816824290292E-07,4.86787975916935E-05,0.000149177297326 49 | 47,25.5,1916.25,2.4,5.96883185594255E-07,3.38012430953463E-05,3.75683842906509E-07,0.000173083460019,0.000132596749798 50 | 48,33,6022.5,2.4,2.0674265174375E-08,3.34729559038151E-05,2.27545834357507E-07,0.000120165750448,0.000140296505555 51 | 49,42,7665,2.4,6.56987453770483E-09,3.80366689459262E-05,2.19952111029734E-07,7.67501009789035E-05,0.000146380458078 52 | 50,43,4197.5,2.4,1.65933464504733E-07,4.84526606907299E-05,4.87587934085048E-07,7.29430193370778E-05,0.000146982123714 53 | 51,49.5,9855,2.4,1.12604669687437E-09,3.87745412763824E-05,1.65443253781577E-07,5.26055972965375E-05,0.000150442032886 54 | 52,62,4562.5,2.4,1.91740969715746E-07,6.68613788194208E-05,6.67874776913479E-07,2.79772465073893E-05,0.000156186304692 55 | 53,21.5,8212.5,2.5,1.31783309179537E-09,1.91848747122469E-05,6.84622393270823E-08,0.000225119126276,0.000127151621839 56 | 54,35.5,3285,2.5,2.87216546287636E-07,4.27140929279224E-05,4.61837743435703E-07,0.000115921927652,0.000143428237943 57 | 55,39,1825,2.5,1.28334477030796E-06,5.13140131423784E-05,7.26285325954644E-07,9.76820554493864E-05,0.000145817577478 58 | 56,52,8212.5,2.5,5.32038798564765E-09,4.49918266769322E-05,2.49938625879728E-07,5.13167662681124E-05,0.000152613774364 59 | 57,62,273.75,2.5,9.90200500807536E-06,8.7673305735931E-05,1.72268710251402E-06,3.10924614929168E-05,0.000156953339519 60 | 58,53.5,182.5,2.6,8.94277818391889E-06,7.68016411031811E-05,1.52059576421386E-06,5.24400570978479E-05,0.000154187483512 61 | 59,59,6022.5,2.6,4.76355617012128E-08,5.82461515549754E-05,4.63678286227929E-07,4.00358183001655E-05,0.000156506990044 62 | 60,70,1368.75,2.6,4.23219712970039E-06,9.16517448464663E-05,1.51861511803921E-06,2.32840071608402E-05,0.000161088014568 63 | 61,32.5,8577.5,2.7,1.8569238760323E-09,2.80264166849705E-05,1.24107891534068E-07,0.000157211169067,0.000143133452792 64 | 62,40.5,1368.75,2.7,1.98111263854068E-06,5.46337539847379E-05,8.33352339779185E-07,0.000108066806193,0.000148922689771 65 | 63,32.5,6387.5,2.8,1.37931166221163E-08,3.21800488637227E-05,2.00207231077099E-07,0.000169259124846,0.000144131205246 66 | 64,67.5,912.5,2.8,6.10542277137965E-06,9.11285117423774E-05,1.63206094620404E-06,3.22085217825989E-05,0.000161495430874 67 | 65,33.5,2463.75,2.9,5.31989361174768E-07,4.24672749908237E-05,4.96893733293116E-07,0.000174015840376,0.00014625125964 68 | 66,48,3650,2.9,3.13676834351377E-07,5.56552517674718E-05,6.26694760020223E-07,8.98301044523825E-05,0.000155042502487 69 | 67,61.5,5110,2.9,1.12173503162396E-07,6.3996485149028E-05,5.89601513091981E-07,4.73947200701079E-05,0.000160171235532 70 | 68,62,9672.5,2.9,1.7033436722549E-09,4.83206444166339E-05,2.17103004825399E-07,4.63343759456962E-05,0.000160307978863 71 | 69,65,6752.5,3,2.66634058662711E-08,6.07534396881419E-05,4.34772880691482E-07,4.40724741498069E-05,0.000162462226848 72 | 70,45,1368.75,3.1,2.27672954486003E-06,6.03971774209416E-05,9.55322919229101E-07,0.000120872224047,0.000156080408219 73 | 71,47.5,6205,3.1,2.89554639346418E-08,4.68477355614104E-05,3.51382346017484E-07,0.000108092876596,0.000157207820402 74 | 72,54.5,3467.5,3.1,4.33379363422111E-07,6.34028140839783E-05,7.52915453031601E-07,7.85086498851637E-05,0.000159957606697 75 | 73,55,3650,3.1,3.7145463759155E-07,6.32221363090696E-05,7.30642229565115E-07,7.67231033448639E-05,0.000160150257777 76 | 74,36.5,5657.5,3.2,3.16725917231739E-08,3.76782164184727E-05,2.75972036706425E-07,0.000187847154834,0.000152121558456 77 | 75,54.5,1733.75,3.2,2.12475064095556E-06,7.07308883473186E-05,1.10371475424681E-06,8.52021748692741E-05,0.000161078859401 78 | 76,64.5,2920,3.2,8.89478177759677E-07,7.68182522700119E-05,1.00798141630531E-06,5.38670635702221E-05,0.000164093664495 79 | 77,67.5,10950,3.2,5.84688353024502E-10,4.82405658810163E-05,1.78827477733339E-07,4.6982033924468E-05,0.000165045623022 80 | 78,22.5,9125,3.3,5.63250461108869E-10,1.8892410969189E-05,5.58744084045619E-08,0.000348235378678,0.000134706251734 81 | 79,35,1368.75,3.3,1.51256445753264E-06,4.74357516460313E-05,6.64911951285053E-07,0.000213148728791,0.000152099200556 82 | 80,47,3650,3.3,2.94175587791291E-07,5.44772278223077E-05,6.0888407965754E-07,0.000128512366211,0.000159461318315 83 | 81,49,8030,3.3,5.57715018432749E-09,4.29751537589923E-05,2.4366003244946E-07,0.000117827647799,0.000160300825473 84 | 82,57,182.5,3.3,9.41744967395323E-06,8.1398376793143E-05,1.63717932414911E-06,8.24897647148494E-05,0.000163098483309 85 | 83,20.5,9307.5,3.4,3.95571617312328E-10,1.70361854640802E-05,4.43140201939785E-08,0.000393902320102,0.000130537567242 86 | 84,23.5,4197.5,3.4,5.62552691641991E-08,2.69225173325624E-05,1.78242019946569E-07,0.000352777581404,0.000137453533567 87 | 85,28,8395,3.4,1.61759100829611E-09,2.45098288816846E-05,9.70591950446074E-08,0.000297917752529,0.000145274816682 88 | 86,38,1277.5,3.4,1.87457933162703E-06,5.16554354127345E-05,7.66609364901576E-07,0.000200817950708,0.000155780336611 89 | 87,39.5,2646.25,3.4,5.67248315749234E-07,4.917750018283E-05,5.99543570378547E-07,0.000188912063257,0.000156780418361 90 | 88,41.5,7117.5,3.4,9.79259948912521E-09,3.87988607540754E-05,2.38298430114336E-07,0.000174004246317,0.000157888024816 91 | 89,34,0,3.5,4.97844398442882E-06,5.02624187265034E-05,8.50801363868222E-07,0.000249870673361,0.000151693492283 92 | 90,47,3285,3.5,4.05633982223382E-07,5.57183792712714E-05,6.58172178865455E-07,0.00014779893824,0.000161878976038 93 | 91,50.5,8395,3.5,4.01240853299933E-09,4.31160661267432E-05,2.31444632448452E-07,0.000127550767974,0.000163176115337 94 | 92,56.5,1003.75,3.5,4.31045142038704E-06,7.65919611745065E-05,1.35301311788902E-06,9.83883569661425E-05,0.000165248472908 95 | 93,39.5,1095,3.6,2.32263545422418E-06,5.42183429047053E-05,8.4025588691569E-07,0.000213675207593,0.000159072706347 96 | 94,50.5,182.5,3.6,7.62722444735789E-06,7.24679791647171E-05,1.41939150414643E-06,0.000136613451681,0.000164603292982 97 | 95,53,10585,3.6,5.73686720558575E-10,3.93012009645533E-05,1.51527682317434E-07,0.000123098264348,0.000165344713927 98 | 96,36,4927.5,3.7,5.78485108683733E-08,3.88822870909328E-05,3.11350097635092E-07,0.00025909489375,0.000157554310919 99 | 97,42.5,4197.5,3.7,1.45792973357398E-07,4.77166617290667E-05,4.67439001289781E-07,0.000201756899975,0.000162284481186 100 | 98,46.5,1368.75,3.7,2.28890163259763E-06,6.22244327944638E-05,9.89521520863111E-07,0.000172070885289,0.000164385514731 101 | 99,52,2737.5,3.7,7.56229496788469E-07,6.33860318926623E-05,8.38039721613337E-07,0.000137481552692,0.000166473100679 102 | 100,58,3467.5,3.7,4.58586252211486E-07,6.71531173161345E-05,8.10994334174174E-07,0.000106890994825,0.000168290296925 103 | 101,30,9490,3.8,6.40331965546492E-10,2.44387625081422E-05,8.36724601961178E-08,0.000339652048512,0.000151621342621 104 | 102,33,1551.25,3.8,1.10164064910971E-06,4.42282414374551E-05,5.66746050974128E-07,0.000305010281416,0.000155508277133 105 | 103,45.5,3285,3.8,3.75973074242247E-07,5.39685726426877E-05,6.28366018920651E-07,0.00019040988815,0.00016522953801 106 | 104,52,4745,3.8,1.18455229417992E-07,5.58122417678851E-05,5.37496144921828E-07,0.000146866530874,0.000167746310583 107 | 105,68,9490,3.8,2.20174366907928E-09,5.31935091462065E-05,2.51802812756313E-07,7.49729835500201E-05,0.000171720756448 108 | 106,27.5,10585,3.9,1.98703993458076E-10,2.09420936906045E-05,5.58738660582861E-08,0.000388158328344,0.00014848314924 109 | 107,47,10037.5,3.9,7.86127359841421E-10,3.63108965047742E-05,1.47481313410847E-07,0.000190803887459,0.000167082015773 110 | 108,60.5,6935,3.9,1.92850765041233E-08,5.59891085472045E-05,3.93051616997789E-07,0.000110729974217,0.000171258290747 111 | 109,64,5475,3.9,8.0642267065254E-08,6.47582519470785E-05,5.76690804674461E-07,9.56176459225205E-05,0.000172070995873 112 | 110,66.5,1825,3.9,2.45602897465434E-06,8.45741500868411E-05,1.34315926830512E-06,8.59741530735528E-05,0.000172679334672 113 | 111,48,2007.5,4,1.30754374086722E-06,6.15605407925548E-05,8.92687865768183E-07,0.000194569651108,0.000168825823392 114 | 112,50.5,2190,4,1.16779927832354E-06,6.37605417151054E-05,9.10400036152838E-07,0.000176649595062,0.000169814707897 115 | 113,53.5,730,4,4.91935939958526E-06,7.38922152696555E-05,1.35287990509034E-06,0.000156986745215,0.000170882804304 116 | 114,20,2190,4.1,2.53193427262266E-07,2.60961685726902E-05,1.90525524401579E-07,0.000537311016215,0.000132059850636 117 | 115,40,3285,4.1,3.02537861233079E-07,4.77229922786362E-05,5.20040145250274E-07,0.000276400956748,0.000165404769447 118 | 116,47.5,1460,4.1,2.11167305946489E-06,6.30790286226831E-05,9.92002455140442E-07,0.000209989289025,0.000169836087151 119 | 117,57,1825,4.1,1.97091994001427E-06,7.32136924948565E-05,1.1474202041492E-06,0.00014546390347,0.000173034296308 120 | 118,48,7665,4.2,7.0530181779051E-09,4.30198736212015E-05,2.55341740177435E-07,0.000217923769424,0.00017131597745 121 | 119,61,0,4.2,1.13445611606013E-05,8.74382067715301E-05,1.84406246342814E-06,0.000132112212817,0.000173367045266 122 | 120,61,3193.75,4.2,6.00100917165137E-07,7.144289602227E-05,9.10779426409667E-07,0.00013215626642,0.00017523138502 123 | 121,23,1095,4.3,8.6159081699018E-07,3.20172921065982E-05,3.13197171041991E-07,0.000527837263678,0.000141301108312 124 | 122,30,9855,4.3,4.37811170667107E-10,2.38557482404698E-05,7.47403426493968E-08,0.000424794490921,0.000155809192702 125 | 123,31.5,4927.5,4.3,4.33800062313869E-08,3.41036463797568E-05,2.40672274830482E-07,0.000404805567203,0.000158130368674 126 | 124,48.5,2646.25,4.3,7.21668021195263E-07,5.96706126346351E-05,7.84676815662326E-07,0.000225984474308,0.000173019350982 127 | 125,49,2098.75,4.3,1.21452492505462E-06,6.23826596397898E-05,8.97821608234253E-07,0.000221900304447,0.000173245095635 128 | 126,26.5,6022.5,4.4,1.18656722746171E-08,2.69288249267324E-05,1.37586388271734E-07,0.000492664044451,0.000149841659476 129 | 127,28,1368.75,4.4,9.48521714532006E-07,3.81447654147683E-05,4.26384666162163E-07,0.000470496924228,0.000152925407662 130 | 128,30.5,8395,4.4,1.66906452249052E-09,2.65335121105305E-05,1.04949728586575E-07,0.000435436634922,0.000157211508954 131 | 129,48,4562.5,4.4,1.20806560057035E-07,5.23253185104261E-05,5.05679141764975E-07,0.000242395441286,0.000173946972912 132 | 130,53.5,9125,4.4,2.10893312001623E-09,4.3407006453399E-05,2.12001611943617E-07,0.000198622668704,0.000176048708847 133 | 131,57.5,1642.5,4.4,2.31784923237589E-06,7.4628267699939E-05,1.21013075981255E-06,0.000170952163928,0.000177244105302 134 | 132,65.5,9307.5,4.4,2.38727694426232E-09,5.18769101012059E-05,2.56008209045958E-07,0.000125591046909,0.000178709955652 135 | 133,21.5,1551.25,4.5,4.90923196475934E-07,2.9102497554807E-05,2.43080910773364E-07,0.0005922166796,0.000137791550454 136 | 134,29.5,6935,4.5,6.1378450671227E-09,2.82190898491947E-05,1.36287980437162E-07,0.000467222403179,0.000156376779669 137 | 135,31.5,1186.25,4.5,1.33431975843804E-06,4.31937270893179E-05,5.42845704184384E-07,0.000439296532111,0.000159733133613 138 | 136,32.5,8760,4.5,1.32560385065279E-09,2.75779550076423E-05,1.07800903381432E-07,0.000425944071653,0.000161170349981 139 | 137,41.5,5840,4.5,2.89478475192148E-08,4.19540982073863E-05,3.07142377068495E-07,0.000318574775265,0.00017101880159 140 | 138,56,912.5,4.5,4.32055707816737E-06,7.62012593502799E-05,1.3768286730741E-06,0.000191335836553,0.000178145768364 141 | 139,57,3467.5,4.5,4.23254161958962E-07,6.59277932283289E-05,8.00435760623841E-07,0.000184425828159,0.000178394775013 142 | 140,69.5,1368.75,4.5,3.85987111971363E-06,9.06040113593491E-05,1.57121366953204E-06,0.000114134612242,0.000180679431678 143 | 141,20,456.25,4.6,1.19388807447226E-06,2.90984685411028E-05,2.66417441784073E-07,0.000638722132659,0.000133376783508 144 | 142,34,2190,4.6,6.01814421261134E-07,4.3654656555647E-05,4.96574888437103E-07,0.000423142967891,0.000164291650993 145 | 143,35.5,8030,4.6,3.00483897926296E-09,3.14618164924499E-05,1.47329872510756E-07,0.000403921308235,0.000166177335752 146 | 144,62,3467.5,4.6,4.66654095776323E-07,7.1240644469803E-05,8.78151099699867E-07,0.000161971045475,0.000180817374224 147 | 145,68,0,4.6,1.31034028597544E-05,9.67776491110723E-05,2.08626692405056E-06,0.000128796117512,0.000179840801425 148 | 146,69,1277.5,4.6,4.13141924936205E-06,9.04947486558296E-05,1.59566114797963E-06,0.000123909185096,0.000181876547105 149 | 147,22,1460,4.7,5.47972185663642E-07,2.99310160658316E-05,2.55153292322911E-07,0.000623951168786,0.000139936881015 150 | 148,29,6022.5,4.7,1.35565618713115E-08,2.93916868276575E-05,1.59577997496372E-07,0.000511111560613,0.000156944125156 151 | 149,44.5,9490,4.7,1.11555875851811E-09,3.56056741210485E-05,1.52037634052887E-07,0.000316157896917,0.000175668073016 152 | 150,58,7847.5,4.7,7.61615602764121E-09,5.07827632347685E-05,3.11220225349629E-07,0.00019850936878,0.000181373528443 153 | 151,59.5,1003.75,4.7,4.31153641656711E-06,8.02115760706824E-05,1.45283744632622E-06,0.000187918214041,0.000181801799911 154 | 152,33,8577.5,4.8,1.57085942711193E-09,2.8300458309801E-05,1.13477824549112E-07,0.000471502344469,0.0001643796442 155 | 153,38.5,547.5,4.8,3.33040706191584E-06,5.46125900810378E-05,8.73249671536257E-07,0.000399305413092,0.000171655811867 156 | 154,44,91.25,4.8,6.20702363810397E-06,6.38162090967527E-05,1.18880799001349E-06,0.000335827327081,0.000176511725026 157 | 155,49,273.75,4.8,6.27800331433849E-06,6.99434360119171E-05,1.33676850147627E-06,0.000285136931536,0.000179616900623 158 | 156,25,365,4.9,1.8633539808157E-06,3.6353181205431E-05,4.13339358579954E-07,0.000612673686487,0.000148968193773 159 | 157,45,8030,4.9,4.29653252763622E-09,3.94573809497852E-05,2.12110653122401E-07,0.000339676671363,0.000178214078695 160 | 158,59,6935,4.9,1.78747615188695E-08,5.46564954648793E-05,3.88870453183095E-07,0.00021261717434,0.000184301139232 161 | 159,21,5657.5,5.1,1.02007643855556E-08,2.19137084858016E-05,8.87595359564826E-08,0.000724107631595,0.000137783978028 162 | 160,23.5,4562.5,5.1,3.44518189096089E-08,2.62284859066288E-05,1.42176643661872E-07,0.00067869149593,0.000145582151695 163 | 161,55,9855,5.1,1.07366447414235E-09,4.24931535374513E-05,1.87008243427609E-07,0.000268759293348,0.000185890569049 164 | 162,68,6752.5,5.1,2.54520882646995E-08,6.30691102336121E-05,4.74544954413735E-07,0.000171599771892,0.000188689972871 165 | 163,21.5,2372.5,5.2,2.17512766732052E-07,2.76029346645149E-05,1.90738447587297E-07,0.000735528881152,0.000139756076879 166 | 164,26.5,3376.25,5.2,1.26994621430226E-07,3.17969274763896E-05,2.3285504633767E-07,0.000646696062247,0.000154135208938 167 | 165,44,456.25,5.2,4.30907201760011E-06,6.23147665875661E-05,1.08451939019097E-06,0.000396227226394,0.000181085797798 168 | 166,62.5,7847.5,5.2,8.09243414090313E-09,5.43459007721383E-05,3.39694510071525E-07,0.000219038064312,0.000189093883813 169 | 167,21,6205,5.3,6.06976182378544E-09,2.11625721747778E-05,7.73458024772474E-08,0.00076588082163,0.000138104573455 170 | 168,26.5,2190,5.3,3.75471498377032E-07,3.42702135139823E-05,3.00603380575311E-07,0.000666450030361,0.000154485339736 171 | 169,27,7482.5,5.3,2.97626361668341E-09,2.49715948233781E-05,9.68531875544561E-08,0.000657978585458,0.000155658516717 172 | 170,28,2555,5.3,2.96228269204825E-07,3.53375811648085E-05,3.08439926058218E-07,0.000641078913408,0.000158108590841 173 | 171,34,10402.5,5.3,2.97412259870175E-10,2.59361144536243E-05,7.80728860798333E-08,0.000546571217363,0.000169712765823 174 | 172,46.5,6752.5,5.3,1.4298039849228E-08,4.41144672885905E-05,2.93199915928956E-07,0.0003824428,0.000183892545536 175 | 173,49,1368.75,5.3,2.21283793799348E-06,6.52019005384815E-05,1.04159320140915E-06,0.000354514484622,0.000185638918096 176 | 174,61.5,2372.5,5.3,1.20757076124977E-06,7.56620609989457E-05,1.11576546387201E-06,0.000237640485428,0.000190045813873 177 | 175,53.5,9125,5.4,1.96848600584309E-09,4.33217959102557E-05,2.11434806370186E-07,0.000321789842407,0.000189042990405 178 | 176,66,2190,5.4,1.59076826212051E-06,8.17968630358488E-05,1.2642652194713E-06,0.000214283800177,0.000192363941075 179 | 177,66.5,8395,5.4,5.32738223742902E-09,5.5634942988013E-05,3.23905647330256E-07,0.000210765092405,0.000192350292514 180 | 178,31,2555,5.5,3.40615902724639E-07,3.89376740086019E-05,3.6756703598401E-07,0.000630498744899,0.000165867180771 181 | 179,32.5,7482.5,5.5,3.97774799133893E-09,2.9851639929216E-05,1.35393859337921E-07,0.000606596236117,0.000168735720129 182 | 180,42.5,8030,5.5,3.73094531693998E-09,3.7308444241546E-05,1.91015653325763E-07,0.000463137968867,0.000182800935576 183 | 181,43.5,3650,5.5,2.18775192334382E-07,5.03339421583874E-05,5.22162531865527E-07,0.00045021980584,0.000183817269152 184 | 182,31.5,4015,5.6,9.08037974550098E-08,3.60549574349836E-05,2.72697981908349E-07,0.000641386230522,0.000167168339403 185 | 183,35,2098.75,5.6,6.39130560107712E-07,4.5088162141988E-05,5.07011251091202E-07,0.000586159031619,0.000173620351684 186 | 184,52.5,2007.5,5.6,1.31664562891036E-06,6.67244438908396E-05,9.89455607577136E-07,0.000359804426455,0.000191230741628 187 | 185,61.5,10585,5.6,6.18413370097652E-10,4.49802752515297E-05,1.82316015669672E-07,0.000272732981288,0.000194292130607 188 | 186,65,6205,5.6,3.80997009891195E-08,6.25223068770416E-05,5.13334516126856E-07,0.000243543269718,0.000195008440139 189 | 187,46,3285,5.7,3.32371427757252E-07,5.43328072340943E-05,6.152094651117E-07,0.000450713878079,0.000188016463733 190 | 188,39.5,182.5,5.8,4.53281636847046E-06,5.71980844418222E-05,9.5100353762247E-07,0.000555333312912,0.000182030925926 191 | 189,44.5,1460,5.8,1.67322377082537E-06,5.9063513318847E-05,8.70570735206198E-07,0.000486223955951,0.00018765900733 192 | 190,51,2463.75,5.8,8.12118343712406E-07,6.30373548783621E-05,8.55542169831879E-07,0.000405874826594,0.000192770734464 193 | 191,69.5,4927.5,5.8,1.35479860007801E-07,7.21568022398813E-05,7.35771357585528E-07,0.000230555149196,0.000198407914838 194 | 192,35.5,2828.75,5.9,3.27517267393713E-07,4.36318724172379E-05,4.35873030329728E-07,0.000633947762216,0.000176568220118 195 | 193,39,1277.5,5.9,1.60664330175007E-06,5.27109136403554E-05,7.26425052165191E-07,0.000580356641421,0.000182112845599 196 | 194,41,10220,5.9,4.54134034528331E-10,3.1357828313774E-05,1.09147618378452E-07,0.000551350211943,0.000184554873875 197 | 195,63,4197.5,5.9,2.25870663356088E-07,6.88924620861157E-05,7.7248119641614E-07,0.000295784831154,0.000198612011975 198 | 196,24.5,5657.5,6,1.27528266937624E-08,2.54665834255973E-05,1.13577395090974E-07,0.000845540227973,0.000151200338751 199 | 197,46,730,6,3.42186828376995E-06,6.38355974393622E-05,1.07336615509102E-06,0.000499336878165,0.000191230615976 200 | 198,49.5,7847.5,6,5.52315857717899E-09,4.36511548177852E-05,2.49743661989427E-07,0.000454579917402,0.000194033925007 201 | 199,50.5,6022.5,6,2.9802432324916E-08,4.98430591925984E-05,3.82509655798467E-07,0.000442275314365,0.000194694090096 202 | -------------------------------------------------------------------------------- /src/datamancer/formulaExp.nim: -------------------------------------------------------------------------------- 1 | import macros, sequtils, strformat, options, sets, tables, algorithm, strutils 2 | import formulaNameMacro 3 | 4 | import column, value, df_types 5 | 6 | type 7 | AssignKind* = enum 8 | byIndex, byTensor 9 | ## replace occurence of nnkAccQuote, nnkCallStrLit, nnkBracketExpr(df) by: 10 | ReplaceByKind = enum 11 | rbIndex ## by call to tensor index, `tT[idx]`, in a `for` loop 12 | rbElement ## by single element, `tIdx` in a `forEach` call 13 | rbColumn ## by full tensor (df column), `colT` in `<<` formula 14 | ## `impure` in the context of `FormulaNode` refers to evaluation requiring 15 | ## a data frame. Pure formulas represent raw expressions that evaluate to 16 | ## a simple `Value` 17 | FormulaKind* = enum 18 | fkNone = "none" ## default value for uninitialized formula / no formula kind at CT yet 19 | fkVariable = "" ## Nim variable as `Value`, pure 20 | fkAssign = "<-" ## assignment op, pure 21 | fkVector = "~" ## map operation, impure 22 | fkScalar = "<<" ## reduce operation, impure 23 | ## either: `t in df["foo", int]` 24 | ## or: `t = df["foo", int]` 25 | Assign* = object 26 | asgnKind*: AssignKind 27 | node*: NimNode ## the exact node that will be replaced by this `Assign` instance 28 | ## TODO: rename / change `ReplaceByKind` as it's currently a bit unclear, in particular after 29 | ## `get` and `delete` was added! 30 | rbKind*: ReplaceByKind ## stores how this should be inserted 31 | element*: NimNode # e.g. `t` 32 | tensor*: NimNode # either `t` or `t_T` if `elmenent` used 33 | col*: NimNode # name of the column 34 | colType*: NimNode # e.g. `float` 35 | resType*: NimNode # the resulting type of the computation `Assign` is ``involved`` in! 36 | Preface* = object 37 | args*: seq[Assign] 38 | FormulaCT* = object 39 | funcKind*: FormulaKind 40 | preface*: Preface 41 | typeHint*: TypeHint # the actual type hint given in the formula 42 | resType*: NimNode # e.g. `ident"float"` 43 | name*: NimNode # name of the formula -> refers to new column / assignment name 44 | rawName*: string # name of the formula body as lisp 45 | loop*: NimNode # loop needs to be patched to remove accented quotes etc 46 | ## `Lift` stores a node which needs to be lifted out of a for loop, because it performs a 47 | ## reducing operation on a full DF column. It will be replaced by `liftedNode` in the loop 48 | ## body. 49 | Lift* = object 50 | toLift*: NimNode 51 | liftedNode*: NimNode 52 | 53 | ## The `TypeHint` and `HeuristicType` are only used for the shortform of the 54 | ## `Formula` syntax using `f{}`. 55 | ## 56 | ## In the general shortform `Formula` syntax `f{}` a `TypeHint` is of the form 57 | ## `f{float -> int: }`, where the first value is the type we use to read the 58 | ## DF tensors and the second the output datatype of the operation. 59 | ## 60 | ## If a `TypeHint` is found in a formula, we do attempt to heuristically determine 61 | ## sensible data types. 62 | TypeHint* = object 63 | inputType*: Option[NimNode] 64 | resType*: Option[NimNode] 65 | 66 | ## `HeuristicType` stores the input and output types of a formula constructed using the 67 | ## `f{}` syntax based on simple heuristic rules about the presence of certain operators 68 | ## and typed symbols (e.g. proc calls, literals). They are only used if no `TypeHint` 69 | ## is supplied. 70 | HeuristicType* = TypeHint 71 | 72 | ## `FormulaTypes` finally is the type used for input and output of the formula 73 | ## construction. Here the types *must* be set, otherwise it's a CT error (which happens 74 | ## if we cannot deduce a type and no hints are given) 75 | FormulaTypes* = object 76 | inputType*: NimNode 77 | resType*: NimNode 78 | 79 | ## Idents used for the generated code 80 | const 81 | InIdent = "in" 82 | ResIdent = "res" 83 | ResultIdent = "result" 84 | RIdent = "r" 85 | DFIdent = "df" 86 | IdxIdent = "idx" 87 | ColIdent = "Column" 88 | ValueIdent = "Value" 89 | 90 | const Dtypes* = ["float", "int", "string", "bool", "Value"] 91 | const DtypesAll* = ["float", "float64", "int", "int64", "string", "bool", "Value"] 92 | 93 | const DtypeOrderMap* = { 94 | "Value" : 1, 95 | "Tensor[Value]" : 2, 96 | "Tensor[T]" : 3, 97 | "T" : 4, 98 | "Tensor[string]" : 5, 99 | "string" : 6, 100 | "Tensor[int]" : 7, 101 | "int" : 8, 102 | "Tensor[int64]" : 9, 103 | "int64" : 10, 104 | "Tensor[float]" : 11, 105 | "float" : 12, 106 | "Tensor[float64]" : 13, 107 | "float64" : 14, 108 | "Tensor[bool]" : 15, 109 | "bool" : 16 # if something can be done with `bool`, take that 110 | }.toTable() 111 | const DtypeOrderMapKeys = toSeq(DtypeOrderMap.keys()) 112 | 113 | proc toStrType*(n: NimNode): NimNode = 114 | case n.kind 115 | of nnkIntLit .. nnkUInt64Lit: result = ident "int" 116 | of nnkFloatLit .. nnkFloat128Lit: result = ident "float" 117 | of nnkStrLit: result = ident "string" 118 | of nnkIdent, nnkSym: 119 | if n.strVal in ["true", "false"]: result = ident "bool" 120 | else: result = ident(n.repr) 121 | else: result = ident(n.repr) 122 | 123 | proc isValidType*(n: NimNode): bool = n.strVal in DtypeOrderMapKeys 124 | 125 | proc sortTypes*(s: seq[string]): seq[string] = 126 | ## sorts the types according to our own "priority list" 127 | var ids = newSeq[int](s.len) 128 | for i, el in s: 129 | if el in DtypeOrderMap: 130 | ids[i] = DtypeOrderMap[el] 131 | result = zip(s, ids).sortedByIt(it[1]).mapIt(it[0]) 132 | 133 | proc sortTypes*(s: seq[NimNode]): seq[string] = 134 | result = s.filterIt(it.isValidType).mapIt(it.strVal).sortTypes() 135 | 136 | proc isColumnType*(n: NimNode): bool = 137 | case n.kind 138 | of nnkBracketExpr: 139 | if n[0].kind in {nnkSym, nnkIdent} and n[0].strVal == "Tensor": 140 | result = true 141 | of nnkSym, nnkIdent: 142 | if n.strVal.startsWith("Tensor"): 143 | result = true 144 | else: discard 145 | 146 | proc checkIdent(n: NimNode, s: string): bool = 147 | result = n.len > 0 and n[0].kind == nnkIdent and n[0].strVal == s 148 | 149 | proc extractCall(stmts: NimNode, id: string): NimNode = 150 | expectKind(stmts, nnkStmtList) 151 | for ch in stmts: 152 | case ch.kind 153 | of nnkCall: 154 | if checkIdent(ch, id): 155 | return ch 156 | else: continue 157 | 158 | proc parsePreface(n: NimNode): Preface = 159 | proc addInfixAssign(ch: NimNode): Assign = 160 | doAssert checkIdent(ch, "in") 161 | doAssert ch[1].kind == nnkIdent, "First element before `in` needs to be an ident!" 162 | doAssert ch[2].kind == nnkBracketExpr, "`in` must refer to a `df[, ]`!" 163 | doAssert ch[2][0].strVal == "df", "`in` must refer to a `df[, ]`!" 164 | let elId = ch[1].strVal 165 | let dtype = ch[2][2].strVal 166 | doAssert dtype in Dtypes, "Column dtype " & $dtype & " not in " & $Dtypes & "!" 167 | result = Assign(asgnKind: byIndex, 168 | node: ch, 169 | element: ident(elId), 170 | tensor: ident(elId & "T"), 171 | col: ch[2][1], 172 | colType: ident(dtype)) 173 | proc addAsgnAssign(ch: NimNode): Assign = 174 | doAssert ch[0].kind == nnkIdent, "First element before `=` needs to be an ident!" 175 | doAssert ch[1].kind == nnkBracketExpr, "`=` must assign from a `df[, ]`!" 176 | doAssert ch[1][0].strVal == "df", "`=` must assign from a `df[, ]`!" 177 | let tId = ch[0].strVal 178 | let dtype = ch[1][2].strVal 179 | doAssert dtype in Dtypes, "Column dtype " & $dtype & " not in " & $Dtypes & "!" 180 | result = Assign(asgnKind: byTensor, 181 | node: ch, 182 | element: ident(tId & "Idx"), 183 | tensor: ident(tId), 184 | col: ch[1][1], 185 | colType: ident(dtype)) 186 | 187 | expectKind(n, nnkCall) 188 | expectKind(n[1], nnkStmtList) 189 | for ch in n[1]: 190 | case ch.kind 191 | of nnkInfix: result.args.add addInfixAssign(ch) 192 | of nnkAsgn: result.args.add addAsgnAssign(ch) 193 | else: error("Invalid node kind " & $ch.kind & " in `preface`: " & (ch.repr)) 194 | 195 | proc parseSingle(n: NimNode): NimNode = 196 | expectKind(n[1], nnkStmtList) 197 | result = n[1][0] 198 | 199 | proc parseLoop(n: NimNode): NimNode = 200 | expectKind(n[1], nnkStmtList) 201 | result = n[1] 202 | 203 | func removeCallAcc(n: NimNode): NimNode = 204 | result = if n.kind == nnkAccQuoted: newLit(n[0].strVal) 205 | elif n.kind == nnkCallStrLit: n[1] 206 | else: n 207 | 208 | proc convertPreface(p: Preface): NimNode = 209 | ## TODO: 210 | ## anything that contains a type of `Tensor[T]` needs to be handled differently. 211 | ## Instead of generating a `let colT = df["col", dType]` we need to just call 212 | ## the function that 213 | proc toLet(a: Assign): NimNode = 214 | result = nnkIdentDefs.newTree( 215 | a.tensor, 216 | newEmptyNode(), 217 | nnkBracketExpr.newTree(ident(DfIdent), a.col.removeCallAcc(), 218 | ident(a.colType.strVal)) #convert nnkSym to nnkIdent 219 | ) 220 | result = nnkLetSection.newTree() 221 | var seenTensors = initHashSet[string]() 222 | for arg in p.args: 223 | if arg.tensor.strVal notin seenTensors: 224 | result.add toLet(arg) 225 | seenTensors.incl arg.tensor.strVal 226 | 227 | proc convertDtype(d: NimNode): NimNode = 228 | result = nnkVarSection.newTree( 229 | nnkIdentDefs.newTree( 230 | ident(ResIdent), 231 | newEmptyNode(), 232 | nnkCall.newTree( 233 | nnkBracketExpr.newTree(ident"newTensorUninit", 234 | d), 235 | nnkDotExpr.newTree(ident(DfIdent), 236 | ident"len")) 237 | ) 238 | ) 239 | 240 | proc `$`*(p: Preface): string = 241 | result = "Preface(" 242 | for i, ch in p.args: 243 | result.add &"Assign(element: {ch.element.strVal}, " 244 | result.add &"asgnKind: {ch.asgnKind}, " 245 | result.add &"node: {ch.node.repr}, " 246 | result.add &"tensor: {ch.tensor.strVal}, " 247 | result.add &"col: {buildName(ch.col)}, " 248 | result.add &"rbKind: {ch.rbKind}, " 249 | result.add &"colType: {buildName(ch.colType)}, " 250 | result.add &"resType: {buildName(ch.resType)})" 251 | if i < p.args.high: 252 | result.add ", " 253 | result.add ")" 254 | 255 | proc contains(p: Preface, n: NimNode): bool = 256 | for arg in p.args: 257 | if arg.node == n: 258 | return true 259 | 260 | proc `[]`(p: Preface, n: NimNode): Assign = 261 | for arg in p.args: 262 | if arg.node == n: 263 | return arg 264 | error("Could not find " & n.repr & " in preface containing " & $p) 265 | 266 | proc delete(p: var Preface, n: NimNode) = 267 | var idx = 0 268 | while idx < p.args.len: 269 | if p.args[idx].node == n: 270 | p.args.delete(idx) 271 | # deleted so return 272 | ## TODO: we don't depend on removing all "duplicates" (same column ref), right? 273 | return 274 | inc idx 275 | 276 | proc nodeIsDf*(n: NimNode): bool = 277 | if n.kind == nnkBracketExpr: 278 | result = n[0].kind == nnkIdent and n[0].strVal == "df" 279 | elif n.kind == nnkCall: 280 | result = n[0].kind == nnkIdent and n[0].strVal == "col" 281 | elif n.kind in {nnkCallStrLit, nnkAccQuoted}: 282 | result = true 283 | 284 | proc nodeIsDfIdx*(n: NimNode): bool = 285 | if n.kind == nnkBracketExpr: 286 | result = n[0].kind == nnkBracketExpr and n[0][0].kind == nnkIdent and 287 | n[0][0].strVal == "df" and n[1].kind == nnkIdent and n[1].strVal == "idx" 288 | elif n.kind == nnkCall: 289 | result = n[0].kind == nnkIdent and n[0].strVal == "idx" 290 | elif n.kind in {nnkCallStrLit, nnkAccQuoted}: 291 | result = true 292 | 293 | proc hasExplicitTypeHint*(n: NimNode): bool = 294 | result = (n.nodeIsDf or n.nodeIsDfIdx) and 295 | n.kind == nnkCall and 296 | n.len == 3 and 297 | n[2].kind in {nnkIdent, nnkSym} and 298 | n[2].strVal in DtypesAll 299 | 300 | proc get(p: var Preface, node: NimNode, useIdx: bool): NimNode = 301 | let n = p[node] 302 | p.delete(node) 303 | result = if n.asgnKind == byIndex: 304 | if useIdx: 305 | nnkBracketExpr.newTree( 306 | n.tensor, 307 | ident(IdxIdent) 308 | ) 309 | else: 310 | n.element 311 | else: 312 | n.tensor 313 | 314 | proc replaceByIdx(n: NimNode, preface: var Preface): NimNode = 315 | ## recurses the node `n` and replaces all occurences by `t[idx]` for each 316 | ## tensor in the loop 317 | # first check if an ident that is in preface we have to replace or if 318 | # an `nnkBracketExpr` which contains an ident from `preface`. In those cases 319 | # return early 320 | case n.kind 321 | of nnkIdent, nnkSym: 322 | if n in preface: return preface.get(n, useIdx = true) 323 | else: return n 324 | of nnkAccQuoted: 325 | return preface.get(n, useIdx = true) 326 | of nnkCallStrLit: 327 | return preface.get(n, useIdx = true) 328 | of nnkBracketExpr: 329 | if n[0].kind == nnkIdent and n in preface: 330 | return n 331 | # if `df["someCol"]` replace by full tensor (e.g. in a call taking tensor) 332 | if nodeIsDf(n) and n in preface: 333 | return preface.get(n, useIdx = true) 334 | if nodeIsDfIdx(n) and n in preface: 335 | return preface.get(n, useIdx = true) 336 | of nnkCall: 337 | if (nodeIsDf(n) or nodeIsDfIdx(n)) and n in preface: 338 | return preface.get(n, useIdx = true) 339 | else: result = n 340 | if n.len > 0: 341 | result = newTree(n.kind) 342 | for ch in n: 343 | result.add replaceByIdx(ch, preface) 344 | 345 | proc replaceByElement(n: NimNode, preface: var Preface): NimNode = 346 | ## recurses the node `n` and replaces all occurences by `t` for each 347 | ## tensor in the loop 348 | # first check if an ident that is in preface we have to replace or if 349 | # an `nnkBracketExpr` which contains an ident from `preface`. In those cases 350 | # return early 351 | case n.kind 352 | of nnkIdent, nnkSym: 353 | if n in preface: return preface.get(n, useIdx = false) 354 | else: return n 355 | of nnkAccQuoted: 356 | return preface.get(n, useIdx = false) 357 | of nnkCallStrLit: 358 | return preface.get(n, useIdx = false) 359 | of nnkBracketExpr: 360 | if n[0].kind == nnkIdent and n in preface: 361 | return preface.get(n, useIdx = false) 362 | # for `df["someCol"]` replace by full tensor, e.g. for call taking tensor 363 | if nodeIsDf(n) and n in preface: 364 | return preface.get(n, useIdx = false) 365 | if nodeIsDfIdx(n) and n in preface: 366 | return preface.get(n, useIdx = false) 367 | of nnkCall: 368 | if (nodeIsDf(n) or nodeIsDfIdx(n)) and n in preface: 369 | return preface.get(n, useIdx = false) 370 | else: result = n 371 | if n.len > 0: 372 | result = newTree(n.kind) 373 | for ch in n: 374 | result.add replaceByElement(ch, preface) 375 | 376 | proc replaceByColumn(n: NimNode, preface: var Preface): NimNode = 377 | ## recurses the node `n` and replaces all occurences by full `col` (i.e. field `tensor`) for each 378 | ## tensor in the loop 379 | case n.kind 380 | of nnkIdent, nnkSym: 381 | if n in preface: return preface[n].tensor 382 | else: return n 383 | of nnkAccQuoted: 384 | return preface[n].tensor 385 | of nnkCallStrLit: 386 | return preface[n].tensor 387 | of nnkBracketExpr: 388 | if n[0].kind == nnkIdent and n in preface: 389 | return preface[n].tensor 390 | # for `df["someCol"]` replace by full tensor, e.g. for call taking tensor 391 | if nodeIsDf(n) and n in preface: 392 | return preface[n].tensor 393 | if nodeIsDfIdx(n) and n in preface: 394 | error("Invalid usage of `idx` in a reducing formula! Access: " & $(n.repr)) 395 | of nnkCall: 396 | if (nodeIsDf(n) or nodeIsDfIdx(n)) and n in preface: 397 | return preface[n].tensor 398 | else: result = n 399 | if n.len > 0: 400 | result = newTree(n.kind) 401 | for ch in n: 402 | result.add replaceByColumn(ch, preface) 403 | 404 | proc fixupTensorIndices(loopStmts: NimNode, preface: var Preface, 405 | rbKind: ReplaceByKind): NimNode = 406 | ## If `toElements` is true, we rewrite everything by `t` (where `t` is an 407 | ## element of `tT` (Tensor). This includes 408 | expectKind(loopStmts, nnkStmtList) 409 | case rbKind 410 | of rbIndex: 411 | let loop = loopStmts[0].replaceByIdx(preface) 412 | case loop.kind 413 | of nnkAsgn: 414 | doAssert loop[0].kind == nnkBracketExpr and 415 | loop[0][0].kind == nnkIdent and loop[0][0].strVal == "r" and 416 | loop[0][1].kind == nnkIdent and loop[0][1].strVal == "idx" 417 | ## TODO: make this prettier / fix this 418 | else: 419 | # turn this into an nnkAsgn node with `res` as LHS and `nnkAsgn` on RHS 420 | result = nnkAsgn.newTree( 421 | nnkBracketExpr.newTree(ident(ResIdent), ident(IdxIdent)), 422 | loop) 423 | of rbElement: 424 | let loop = loopStmts[0].replaceByElement(preface) 425 | case loop.kind 426 | of nnkAsgn: doAssert loop[0].kind == nnkIdent and loop[0].strVal == RIdent 427 | else: 428 | # turn this into an nnkAsgn node with `res` as LHS and `nnkAsgn` on RHS 429 | result = nnkAsgn.newTree(ident(RIdent), loop) 430 | of rbColumn: 431 | let loop = loopStmts[0].replaceByColumn(preface) 432 | case loop.kind 433 | of nnkAsgn: doAssert loop[0].kind == nnkIdent and loop[0].strVal == ResIdent 434 | else: 435 | # turn this into an `nnkVarSection` node with `res` as LHS and `loop` as RHS 436 | result = nnkVarSection.newTree( 437 | nnkIdentDefs.newTree( 438 | ident(ResIdent), 439 | newEmptyNode(), 440 | loop) 441 | ) 442 | 443 | proc convertLoop(p: Preface, dtype, loop: NimNode, 444 | fnKind: FormulaKind): NimNode = 445 | let memCopyable = ["float", "int", "bool"] 446 | let isMemCopyable = dtype.strVal in memCopyable and 447 | p.args.allIt(it.colType.strVal in memCopyable) 448 | proc genForLoop(p: Preface, loop: NimNode): NimNode = 449 | var mpreface = p 450 | let loopIndexed = fixupTensorIndices(loop, mpreface, rbKind = rbIndex) 451 | let idx = ident(IdxIdent) 452 | let df = ident(DfIdent) 453 | var loop = quote do: 454 | for `idx` in 0 ..< `df`.len: 455 | `loopIndexed` 456 | result = newStmtList(loop) 457 | 458 | proc genForEach(p: Preface, loop: NimNode): NimNode = 459 | var mpreface = p 460 | let loopElements = fixupTensorIndices(loop, mpreface, rbKind = rbElement) 461 | var forEach = nnkCommand.newTree(ident"forEach") 462 | forEach.add nnkInfix.newTree(ident(InIdent), ident(RIdent), ident(ResIdent)) 463 | for arg in p.args: 464 | forEach.add nnkInfix.newTree(ident(InIdent), arg.element, arg.tensor) 465 | forEach.add nnkStmtList.newTree(loopElements) 466 | result = newStmtList(forEach) 467 | 468 | proc addResultVector(): NimNode = 469 | let resId = ident(ResIdent) 470 | let resultId = ident(ResultIdent) 471 | result = quote do: 472 | `resultId` = toColumn `resId` 473 | 474 | case fnKind 475 | of fkVector: 476 | if not isMemCopyable: 477 | result = genForLoop(p, loop) 478 | result.add addResultVector() 479 | else: 480 | result = genForEach(p, loop) 481 | result.add addResultVector() 482 | of fkScalar: 483 | let resultId = ident(ResultIdent) 484 | var mpreface = p 485 | let loopElements = fixupTensorIndices(loop, mpreface, rbKind = rbColumn) 486 | let resId = ident(ResIdent) 487 | result = quote do: 488 | `loopElements` 489 | `resultId` = %~ `resId` 490 | else: 491 | error("Invalid FormulaKind `" & $(fnKind.repr) & "` in `convertLoop`. Already handled " & 492 | "in `compileFormula`!") 493 | 494 | proc parseFormulaCT(stmts: NimNode): FormulaCT = 495 | let preface = parsePreface(extractCall(stmts, "preface")) 496 | ## TODO: if `dtype` not given: auto determine 497 | let dtype = parseSingle(extractCall(stmts, "dtype")) 498 | let name = parseSingle(extractCall(stmts, "name")) 499 | let loop = parseLoop(extractCall(stmts, "loop")) 500 | result = FormulaCT(preface: preface, 501 | resType: dtype, 502 | name: name, 503 | loop: loop) 504 | 505 | proc generateClosure*(fct: FormulaCT): NimNode = 506 | var procBody = newStmtList() 507 | procBody.add convertPreface(fct.preface) 508 | if fct.funcKind == fkVector: 509 | procBody.add convertDtype(fct.resType) 510 | procBody.add convertLoop(fct.preface, fct.resType, fct.loop, fct.funcKind) 511 | result = procBody 512 | var params: array[2, NimNode] 513 | case fct.funcKind 514 | of fkVector: 515 | params = [ident(ColIdent), 516 | nnkIdentDefs.newTree(ident(DfIdent), 517 | ident"DataFrame", 518 | newEmptyNode())] 519 | of fkScalar: 520 | when (NimMajor, NimMinor, NimPatch) < (1, 5, 0): 521 | let valueId = ident(ValueIdent) 522 | else: 523 | let valueId = nnkDotExpr.newTree(ident"value", ident(ValueIdent)) 524 | # to avoid clashes with other `Value` objects, fully clarify we mean ours 525 | params = [valueId, 526 | nnkIdentDefs.newTree(ident(DfIdent), 527 | ident"DataFrame", 528 | newEmptyNode())] 529 | else: 530 | error("Invalid FormulaKind `" & $(fct.funcKind.repr) & "` in `convertLoop`. Already handled " & 531 | "in `compileFormula`!") 532 | result = newProc(newEmptyNode(), 533 | params = params, 534 | body = procBody, 535 | procType = nnkLambda) 536 | 537 | proc compileFormula(stmts: NimNode): NimNode = 538 | let fct = parseFormulaCT(stmts) 539 | result = generateClosure(fct) 540 | 541 | macro formula(y: untyped): untyped = 542 | ## TODO: add some ability to explicitly create formulas of 543 | ## different kinds more easily! Essentially force the type without 544 | ## a check to avoid having to rely on heuristics. 545 | ## Use 546 | ## - `<-` for assignment 547 | ## - `<<` for reduce operations, i.e. scalar proc? 548 | ## - `~` for vector like proc 549 | ## - formula without any of the above will be considered: 550 | ## - `fkVariable` if no column involved 551 | ## - `fkVector` else 552 | ## - `: `: simple type hint for tensors in closure 553 | ## - ` -> : `: full type for closure. 554 | ## `` is the dtype used for tensors, `` the resulting type 555 | ## - `df[]`: to access columns using identifiers / symbols 556 | ## defined in the scope 557 | ## - `idx`: can be used to access the loop iteration index 558 | result = compileFormula(y) 559 | 560 | when false: # isMainModule: 561 | import math 562 | import arraymancer / laser / strided_iteration / foreach 563 | let f1 = formula: 564 | preface: 565 | t in df["foo", int] # t refers to each element of `foo` in the loop 566 | u in df["bar", float] 567 | v = df["baz", int] # v refers to the ``Tensor`` `baz` 568 | dtype: float 569 | name: "fooBar" 570 | loop: 571 | t.float * u + v[idx].float 572 | 573 | let f2 = f{ parseInt(`t`) > 5 } 574 | 575 | 576 | #let f2 = fn: 577 | # preface: 578 | # t in df["foo", int] # t refers to each element of `foo` in the loop 579 | # u in df["bar", float] 580 | # v = df["baz", int] # v refers to the ``Tensor`` `baz` 581 | # #r in result 582 | # dtype: bool 583 | # name: "filterme" 584 | # loop: 585 | # t.float > u and v[idx].float < 2.2 586 | # 587 | #let f3 = fn: 588 | # preface: 589 | # t in df["foo", float] # t refers to each element of `foo` in the loop 590 | # dtype: bool 591 | # name: "noNan" 592 | # loop: 593 | # not (classify(t) == fcNan) 594 | -------------------------------------------------------------------------------- /src/datamancer/io.nim: -------------------------------------------------------------------------------- 1 | import dataframe, value, column 2 | 3 | import memfiles, streams, strutils, tables, parsecsv, sequtils 4 | # for `showBrowser` 5 | import browsers, strformat, os 6 | 7 | # no-op for backward compatibility with `toDf(readCsv(...))` 8 | proc toDf*(df: DataFrame): DataFrame {.deprecated: "`toDf` is not required anymore, because `readCsv` " & 9 | "already returns an actual `DataFrame` nowadays. Feel free to remove the `toDf` call."} = 10 | df 11 | 12 | proc countLines(s: var FileStream): int = 13 | ## quickly counts the number of lines and then resets stream to beginning 14 | ## of file 15 | var buf = newString(500) 16 | while s.readLine(buf): 17 | inc result 18 | s.setPosition(0) 19 | 20 | proc checkHeader(s: Stream, fname, header: string, colNames: seq[string]): bool = 21 | ## checks whether the given file contains the header `header` 22 | result = true 23 | if header.len > 0: 24 | var headerBuf: string 25 | if s.peekLine(headerBuf): 26 | result = headerBuf.startsWith(header) 27 | else: 28 | raise newException(IOError, "The input file " & $fname & " seems to be empty.") 29 | elif colNames.len > 0: 30 | # given some column names and a "header" without a symbol means we assume 31 | # there is no real header. If there is a real header in addition, user has 32 | # to use `skipLines = N` to skip it. 33 | result = false 34 | 35 | proc readCsv*(s: Stream, 36 | sep = ',', 37 | header = "", 38 | skipLines = 0, 39 | colNames: seq[string] = @[], 40 | fname = ""): OrderedTable[string, seq[string]] = 41 | ## returns a `Stream` with CSV like data as a table of `header` keys vs. `seq[string]` 42 | ## values, where idx 0 corresponds to the first data value 43 | ## The `header` field can be used to designate the symbol used to 44 | ## differentiate the `header`. By default `#`. 45 | ## `colNames` can be used to provide custom names for the columns. 46 | ## If any are given and a header is present with a character indiciating 47 | ## the header, it is automatically skipped. ``However``, if custom names are 48 | ## desired and there is a real header without any starting symbol (i.e. 49 | ## `header.len == 0`), please use `skipLines = N` to skip it manually! 50 | # first check if the file even has a header of type `header` 51 | let hasHeader = checkHeader(s, fname, header, colNames) 52 | 53 | var parser: CsvParser 54 | open(parser, s, fname, separator = sep, skipInitialSpace = true) 55 | 56 | if colNames.len > 0: 57 | # if `colNames` available, use as header 58 | parser.headers = colNames 59 | if hasHeader: 60 | # and skip the real header 61 | discard parser.readRow() 62 | elif hasHeader: 63 | # read the header and use it 64 | parser.readHeaderRow() 65 | else: 66 | # file has no header nor user gave column names, raise 67 | raise newException(IOError, "Input neither has header starting with " & 68 | $header & " nor were column names provided!") 69 | 70 | result = initOrderedTable[string, seq[string]]() 71 | # filter out the header, delimiter, if any 72 | parser.headers.keepItIf(it != header) 73 | 74 | # possibly strip the headers and create the result table of columns 75 | var colHeaders: seq[string] 76 | for colUnstripped in items(parser.headers): 77 | let col = colUnstripped.strip 78 | colHeaders.add col 79 | result[col] = newSeqOfCap[string](5000) # start with a reasonable default cap 80 | 81 | # parse the actual file using the headers 82 | var lnCount = 0 83 | while readRow(parser): 84 | if lnCount < skipLines: 85 | inc lnCount 86 | continue 87 | for i, col in parser.headers: 88 | parser.rowEntry(col).removePrefix({' '}) 89 | parser.rowEntry(col).removeSuffix({' '}) 90 | result[colHeaders[i]].add parser.rowEntry(col) 91 | parser.close() 92 | 93 | template copyBuf(data: ptr UncheckedArray[char], buf: var string, 94 | idx, colStart: int): untyped = 95 | let nIdx = idx - colStart 96 | if nIdx > 0: 97 | ## TODO: can we keep the buffer the same length and only copy the actual length? 98 | buf = newString(nIdx) 99 | copyMem(buf[0].addr, data[colStart].addr, nIdx) 100 | buf.setLen(nIdx) 101 | else: 102 | buf.setLen(0) 103 | 104 | template parseHeaderCol(data: ptr UncheckedArray[char], buf: var string, 105 | colNames: var seq[string], 106 | header: string, sep, quote: char, 107 | idx, colStart: int): untyped = 108 | copyBuf(data, buf, idx, colStart) 109 | if col == 0: 110 | if not buf.startsWith(header): 111 | raise newException(IOError, "Unexpected column name at column 0, missing " & 112 | "expected header `" & header & "`. Found " & buf) 113 | else: 114 | buf.removePrefix(header) 115 | # and remove possible whitespace 116 | buf = buf.strip(chars = Whitespace + {quote}) 117 | let bufStripped = buf.strip(chars = Whitespace + {quote}) 118 | if bufStripped.len == 0 and sep in {' ', '\t'}: 119 | # don't add any name because we are dealing with a space before the 120 | # first column. We don't care about the `col` being off while parsing headers as 121 | # we do not use it to access data. 122 | # This is required over the `if` in the `parseLine` separator, because of possible 123 | # files using header symbols e.g. '#' 124 | discard 125 | elif bufStripped.len == 0: 126 | # in case a column does not have a name, we use `Unnamed` similar to pandas 127 | let numUnknown = colNames.filterIt(it.startsWith("Unnamed")) 128 | colNames.add("Unnamed" & $numUnknown.len) 129 | else: 130 | colNames.add bufStripped 131 | 132 | template guessType(data: ptr UncheckedArray[char], buf: var string, 133 | colTypes: var seq[ColKind], 134 | col, idx, colStart, numCols: untyped): untyped = 135 | # only determine types for as many cols as in header 136 | if col < numCols: 137 | copyBuf(data, buf, idx, colStart) 138 | if buf.isInt: 139 | colTypes[col] = colInt 140 | elif buf.isNumber: 141 | colTypes[col] = colFloat 142 | elif buf.isBool: 143 | colTypes[col] = colBool 144 | else: 145 | colTypes[col] = colString 146 | 147 | proc i64(c: char): int {.inline.} = int(ord(c) - ord('0')) 148 | 149 | proc pow10(e: int): float {.inline.} = 150 | const p10 = [1e-22, 1e-21, 1e-20, 1e-19, 1e-18, 1e-17, 1e-16, 1e-15, 1e-14, 151 | 1e-13, 1e-12, 1e-11, 1e-10, 1e-09, 1e-08, 1e-07, 1e-06, 1e-05, 152 | 1e-4, 1e-3, 1e-2, 1e-1, 1.0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 153 | 1e8, 1e9] # 4*64B cache lines = 32 slots 154 | if -22 <= e and e <= 9: 155 | return p10[e + 22] # common case=small table lookup 156 | result = 1.0 157 | var base = 10.0 158 | var e = e 159 | if e < 0: 160 | e = -e 161 | base = 0.1 162 | while e != 0: 163 | if (e and 1) != 0: 164 | result *= base 165 | e = e shr 1 166 | base *= base 167 | 168 | type 169 | RetType = enum 170 | rtInt, rtFloat, rtNaN, rtError 171 | 172 | proc parseNumber(data: ptr UncheckedArray[char], 173 | sep: char, # if this sep is found parsing ends 174 | idxIn: int, 175 | intVal: var int, floatVal: var float): RetType {.inline, noInit.} = 176 | ## this code is taken and adapted from @c-blake's code in Nim PR #16055. 177 | # Parse/validate/classify all at once, returning the type we parsed into 178 | # and if not `rtError` the `intVal/floatVal` will store the parsed number 179 | const Sign = {'+', '-'} # NOTE: `parseFloat` can generalize this to INF/NAN. 180 | var idx = idxIn 181 | var noDot = false 182 | var exp = 0 183 | var p10 = 0 184 | var pnt = -1 # find '.' (point); do digits 185 | var nD = 0 186 | var giant = false 187 | intVal = 0 # build intVal up from zero.. 188 | if data[idx] in Sign: 189 | idx.inc # skip optional sign 190 | while data[idx] != '\0': # ..and track scale/pow10. 191 | if data[idx] notin Digits: 192 | if data[idx] != '.' or pnt >= 0: 193 | break # a second '.' is forbidden 194 | pnt = nD # save location of '.' (point) 195 | nD.dec # undo loop's nD.inc 196 | elif nD < 18: # 2**63==9.2e18 => 18 digits ok 197 | intVal = 10 * intVal + data[idx].i64 # core ASCII->binary transform 198 | else: # 20+ digits before decimal 199 | giant = true #XXX condition should be more precise than "18 digits" 200 | p10.inc # any digit moves implicit '.' 201 | idx.inc 202 | nD.inc 203 | if data[idxIn] == '-': 204 | intVal = -intVal # adjust sign 205 | 206 | if pnt < 0: # never saw '.' 207 | if nD == 0 and data[idx] == sep: # empty field in CSV 208 | return rtNaN 209 | pnt = nD; noDot = true # so set to number of digits 210 | elif nD == 1: 211 | return rtError # ONLY "[+-]*\.*" 212 | 213 | # `\0` is necessary to support parsing until the end of the file in case of no line break 214 | if data[idx] notin {'\0', sep, '\n', '\r', '\l', 'e', 'E'}: ## TODO: generalize this? 215 | return rtError 216 | 217 | if data[idx] in {'E', 'e'}: # optional exponent 218 | idx.inc 219 | let i0 = idx 220 | if data[idx] in Sign: 221 | idx.inc # skip optional sign 222 | while data[idx] in Digits: # build exponent 223 | exp = 10 * exp + data[idx].i64 224 | idx.inc 225 | if data[i0] == '-': 226 | exp = -exp # adjust sign 227 | elif noDot: # and intVal < (1'i64 shl 53'i64) ? # No '.' & No [Ee]xponent 228 | ## TODO: handle giant? 229 | #if giant: 230 | # return rtError 231 | # #copyBuf(data, strVal, idx, idxIn) 232 | return rtInt # mark as integer 233 | exp += pnt - nD + p10 # combine explicit&implicit exp 234 | floatVal = intVal.float * pow10(exp) # has round-off vs. 80-bit 235 | ## TODO: handle giant? 236 | #if giant: 237 | # return rtError 238 | # #copyBuf(data, strVal, idx, idxIn) 239 | result = rtFloat # mark as float 240 | 241 | template parseCol(data: ptr UncheckedArray[char], buf: var string, 242 | col: var Column, 243 | sep: char, 244 | colTypes: seq[ColKind], colIdx, idx, colStart, row, numCols: int, 245 | intVal: var int, floatVal: var float, rtType: var RetType): untyped = 246 | ## if there are more `,` in a row than in the header, skip it 247 | if likely(colIdx < numCols): 248 | case colTypes[colIdx] 249 | of colInt: 250 | retType = parseNumber(data, sep, colStart, intVal, floatVal) 251 | case retType 252 | of rtInt: col.iCol[row] = intVal 253 | of rtFloat, rtNaN: 254 | # before we copy everything check if can be parsed to float, this branch will only 255 | # be called a single time 256 | col = toColumn col.iCol.asType(float) 257 | if retType != rtNaN: 258 | col.fCol[row] = floatVal 259 | else: 260 | col.fCol[row] = NaN 261 | colTypes[colIdx] = colFloat 262 | of rtError: 263 | # object column 264 | copyBuf(data, buf, idx, colStart) 265 | col = toObjectColumn col 266 | colTypes[colIdx] = colObject 267 | col.oCol[row] = %~ buf 268 | of colFloat: 269 | retType = parseNumber(data, sep, colStart, intVal, floatVal) 270 | case retType 271 | of rtInt: col.fCol[row] = intVal.float 272 | of rtFloat: col.fCol[row] = floatVal 273 | of rtNaN: col.fCol[row] = NaN 274 | of rtError: 275 | # object column 276 | copyBuf(data, buf, idx, colStart) 277 | col = toObjectColumn col 278 | colTypes[colIdx] = colObject 279 | col.oCol[row] = %~ buf 280 | of colBool: 281 | copyBuf(data, buf, idx, colStart) 282 | try: 283 | col.bCol[row] = parseBool buf 284 | except ValueError: 285 | # object column 286 | col = toObjectColumn col 287 | colTypes[colIdx] = colObject 288 | col.oCol[row] = %~ buf 289 | of colString: 290 | copyBuf(data, buf, idx, colStart) 291 | col.sCol[row] = buf 292 | of colObject: 293 | # try to parse as number 294 | retType = parseNumber(data, sep, colStart, intVal, floatVal) 295 | case retType 296 | of rtInt: col.oCol[row] = %~ intVal 297 | of rtFloat: col.oCol[row] = %~ floatVal 298 | of rtNaN: col.oCol[row] = Value(kind: VNull) 299 | of rtError: 300 | copyBuf(data, buf, idx, colStart) 301 | col.oCol[row] = %~ buf 302 | of colConstant: discard # already set 303 | of colNone: 304 | raise newException(IOError, "Invalid column type to parse into: `colNone`. " & 305 | "This shouldn't have happened! row = " & $row & ", col = " & $col) 306 | 307 | template parseLine(data: ptr UncheckedArray[char], buf: var string, 308 | sep: char, 309 | quote: char, 310 | col, idx, colStart, row: var int, 311 | lastWasSep, inQuote: var bool, 312 | toBreak: static bool, 313 | fnToCall: untyped): untyped = 314 | if unlikely(data[idx] == quote): 315 | inQuote = not inQuote 316 | elif unlikely(inQuote): 317 | inc idx 318 | # skip ahead in case we start quote 319 | continue 320 | elif unlikely(data[idx] in {'\n', '\r', '\l'}): 321 | fnToCall 322 | inc row 323 | col = 0 324 | if data[idx] == '\r' and data[idx + 1] == '\l': 325 | inc idx 326 | colStart = idx + 1 327 | lastWasSep = false 328 | when toBreak: 329 | inc idx 330 | break 331 | elif unlikely(skipInitialSpace and lastWasSep and data[idx] == ' '): 332 | colStart = idx + 1 333 | elif unlikely(data[idx] == sep): 334 | # convert last column to data 335 | if (idx - colStart > 0 or col > 0 or sep notin {' ', '\t'}): 336 | # only parse if: we have characters to parse, unless we are not in the first 337 | # column and unless our separator is not "spaces" like. Idea is only ignore 338 | # empty (only spaces) first columns iff we are dealing with space separated files. 339 | # For a proper separator like ',' empty inputs are allowed at the beginning. 340 | fnToCall 341 | inc col 342 | colStart = idx + 1 343 | lastWasSep = true 344 | elif unlikely(data[idx] in toSkip): 345 | colStart = idx + 1 346 | lastWasSep = false 347 | elif unlikely(lastWasSep): 348 | lastWasSep = false 349 | else: 350 | discard 351 | inc idx 352 | 353 | proc readCsvTypedImpl(data: ptr UncheckedArray[char], 354 | size: int, 355 | lineCnt: int, 356 | sep: char = ',', 357 | header: string = "", 358 | skipLines = 0, 359 | toSkip: set[char] = {}, 360 | colNamesIn: seq[string] = @[], 361 | skipInitialSpace = true, 362 | quote = '"'): DataFrame = 363 | ## Implementation of the CSV parser that works on a data array of chars. 364 | result = newDataFrame() 365 | var 366 | idx = 0 367 | row = 0 368 | col = 0 369 | colStart = 0 370 | lastWasSep = false 371 | inQuote = false 372 | buf = newStringOfCap(80) 373 | 374 | # 1. first parse the header 375 | var colNames: seq[string] 376 | while idx < size: 377 | parseLine(data, buf, sep, quote, col, idx, colStart, row, lastWasSep, inQuote, toBreak = true): 378 | parseHeaderCol(data, buf, colNames, header, sep, quote, idx, colStart) 379 | 380 | if colNamesIn.len > 0 and colNamesIn.len != colNames.len: 381 | raise newException(IOError, "Input data contains " & $colNames.len & " columns, but " & 382 | "given " & $colNamesIn.len & " column names given: " & $colNamesIn) 383 | elif colNamesIn.len > 0: 384 | colNames = colNamesIn 385 | # reset index and row back to 0 386 | row = 0 387 | idx = 0 388 | 389 | # 1a. if `header` is set, skip all additional lines starting with header 390 | if header.len > 0: 391 | while idx < size: 392 | parseLine(data, buf, sep, quote, col, idx, colStart, row, lastWasSep, inQuote, toBreak = false): 393 | if col == 0 and data[colStart] != header[0]: 394 | break 395 | 396 | let numCols = colNames.len 397 | # 1b. skip `skipLines` 398 | let rowStart = row 399 | while idx < size: 400 | parseLine(data, buf, sep, quote, col, idx, colStart, row, lastWasSep, inQuote, toBreak = false): 401 | if row - rowStart == skipLines: 402 | break 403 | # compute the number of skipped lines in total 404 | let skippedLines = row 405 | # reset row to 0 406 | row = 0 407 | 408 | # 2. peek the first line to determine the data types 409 | var colTypes = newSeq[ColKind](numCols) 410 | var lastIdx = idx 411 | var lastColStart = colStart 412 | var dataColsIdx = 0 413 | while idx < size: 414 | parseLine(data, buf, sep, quote, col, idx, colStart, row, lastWasSep, inQuote, toBreak = true): 415 | guessType(data, buf, colTypes, col, idx, colStart, numCols) 416 | # if we see the end of the line, store the current column number 417 | if data[idx] in {'\n', '\r', '\l'}: 418 | dataColsIdx = col 419 | 420 | if dataColsIdx + 1 != numCols: 421 | raise newException(IOError, "Input data contains " & $(dataColsIdx + 1) & " in the data portion, but " & 422 | $numCols & " columns in the header.") 423 | # 2a. revert the indices (make it a peek) 424 | idx = lastIdx 425 | colStart = lastColStart 426 | dec row 427 | # 3. create the starting columns 428 | var cols = newSeq[Column](numCols) 429 | let dataLines = lineCnt - skippedLines 430 | for i in 0 ..< colTypes.len: 431 | # create column of length: 432 | # lines in file - header - skipLines 433 | cols[i] = newColumn(colTypes[i], dataLines) 434 | # 4. parse the actual data 435 | doAssert row >= 0, "Parsing the header failed" 436 | var 437 | retType: RetType 438 | intVal: int 439 | floatVal: float 440 | while idx < size: 441 | parseLine(data, buf, sep, quote, col, idx, colStart, row, lastWasSep, inQuote, toBreak = false): 442 | parseCol(data, buf, cols[col], sep, colTypes, col, idx, colStart, row, numCols, 443 | intVal, floatVal, retType) 444 | if row + skippedLines < lineCnt: 445 | # missing linebreak at end of last line 446 | doAssert row + skippedLines == lineCnt - 1, "Bad file. Please report an issue." 447 | parseCol(data, buf, cols[col], sep, colTypes, col, idx, colStart, row, numCols, 448 | intVal, floatVal, retType) 449 | for i, col in colNames: 450 | result[col] = cols[i] 451 | result.len = dataLines 452 | 453 | proc readCsv*(fname: string, 454 | sep: char = ',', 455 | header: string = "", 456 | skipLines = 0, 457 | toSkip: set[char] = {}, 458 | colNames: seq[string] = @[], 459 | skipInitialSpace = true, 460 | quote = '"', 461 | ): DataFrame = 462 | ## Reads a DF from a CSV file using the separator character `sep`. 463 | ## 464 | ## `toSkip` can be used to skip optional characters that may be present 465 | ## in the data. For instance if a CSV file is separated by `,`, but contains 466 | ## additional whitespace (`5, 10, 8` instead of `5,10,8`) this can be 467 | ## parsed correctly by setting `toSkip = {' '}`. 468 | ## 469 | ## `header` designates the symbol that defines the header of the CSV file. 470 | ## By default it's empty meaning that the first line will be treated as 471 | ## the header. If a header is given, e.g. `"#"`, this means we will determine 472 | ## the column names from the first line (which has to start with `#`) and 473 | ## skip every line until the first line starting without `#`. 474 | ## 475 | ## `skipLines` is used to skip `N` number of lines at the beginning of the 476 | ## file. 477 | result = newDataFrame() 478 | var ff = memfiles.open(fname) 479 | var lineCnt = 0 480 | for slice in memSlices(ff): 481 | if slice.size > 0: 482 | inc lineCnt 483 | 484 | ## we're dealing with ASCII files, thus each byte can be interpreted as a char 485 | var data = cast[ptr UncheckedArray[char]](ff.mem) 486 | result = readCsvTypedImpl(data, ff.size, lineCnt, sep, header, skipLines, toSkip, colNames) 487 | ff.close() 488 | 489 | proc parseCsvString*(csvData: string, 490 | sep: char = ',', 491 | header: string = "", 492 | skipLines = 0, 493 | toSkip: set[char] = {}, 494 | colNames: seq[string] = @[]): DataFrame = 495 | ## Parses a `DataFrame` from a string containing CSV data. 496 | ## 497 | ## `toSkip` can be used to skip optional characters that may be present 498 | ## in the data. For instance if a CSV file is separated by `,`, but contains 499 | ## additional whitespace (`5, 10, 8` instead of `5,10,8`) this can be 500 | ## parsed correctly by setting `toSkip = {' '}`. 501 | ## 502 | ## `header` designates the symbol that defines the header of the CSV file. 503 | ## By default it's empty meaning that the first line will be treated as 504 | ## the header. If a header is given, e.g. `"#"`, this means we will determine 505 | ## the column names from the first line (which has to start with `#`) and 506 | ## skip every line until the first line starting without `#`. 507 | ## 508 | ## `skipLines` is used to skip `N` number of lines at the beginning of the 509 | ## file. 510 | result = newDataFrame() 511 | 512 | ## we're dealing with ASCII files, thus each byte can be interpreted as a char 513 | var data = cast[ptr UncheckedArray[char]](csvData[0].unsafeAddr) 514 | result = readCsvTypedImpl(data, csvData.len, countLines(csvData), sep, header, skipLines, toSkip, colNames) 515 | 516 | proc readCsvAlt*(fname: string, 517 | sep = ',', 518 | header = "", 519 | skipLines = 0, 520 | colNames: seq[string] = @[]): OrderedTable[string, seq[string]] = 521 | ## returns a CSV file as a table of `header` keys vs. `seq[string]` 522 | ## values, where idx 0 corresponds to the first data value 523 | ## The `header` field can be used to designate the symbol used to 524 | ## differentiate the `header`. By default `#`. 525 | ## `colNames` can be used to provide custom names for the columns. 526 | ## If any are given and a header is present with a character indiciating 527 | ## the header, it is automatically skipped. ``However``, if custom names are 528 | ## desired and there is a real header without any starting symbol (i.e. 529 | ## `header.len == 0`), please use `skipLines = N` to skip it manually! 530 | var s = newFileStream(fname, fmRead) 531 | if s == nil: 532 | raise newException(IOError, "Input file " & $fname & " does not exist! " & 533 | "`readCsv` failed.") 534 | result = s.readCsv(sep, header, skipLines, colNames, fname = fname) 535 | s.close() 536 | 537 | proc writeCsv*(df: DataFrame, filename: string, sep = ',', header = "", 538 | precision = 4) = 539 | ## writes a DataFrame to a "CSV" (separator can be changed) file. 540 | ## `sep` is the actual separator to be used. `header` indicates a potential 541 | ## symbol marking the header line, e.g. `#` 542 | var data = newStringOfCap(df.len * 8) # for some reserved space 543 | # add header symbol to first line 544 | data.add header 545 | let keys = getKeys(df) 546 | data.add join(keys, $sep) & "\n" 547 | var idx = 0 548 | for row in df: 549 | idx = 0 550 | for x in row: 551 | if idx > 0: 552 | data.add $sep 553 | data.add pretty(x, precision = precision) 554 | inc idx 555 | data.add "\n" 556 | writeFile(filename, data) 557 | 558 | proc showBrowser*(df: DataFrame, fname = "df.html", path = getTempDir(), toRemove = false) = 559 | ## Displays the given DataFrame as a table in the default browser. 560 | ## 561 | ## Note: the HTML generation is not written for speed at this time. For very large 562 | ## dataframes expect bad performance. 563 | const tmpl = """ 564 | 565 | 566 | 567 | 584 | 585 | 586 | 587 | 588 | $# 589 |
590 | 591 | 592 | 593 | """ 594 | var 595 | header: string 596 | body: string 597 | header = "\n" 598 | for k in df.getKeys: 599 | header.add &" {k}

{df[k].kind.toNimType} " 600 | header.add "\n" 601 | body = "" 602 | for row in df: 603 | body.add "\n" 604 | for x in row: 605 | body.add &"{pretty(x)}" 606 | body.add "\n" 607 | body.add "" 608 | let fname = path / fname 609 | writeFile(fname, tmpl % [header & body]) 610 | openDefaultBrowser(fname) 611 | if toRemove: 612 | # opening browsers may be slow, so wait a long time before we delete (file still needs to 613 | # be there when the browser is finally open. Thus default is to keep the file 614 | sleep(1000) 615 | removeFile(fname) 616 | -------------------------------------------------------------------------------- /src/datamancer/column.nim: -------------------------------------------------------------------------------- 1 | import arraymancer 2 | import value, sugar, math, strformat 3 | from sequtils import allIt 4 | 5 | type 6 | ColKind* = enum 7 | colNone, colFloat, colInt, colBool, colString, colObject, colConstant 8 | Column* = ref object 9 | len*: int 10 | case kind*: ColKind 11 | of colFloat: fCol*: Tensor[float] 12 | of colInt: iCol*: Tensor[int] 13 | of colBool: bCol*: Tensor[bool] 14 | of colString: sCol*: Tensor[string] 15 | of colObject: oCol*: Tensor[Value] 16 | of colConstant: cCol*: Value 17 | of colNone: discard 18 | 19 | template `%~`*(v: Value): Value = v 20 | proc pretty*(c: Column): string 21 | proc compatibleColumns*(c1, c2: Column): bool {.inline.} 22 | # just a no-op 23 | template toColumn*(c: Column): Column = c 24 | 25 | func high*(c: Column): int = c.len - 1 26 | 27 | func isConstant*(c: Column): bool = c.kind == colConstant 28 | 29 | proc toColumn*[T: SomeFloat | SomeInteger | string | bool | Value](t: Tensor[T]): Column = 30 | when T is SomeInteger: 31 | result = Column(kind: colInt, 32 | iCol: t.asType(int), 33 | len: t.size) 34 | elif T is SomeFloat: 35 | result = Column(kind: colFloat, 36 | fCol: t.asType(float), 37 | len: t.size) 38 | elif T is bool: 39 | result = Column(kind: colBool, 40 | bCol: t, 41 | len: t.size) 42 | elif T is string: 43 | result = Column(kind: colString, 44 | sCol: t, 45 | len: t.size) 46 | elif T is Value: 47 | result = Column(kind: colObject, 48 | oCol: t, 49 | len: t.size) 50 | 51 | proc constantColumn*[T](val: T, len: int): Column = 52 | ## creates a constant column based on `val` and its type 53 | result = Column(len: len, kind: colConstant, cCol: %~ val) 54 | 55 | proc constantToFull*(c: Column): Column = 56 | ## creates a real constant full tensor column based on a constant column 57 | if c.kind != colConstant: return c 58 | withNative(c.cCol, val): 59 | result = toColumn newTensorWith[type(val)](c.len, val) 60 | 61 | proc `[]`*(c: Column, slice: Slice[int]): Column = 62 | case c.kind 63 | of colInt: result = toColumn c.iCol[slice.a .. slice.b] 64 | of colFloat: result = toColumn c.fCol[slice.a .. slice.b] 65 | of colString: result = toColumn c.sCol[slice.a .. slice.b] 66 | of colBool: result = toColumn c.bCol[slice.a .. slice.b] 67 | of colObject: result = toColumn c.oCol[slice.a .. slice.b] 68 | of colConstant: 69 | # for constant keep column, only adjust the length to the slice 70 | result = c 71 | result.len = slice.b - slice.a + 1 72 | of colNone: raise newException(IndexError, "Accessed column is empty!") 73 | 74 | proc newColumn*(kind = colNone, length = 0): Column = 75 | case kind 76 | of colFloat: result = toColumn newTensor[float](length) 77 | of colInt: result = toColumn newTensor[int](length) 78 | of colString: result = toColumn newTensor[string](length) 79 | of colBool: result = toColumn newTensor[bool](length) 80 | of colObject: result = toColumn newTensor[Value](length) 81 | of colConstant: result = constantColumn(Value(kind: VNull), length) 82 | of colNone: result = Column(kind: colNone, len: 0) 83 | 84 | 85 | proc toColKind*[T](dtype: typedesc[T]): ColKind = 86 | when T is SomeFloat: 87 | result = colFloat 88 | elif T is SomeInteger: 89 | result = colInt 90 | elif T is bool: 91 | result = colBool 92 | elif T is string: 93 | result = colString 94 | elif T is Value: 95 | result = colObject 96 | 97 | proc toColKind*(vKind: ValueKind): ColKind = 98 | case vKind 99 | of VFloat: result = colFloat 100 | of VInt: result = colInt 101 | of VString: result = colString 102 | of VBool: result = colBool 103 | of VObject: result = colObject 104 | of VNull: result = colObject 105 | 106 | proc toValueKind*(colKind: ColKind): ValueKind = 107 | case colKind 108 | of colFloat: result = VFloat 109 | of colInt: result = VInt 110 | of colString: result = VString 111 | of colBool: result = VBool 112 | of colObject: result = VObject 113 | of colConstant: result = VObject 114 | of colNone: result = VNull 115 | 116 | proc toNimType*(colKind: ColKind): string = 117 | ## returns the string name of the underlying data type of the column kind 118 | case colKind 119 | of colFloat: result = "float" 120 | of colInt: result = "int" 121 | of colString: result = "string" 122 | of colBool: result = "bool" 123 | of colObject: result = "object" 124 | of colConstant: result = "constant" 125 | of colNone: result = "null" 126 | 127 | template withNativeTensor*(c: Column, 128 | valName: untyped, 129 | body: untyped): untyped = 130 | case c.kind 131 | of colInt: 132 | let `valName` {.inject.} = c.iCol 133 | body 134 | of colFloat: 135 | let `valName` {.inject.} = c.fCol 136 | body 137 | of colString: 138 | let `valName` {.inject.} = c.sCol 139 | body 140 | of colBool: 141 | let `valName` {.inject.} = c.bCol 142 | body 143 | of colObject: 144 | let `valName` {.inject.} = c.oCol 145 | body 146 | of colConstant: 147 | withNative(c.cCol, realVal): 148 | let `valName` {.inject.} = newTensorWith(c.len, realVal) 149 | body 150 | of colNone: raise newException(ValueError, "Accessed column is empty!") 151 | 152 | proc combinedColKind*(c: seq[ColKind]): ColKind = 153 | if c.allIt(it == c[0]): 154 | # all the same, take any 155 | result = c[0] 156 | elif c.allIt(it in {colInt, colFloat}): 157 | # int and float can be combined to float, since we're lenient like that 158 | result = colFloat 159 | else: 160 | # the rest can only be merged via object columns of `Values`. 161 | result = colObject 162 | 163 | template withNative*(c: Column, idx: int, 164 | valName: untyped, 165 | body: untyped): untyped = 166 | case c.kind 167 | of colInt: 168 | let `valName` {.inject.} = c[idx, int] 169 | body 170 | of colFloat: 171 | let `valName` {.inject.} = c[idx, float] 172 | body 173 | of colString: 174 | let `valName` {.inject.} = c[idx, string] 175 | body 176 | of colBool: 177 | let `valName` {.inject.} = c[idx, bool] 178 | body 179 | of colObject: 180 | let `valName` {.inject.} = c[idx, Value] 181 | body 182 | of colConstant: 183 | let `valName` {.inject.} = c[idx, Value] 184 | body 185 | of colNone: raise newException(ValueError, "Accessed column is empty!") 186 | 187 | template withNativeDtype*(c: Column, body: untyped): untyped = 188 | case c.kind 189 | of colInt: 190 | type dtype {.inject.} = int 191 | body 192 | of colFloat: 193 | type dtype {.inject.} = float 194 | body 195 | of colString: 196 | type dtype {.inject.} = string 197 | body 198 | of colBool: 199 | type dtype {.inject.} = bool 200 | body 201 | of colObject, colConstant: 202 | type dtype {.inject.} = Value 203 | body 204 | of colNone: raise newException(ValueError, "Accessed column is empty!") 205 | 206 | template withDtypeByColKind*(colKind: ColKind, body: untyped): untyped = 207 | case colKind 208 | of colInt: 209 | type dtype {.inject.} = int 210 | body 211 | of colFloat: 212 | type dtype {.inject.} = float 213 | body 214 | of colString: 215 | type dtype {.inject.} = string 216 | body 217 | of colBool: 218 | type dtype {.inject.} = bool 219 | body 220 | of colObject, colConstant: 221 | type dtype {.inject.} = Value 222 | body 223 | of colNone: raise newException(ValueError, "Invalid column kind!") 224 | 225 | proc asValue*[T](t: Tensor[T]): Tensor[Value] {.noInit.} = 226 | ## Apply type conversion on the whole tensor 227 | result = t.map(x => (%~ x)) 228 | 229 | proc valueTo*[T](t: Tensor[Value], dtype: typedesc[T], 230 | dropNulls: static bool = false): Tensor[T] = 231 | when not dropNulls: 232 | when T is string: 233 | result = t.map(x => x.toStr) 234 | elif T is float: 235 | result = t.map(x => x.toFloat) 236 | elif T is int: 237 | result = t.map(x => x.toInt) 238 | elif T is bool: 239 | result = t.map(x => x.toBool) 240 | elif T is Value: 241 | result = t 242 | else: 243 | # filter tensor to non Null values 244 | var outputIdx = newSeqOfCap[int](t.size) 245 | for idx, x in t: 246 | if x.kind != VNull: 247 | outputIdx.add idx[0] 248 | result = newTensor[T](outputIdx.len) 249 | when T is string: 250 | for i, idx in outputIdx: 251 | result[i] = t[idx].toStr 252 | elif T is float: 253 | for i, idx in outputIdx: 254 | result[i] = t[idx].toFloat 255 | elif T is int: 256 | for i, idx in outputIdx: 257 | result[i] = t[idx].toInt 258 | elif T is bool: 259 | for i, idx in outputIdx: 260 | result[i] = t[idx].toBool 261 | elif T is Value: 262 | for i, idx in outputIdx: 263 | result[i] = t[idx] 264 | 265 | proc toTensor*[T](c: Column, dtype: typedesc[T], 266 | dropNulls: static bool = false): Tensor[T] = 267 | ## `dropNulls` only has an effect on `colObject` columns. It allows to 268 | ## drop Null values to get (hopefully) a valid raw Tensor 269 | case c.kind 270 | of colInt: 271 | when T is int: 272 | result = c.iCol 273 | elif T is SomeNumber: 274 | result = c.iCol.asType(T) 275 | elif T is Value: 276 | result = c.iCol.asValue 277 | elif T is string: 278 | result = c.iCol.map_inline($x) 279 | else: 280 | raise newException(ValueError, "Invalid conversion of int column to " & $T & "!") 281 | of colFloat: 282 | when T is float: 283 | result = c.fCol 284 | elif T is SomeNumber: 285 | result = c.fCol.asType(T) 286 | elif T is Value: 287 | result = c.fCol.asValue 288 | elif T is string: 289 | result = c.fCol.map_inline($x) 290 | else: 291 | raise newException(ValueError, "Invalid conversion of float column to " & $T & "!") 292 | of colString: 293 | when T is string: 294 | result = c.sCol 295 | elif T is Value: 296 | result = c.sCol.asValue 297 | else: 298 | raise newException(ValueError, "Invalid conversion of string column to " & $T & "!") 299 | of colBool: 300 | when T is bool: 301 | result = c.bCol 302 | elif T is Value: 303 | result = c.bCol.asValue 304 | else: 305 | raise newException(ValueError, "Invalid conversion of bool column to " & $T & "!") 306 | of colObject: 307 | result = c.oCol.valueTo(T, dropNulls = dropNulls) 308 | of colConstant: 309 | result = c.constantToFull.toTensor(dtype, dropNulls) 310 | of colNone: raise newException(ValueError, "Accessed column is empty!") 311 | 312 | proc toTensor*[T](c: Column, slice: Slice[int], dtype: typedesc[T]): Tensor[T] = 313 | case c.kind 314 | of colInt: 315 | when T is int: 316 | result = c.iCol[slice.a .. slice.b] 317 | elif T is SomeNumber: 318 | result = c.iCol[slice.a .. slice.b].asType(T) 319 | of colFloat: 320 | when T is float: 321 | result = c.fCol[slice.a .. slice.b] 322 | elif T is SomeNumber: 323 | result = c.fCol[slice.a .. slice.b].asType(T) 324 | of colString: 325 | when T is string: 326 | result = c.sCol[slice.a .. slice.b] 327 | of colBool: 328 | when T is bool: 329 | result = c.bCol[slice.a .. slice.b] 330 | of colObject: 331 | result = c.oCol[slice.a .. slice.b].valueTo(T) 332 | of colConstant: 333 | result = newTensorWith[T](slice.b - slice.a + 1, c.cCol.to(T)) 334 | of colNone: raise newException(ValueError, "Accessed column is empty!") 335 | 336 | proc `[]`*[T](c: Column, idx: int, dtype: typedesc[T]): T = 337 | when T isnot Value: 338 | case c.kind 339 | of colInt: 340 | when T is int: 341 | result = c.iCol[idx] 342 | elif T is SomeNumber: 343 | result = c.iCol[idx].T 344 | elif T is string: 345 | result = $c.iCol[idx] 346 | of colFloat: 347 | when T is float: 348 | result = c.fCol[idx] 349 | elif T is SomeNumber: 350 | result = c.fCol[idx].T 351 | elif T is string: 352 | # convert to Value and then string so that we use one single 353 | # formatting function. This is slow anyways 354 | result = pretty(%~ c.fCol[idx]) 355 | of colString: 356 | when T is string: 357 | result = c.sCol[idx] 358 | of colBool: 359 | when T is bool: 360 | result = c.bCol[idx] 361 | of colObject: 362 | when T is string: 363 | result = c.oCol[idx].toStr 364 | elif T is float: 365 | result = c.oCol[idx].toFloat 366 | elif T is int: 367 | result = c.oCol[idx].toInt 368 | elif T is bool: 369 | result = c.oCol[idx].toBool 370 | of colConstant: 371 | when T is string: 372 | result = c.cCol.toStr 373 | elif T is float: 374 | result = c.cCol.toFloat 375 | elif T is int: 376 | result = c.cCol.toInt 377 | elif T is bool: 378 | result = c.cCol.toBool 379 | of colNone: raise newException(ValueError, "Accessed column is empty!") 380 | else: 381 | case c.kind 382 | of colInt: result = %~ c.iCol[idx] 383 | of colFloat: result = %~ c.fCol[idx] 384 | of colString: result = %~ c.sCol[idx] 385 | of colBool: result = %~ c.bCol[idx] 386 | of colObject: result = c.oCol[idx] 387 | of colConstant: result = c.cCol 388 | of colNone: raise newException(ValueError, "Accessed column is empty!") 389 | 390 | proc toObjectColumn*(c: Column): Column = 391 | ## returns `c` as an object column 392 | var res = newTensor[Value](c.len) 393 | withNativeTensor(c, t): 394 | for idx in 0 ..< c.len: 395 | res[idx] = %~ (t[idx]) 396 | result = toColumn res 397 | 398 | proc `[]=`*[T](c: var Column, idx: int, val: T) = 399 | ## assign `val` to column `c` at index `idx` 400 | ## If the types match, it just calls `[]=` on the tensor. 401 | ## If they are compatible, `val` is converted to c's type. 402 | ## If they are incompatible, `c` will be rewritten to an object 403 | ## column. 404 | var rewriteAsValue = false 405 | case c.kind 406 | of colFloat: 407 | when T is float: 408 | c.fCol[idx] = val 409 | elif T is SomeNumber: 410 | c.fCol[idx] = val.float 411 | of colInt: 412 | when T is int: 413 | c.iCol[idx] = val 414 | else: 415 | rewriteAsValue = true 416 | of colString: 417 | when T is string: 418 | c.sCol[idx] = val 419 | else: 420 | rewriteAsValue = true 421 | of colBool: 422 | when T is bool: 423 | c.bCol[idx] = val 424 | else: 425 | rewriteAsValue = true 426 | of colObject: 427 | c.oCol[idx] = %~ val 428 | of colConstant: 429 | if c.cCol == %~ val: discard # do nothing 430 | elif c.cCol.kind == VNull: 431 | # turn into constant column of `val` 432 | c.cCol = %~ val 433 | else: 434 | # need to replace constant column by non constant with changed value at 435 | # specified index 436 | c = c.constantToFull() 437 | c[idx] = val 438 | of colNone: raise newException(ValueError, "Accessed column is empty!") 439 | if rewriteAsValue: 440 | # rewrite as an object column 441 | c = c.toObjectColumn() 442 | c.oCol[idx] = %~ val 443 | 444 | proc `[]=`*[T](c: var Column, slice: Slice[int], t: Tensor[T]) = 445 | ## Assigns the tensor `t` to the slice `slice`. The slice length must match 446 | ## the tensor length exactly and must be smaller than the column length. 447 | ## 448 | ## If the type of `t` does not match the column kind, we reallocate to an object column. 449 | let length = slice.b - slice.a + 1 450 | let sa = slice.a 451 | let sb = slice.b 452 | if length != t.size: 453 | raise newException(ValueError, "Given tensor of size " & $t.size & " does not match slice " & 454 | $slice & " with length: " & $length & ".") 455 | elif length > c.len: 456 | raise newException(ValueError, "Given slice " & $slice & " of length " & $length & 457 | " is larger than column length of " & $c.len & ".") 458 | case c.kind 459 | of colInt: 460 | when T is int: 461 | c.iCol[sa .. sb] = t 462 | else: 463 | c = c.toObjectColumn() 464 | c.oCol[sa .. sb] = t.asValue() 465 | of colFloat: 466 | when T is float: 467 | c.fCol[sa .. sb] = t 468 | elif T is int: 469 | c.fCol[sa .. sb] = t.asType(float) 470 | else: 471 | c = c.toObjectColumn() 472 | c.oCol[sa .. sb] = t.asValue() 473 | of colString: 474 | when T is string: 475 | c.sCol[sa .. sb] = t 476 | else: 477 | c = c.toObjectColumn() 478 | c.oCol[sa .. sb] = t.asValue() 479 | of colBool: 480 | when T is bool: 481 | c.bCol[sa .. sb] = t 482 | else: 483 | c = c.toObjectColumn() 484 | c.oCol[sa .. sb] = t.asValue() 485 | of colConstant: 486 | ## if we are handed a Tensor to slice assign, we have to convert to a full column 487 | ## Then try again with the full tensor (possibly convert to object column then) 488 | c = c.constantToFull() 489 | c[sa .. sb] = t 490 | of colObject: 491 | when T is Value: 492 | c.oCol[sa .. sb] = t 493 | else: 494 | c.oCol[sa .. sb] = t.asValue() 495 | of colNone: 496 | raise newException(ValueError, "Cannot assign a tensor to an empty column.") 497 | 498 | proc `[]=`*(c: var Column, slice: Slice[int], col: Column) = 499 | let sa = slice.a.int 500 | let sb = slice.b.int 501 | if c.compatibleColumns(col) and c.kind != colConstant: 502 | withNativeDtype(c): 503 | c[slice] = col.toTensor(dtype) 504 | elif c.kind == colConstant and col.kind == colConstant: 505 | if c.cCol == col.cCol: return # nothing to do 506 | else: 507 | c = c.constantToFull() 508 | let c2 = col.constantToFull() 509 | c[slice] = c2 510 | else: 511 | c = c.toObjectColumn() 512 | c.oCol[sa .. sb] = col.toTensor(Value) 513 | 514 | template withNative2*(c1, c2: Column, idx1, idx2: int, 515 | valName1, valName2: untyped, 516 | body: untyped): untyped = 517 | assert c1.kind == c2.kind 518 | case c1.kind 519 | of colInt: 520 | let `valName1` {.inject.} = c1[idx1, int] 521 | let `valName2` {.inject.} = c2[idx2, int] 522 | body 523 | of colFloat: 524 | let `valName1` {.inject.} = c1[idx1, float] 525 | let `valName2` {.inject.} = c2[idx2, float] 526 | body 527 | of colString: 528 | let `valName1` {.inject.} = c1[idx1, string] 529 | let `valName2` {.inject.} = c2[idx2, string] 530 | body 531 | of colBool: 532 | let `valName1` {.inject.} = c1[idx1, bool] 533 | let `valName2` {.inject.} = c2[idx2, bool] 534 | body 535 | of colObject: 536 | let `valName1` {.inject.} = c1[idx1, Value] 537 | let `valName2` {.inject.} = c2[idx2, Value] 538 | body 539 | of colConstant: raise newException(ValueError, "Accessed column is constant!") 540 | of colNone: raise newException(ValueError, "Accessed column is empty!") 541 | 542 | proc compatibleColumns*(c1, c2: Column): bool {.inline.} = 543 | if c1.kind == c2.kind: result = true 544 | elif c1.kind in {colInt, colFloat} and 545 | c2.kind in {colInt, colFloat}: 546 | result = true 547 | else: result = false 548 | 549 | proc equal*(c1: Column, idx1: int, c2: Column, idx2: int): bool = 550 | ## checks if the value in `c1` at `idx1` is equal to the 551 | ## value in `c2` at `idx2` 552 | if not compatibleColumns(c1, c2): return false 553 | elif c1.kind == c2.kind: 554 | withNativeDtype(c1): 555 | result = c1[idx1, dtype] == c2[idx2, dtype] 556 | else: 557 | # need to get the enveloping kind and read the data using that corresponding 558 | # data type 559 | let kind = combinedColKind(@[c1.kind, c2.kind]) 560 | withDtypeByColKind(kind): 561 | result = c1[idx1, dtype] == c2[idx2, dtype] 562 | 563 | proc toObject*(c: Column): Column {.inline.} = 564 | case c.kind 565 | of colObject: result = c 566 | of colInt: result = toColumn c.iCol.asValue 567 | of colFloat: result = toColumn c.fCol.asValue 568 | of colString: result = toColumn c.sCol.asValue 569 | of colBool: result = toColumn c.bCol.asValue 570 | of colConstant: raise newException(ValueError, "Accessed column is constant!") 571 | of colNone: raise newException(ValueError, "Accessed column is empty!") 572 | 573 | proc add*(c1, c2: Column): Column = 574 | ## adds column `c2` to `c1`. Uses `concat` internally. 575 | if c1.isNil: return c2 # allows to add to an uninitialized column 576 | if c2.len == 0: return c1 577 | elif c1.len == 0: return c2 578 | if c1.kind == c2.kind: 579 | # just concat directly 580 | case c1.kind 581 | of colInt: result = toColumn concat(c1.iCol, c2.iCol, axis = 0) 582 | of colFloat: result = toColumn concat(c1.fCol, c2.fCol, axis = 0) 583 | of colBool: result = toColumn concat(c1.bCol, c2.bCol, axis = 0) 584 | of colString: result = toColumn concat(c1.sCol, c2.sCol, axis = 0) 585 | of colObject: result = toColumn concat(c1.oCol, c2.oCol, axis = 0) 586 | of colConstant: 587 | if c1.cCol == c2.cCol: result = c1 # does not matter which to return 588 | else: result = add(c1.constantToFull, c2.constantToFull) 589 | of colNone: doAssert false, "Both columns are empty!" 590 | elif compatibleColumns(c1, c2): 591 | # convert both to float 592 | case c1.kind 593 | of colInt: 594 | # c1 is int, c2 is float 595 | assert c2.kind == colFloat 596 | result = toColumn concat(c1.iCol.asType(float), c2.fCol, axis = 0) 597 | of colFloat: 598 | # c1 is float, c2 is int 599 | assert c2.kind == colInt 600 | result = toColumn concat(c1.fCol, c2.iCol.asType(float), axis = 0) 601 | else: doAssert false, "cannot happen, since not compatible!" 602 | elif c1.kind == colConstant or c2.kind == colConstant: 603 | result = add(c1.constantToFull, c2.constantToFull) 604 | else: 605 | # convert both columns to Value 606 | result = toColumn concat(c1.toObject.oCol, c2.toObject.oCol, axis = 0) 607 | result.len = c1.len + c2.len 608 | 609 | proc toColumn*[T: SomeFloat | SomeInteger | string | bool | Value](s: openArray[T]): Column = 610 | var vals = newTensor[T](s.len) 611 | for i, x in s: 612 | vals[i] = x 613 | result = toColumn(vals) 614 | 615 | proc toColumn*[T: SomeFloat | SomeInteger | string | bool | Value](x: T): Column = 616 | # also possible to create single row column, but inefficient 617 | # for `summarize` though there's no way around 618 | let vals = newTensorWith[T](1, x) 619 | result = toColumn(vals) 620 | 621 | proc toNativeColumn*(s: openArray[Value]): Column = 622 | ## given input as `Value`, will attempt to return the column as native 623 | ## data type. 624 | ## NOTE: this is unsafe and assumes the values are indeed all one type! 625 | if s.len > 0: 626 | withNativeConversion(s[0].kind, get): 627 | var data = newTensor[dtype](s.len) 628 | for i, x in s: 629 | data[i] = get(x) 630 | result = toColumn data 631 | 632 | proc toNativeColumn*(c: Column, failIfImpossible: static bool = true): Column = 633 | ## attempts to convert the given column from `colObject` to its 634 | ## native type, if possible. This is mainly useful after removal 635 | ## of null values. If it fails (i.e. floats and strings in one 636 | ## col) the result stays a colObject. 637 | ## 638 | ## In the default case `failIfImpossible = true` this procedure will 639 | ## fail with an `AssertionDefect` if a column contains multiple datatypes. 640 | ## This can be disabled so that at worst the input is returned as an 641 | ## object type column. 642 | if c.kind != colObject: return c 643 | # assuming the column ``can`` be converted to native type, the 644 | # first element contains all information we need, namely the 645 | # value kind of ``all`` elements in the column 646 | # exception: first element is int, but mixed with float 647 | let vKind = c[0, Value].kind 648 | ## TODO: this can fail... 649 | withNativeConversion(vKind, get): 650 | var data = newTensor[dtype](c.len) 651 | let cValue = c.toTensor(Value) 652 | for i in 0 ..< c.len: 653 | when failIfImpossible: 654 | doAssert cValue[i].kind == vKind, "Column contains actual multiple datatypes! " & 655 | $vKind & " and " & $cValue[i].kind & "!" 656 | else: 657 | if cValue[i].kind != vKind: 658 | # not possible to convert, return input 659 | return c 660 | data[i] = get cValue[i] 661 | result = toColumn data 662 | 663 | proc nullColumn*(num: int): Column = 664 | ## returns an object `Column` with `N` values, which are 665 | ## all `VNull` 666 | var nullseq = newSeq[Value](num) 667 | for i in 0 ..< num: 668 | nullseq[i] = Value(kind: VNull) 669 | result = toColumn(nullseq) 670 | 671 | #proc `*`[T: SomeNumber]*(c: Column, x: T) 672 | proc contains*[T: float | string | int | bool | Value](c: Column, val: T): bool = 673 | let t = toTensor(c, T) 674 | result = false 675 | for x in t: 676 | if val == x: 677 | return true 678 | 679 | template liftScalarToColumn*(name: untyped): untyped = 680 | proc `name`*(c: Column): Value = 681 | withNativeDtype(c): 682 | result = %~ `name`(c.toTensor(dtype)) 683 | liftScalarToColumn(max) 684 | 685 | proc pretty*(c: Column): string = 686 | ## pretty prints a Column 687 | result = &"Column of type: {toNimType(c.kind)} with length: {c.len}\n" 688 | withNativeTensor(c, t): 689 | result.add &" contained Tensor: {t}" 690 | template `$`*(c: Column): string = pretty(c) 691 | 692 | proc clone*(c: Column): Column = 693 | ## clones the given column by cloning the Tensor 694 | result = Column(kind: c.kind, len: c.len) 695 | case result.kind 696 | of colInt: result.iCol = c.iCol.clone() 697 | of colFloat: result.fCol = c.fCol.clone() 698 | of colString: result.sCol = c.sCol.clone() 699 | of colBool: result.bCol = c.bCol.clone() 700 | of colObject: result.oCol = c.oCol.clone() 701 | of colConstant: result.cCol = c.cCol # just a `Value` 702 | of colNone: discard 703 | 704 | proc map*[T; U](c: Column, fn: (T -> U)): Column = 705 | ## Maps a given column given `fn` to a new column. 706 | ## Because `Column` is a variant type, an untyped mapping function 707 | ## won't compile. 708 | ## 709 | ## See the `map_inline` template below, which attempts to work around this 710 | ## limitation by compiling all map function bodies, which are valid for `c`. 711 | ## 712 | ## .. code-block:: nim 713 | ## c.map((x: int) => x * 5) 714 | ## 715 | ## Using this is not really recommended. Use `df["x", int].map(x => x * 5)` instead! 716 | result = toColumn c.toTensor(T).map_inline(fn(x)) 717 | 718 | template map_inline*(c: Column, body: untyped): Column = 719 | ## This is a helper template, which attempts to work around this 720 | ## limitation by compiling all map function bodies, which are valid for `c`. 721 | ## However, be careful: by using the template you throw out possible compile 722 | ## time checking and replace it by possible exceptions in your code! 723 | ## 724 | ## .. code-block:: nim 725 | ## c.map_inline(x * 5) 726 | ## 727 | ## This example will throw a runtime exception, if `* 5` is invalid for the 728 | ## column type that `c` actually is at runtime! 729 | ## Using this is not really recommended. Use `df["x", int].map_inline(x * 5)` instead! 730 | withNativeDtype(c): 731 | var res: Column 732 | when compiles((map(c, (x: dtype) => body))): 733 | res = toColumn map(c, (x: dtype) => body) 734 | else: 735 | ## Cannot raise a CT error unfortunately I think, because this branch will always be compiled 736 | ## for one of the column types 737 | raise newException(Exception, "Column is of invalid type for map body `" & $(astToStr(body)) & 738 | "` for dtype of column: " & $(c.kind.toNimType)) 739 | res 740 | --------------------------------------------------------------------------------