├── .gitignore ├── .luacov ├── .travis.yml ├── CMakeLists.txt ├── CONTRIBUTING.md ├── LICENSE ├── NEWS.md ├── README.md ├── argcheck.lua ├── custom_assertions.lua ├── dataframe ├── categorical.lua ├── column.lua ├── export_data.lua ├── init.lua ├── load_data.lua ├── metatable.lua ├── missing_data.lua ├── output.lua ├── row.lua ├── select_set_update.lua ├── statistics.lua └── subsets_and_batches.lua ├── dataseries ├── categorical.lua ├── export.lua ├── init.lua ├── metatable.lua ├── sngl_elmnt_ops.lua └── statistics.lua ├── doc.lua ├── doc ├── README.md ├── core │ ├── README.md │ ├── categorical.md │ ├── column.md │ ├── export_data.md │ ├── init.md │ ├── load_data.md │ ├── metatable.md │ ├── missing_data.md │ ├── output.md │ ├── row.md │ ├── select_set_update.md │ ├── statistics.md │ └── subsets_and_batches.md ├── dataseries │ ├── README.md │ ├── categorical.md │ ├── export.md │ ├── init.md │ ├── metatable.md │ ├── sngl_elmnt_ops.md │ └── statistics.md ├── helper_classes │ ├── 10_iterator.md │ ├── 11_paralleliterator.md │ ├── 20_tbl.md │ ├── 21_dict.md │ ├── 22_array.md │ └── README.md ├── sub_classes │ ├── 01_subset.md │ ├── 10_batchframe.md │ └── README.md └── utils │ ├── README.md │ └── utils.md ├── examples ├── Facebook license │ ├── LICENSE │ └── PATENTS └── mnist_example.lua ├── helper_classes ├── 10_iterator.lua ├── 11_paralleliterator.lua ├── 20_tbl.lua ├── 21_dict.lua ├── 22_array.lua └── Facebok license ├── init.lua ├── rocks ├── torch-dataframe-1.0-0.rockspec ├── torch-dataframe-1.1-0.rockspec ├── torch-dataframe-1.5-0.rockspec ├── torch-dataframe-1.6-0.rockspec ├── torch-dataframe-1.6-1.rockspec ├── torch-dataframe-1.7-0.rockspec └── torch-dataframe-scm-1.rockspec ├── specs ├── coverage.sh ├── data │ ├── advanced_short.csv │ ├── full.csv │ ├── iris-label.csv │ ├── iris-no-header.csv │ ├── iris-no-label.csv │ ├── realistic_29_row_data.csv │ ├── sampler_csv_files │ │ ├── index.csv │ │ └── index3.csv │ └── simple_short.csv ├── dataframe │ ├── batchframe_spec.lua │ ├── categorical_spec.lua │ ├── column_order_spec.lua │ ├── column_spec.lua │ ├── export_data_spec.lua │ ├── load_data_spec.lua │ ├── main_spec.lua │ ├── metatable_spec.lua │ ├── missing_data_spec.lua │ ├── row_spec.lua │ ├── sampler_spec.lua │ ├── select_set_update_spec.lua │ ├── serialization_spec.lua │ ├── statistics_spec.lua │ └── subsets_and_batches_spec.lua ├── dataseries │ └── dataseries_spec.lua ├── helper_classes │ ├── df_array_spec.lua │ ├── df_dict_spec.lua │ └── df_tbl_spec.lua ├── linter.sh ├── output │ ├── Wiki-templates │ │ ├── Readme.md │ │ └── Where_update_and_set.ipynb │ ├── cli_output.lua │ └── itorch_notebook_df_test.ipynb ├── run_all.sh └── utils │ ├── ntwrk_implementation_spec.lua │ ├── test.lua │ └── utils_spec.lua ├── sub_classes ├── 01_subset.lua ├── 10_batchframe.lua └── subset_extensions │ └── samplers.lua └── utils ├── doc_helpers ├── get_anchors.lua ├── parse_file.lua └── write_doc.lua ├── loader.lua └── utils.lua /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb_checkpoints 2 | luacov.* 3 | build.* 4 | -------------------------------------------------------------------------------- /.luacov: -------------------------------------------------------------------------------- 1 | return { 2 | modules = { 3 | ["init"] = 'init.lua', 4 | ["argcheck"] = 'argcheck.lua', 5 | ["main"] = 'main.lua', 6 | 7 | ["utils.utils"] = 'utils/utils.lua', 8 | ["utils.loader"] = 'utils/loader.lua', 9 | ["utils.doc_helpers.get_anchors"] = 'utils/doc_helpers/get_anchors.lua', 10 | ["utils.doc_helpers.parse_file"] = 'utils/doc_helpers/parse_file.lua', 11 | ["utils.doc_helpers.write_doc"] = 'utils/doc_helpers/write_doc.lua', 12 | 13 | ["sub_classes.01_subset"] = 'sub_classes/01_subset.lua', 14 | ["sub_classes.10_batchframe"] = 'sub_classes/10_batchframe.lua', 15 | ["sub_classes.subset_extensions.samplers"] = 'sub_classes/subset_extensions/samplers.lua', 16 | 17 | ["helper_classes.10_iterator"] = 'helper_classes/10_iterator.lua', 18 | ["helper_classes.11_paralleliterator"] = 'helper_classes/11_paralleliterator.lua', 19 | ["helper_classes.20_tbl"] = 'helper_classes/20_tbl.lua', 20 | ["helper_classes.21_dict"] = 'helper_classes/21_dict.lua', 21 | ["helper_classes.22_array"] = 'helper_classes/22_array.lua', 22 | 23 | ["dataseries.categorical"] = 'dataseries/categorical.lua', 24 | ["dataseries.categorical"] = 'dataseries/export.lua', 25 | ["dataseries.export"] = 'dataseries/export.lua', 26 | ["dataseries.init"] = 'dataseries/init.lua', 27 | ["dataseries.metatable"] = 'dataseries/metatable.lua', 28 | ["dataseries.sngl_elmnt_ops"] = 'dataseries/sngl_elmnt_ops.lua', 29 | ["dataseries.statistics"] = 'dataseries/statistics.lua', 30 | 31 | ["dataframe.categorical"] = 'dataframe/categorical.lua', 32 | ["dataframe.column"] = 'dataframe/column.lua', 33 | ["dataframe.export_data"] = 'dataframe/export_data.lua', 34 | ["dataframe.init"] = 'dataframe/init.lua', 35 | ["dataframe.load_data"] = 'dataframe/load_data.lua', 36 | ["dataframe.metatable"] = 'dataframe/metatable.lua', 37 | ["dataframe.missing_data"] = 'dataframe/missing_data.lua', 38 | ["dataframe.output"] = 'dataframe/output.lua', 39 | ["dataframe.row"] = 'dataframe/row.lua', 40 | ["dataframe.select_set_update"] = 'dataframe/select_set_update.lua', 41 | ["dataframe.statistics"] = 'dataframe/statistics.lua', 42 | ["dataframe.subsets_and_batches"] = 'dataframe/subsets_and_batches.lua' 43 | 44 | 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | sudo: true 4 | 5 | branches: 6 | only: 7 | - master 8 | - develop 9 | env: 10 | global: 11 | - TORCH_SERVER=https://raw.githubusercontent.com/torch/rocks/master/ 12 | matrix: 13 | - LUA="LUA52" 14 | - LUA="LUA53" 15 | - LUA="LUAJIT20" 16 | - LUA="LUAJIT21" 17 | 18 | before_install: 19 | - if [[ ! -d torch ]]; then git clone https://github.com/torch/distro.git torch --recursive ; fi 20 | - cd torch 21 | - git pull 22 | - git submodule update 23 | - git submodule foreach git pull origin master 24 | - cd .. 25 | - cp -rf torch torch_$LUA 26 | - cd torch_$LUA 27 | - TORCH_LUA_VERSION=$LUA ./install.sh -b 28 | - cd .. 29 | 30 | install: 31 | - source ./torch_$LUA/install/bin/torch-activate 32 | - luarocks --from=$TORCH_SERVER install sundown 33 | - luarocks --from=$TORCH_SERVER install dok 34 | - luarocks --from=$TORCH_SERVER install argcheck 35 | - luarocks --from=$TORCH_SERVER install csvigo 36 | - luarocks install luafilesystem 37 | - luarocks install paths 38 | - luarocks install threads 39 | - luarocks install torchnet 40 | - luarocks install busted 41 | - luarocks install luacov 42 | - luarocks install nn 43 | - luarocks make rocks/torch-dataframe-scm-1.rockspec CFLAGS="-O2 -fPIC -fprofile-arcs -ftest-coverage" LIBFLAG="-shared --coverage" 44 | 45 | script: 46 | - cd specs 47 | - ./run_all.sh --coverage --version $LUA 48 | - ./coverage.sh --generate 49 | - cd .. 50 | 51 | after_success: 52 | - bash <(curl -s https://codecov.io/bash) 53 | 54 | notifications: 55 | email: 56 | on_success: change 57 | on_failure: always 58 | -------------------------------------------------------------------------------- /CMakeLists.txt: -------------------------------------------------------------------------------- 1 | cmake_minimum_required (VERSION 2.8) 2 | cmake_policy(VERSION 2.8) 3 | 4 | set(PKGNAME Dataframe) 5 | 6 | file(GLOB_RECURSE luafiles RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.lua") 7 | 8 | # Exclude doc helpers and spec files 9 | set (EXCLUDE_DIRS "utils/doc_helpers/" "specs/") 10 | list(REMOVE_ITEM luafiles "custom_assertions.lua") 11 | 12 | foreach (TMP_PATH ${luafiles}) 13 | 14 | foreach (EXCLUDE_DIR ${EXCLUDE_DIRS}) 15 | string (FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND) 16 | if (NOT ${EXCLUDE_DIR_FOUND} EQUAL -1) 17 | MESSAGE("Removing ${TMP_PATH}") 18 | list (REMOVE_ITEM luafiles ${TMP_PATH}) 19 | endif () 20 | endforeach(EXCLUDE_DIR) 21 | 22 | endforeach(TMP_PATH) 23 | 24 | foreach(file ${luafiles}) 25 | get_filename_component(dir ${file} PATH) 26 | install(FILES ${file} DESTINATION ${LUA_PATH}/${PKGNAME}/${dir}) 27 | endforeach() 28 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | Feel free to report a bug, suggest enhancements or submit new cool features using [Issues][df_issues] or directly send us a [Pull Request][df_pr] :). 4 | 5 | ## Before submitting 6 | 7 | Don't forget to : 8 | - test your code 9 | - generate the doc 10 | - use the linter script in `specs` directory 11 | 12 | You can find how we implemented our tests in the [specs directory][df_specs]. See "Behavior Driven Development" for more details on this technique. 13 | 14 | ## Programming book 15 | 16 | For a better contribution we ask you to follow these simple rules to keep the code reading as smooth as possible : 17 | * Indentation is a tabulation of size 2 18 | * Every composants of a function's name is separated by an underscore : `my_func_name` 19 | 20 | [df_issues]: https://github.com/AlexMili/torch-dataframe/issues 21 | [df_pr]: https://github.com/AlexMili/torch-dataframe/pulls 22 | [df_specs]: https://github.com/AlexMili/torch-dataframe/tree/readme/specs 23 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2015 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /argcheck.lua: -------------------------------------------------------------------------------- 1 | local env = require 'argcheck.env' -- retrieve argcheck environement 2 | 3 | -- From http://lua-users.org/wiki/SplitJoin 4 | function string:split(sep) 5 | local sep, fields = sep or ":", {} 6 | local pattern = string.format("([^%s]+)", sep) 7 | self:gsub(pattern, function(c) fields[#fields+1] = c end) 8 | return fields 9 | end 10 | 11 | env.istype = function(obj, typename) 12 | if (typename == "*") then 13 | return true 14 | end 15 | 16 | -- From the original argcheck env 17 | local thname = torch.typename(obj) -- empty if non-torch class 18 | local thtype = torch.type(obj) 19 | if (typename == "!table" and thtype ~= "table") then 20 | return true 21 | end 22 | 23 | if (typename:match("|")) then 24 | if (thname) then 25 | -- Do a recursive search thrhough all the patterns for torch class objects 26 | for _,subtype in ipairs(typename:split("|")) do 27 | local ret = env.istype(obj, subtype) 28 | if (ret) then 29 | return true 30 | end 31 | end 32 | 33 | return false 34 | else 35 | -- We only need to find basic variable match + nan values 36 | for _,subtype in ipairs(typename:split("|")) do 37 | if ((thtype == subtype) or 38 | (thtype == "nan" and isnan(obj))) 39 | then 40 | return true 41 | end 42 | end 43 | 44 | return false 45 | end 46 | end 47 | 48 | if thname then 49 | -- __typename (see below) might be absent 50 | local match = thname:match(typename) 51 | if match and (match ~= typename or match == thname) then 52 | return true 53 | end 54 | local mt = torch.getmetatable(thname) 55 | while mt do 56 | if mt.__typename then 57 | match = mt.__typename:match(typename) 58 | if match and (match ~= typename or match == mt.__typename) then 59 | return true 60 | end 61 | end 62 | mt = getmetatable(mt) 63 | end 64 | return false 65 | end 66 | 67 | return type(obj) == typename 68 | end 69 | -------------------------------------------------------------------------------- /dataframe/export_data.lua: -------------------------------------------------------------------------------- 1 | local params = {...} 2 | local Dataframe = params[1] 3 | 4 | local argcheck = require "argcheck" 5 | local doc = require "argcheck.doc" 6 | 7 | doc[[ 8 | 9 | ## Data save/export functions 10 | 11 | ]] 12 | 13 | Dataframe.to_csv = argcheck{ 14 | doc = [[ 15 | 16 | ### Dataframe.to_csv(@ARGP) 17 | 18 | Saves a Dataframe into a CSV using csvigo as backend 19 | 20 | _Return value_: self (Dataframe) 21 | 22 | @ARGT 23 | 24 | ]], 25 | {name="self", type="Dataframe"}, 26 | {name='path', type='string', doc='path to file'}, 27 | {name="separator", type='string', doc='separator (one character)', default=','}, 28 | {name='verbose', type='boolean', help='verbose load', default=false}, 29 | call = function(self, path, separator, verbose) 30 | 31 | -- Make sure that categorical columns are presented in the correct way 32 | save_data = {} 33 | for _,k in pairs(self.column_order) do 34 | save_data[k] = self:get_column(k):to_table{boolean2string = true} 35 | end 36 | 37 | -- TODO: The csvigo will have memory issues when used with regular tables 38 | csvigo.save{path = path, 39 | data = save_data, 40 | separator = separator, 41 | verbose = verbose, 42 | column_order = self.column_order, 43 | nan_as_missing = true} 44 | 45 | return self 46 | end} 47 | 48 | Dataframe.to_tensor = argcheck{ 49 | doc = [[ 50 | 51 | ### Dataframe.to_tensor(@ARGP) 52 | 53 | Convert the numeric section or specified columns of the dataset to a tensor 54 | 55 | @ARGT 56 | 57 | _Return value_: (1) torch.tensor with self:size(1) rows and self:size(2) columns, 58 | (2) exported column names 59 | 60 | ]], 61 | {name="self", type="Dataframe"}, 62 | call = function(self) 63 | 64 | return self:to_tensor(Df_Array(self:get_numerical_colnames())) 65 | end} 66 | 67 | Dataframe.to_tensor = argcheck{doc=[[ 68 | 69 | You can export selected columns using the columns argument: 70 | 71 | @ARGT 72 | ]], 73 | overload=Dataframe.to_tensor, 74 | {name="self", type="Dataframe"}, 75 | {name="columns", type='Df_Array', doc='The columns to export to labels'}, 76 | call = function(self, columns) 77 | 78 | columns = columns.data 79 | 80 | -- Check data integrity 81 | local numeric_dataset = {} 82 | local type = -1 83 | local tensor_types = { 84 | "ByteTensor" -- contains unsigned chars 85 | ,"CharTensor" -- contains signed chars 86 | ,"ShortTensor" -- contains shorts 87 | ,"IntTensor" -- contains ints 88 | ,"LongTensor" -- contains longs 89 | ,"FloatTensor" -- contains floats 90 | ,"DoubleTensor" 91 | } 92 | for _,k in pairs(columns) do 93 | self:assert_has_column(k) 94 | assert(self:is_numerical(k), "Column " .. tostring(k) .. " is not numerical") 95 | local col = self:get_column(k) 96 | numeric_dataset[k] = col:to_tensor() 97 | local current_type = col:type() 98 | 99 | for idx,tnsr_type in ipairs(tensor_types) do 100 | if (current_type:match(tnsr_type)) then 101 | current_type = idx 102 | break 103 | end 104 | end 105 | if (current_type > type) then 106 | type = current_type 107 | end 108 | end 109 | 110 | -- Convert all tensors to the same format before concat 111 | type = ("torch.%s"):format(tensor_types[type]) 112 | for cn,col in pairs(numeric_dataset) do 113 | numeric_dataset[cn] = numeric_dataset[cn]:type(type) 114 | end 115 | 116 | tensor_data = nil 117 | tensor_col_names = {} 118 | for col_no = 1,#self.column_order do 119 | -- Find the next column that is present in the numerics 120 | found = false 121 | column_name = self.column_order[col_no] 122 | for k,v in pairs(numeric_dataset) do 123 | if (k == column_name) then 124 | found = true 125 | break 126 | end 127 | end 128 | 129 | -- If column found we then concatenate that with our tensor_data 130 | if (found) then 131 | next_col = numeric_dataset[column_name] 132 | if (torch.isTensor(tensor_data)) then 133 | tensor_data = torch.cat(tensor_data, next_col, 2) 134 | else 135 | tensor_data = next_col 136 | end 137 | table.insert(tensor_col_names, column_name) 138 | end 139 | end 140 | 141 | if (#tensor_col_names == 1) then 142 | -- Reshape to tabular if this is only a single column 143 | tensor_data = tensor_data:reshape(tensor_data:size(1), 1) 144 | end 145 | 146 | return tensor_data, tensor_col_names 147 | end} 148 | 149 | Dataframe.to_tensor = argcheck{ 150 | doc=[[ 151 | 152 | If a filename is provided the tensor will be saved (`torch.save`) to that file: 153 | 154 | @ARGT 155 | ]], 156 | overload=Dataframe.to_tensor, 157 | {name="self", type="Dataframe"}, 158 | {name='filename', type='string', doc='Filename for tensor.save()'}, 159 | {name="columns", type='Df_Array', doc='The columns to export to labels', default=false}, 160 | call = function(self, filename, columns) 161 | 162 | if (columns) then 163 | tensor_data, tensor_col_names = self:to_tensor{columns = columns} 164 | else 165 | tensor_data, tensor_col_names = self:to_tensor() 166 | end 167 | 168 | torch.save(filename, tensor_data) 169 | 170 | return tensor_data, tensor_col_names 171 | end} 172 | 173 | Dataframe.get = argcheck{ 174 | doc = [[ 175 | 176 | ### Dataframe.get(@ARGP) 177 | 178 | A funtion for *torchnet* compliance. It subsets a single index and returns the 179 | `to_tensor` on that example. 180 | 181 | @ARGT 182 | 183 | _Return value_: (1) torch.tensor with 1 row and #numerical columns 184 | 185 | ]], 186 | {name="self", type="Dataframe"}, 187 | {name="idx", type="number"}, 188 | call = function(self, idx) 189 | local row = self:sub(idx, idx) 190 | return row:to_tensor() 191 | end} 192 | -------------------------------------------------------------------------------- /dataframe/metatable.lua: -------------------------------------------------------------------------------- 1 | local params = {...} 2 | local Dataframe = params[1] 3 | 4 | local argcheck = require "argcheck" 5 | local doc = require "argcheck.doc" 6 | 7 | doc[[ 8 | 9 | ## Metatable functions 10 | 11 | ]] 12 | 13 | Dataframe.size = argcheck{ 14 | doc = [[ 15 | 16 | ### Dataframe.size(@ARGP) 17 | 18 | By providing dimension you can get only that dimension, row == 1, col == 2. If 19 | value omitted it will return the number of rows in order to comply with torchnet 20 | standard. 21 | 22 | @ARGT 23 | 24 | _Return value_: integer 25 | ]], 26 | {name="self", type="Dataframe"}, 27 | {name="dim", type="number", doc="The dimension of interest", default = 1}, 28 | call=function(self, dim) 29 | assert(isint(dim), "The dimension isn't an integer: " .. tostring(dim)) 30 | assert(dim == 1 or dim == 2, "The dimension can only be between 1 and 2 - you've provided: " .. dim) 31 | if (dim == 1) then 32 | if (not self.column_order or #self.column_order == 0) then 33 | return 0 34 | end 35 | 36 | local col = self.column_order[1] 37 | if (self:has_column(col)) then 38 | return self:get_column(self.column_order[1]):size() 39 | else 40 | -- this case happends when _copy_meta has been called and the column_order has been set 41 | -- TODO: remove the dependence of column_order for the row calc 42 | return 0 43 | end 44 | end 45 | 46 | return #self.column_order 47 | end} 48 | 49 | doc = [[ 50 | 51 | ### Dataframe.[] 52 | 53 | The `__index__` function is a powerful tool that allows quick access to regular functions 54 | 55 | - _Single integer_: it returns the raw row table (see `get_row()`) 56 | - _Df_Array()_: select rows of interest (see `_create_subset()`) 57 | - _"start:stop"_: get a row span using start/stop index, e.g. `"2:5"` (see `sub()`) 58 | - _"$column_name"_: get a column by prepending the name with `$`, e.g. `"$a column name"` (see `get_column`) 59 | - _"/subset_name"_: get a subset by prepending the name with `/`, e.g. `"/a subset name"` (see `get_subset`) 60 | 61 | _Return value_: Table or Dataframe 62 | ]] 63 | 64 | function Dataframe:__index__(index) 65 | if (torch.type(index) == "number") then 66 | return self:get_row(index), true 67 | end 68 | 69 | if (torch.type(index) == "string") then 70 | if (index:match("^[0-9]+:[0-9]+$")) then 71 | -- Get the core data 72 | local start = index:gsub(":.*", "") 73 | start = tonumber(start) 74 | local stop = index:gsub("[^:]+:", "") 75 | stop = tonumber(stop) 76 | 77 | return self:sub{start=start, stop=stop}, true 78 | end 79 | 80 | -- Index a column using a $ at the beginning of a string 81 | if (index:match("^[$]")) then 82 | local column_name = index:gsub("^[$]", "") 83 | return self:get_column(column_name), true 84 | end 85 | 86 | -- Index a subset using a / at the beginning of a string 87 | if (index:match("^[/]")) then 88 | local subset_name = index:gsub("^[/]", "") 89 | return self:get_subset(subset_name), true 90 | end 91 | 92 | return false 93 | end 94 | 95 | if (torch.type(index) == "Df_Array") then 96 | return self:_create_subset(index), true 97 | end 98 | 99 | return false 100 | end 101 | 102 | doc = [[ 103 | 104 | ### Dataframe.[] = 105 | 106 | The `__newindex__` allows easy updating of a single row (see `_update_single_row()`) 107 | 108 | ]] 109 | 110 | function Dataframe:__newindex__(index, value) 111 | if (torch.type(index) == "number") then 112 | self:_update_single_row(index, Df_Tbl(value), Df_Tbl(self:get_row(index))) 113 | return true 114 | end 115 | 116 | return false 117 | end 118 | 119 | Dataframe.__tostring__ = argcheck{ 120 | doc=[[ 121 | 122 | ### Dataframe.__tostring__(@ARGP) 123 | 124 | A wrapper for `tostring()` 125 | 126 | @ARGT 127 | 128 | _Return value_: string 129 | ]], 130 | {name="self", type="Dataframe"}, 131 | call=function (self) 132 | return self:tostring() 133 | end} 134 | 135 | 136 | Dataframe.copy = argcheck{ 137 | doc = [[ 138 | 139 | ### Dataframe.copy(@ARGP) 140 | 141 | Copies the table together with all metadata 142 | 143 | @ARGT 144 | 145 | _Return value_: Dataframe 146 | ]], 147 | {name="self", type="Dataframe"}, 148 | call=function(self) 149 | local new_df = Dataframe.new(Df_Dict(self.dataset)) 150 | new_df = self:_copy_meta(new_df) 151 | return new_df 152 | end} 153 | 154 | Dataframe.__len__ = argcheck{ 155 | doc = [[ 156 | 157 | ### Dataframe.# 158 | 159 | Returns the number of rows 160 | 161 | _Return value_: integer 162 | ]], 163 | {name="self", type="Dataframe"}, 164 | {name="other", type="Dataframe"}, 165 | call=function(self, other) 166 | return self:size(1) 167 | end} 168 | 169 | Dataframe.__len__ = argcheck{ 170 | overload=Dataframe.__len__, 171 | {name="self", type="Dataframe"}, 172 | call=function(self) 173 | return self:size(1) 174 | end} 175 | 176 | Dataframe.__eq__ = argcheck{ 177 | doc = [[ 178 | 179 | ### Dataframe.== 180 | 181 | Checks if Dataframe's contain the same values 182 | 183 | _Return value_: boolean 184 | ]], 185 | {name="self", type="Dataframe"}, 186 | {name="other", type="Dataframe"}, 187 | call=function(self, other) 188 | -- Check that size matches 189 | if (self:size(1) ~= other:size(1) or 190 | self:size(2) ~= other:size(2)) then 191 | return false 192 | end 193 | 194 | -- Check that columns match 195 | for i=1,#self.column_order do 196 | if (not other:has_column(self.column_order[i])) then 197 | return false 198 | end 199 | end 200 | 201 | -- Check actual content (expensive why this is left to last) 202 | for i=1,#self.column_order do 203 | local self_col = self:get_column(self.column_order[i]) 204 | local other_col = other:get_column(self.column_order[i]) 205 | 206 | for i=1,self:size(1) do 207 | -- one is nan and not the other 208 | if ((not isnan(self_col[i]) and 209 | isnan(other_col[i])) or 210 | (isnan(self_col[i]) and 211 | not isnan(other_col[i]))) then 212 | return false 213 | end 214 | 215 | -- Actual value check if both weren't nan 216 | if (not(isnan(self_col[i]))) then 217 | if (self_col[i] ~= other_col[i]) then 218 | return false 219 | end 220 | end 221 | 222 | end 223 | end 224 | 225 | -- If the function hasn't exited before then it means that the two dataframes are equal 226 | return true 227 | end} 228 | -------------------------------------------------------------------------------- /dataframe/missing_data.lua: -------------------------------------------------------------------------------- 1 | local params = {...} 2 | local Dataframe = params[1] 3 | 4 | local argcheck = require "argcheck" 5 | local doc = require "argcheck.doc" 6 | 7 | doc[[ 8 | 9 | ## Missing data functions 10 | 11 | ]] 12 | 13 | Dataframe.count_na = argcheck{ 14 | doc = [[ 15 | 16 | ### Dataframe.count_na(@ARGP) 17 | 18 | Count missing values in dataset 19 | 20 | @ARGT 21 | 22 | _Return value_: Dataframe or table containing missing values per column, total na 23 | ]], 24 | {name="self", type="Dataframe"}, 25 | {name="columns", type="Df_Array", doc="The columns to count", opt=true}, 26 | {name='as_dataframe', type='boolean', default=true, 27 | doc="Return a dataframe"}, 28 | call=function(self, columns, as_dataframe) 29 | if (columns) then 30 | columns = columns.data 31 | else 32 | columns = self.column_order 33 | end 34 | 35 | local ret = {} 36 | local tot_na = 0 37 | for i=1,#columns do 38 | ret[columns[i]] = self:count_na(columns[i]) 39 | tot_na = tot_na + ret[columns[i]] 40 | end 41 | 42 | if (as_dataframe) then 43 | local ret_df = Dataframe.new() 44 | for name,val in pairs(ret) do 45 | ret_df:append{rows = Df_Dict{Column = name, Value = val}, 46 | column_order = Df_Array("Column", "Value")} 47 | end 48 | return ret_df, tot_na 49 | else 50 | return ret, tot_na 51 | end 52 | end} 53 | 54 | Dataframe.count_na = argcheck{ 55 | doc = [[ 56 | If you only want to count a single column 57 | 58 | @ARGT 59 | 60 | _Return value_: single integer 61 | ]], 62 | overload=Dataframe.count_na, 63 | {name="self", type="Dataframe"}, 64 | {name="column", type="string", doc="The column to count"}, 65 | call=function(self, column) 66 | self:assert_has_column(column) 67 | 68 | return self:get_column(column):count_na() 69 | end} 70 | 71 | Dataframe.fill_na = argcheck{ 72 | doc = [[ 73 | 74 | ### Dataframe.fill_na(@ARGP) 75 | 76 | Replace missing value in a specific column 77 | 78 | @ARGT 79 | 80 | _Return value_: self 81 | ]], 82 | {name="self", type="Dataframe"}, 83 | {name="column_name", type="string", doc="The column to fill"}, 84 | {name="default_value", type="number|string|boolean", 85 | doc="The default missing value", default=0}, 86 | call=function(self, column_name, default_value) 87 | self:assert_has_column(column_name) 88 | 89 | local column_data = self:get_column(column_name) 90 | 91 | column_data:fill_na(default_value) 92 | 93 | return self 94 | end} 95 | 96 | Dataframe.fill_all_na = argcheck{ 97 | doc = [[ 98 | 99 | ### Dataframe.fill_na(@ARGP) 100 | 101 | Replace missing value in all columns 102 | 103 | @ARGT 104 | 105 | _Return value_: self 106 | ]], 107 | {name="self", type="Dataframe"}, 108 | {name="default_value", type="number|string|boolean", doc="The default missing value", default=0}, 109 | call=function(self, default_value) 110 | for i=1,#self.column_order do 111 | self:fill_na(self.column_order[i], default_value) 112 | end 113 | 114 | return self 115 | end} 116 | -------------------------------------------------------------------------------- /dataseries/export.lua: -------------------------------------------------------------------------------- 1 | local params = {...} 2 | local Dataseries = params[1] 3 | 4 | local argcheck = require "argcheck" 5 | local doc = require "argcheck.doc" 6 | 7 | doc[[ 8 | 9 | ## Export functions 10 | 11 | Here are functions are used for exporting to a different format. Generally `to_` 12 | functions should reside here. Only exception is the `tostring`. 13 | 14 | ]] 15 | 16 | Dataseries.to_tensor = argcheck{ 17 | doc=[[ 18 | 19 | ### Dataseries.to_tensor(@ARGP) 20 | 21 | Returns the values in tensor format. Note that if you don't provide a replacement 22 | for missing values and there are missing values the function will throw an error. 23 | 24 | *Note*: boolean columns are not tensors and need to be manually converted to a 25 | tensor. This since 0 would be a natural value for false but can cause issues as 26 | neurons are labeled 1 to n for classification tasks. See the `Dataframe.update` 27 | function for details or run the `boolean2tensor`. 28 | 29 | @ARGT 30 | 31 | _Return value_: `torch.*Tensor` of the current type 32 | ]], 33 | {name="self", type="Dataseries"}, 34 | {name="missing_value", type="number", 35 | doc="Set a value for the missing data", 36 | opt=true}, 37 | {name="copy", type="boolean", default=true, 38 | doc="Set to false if you want the original data to be returned."}, 39 | call=function(self, missing_value) 40 | assert(self:type():match("torch.*Tensor"), 41 | "Can only automatically retrieve columns that already are tensors") 42 | assert(self:count_na() == 0 or missing_value, 43 | "Missing data should be replaced with a default value before retrieving tensor") 44 | 45 | local ret 46 | if (copy) then 47 | ret = self:copy() 48 | else 49 | ret = self 50 | end 51 | 52 | if (missing_value and self:count_na() > 0) then 53 | assert(copy, "Replacing missing values is not allowed in to_tensor unless you are returning a copy") 54 | ret:fill_na(missing_value) 55 | end 56 | 57 | return ret.data 58 | end} 59 | 60 | Dataseries.to_table = argcheck{ 61 | doc=[[ 62 | 63 | ### Dataseries.to_table(@ARGP) 64 | 65 | Returns the values in table format 66 | 67 | @ARGT 68 | 69 | _Return value_: table 70 | ]], 71 | {name="self", type="Dataseries"}, 72 | {name="boolean2string", type="boolean", opt=true, 73 | doc="Convert boolean values to strings since they cause havoc with csvigo"}, 74 | call=function(self, boolean2string) 75 | local ret = {} 76 | for i=1,self:size() do 77 | ret[i] = self:get(i) 78 | end 79 | 80 | if (boolean2string and self:type() == "tds.Vec") then 81 | for i=1,#ret do 82 | if (type(ret[i]) == "boolean") then 83 | ret[i] = tostring(ret[i]) 84 | end 85 | end 86 | end 87 | 88 | return ret 89 | end} 90 | -------------------------------------------------------------------------------- /dataseries/metatable.lua: -------------------------------------------------------------------------------- 1 | local params = {...} 2 | local Dataseries = params[1] 3 | 4 | local argcheck = require "argcheck" 5 | local doc = require "argcheck.doc" 6 | 7 | doc[[ 8 | 9 | ## Metatable functions 10 | 11 | ]] 12 | 13 | doc = [[ 14 | 15 | ### Dataseries.[] 16 | 17 | The `__index__` function is a powerful tool that allows quick access to regular functions 18 | 19 | - _Single integer_: it returns the raw elemet table (see `get()`) 20 | - _Df_Array()_: select a set of interest (see `_create_subset()`) 21 | - _"start:stop"_: get a row span using start/stop index, e.g. `"2:5"` (see `sub()`) 22 | 23 | _Return value_: Table or Dataseries 24 | ]] 25 | 26 | function Dataseries:__index__(index) 27 | local thtype = torch.type(index) 28 | -- If this is a number or a Df_Array, let `get()` method handle them both 29 | if (thtype == "number" or 30 | thtype == "Df_Array") then 31 | return self:get(index), true 32 | -- If this is a string matching "start:stop", it should be a query for a subset 33 | elseif (thtype == "string" and index:match("^[0-9]*:[0-9]*$")) then 34 | start = index:gsub(":.*", "") 35 | start = tonumber(start) 36 | 37 | stop = index:gsub("[^:]*:", "") 38 | stop = tonumber(stop) 39 | 40 | return self:sub(start, stop), true 41 | end 42 | 43 | return false 44 | end 45 | 46 | 47 | doc = [[ 48 | 49 | ### Dataseries.[] = 50 | 51 | The `__newindex__` allows updating of a single element (uses `set()`) 52 | 53 | ]] 54 | function Dataseries:__newindex__(index, value) 55 | if (torch.type(index) == "number") then 56 | self:set(index, value) 57 | return true 58 | end 59 | 60 | return false 61 | end 62 | 63 | Dataseries.__len__ = argcheck{ 64 | doc = [[ 65 | 66 | ### Dataseries.# 67 | 68 | Returns the number of elements 69 | 70 | _Return value_: integer 71 | ]], 72 | {name="self", type="Dataseries"}, 73 | {name="other", type="Dataseries", opt=true}, 74 | call=function(self, other) 75 | return self:size() 76 | end} 77 | 78 | Dataseries.__tostring__ = argcheck{ 79 | doc=[[ 80 | 81 | ### Dataseries.__tostring__(@ARGP) 82 | 83 | A wrapper for `tostring()` 84 | 85 | @ARGT 86 | 87 | _Return value_: string 88 | ]], 89 | {name="self", type="Dataseries"}, 90 | call=function (self) 91 | return self:tostring() 92 | end} 93 | -------------------------------------------------------------------------------- /doc.lua: -------------------------------------------------------------------------------- 1 | local paths = require 'paths' 2 | 3 | local dataframe_path = paths.thisfile():gsub("doc.lua$", "?.lua") 4 | local dataframe_dir = string.gsub(dataframe_path, "[^/]+$", "") 5 | 6 | -- Custom argument checks 7 | local argcheck_file = string.gsub(dataframe_path,"?", "argcheck") 8 | assert(loadfile(argcheck_file))() 9 | 10 | -- Get the core loader function 11 | local loader_file = string.gsub(dataframe_path,"?", "utils/loader") 12 | assert(loadfile(loader_file))() 13 | 14 | load_dir_files(dataframe_dir .. "utils/doc_helpers/") 15 | 16 | --[[ 17 | The doc.lua loads everything in the same order as the init script. As 18 | we want to later link the scripts has three sections: 19 | 20 | 1. Load the scripts and store the full docs in the docs table. The file order is 21 | retained via the files table. 22 | 2. Parse the files in the apropriate order and generate a table of content for exact_length 23 | file that is written to the doc folder with the same name as the file but with 24 | `md` as file ending. 25 | 3. Merge all the table of contents data into the README so that the docs are 26 | easier to navigate. 27 | ]] 28 | local docs = {} 29 | local files = {} 30 | files.utils, docs.utils = load_dir_files{ 31 | path = dataframe_dir .. "utils/", 32 | docs = true 33 | } 34 | 35 | files.helper_classes, docs.helper_classes = load_dir_files{ 36 | path = dataframe_dir .. "helper_classes/", 37 | docs = true 38 | } 39 | 40 | files.dataseries, docs.dataseries = load_dir_files{ 41 | path = dataframe_dir .. "dataseries/", 42 | docs = true 43 | } 44 | 45 | files.core, docs.core = load_dir_files{ 46 | path = dataframe_dir .. "dataframe/", 47 | docs = true 48 | } 49 | 50 | files.sub_classes, docs.sub_classes = 51 | -- Load all sub classes 52 | load_dir_files{ 53 | path = dataframe_dir .. "sub_classes/", 54 | params = {Dataframe}, 55 | docs = true 56 | } 57 | 58 | --[[ 59 | !!! Start section 2 !!! 60 | Parse each group, create a directory for that group, parse all files and write an 61 | MD for each file. Then add a Readme for that directory. 62 | ]] 63 | 64 | local parsed_docs = {} 65 | local doc_path = "doc" 66 | if (not paths.dirp(doc_path)) then 67 | paths.mkdir(doc_path) 68 | end 69 | 70 | local rough_toc_tbl = {} 71 | local detailed_toc_tbl = {} 72 | for group_name,group in pairs(docs) do 73 | local sub_doc_path = ("%s/%s/"):format(doc_path,group_name) 74 | if (not paths.dirp(sub_doc_path)) then 75 | paths.mkdir(sub_doc_path) 76 | end 77 | 78 | local grp_rough_toc = "" 79 | local grp_detailed_toc = "" 80 | local gnrl_rough_toc = "" 81 | local gnrl_detailed_toc = "" 82 | 83 | parsed_docs[group_name] = {} 84 | for _,file_name in ipairs(files[group_name]) do 85 | local base_fn = paths.basename(file_name) 86 | local md_path = ("%s%s"):format(sub_doc_path, 87 | base_fn:gsub("%.lua$", ".md")) 88 | 89 | parsed_docs[group_name][base_fn] = parse_doc(group[file_name], base_fn) 90 | local pd = parsed_docs[group_name][base_fn] 91 | write_doc(pd, 92 | md_path) 93 | 94 | grp_rough_toc, grp_detailed_toc = 95 | get_doc_anchors(sub_doc_path, md_path, pd, grp_rough_toc, grp_detailed_toc) 96 | gnrl_rough_toc, gnrl_detailed_toc = 97 | get_doc_anchors(doc_path, md_path, pd, gnrl_rough_toc, gnrl_detailed_toc) 98 | end 99 | 100 | local readmefile = io.open(sub_doc_path .. "README.md", "w") 101 | readmefile:write(([[# Documentation for %s 102 | 103 | This documentation ha been auto-generated from code using the `argcheck` system. 104 | 105 | ## Table of contents (file-level) 106 | 107 | Below follows a more [detailed](#detailed) table of contents with links to 108 | the different functions. Not this list may be incompleted due to failure to 109 | add apropriate anchor tags during documentation. 110 | 111 | %s 112 | 113 | ## Detailed table of contents (file-level + anchors) 114 | 115 | %s]]):format(group_name:gsub("_", " "), grp_rough_toc, grp_detailed_toc)) 116 | 117 | -- Save the group TOCS for the general README 118 | rough_toc_tbl[group_name] = gnrl_rough_toc 119 | detailed_toc_tbl[group_name] = gnrl_detailed_toc 120 | end 121 | 122 | local readmefile = io.open("doc/README.md", "w") 123 | readmefile:write(([[# Documentation for torch-dataframe 124 | 125 | This documentation ha been auto-generated from code using the `argcheck` system. 126 | 127 | Below follows a more [detailed](#detailed) table of contents with links to 128 | the different functions. Not this list may be incompleted due to failure to 129 | add apropriate anchor tags during documentation. 130 | 131 | ## Dataframe core components 132 | 133 | %s 134 | 135 | ## Dataseries - Dataframe's data storage 136 | 137 | %s 138 | 139 | ## Dataframe sub-classes 140 | 141 | %s 142 | 143 | ## Helper classes 144 | 145 | %s]]):format(rough_toc_tbl["core"], 146 | rough_toc_tbl["dataseries"], 147 | rough_toc_tbl["sub_classes"], 148 | rough_toc_tbl["helper_classes"])) 149 | 150 | detailed_toc = ([[ 151 | 152 | # Detailed table of contents (file-level + anchors) 153 | 154 | ## Dataframe core components 155 | 156 | %s 157 | 158 | ## Dataseries - Dataframe's data storage 159 | 160 | %s 161 | 162 | ## Dataframe sub-classes 163 | 164 | %s 165 | 166 | ## Helper classes 167 | 168 | %s]]):format(detailed_toc_tbl["core"], 169 | detailed_toc_tbl["dataseries"], 170 | detailed_toc_tbl["sub_classes"], 171 | detailed_toc_tbl["helper_classes"]) 172 | 173 | -- Remove these elements from the tables in order to avoid ouputting them twice 174 | for _,key in ipairs({"core", "dataseries", "sub_classes", "helper_classes"}) do 175 | rough_toc_tbl[key] = nil 176 | detailed_toc_tbl[key] = nil 177 | end 178 | 179 | for group_name, toc in pairs(rough_toc_tbl) do 180 | local group_title = group_name:sub(1,1):upper() .. group_name:sub(2):gsub("_", " ") 181 | readmefile:write(([[ 182 | 183 | ## %s 184 | 185 | %s]]):format(group_title, toc)) 186 | detailed_toc = ([[%s 187 | 188 | ## %s 189 | 190 | %s]]):format(detailed_toc, group_title, detailed_toc_tbl[group_name]) 191 | end 192 | 193 | readmefile:write(([[ 194 | 195 | %s 196 | ]]):format(detailed_toc)) 197 | 198 | readmefile:close() 199 | -------------------------------------------------------------------------------- /doc/core/README.md: -------------------------------------------------------------------------------- 1 | # Documentation for core 2 | 3 | This documentation ha been auto-generated from code using the `argcheck` system. 4 | 5 | ## Table of contents (file-level) 6 | 7 | Below follows a more [detailed](#detailed) table of contents with links to 8 | the different functions. Not this list may be incompleted due to failure to 9 | add apropriate anchor tags during documentation. 10 | 11 | 12 | - [Core functions](init.md) 13 | - [Categorical functions](categorical.md) 14 | - [Column functions](column.md) 15 | - [Data save/export functions](export_data.md) 16 | - [Data loader functions](load_data.md) 17 | - [Metatable functions](metatable.md) 18 | - [Missing data functions](missing_data.md) 19 | - [Output functions](output.md) 20 | - [Row functions](row.md) 21 | - [Subsetting and manipulation functions](select_set_update.md) 22 | - [Statistical functions](statistics.md) 23 | - [Subsets and batches](subsets_and_batches.md) 24 | 25 | ## Detailed table of contents (file-level + anchors) 26 | 27 | 28 | - **[Core functions](init.md)** 29 | - [Dataframe.`__init`](init.md#Dataframe.__init) 30 | - [Dataframe.get_schema](init.md#Dataframe.get_schema) 31 | - [Dataframe.shape](init.md#Dataframe.shape) 32 | - [Dataframe.version](init.md#Dataframe.version) 33 | - [Dataframe.set_version](init.md#Dataframe.set_version) 34 | - [Dataframe.upgrade_frame](init.md#Dataframe.upgrade_frame) 35 | - [Dataframe.assert_is_index](init.md#Dataframe.assert_is_index) 36 | - **[Categorical functions](categorical.md)** 37 | - [Dataframe.as_categorical](categorical.md#Dataframe.as_categorical) 38 | - [Dataframe.add_cat_key](categorical.md#Dataframe.add_cat_key) 39 | - [Dataframe.as_string](categorical.md#Dataframe.as_string) 40 | - [Dataframe.clean_categorical](categorical.md#Dataframe.clean_categorical) 41 | - [Dataframe.is_categorical](categorical.md#Dataframe.is_categorical) 42 | - [Dataframe.get_cat_keys](categorical.md#Dataframe.get_cat_keys) 43 | - [Dataframe.to_categorical](categorical.md#Dataframe.to_categorical) 44 | - [Dataframe.from_categorical](categorical.md#Dataframe.from_categorical) 45 | - [Dataframe.boolean2categorical](categorical.md#Dataframe.boolean2categorical) 46 | - **[Column functions](column.md)** 47 | - [Dataframe.is_numerical](column.md#Dataframe.is_numerical) 48 | - [Dataframe.is_string](column.md#Dataframe.is_string) 49 | - [Dataframe.is_boolean](column.md#Dataframe.is_boolean) 50 | - [Dataframe.has_column](column.md#Dataframe.has_column) 51 | - [Dataframe.assert_has_column](column.md#Dataframe.assert_has_column) 52 | - [Dataframe.assert_has_not_column](column.md#Dataframe.assert_has_not_column) 53 | - [Dataframe.drop](column.md#Dataframe.drop) 54 | - [Dataframe.add_column](column.md#Dataframe.add_column) 55 | - [Dataframe.get_column](column.md#Dataframe.get_column) 56 | - [Dataframe.reset_column](column.md#Dataframe.reset_column) 57 | - [Dataframe.rename_column](column.md#Dataframe.rename_column) 58 | - [Dataframe.get_numerical_colnames](column.md#Dataframe.get_numerical_colnames) 59 | - [Dataframe.get_column_order](column.md#Dataframe.get_column_order) 60 | - [Dataframe.swap_column_order](column.md#Dataframe.swap_column_order) 61 | - [Dataframe.pos_column_order](column.md#Dataframe.pos_column_order) 62 | - [Dataframe.boolean2tensor](column.md#Dataframe.boolean2tensor) 63 | - **[Data save/export functions](export_data.md)** 64 | - [Dataframe.to_csv](export_data.md#Dataframe.to_csv) 65 | - [Dataframe.to_tensor](export_data.md#Dataframe.to_tensor) 66 | - [Dataframe.get](export_data.md#Dataframe.get) 67 | - **[Data loader functions](load_data.md)** 68 | - [Dataframe.load_csv](load_data.md#Dataframe.load_csv) 69 | - [Dataframe.bulk_load_csv](load_data.md#Dataframe.bulk_load_csv) 70 | - [Dataframe.load_table](load_data.md#Dataframe.load_table) 71 | - [Dataframe.`_clean_columns`](load_data.md#Dataframe._clean_columns) 72 | - **[Metatable functions](metatable.md)** 73 | - [Dataframe.size](metatable.md#Dataframe.size) 74 | - [Dataframe.`__tostring__`](metatable.md#Dataframe.__tostring__) 75 | - [Dataframe.copy](metatable.md#Dataframe.copy) 76 | - [Dataframe.#](metatable.md#Dataframe.#) 77 | - [Dataframe.==](metatable.md#Dataframe.==) 78 | - **[Missing data functions](missing_data.md)** 79 | - [Dataframe.count_na](missing_data.md#Dataframe.count_na) 80 | - [Dataframe.fill_na](missing_data.md#Dataframe.fill_na) 81 | - [Dataframe.fill_na](missing_data.md#Dataframe.fill_na) 82 | - **[Output functions](output.md)** 83 | - [Dataframe.output](output.md#Dataframe.output) 84 | - [Dataframe.show](output.md#Dataframe.show) 85 | - [Dataframe.tostring](output.md#Dataframe.tostring) 86 | - [Dataframe.`_to_html`](output.md#Dataframe._to_html) 87 | - **[Row functions](row.md)** 88 | - [Dataframe.get_row](row.md#Dataframe.get_row) 89 | - [Dataframe.insert](row.md#Dataframe.insert) 90 | - [Dataframe.insert](row.md#Dataframe.insert) 91 | - [Dataframe.append](row.md#Dataframe.append) 92 | - [Dataframe.rbind](row.md#Dataframe.rbind) 93 | - [Dataframe.remove_index](row.md#Dataframe.remove_index) 94 | - **[Subsetting and manipulation functions](select_set_update.md)** 95 | - [Dataframe.sub](select_set_update.md#Dataframe.sub) 96 | - [Dataframe.get_random](select_set_update.md#Dataframe.get_random) 97 | - [Dataframe.head](select_set_update.md#Dataframe.head) 98 | - [Dataframe.tail](select_set_update.md#Dataframe.tail) 99 | - [Dataframe.`_create_subset`](select_set_update.md#Dataframe._create_subset) 100 | - [Dataframe.where](select_set_update.md#Dataframe.where) 101 | - [Dataframe.which](select_set_update.md#Dataframe.which) 102 | - [Dataframe.update](select_set_update.md#Dataframe.update) 103 | - [Dataframe.set](select_set_update.md#Dataframe.set) 104 | - [Dataframe.wide2long](select_set_update.md#Dataframe.wide2long) 105 | - **[Statistical functions](statistics.md)** 106 | - [Dataframe.unique](statistics.md#Dataframe.unique) 107 | - [Dataframe.value_counts](statistics.md#Dataframe.value_counts) 108 | - [Dataframe.which_max](statistics.md#Dataframe.which_max) 109 | - [Dataframe.which_min](statistics.md#Dataframe.which_min) 110 | - [Dataframe.get_mode](statistics.md#Dataframe.get_mode) 111 | - [Dataframe.get_max_value](statistics.md#Dataframe.get_max_value) 112 | - [Dataframe.get_min_value](statistics.md#Dataframe.get_min_value) 113 | - **[Subsets and batches](subsets_and_batches.md)** 114 | - [Dataframe.create_subsets](subsets_and_batches.md#Dataframe.create_subsets) 115 | - [Dataframe.reset_subsets](subsets_and_batches.md#Dataframe.reset_subsets) 116 | - [Dataframe.has_subset](subsets_and_batches.md#Dataframe.has_subset) 117 | - [Dataframe.get_subset](subsets_and_batches.md#Dataframe.get_subset) -------------------------------------------------------------------------------- /doc/core/categorical.md: -------------------------------------------------------------------------------- 1 | # API documentation for [categorical functions](#__Categorical functions__) 2 | - [Dataframe.as_categorical](#Dataframe.as_categorical) 3 | - [Dataframe.add_cat_key](#Dataframe.add_cat_key) 4 | - [Dataframe.as_string](#Dataframe.as_string) 5 | - [Dataframe.clean_categorical](#Dataframe.clean_categorical) 6 | - [Dataframe.is_categorical](#Dataframe.is_categorical) 7 | - [Dataframe.get_cat_keys](#Dataframe.get_cat_keys) 8 | - [Dataframe.to_categorical](#Dataframe.to_categorical) 9 | - [Dataframe.from_categorical](#Dataframe.from_categorical) 10 | - [Dataframe.boolean2categorical](#Dataframe.boolean2categorical) 11 | 12 | 13 | ## Categorical functions 14 | 15 | 16 | ### Dataframe.as_categorical(self, column_name[, levels][, labels][, exclude]) 17 | 18 | Set a column to categorical type. 19 | 20 | ``` 21 | ({ 22 | self = Dataframe -- 23 | column_name = string -- The column name to convert 24 | [levels = Df_Array|boolean] -- An optional array of the values that column might have taken. 25 | The default is the unique set of values taken by Dataframe.unique, 26 | sorted into increasing order. If you provide values that aren't present 27 | within the current column the value will still be saved and may be envoked in 28 | the future. [default=false] 29 | [labels = Df_Array|boolean] -- An optional character vector of labels for the levels 30 | (in the same order as levels after removing those in exclude) [default=false] 31 | [exclude = Df_Array|boolean] -- Values to be excluded when forming the set of levels. This should be 32 | of the same type as column, and will be coerced if necessary. [default=false] 33 | }) 34 | ``` 35 | 36 | _Return value_: self 37 | 38 | ``` 39 | ({ 40 | self = Dataframe -- 41 | column_array = Df_Array -- An array with column names 42 | [levels = Df_Array|boolean] -- An optional array of the values that column might have taken. 43 | The default is the unique set of values taken by Dataframe.unique, 44 | sorted into increasing order. If you provide values that aren't present 45 | within the current column the value will still be saved and may be envoked in 46 | the future. [default=false] 47 | [labels = Df_Array|boolean] -- An optional character vector of labels for the levels 48 | (in the same order as levels after removing those in exclude) [default=false] 49 | [exclude = Df_Array|boolean] -- Values to be excluded when forming the set of levels. This should be 50 | of the same type as column, and will be coerced if necessary. [default=false] 51 | }) 52 | ``` 53 | 54 | 55 | ### Dataframe.add_cat_key(self, column_name, key) 56 | 57 | Adds a key to the keyset of a categorical column. Mostly intended for internal use. 58 | 59 | ``` 60 | ({ 61 | self = Dataframe -- 62 | column_name = string -- The column name 63 | key = number|string -- The new key to insert 64 | }) 65 | ``` 66 | 67 | _Return value_: index value for key (integer) 68 | 69 | ### Dataframe.as_string(self, column_name) 70 | 71 | Converts a categorical column to a string column. This can be used to revert 72 | the Dataframe.as_categorical or as a way to convert numericals into strings. 73 | 74 | ``` 75 | ({ 76 | self = Dataframe -- 77 | column_name = string -- The column name 78 | }) 79 | ``` 80 | 81 | _Return value_: self 82 | 83 | ### Dataframe.clean_categorical(self, column_name[, reset_keys]) 84 | 85 | ``` 86 | ({ 87 | self = Dataframe -- 88 | column_name = string -- the name of the column 89 | [reset_keys = boolean] -- if all the keys should be reinitialized [default=false] 90 | }) 91 | ``` 92 | 93 | Removes any categories no longer present from the keys 94 | 95 | _Return value_: self 96 | 97 | ### Dataframe.is_categorical(self, column_name) 98 | 99 | Check if a column is categorical 100 | 101 | ``` 102 | ({ 103 | self = Dataframe -- 104 | column_name = string -- the name of the column 105 | }) 106 | ``` 107 | 108 | _Return value_: boolean 109 | 110 | ### Dataframe.get_cat_keys(self, column_name) 111 | 112 | Get keys from a categorical column. 113 | 114 | ``` 115 | ({ 116 | self = Dataframe -- 117 | column_name = string -- the name of the column 118 | }) 119 | ``` 120 | 121 | _Return value_: table with `["key"] = number` structure 122 | 123 | ### Dataframe.to_categorical(self, data, column_name) 124 | 125 | Converts values to categorical according to a column's keys 126 | 127 | ``` 128 | ({ 129 | self = Dataframe -- 130 | data = number|torch.*Tensor|Df_Array -- The integer to be converted 131 | column_name = string -- The name of the column which keys to use 132 | }) 133 | ``` 134 | 135 | _Return value_: string with the value 136 | 137 | ### Dataframe.from_categorical(self, data, column_name[, as_tensor]) 138 | 139 | ``` 140 | ({ 141 | self = Dataframe -- 142 | data = Df_Array -- The data to be converted 143 | column_name = string -- The name of the column 144 | [as_tensor = boolean] -- If the returned value should be a tensor [default=false] 145 | }) 146 | ``` 147 | 148 | Converts categorical to numerical according to a column's keys 149 | 150 | _Return value_: table or tensor 151 | 152 | ``` 153 | ({ 154 | self = Dataframe -- 155 | data = number|string -- The data to be converted 156 | column_name = string -- The name of the column 157 | }) 158 | ``` 159 | 160 | 161 | ### Dataframe.boolean2categorical(self, column_name[, false_str][, true_str]) 162 | 163 | Converts a boolean column into a torch.ByteTensor of type integer 164 | 165 | ``` 166 | ({ 167 | self = Dataframe -- 168 | column_name = string -- The boolean column that you want to convert 169 | [false_str = string] -- The string value for false [default=false] 170 | [true_str = string] -- The string value for true [default=true] 171 | }) 172 | ``` 173 | 174 | _Return value_: self -------------------------------------------------------------------------------- /doc/core/export_data.md: -------------------------------------------------------------------------------- 1 | # API documentation for [Data save/export functions](#__Data save/export functions__) 2 | - [Dataframe.to_csv](#Dataframe.to_csv) 3 | - [Dataframe.to_tensor](#Dataframe.to_tensor) 4 | - [Dataframe.get](#Dataframe.get) 5 | 6 | 7 | ## Data save/export functions 8 | 9 | 10 | ### Dataframe.to_csv(self, path[, separator][, verbose]) 11 | 12 | Saves a Dataframe into a CSV using csvigo as backend 13 | 14 | _Return value_: self (Dataframe) 15 | 16 | ``` 17 | ({ 18 | self = Dataframe -- 19 | path = string -- path to file 20 | [separator = string] -- separator (one character) [default=,] 21 | [verbose = boolean] -- verbose load [default=false] 22 | }) 23 | ``` 24 | 25 | 26 | ### Dataframe.to_tensor(self) 27 | 28 | Convert the numeric section or specified columns of the dataset to a tensor 29 | 30 | ``` 31 | ({ 32 | self = Dataframe -- 33 | }) 34 | ``` 35 | 36 | _Return value_: (1) torch.tensor with self:size(1) rows and self:size(2) columns, 37 | (2) exported column names 38 | 39 | 40 | You can export selected columns using the columns argument: 41 | 42 | ``` 43 | ({ 44 | self = Dataframe -- 45 | columns = Df_Array -- The columns to export to labels 46 | }) 47 | ``` 48 | 49 | If a filename is provided the tensor will be saved (`torch.save`) to that file: 50 | 51 | ``` 52 | ({ 53 | self = Dataframe -- 54 | filename = string -- Filename for tensor.save() 55 | [columns = Df_Array] -- The columns to export to labels [default=false] 56 | }) 57 | ``` 58 | 59 | ### Dataframe.get(self, idx) 60 | 61 | A funtion for *torchnet* compliance. It subsets a single index and returns the 62 | `to_tensor` on that example. 63 | 64 | ``` 65 | ({ 66 | self = Dataframe -- 67 | idx = number -- 68 | }) 69 | ``` 70 | 71 | _Return value_: (1) torch.tensor with 1 row and #numerical columns -------------------------------------------------------------------------------- /doc/core/init.md: -------------------------------------------------------------------------------- 1 | # API documentation for [core functions](#__Core functions__) 2 | - [Dataframe.`__init`](#Dataframe.__init) 3 | - [Dataframe.get_schema](#Dataframe.get_schema) 4 | - [Dataframe.shape](#Dataframe.shape) 5 | - [Dataframe.version](#Dataframe.version) 6 | - [Dataframe.set_version](#Dataframe.set_version) 7 | - [Dataframe.upgrade_frame](#Dataframe.upgrade_frame) 8 | - [Dataframe.assert_is_index](#Dataframe.assert_is_index) 9 | 10 | 11 | ## Core functions 12 | 13 | 14 | ### Dataframe.__init(self) 15 | 16 | Creates and initializes a Dataframe class. Envoked through `local my_dataframe = Dataframe()` 17 | 18 | ``` 19 | ({ 20 | self = Dataframe -- 21 | }) 22 | ``` 23 | 24 | _Return value_: Dataframe 25 | Read in an csv-file 26 | 27 | ``` 28 | ({ 29 | self = Dataframe -- 30 | csv_file = string -- The file path to the CSV 31 | }) 32 | ``` 33 | 34 | Directly input a table 35 | 36 | ``` 37 | ({ 38 | self = Dataframe -- 39 | data = Df_Dict -- The data to read in 40 | [column_order = Df_Array] -- The order of the column (has to be array and _not_ a dictionary) 41 | }) 42 | ``` 43 | 44 | If you enter column schema* and number of rows a table will be initialized. Note 45 | that you can optionally set all non-set values to `nan` values but this may be 46 | time-consuming for big datasets. 47 | 48 | * A schema is a hash table with the column names as keys and the column types 49 | as values. The column types are: 50 | - `boolean` 51 | - `integer` 52 | - `long` 53 | - `double` 54 | - `string` (this is stored as a `tds.Vec` and can be any value) 55 | 56 | ``` 57 | ({ 58 | self = Dataframe -- 59 | schema = Df_Dict -- The schema to use for initializaiton 60 | no_rows = number -- The number of rows 61 | [column_order = Df_Array] -- The column order 62 | [set_missing = boolean] -- Whether all elements should be set to missing from start [default=false] 63 | }) 64 | ``` 65 | 66 | _Return value_: Dataframe 67 | No updates is performed on already inserted data. The purpose of this method 68 | is to prepare a Dataframe object. 69 | 70 | A schema is a hash table with the column names as keys and the column types 71 | as values. The column types are: 72 | - `boolean` 73 | - `integer` 74 | - `long` 75 | - `double` 76 | - `string` (this is stored as a `tds.Vec` and can be any value) 77 | 78 | ``` 79 | ({ 80 | self = Dataframe -- 81 | schema = Df_Dict -- The schema to use for initializaiton 82 | column_order = Df_Array -- The column order 83 | }) 84 | ``` 85 | 86 | 87 | ### Dataframe.get_schema(self, column_name) 88 | 89 | Returns the schema, i.e. column types 90 | 91 | ``` 92 | ({ 93 | self = Dataframe -- 94 | column_name = string -- The column to get schema for 95 | }) 96 | ``` 97 | 98 | _Return value_: string 99 | ``` 100 | ({ 101 | self = Dataframe -- 102 | [columns = Df_Array] -- The columns to get schema for 103 | }) 104 | ``` 105 | 106 | _Return value_: table 107 | 108 | ### Dataframe.shape(self) 109 | 110 | Returns the number of rows and columns in a table 111 | 112 | ``` 113 | ({ 114 | self = Dataframe -- 115 | }) 116 | ``` 117 | 118 | _Return value_: table 119 | 120 | ### Dataframe.version(self) 121 | 122 | Returns the current data-frame version 123 | 124 | ``` 125 | ({ 126 | self = Dataframe -- 127 | }) 128 | ``` 129 | 130 | _Return value_: string 131 | 132 | ### Dataframe.set_version(self) 133 | 134 | Sets the data-frame version 135 | 136 | ``` 137 | ({ 138 | self = Dataframe -- 139 | }) 140 | ``` 141 | 142 | _Return value_: self 143 | 144 | ### Dataframe.upgrade_frame(self[, skip_version][, current_version]) 145 | 146 | Upgrades a dataframe using the old batch loading framework to the new framework 147 | by instantiating the subsets argument, copying the indexes and setting the 148 | samplers to either: 149 | 150 | - linear for test/validate or shuffle = false 151 | - permutation if shuffle = true and none of above names 152 | 153 | ``` 154 | ({ 155 | self = Dataframe -- 156 | [skip_version = boolean] -- Set to true if you want to upgrade your dataframe regardless of the version check 157 | [current_version = number] -- The current version of the dataframe 158 | }) 159 | ``` 160 | 161 | *Note:* Sometimes the version check fails to identify that the Dataframe is of 162 | an old version and you can therefore skip the version check. 163 | 164 | _Return value_: Dataframe 165 | 166 | ### Dataframe.assert_is_index(self, index[, plus_one]) 167 | 168 | Asserts that the number is a valid index. 169 | 170 | ``` 171 | ({ 172 | self = Dataframe -- 173 | index = number -- The index to investigate 174 | [plus_one = boolean] -- Count next non-existing index as good. When adding rows, an index of size(1) + 1 is OK [default=false] 175 | }) 176 | ``` 177 | 178 | _Return value_: Dataframe -------------------------------------------------------------------------------- /doc/core/load_data.md: -------------------------------------------------------------------------------- 1 | # API documentation for [Data loader functions](#__Data loader functions__) 2 | - [Dataframe.load_csv](#Dataframe.load_csv) 3 | - [Dataframe.bulk_load_csv](#Dataframe.bulk_load_csv) 4 | - [Dataframe.load_table](#Dataframe.load_table) 5 | - [Dataframe.`_clean_columns`](#Dataframe._clean_columns) 6 | 7 | 8 | ## Data loader functions 9 | 10 | 11 | ### Dataframe.load_csv(self, path[, header][, schema][, separator][, skip][, verbose][, rows2explore]) 12 | 13 | Loads a CSV file into Dataframe using csvigo as backend 14 | 15 | ``` 16 | ({ 17 | self = Dataframe -- 18 | path = string -- path to file 19 | [header = boolean] -- if has header on first line [default=true] 20 | [schema = Df_Dict] -- The column schema types with column names as keys 21 | [separator = string] -- separator (one character) [default=,] 22 | [skip = number] -- skip this many lines at start of file [default=0] 23 | [verbose = boolean] -- verbose load [default=false] 24 | [rows2explore = number] -- The maximum number of rows to traverse when trying to identify schema 25 | }) 26 | ``` 27 | 28 | _Return value_: self 29 | 30 | ### Dataframe.bulk_load_csv(self, path[, header][, schema][, separator][, skip][, verbose][, nthreads]) 31 | 32 | Loads a CSV file into Dataframe using multithreading. 33 | Warning : this method does not do the same checks as load_csv would do. It doesn't handle other format than torch.*Tensor and tds.Vec. 34 | 35 | ``` 36 | ({ 37 | self = Dataframe -- 38 | path = string -- path to file 39 | [header = boolean] -- if has header on first line (not used at the moment) [default=true] 40 | [schema = Df_Dict] -- The column schema types with column names as keys 41 | [separator = string] -- separator (one character) [default=,] 42 | [skip = number] -- skip this many lines at start of file (not used at the moment) [default=0] 43 | [verbose = boolean] -- verbose load [default=false] 44 | [nthreads = number] -- Number of threads to use to read the csv file [default=1] 45 | }) 46 | ``` 47 | 48 | _Return value_: self 49 | 50 | ### Dataframe.load_table(self, data[, schema][, column_order]) 51 | 52 | ``` 53 | ({ 54 | self = Dataframe -- 55 | data = Df_Dict -- Table (dictionary) to import. Max depth 2. 56 | [schema = Df_Dict] -- Provide if you want to force column types 57 | [column_order = Df_Array] -- The order of the column (has to be array and _not_ a dictionary) 58 | }) 59 | ``` 60 | 61 | Imports a table data directly into Dataframe. The table should all be of equal length 62 | or just single values. If a table contains one column with 10 rows and then has 63 | another column with a single element that element is duplicated 10 times, i.e. 64 | filling the entire column with that single value. 65 | 66 | 67 | _Return value_: self 68 | 69 | ### Dataframe._clean_columns(self, data[, column_order][, schema]) 70 | 71 | ``` 72 | { 73 | self = Dataframe -- 74 | data = table -- 75 | [column_order = table] -- 76 | [schema = table] -- 77 | } 78 | ``` 79 | 80 | Internal function to clean columns names 81 | 82 | _Return value_: self -------------------------------------------------------------------------------- /doc/core/metatable.md: -------------------------------------------------------------------------------- 1 | # API documentation for [metatable functions](#__Metatable functions__) 2 | - [Dataframe.size](#Dataframe.size) 3 | - [Dataframe.`__tostring__`](#Dataframe.__tostring__) 4 | - [Dataframe.copy](#Dataframe.copy) 5 | - [Dataframe.#](#Dataframe.#) 6 | - [Dataframe.==](#Dataframe.==) 7 | 8 | 9 | ## Metatable functions 10 | 11 | 12 | ### Dataframe.size(self[, dim]) 13 | 14 | By providing dimension you can get only that dimension, row == 1, col == 2. If 15 | value omitted it will return the number of rows in order to comply with torchnet 16 | standard. 17 | 18 | ``` 19 | ({ 20 | self = Dataframe -- 21 | [dim = number] -- The dimension of interest [default=1] 22 | }) 23 | ``` 24 | 25 | _Return value_: integer 26 | 27 | ### Dataframe.__tostring__(self) 28 | 29 | A wrapper for `tostring()` 30 | 31 | ``` 32 | ({ 33 | self = Dataframe -- 34 | }) 35 | ``` 36 | 37 | _Return value_: string 38 | 39 | ### Dataframe.copy(self) 40 | 41 | Copies the table together with all metadata 42 | 43 | ``` 44 | ({ 45 | self = Dataframe -- 46 | }) 47 | ``` 48 | 49 | _Return value_: Dataframe 50 | 51 | ### Dataframe.# 52 | 53 | Returns the number of rows 54 | 55 | _Return value_: integer 56 | 57 | ### Dataframe.== 58 | 59 | Checks if Dataframe's contain the same values 60 | 61 | _Return value_: boolean -------------------------------------------------------------------------------- /doc/core/missing_data.md: -------------------------------------------------------------------------------- 1 | # API documentation for [missing data functions](#__Missing data functions__) 2 | - [Dataframe.count_na](#Dataframe.count_na) 3 | - [Dataframe.fill_na](#Dataframe.fill_na) 4 | - [Dataframe.fill_na](#Dataframe.fill_na) 5 | 6 | 7 | ## Missing data functions 8 | 9 | 10 | ### Dataframe.count_na(self[, columns][, as_dataframe]) 11 | 12 | Count missing values in dataset 13 | 14 | ``` 15 | ({ 16 | self = Dataframe -- 17 | [columns = Df_Array] -- The columns to count 18 | [as_dataframe = boolean] -- Return a dataframe [default=true] 19 | }) 20 | ``` 21 | 22 | _Return value_: Dataframe or table containing missing values per column, total na 23 | If you only want to count a single column 24 | 25 | ``` 26 | ({ 27 | self = Dataframe -- 28 | column = string -- The column to count 29 | }) 30 | ``` 31 | 32 | _Return value_: single integer 33 | 34 | ### Dataframe.fill_na(self, column_name[, default_value]) 35 | 36 | Replace missing value in a specific column 37 | 38 | ``` 39 | ({ 40 | self = Dataframe -- 41 | column_name = string -- The column to fill 42 | [default_value = number|string|boolean] -- The default missing value [default=0] 43 | }) 44 | ``` 45 | 46 | _Return value_: self 47 | 48 | ### Dataframe.fill_na(self[, default_value]) 49 | 50 | Replace missing value in all columns 51 | 52 | ``` 53 | ({ 54 | self = Dataframe -- 55 | [default_value = number|string|boolean] -- The default missing value [default=0] 56 | }) 57 | ``` 58 | 59 | _Return value_: self -------------------------------------------------------------------------------- /doc/core/output.md: -------------------------------------------------------------------------------- 1 | # API documentation for [output functions](#__Output functions__) 2 | - [Dataframe.output](#Dataframe.output) 3 | - [Dataframe.show](#Dataframe.show) 4 | - [Dataframe.tostring](#Dataframe.tostring) 5 | - [Dataframe.`_to_html`](#Dataframe._to_html) 6 | 7 | 8 | ## Output functions 9 | 10 | 11 | ### Dataframe.output(self[, html][, max_rows][, digits]) 12 | 13 | ``` 14 | ({ 15 | self = Dataframe -- 16 | [html = boolean] -- If the output should be in html format [default=false] 17 | [max_rows = number] -- Limit the maximum number of printed rows [default=20] 18 | [digits = number|boolean] -- Set this to an integer >= 0 in order to reduce the number of integers shown [default=false] 19 | }) 20 | ``` 21 | 22 | Prints the table into itorch.html if in itorch and html == true, otherwise prints a table string 23 | 24 | _Return value_: self 25 | 26 | ### Dataframe.show(self[, digits]) 27 | 28 | ``` 29 | ({ 30 | self = Dataframe -- 31 | [digits = number|boolean] -- Set this to an integer >= 0 in order to reduce the number of integers shown [default=false] 32 | }) 33 | ``` 34 | 35 | Prints the top and bottom section of the table for better overview. Uses itorch if available 36 | 37 | _Return value_: self 38 | 39 | ### Dataframe.tostring(self[, digits][, columns2skip][, no_rows][, min_col_width][, max_table_width]) 40 | 41 | Converts table to a string representation that follows standard markdown syntax. 42 | The table tries to follow a maximum table width inspired by the `dplyr` table print. 43 | The core concept is that wide columns are clipped when the table risks of being larger 44 | than a certain max width. The columns convey though no information if they need to 45 | be clipped to just a few characters why there is a minimum number of characters. 46 | The columns that then don't fit are noted below the table as skipped columns. 47 | 48 | You can also specify columns that you wish to skip by providing the columns2skip 49 | skip argumnt. If columns are skipped by user demand there won't be a ... column to 50 | the right but if the table is still too wide then the software may choose to skip 51 | additional columns and thereby add a ... column. 52 | 53 | ``` 54 | ({ 55 | self = Dataframe -- 56 | [digits = number|boolean] -- Set this to an integer >= 0 in order to reduce the number of integers shown [default=false] 57 | [columns2skip = Df_Array] -- Columns to skip from the output [default=false] 58 | [no_rows = number|boolean] -- The number of rows to display. If -1 then shows all. Defaults to setting in Dataframe.tostring_defaults [default=false] 59 | [min_col_width = number|boolean] -- The minimum column width in characters. Defaults to setting in Dataframe.tostring_defaults [default=false] 60 | [max_table_width = number|boolean] -- The maximum table width in characters. Defaults to setting in Dataframe.tostring_defaults [default=false] 61 | }) 62 | ``` 63 | 64 | _Return value_: string 65 | 66 | ``` 67 | ({ 68 | self = Dataframe -- 69 | [digits = number|boolean] -- Set this to an integer >= 0 in order to reduce the number of integers shown [default=false] 70 | columns2skip = string -- Columns to skip from the output as regular expression 71 | [no_rows = number] -- The number of rows to display. If -1 then shows all. Defaults to setting in Dataframe.tostring_defaults [default=false] 72 | [min_col_width = number] -- The minimum column width in characters. Defaults to setting in Dataframe.tostring_defaults [default=false] 73 | [max_table_width = number] -- The maximum table width in characters. Defaults to setting in Dataframe.tostring_defaults [default=false] 74 | }) 75 | ``` 76 | 77 | 78 | ### Dataframe._to_html(self[, split_table][, offset][, digits]) 79 | 80 | ``` 81 | ({ 82 | self = Dataframe -- 83 | [split_table = string] -- Where the table is split. Valid input is 'none', 'top', 'bottom', 'all'. 84 | Note that the 'bottom' removes the trailing while the 'top' removes 85 | the initial ''. The 'all' removes both but retains the header while 86 | the 'top' has no header. 87 | [default=none] 88 | [offset = number] -- The line index offset [default=0] 89 | [digits = number|boolean] -- Set this to an integer >= 0 in order to reduce the number of integers shown [default=false] 90 | }) 91 | ``` 92 | 93 | Internal function to convert a table to html (only works for 1D table) 94 | 95 | _Return value_: string -------------------------------------------------------------------------------- /doc/core/row.md: -------------------------------------------------------------------------------- 1 | # API documentation for [row functions](#__Row functions__) 2 | - [Dataframe.get_row](#Dataframe.get_row) 3 | - [Dataframe.insert](#Dataframe.insert) 4 | - [Dataframe.insert](#Dataframe.insert) 5 | - [Dataframe.append](#Dataframe.append) 6 | - [Dataframe.rbind](#Dataframe.rbind) 7 | - [Dataframe.remove_index](#Dataframe.remove_index) 8 | 9 | 10 | ## Row functions 11 | 12 | 13 | ### Dataframe.get_row(self, index) 14 | 15 | Gets a single row from the Dataframe 16 | 17 | ``` 18 | ({ 19 | self = Dataframe -- 20 | index = number -- The row index to retrieve 21 | }) 22 | ``` 23 | 24 | _Return value_: A table with the row content 25 | 26 | ### Dataframe.insert(self, index, rows) 27 | 28 | Inserts a row or multiple rows into database at the position of the provided index. 29 | 30 | ``` 31 | ({ 32 | self = Dataframe -- 33 | index = number -- The row number where to insert the row(s) 34 | rows = Df_Dict -- Insert values to the dataset 35 | }) 36 | ``` 37 | 38 | _Return value_: self 39 | 40 | ### Dataframe.insert(self, index, rows, schema) 41 | 42 | Inserts a row or multiple rows into database at the position of the provided index and 43 | according to the prvided schema. 44 | 45 | ``` 46 | ({ 47 | self = Dataframe -- 48 | index = number -- The row number where to insert the row(s) 49 | rows = Df_Dict -- Insert values to the dataset 50 | schema = Df_Dict -- Specify a schema to check before insertion 51 | }) 52 | ``` 53 | 54 | _Return value_: self 55 | Note, if you provide a Dataframe the primary dataframes meta-information will 56 | be the ones that are kept 57 | 58 | ``` 59 | ({ 60 | self = Dataframe -- 61 | index = number -- The row number where to insert the row(s) 62 | rows = Dataframe -- A Dataframe that you want to insert 63 | }) 64 | ``` 65 | 66 | 67 | ### Dataframe.append(self, rows[, column_order][, schema]) 68 | 69 | Appends the row(s) to the Dataframe. 70 | 71 | ``` 72 | ({ 73 | self = Dataframe -- 74 | rows = Df_Dict -- Values to append to the Dataframe 75 | [column_order = Df_Array] -- The order of the column (has to be array and _not_ a dictionary). Only used when the Dataframe is empty 76 | [schema = Df_Dict] -- The schema for the data - used in case the table is new 77 | }) 78 | ``` 79 | 80 | _Return value_: self 81 | Note, if you provide a Dataframe the primary dataframes meta-information will 82 | be the ones that are kept 83 | 84 | ``` 85 | ({ 86 | self = Dataframe -- 87 | rows = Dataframe -- A Dataframe that you want to append 88 | }) 89 | ``` 90 | 91 | 92 | ### Dataframe.rbind(self, rows) 93 | 94 | Alias to Dataframe.append 95 | 96 | ``` 97 | ({ 98 | self = Dataframe -- 99 | rows = Df_Dict -- Values to append to the Dataframe 100 | }) 101 | ``` 102 | 103 | _Return value_: self 104 | Note, if you provide a Dataframe the primary dataframes meta-information will 105 | be the ones that are kept 106 | 107 | ``` 108 | ({ 109 | self = Dataframe -- 110 | rows = Dataframe -- A Dataframe that you want to append 111 | }) 112 | ``` 113 | 114 | 115 | ### Dataframe.remove_index(self, index) 116 | 117 | Deletes a given row 118 | 119 | ``` 120 | ({ 121 | self = Dataframe -- 122 | index = number -- The row index to remove 123 | }) 124 | ``` 125 | 126 | _Return value_: self -------------------------------------------------------------------------------- /doc/dataseries/README.md: -------------------------------------------------------------------------------- 1 | # Documentation for dataseries 2 | 3 | This documentation ha been auto-generated from code using the `argcheck` system. 4 | 5 | ## Table of contents (file-level) 6 | 7 | Below follows a more [detailed](#detailed) table of contents with links to 8 | the different functions. Not this list may be incompleted due to failure to 9 | add apropriate anchor tags during documentation. 10 | 11 | 12 | - [Dataseries](init.md) 13 | - [Categorical functions](categorical.md) 14 | - [Export functions](export.md) 15 | - [Metatable functions](metatable.md) 16 | - [Single element functions](sngl_elmnt_ops.md) 17 | - [Statistics](statistics.md) 18 | 19 | ## Detailed table of contents (file-level + anchors) 20 | 21 | 22 | - **[Dataseries](init.md)** 23 | - [Dataseries.`__init`](init.md#Dataseries.__init) 24 | - [Dataseries.load](init.md#Dataseries.load) 25 | - [Dataseries.new_storage](init.md#Dataseries.new_storage) 26 | - [Dataseries.copy](init.md#Dataseries.copy) 27 | - [Dataseries.size](init.md#Dataseries.size) 28 | - [Dataseries.resize](init.md#Dataseries.resize) 29 | - [Dataseries.assert_is_index](init.md#Dataseries.assert_is_index) 30 | - [Dataseries.is_numerical](init.md#Dataseries.is_numerical) 31 | - [Dataseries.is_numerical](init.md#Dataseries.is_numerical) 32 | - [Dataseries.is_boolean](init.md#Dataseries.is_boolean) 33 | - [Dataseries.is_string](init.md#Dataseries.is_string) 34 | - [Dataseries.type](init.md#Dataseries.type) 35 | - [Dataseries.get_variable_type](init.md#Dataseries.get_variable_type) 36 | - [Dataseries.boolean2tensor](init.md#Dataseries.boolean2tensor) 37 | - [Dataseries.fill](init.md#Dataseries.fill) 38 | - [Dataseries.fill_na](init.md#Dataseries.fill_na) 39 | - [Dataseries.tostring](init.md#Dataseries.tostring) 40 | - [Dataseries.sub](init.md#Dataseries.sub) 41 | - [Dataseries.eq](init.md#Dataseries.eq) 42 | - [Dataseries.get_data_mask](init.md#Dataseries.get_data_mask) 43 | - **[Categorical functions](categorical.md)** 44 | - [Dataseries.as_categorical](categorical.md#Dataseries.as_categorical) 45 | - [Dataseries.add_cat_key](categorical.md#Dataseries.add_cat_key) 46 | - [Dataseries.as_string](categorical.md#Dataseries.as_string) 47 | - [Dataseries.clean_categorical](categorical.md#Dataseries.clean_categorical) 48 | - [Dataseries.is_categorical](categorical.md#Dataseries.is_categorical) 49 | - [Dataseries.get_cat_keys](categorical.md#Dataseries.get_cat_keys) 50 | - [Dataseries.to_categorical](categorical.md#Dataseries.to_categorical) 51 | - [Dataseries.from_categorical](categorical.md#Dataseries.from_categorical) 52 | - [Dataseries.boolean2categorical](categorical.md#Dataseries.boolean2categorical) 53 | - **[Export functions](export.md)** 54 | - [Dataseries.to_tensor](export.md#Dataseries.to_tensor) 55 | - [Dataseries.to_table](export.md#Dataseries.to_table) 56 | - **[Metatable functions](metatable.md)** 57 | - [Dataseries.#](metatable.md#Dataseries.#) 58 | - [Dataseries.`__tostring__`](metatable.md#Dataseries.__tostring__) 59 | - **[Single element functions](sngl_elmnt_ops.md)** 60 | - [Dataseries.get](sngl_elmnt_ops.md#Dataseries.get) 61 | - [Dataseries.set](sngl_elmnt_ops.md#Dataseries.set) 62 | - [Dataseries.mutate](sngl_elmnt_ops.md#Dataseries.mutate) 63 | - [Dataseries.append](sngl_elmnt_ops.md#Dataseries.append) 64 | - [Dataseries.remove](sngl_elmnt_ops.md#Dataseries.remove) 65 | - [Dataseries.insert](sngl_elmnt_ops.md#Dataseries.insert) 66 | - **[Statistics](statistics.md)** 67 | - [Dataseries.count_na](statistics.md#Dataseries.count_na) 68 | - [Dataseries.unique](statistics.md#Dataseries.unique) 69 | - [Dataseries.value_counts](statistics.md#Dataseries.value_counts) 70 | - [Dataseries.which_max](statistics.md#Dataseries.which_max) 71 | - [Dataseries.which_min](statistics.md#Dataseries.which_min) 72 | - [Dataseries.get_mode](statistics.md#Dataseries.get_mode) 73 | - [Dataseries.get_max_value](statistics.md#Dataseries.get_max_value) 74 | - [Dataseries.get_min_value](statistics.md#Dataseries.get_min_value) -------------------------------------------------------------------------------- /doc/dataseries/categorical.md: -------------------------------------------------------------------------------- 1 | # API documentation for [categorical functions](#__Categorical functions__) 2 | - [Dataseries.as_categorical](#Dataseries.as_categorical) 3 | - [Dataseries.add_cat_key](#Dataseries.add_cat_key) 4 | - [Dataseries.as_string](#Dataseries.as_string) 5 | - [Dataseries.clean_categorical](#Dataseries.clean_categorical) 6 | - [Dataseries.is_categorical](#Dataseries.is_categorical) 7 | - [Dataseries.get_cat_keys](#Dataseries.get_cat_keys) 8 | - [Dataseries.to_categorical](#Dataseries.to_categorical) 9 | - [Dataseries.from_categorical](#Dataseries.from_categorical) 10 | - [Dataseries.boolean2categorical](#Dataseries.boolean2categorical) 11 | 12 | 13 | ## Categorical functions 14 | 15 | Here are functions are used for converting to and from categorical type. The 16 | categorical series type is a hash table around a torch.IntTensor that maps 17 | numerical values between integer and string values. The standard numbering is 18 | from 1 to n unique values. 19 | 20 | 21 | ### Dataseries.as_categorical(self[, levels][, labels][, exclude]) 22 | 23 | Set a series to categorical type. The keys retrieved from Dataseries.unique. 24 | 25 | ``` 26 | ({ 27 | self = Dataseries -- 28 | [levels = Df_Array|boolean] -- An optional array of the values that series might have taken. 29 | The default is the unique set of values taken by Dataseries.unique, 30 | sorted into increasing order. If you provide values that aren't present 31 | within the current series the value will still be saved and may be envoked in 32 | the future. 33 | [labels = Df_Array|boolean] -- An optional character vector of labels for the levels 34 | (in the same order as levels after removing those in exclude) 35 | [exclude = Df_Array|boolean] -- Values to be excluded when forming the set of levels. This should be 36 | of the same type as the series, and will be coerced if necessary. 37 | }) 38 | ``` 39 | 40 | _Return value_: self 41 | 42 | ### Dataseries.add_cat_key(self, key[, key_index]) 43 | 44 | Adds a key to the keyset of a categorical series. Mostly intended for internal use. 45 | 46 | ``` 47 | ({ 48 | self = Dataseries -- 49 | key = number|string -- The new key to insert 50 | [key_index = number] -- The key index to use 51 | }) 52 | ``` 53 | 54 | _Return value_: index value for key (integer) 55 | 56 | ### Dataseries.as_string(self) 57 | 58 | Converts a categorical Dataseries to a string Dataseries. This can be used to revert 59 | the Dataseries.as_categorical or as a way to convert numericals into strings. 60 | 61 | ``` 62 | ({ 63 | self = Dataseries -- 64 | }) 65 | ``` 66 | 67 | _Return value_: self 68 | 69 | ### Dataseries.clean_categorical(self[, reset_keys]) 70 | 71 | ``` 72 | ({ 73 | self = Dataseries -- 74 | [reset_keys = boolean] -- if all the keys should be reinitialized [default=false] 75 | }) 76 | ``` 77 | 78 | Removes any categories no longer present from the keys 79 | 80 | _Return value_: self 81 | 82 | ### Dataseries.is_categorical(self) 83 | 84 | Check if a Dataseries is categorical 85 | 86 | ``` 87 | ({ 88 | self = Dataseries -- 89 | }) 90 | ``` 91 | 92 | _Return value_: boolean 93 | 94 | ### Dataseries.get_cat_keys(self) 95 | 96 | Get keys 97 | 98 | ``` 99 | ({ 100 | self = Dataseries -- 101 | }) 102 | ``` 103 | 104 | _Return value_: table with `["key"] = number` structure 105 | 106 | ### Dataseries.to_categorical(self, key_index) 107 | 108 | Converts values to categorical according to a series's keys 109 | 110 | ``` 111 | ({ 112 | self = Dataseries -- 113 | key_index = number -- The integer to be converted 114 | }) 115 | ``` 116 | 117 | _Return value_: string with the value. If provided `nan` it will also 118 | return a `nan`. It returns `nil` if no key is found 119 | You can also provide a tensor 120 | 121 | ``` 122 | ({ 123 | self = Dataseries -- 124 | data = torch.*Tensor -- The integers to be converted 125 | }) 126 | ``` 127 | 128 | _Return value_: table with values 129 | You can also provide an array 130 | 131 | ``` 132 | ({ 133 | self = Dataseries -- 134 | data = Df_Array -- The integers to be converted 135 | }) 136 | ``` 137 | 138 | _Return value_: table with values 139 | 140 | ### Dataseries.from_categorical(self, data) 141 | 142 | Converts categorical to numerical according to a Dataseries's keys 143 | 144 | ``` 145 | ({ 146 | self = Dataseries -- 147 | data = number|string -- The data to be converted 148 | }) 149 | ``` 150 | 151 | _Return value_: table or tensor 152 | You can also provide an array with values 153 | 154 | ``` 155 | ({ 156 | self = Dataseries -- 157 | data = Df_Array -- The data to be converted 158 | [as_tensor = boolean] -- If the returned value should be a tensor [default=false] 159 | }) 160 | ``` 161 | 162 | _Return value_: table or tensor 163 | Checks if categorical key exists 164 | 165 | ``` 166 | ({ 167 | self = Dataseries -- 168 | value = number|string -- The value that should be present in the categorical hash 169 | }) 170 | ``` 171 | 172 | _Return value_: boolean 173 | Checks if categorical value exists 174 | 175 | ``` 176 | ({ 177 | self = Dataseries -- 178 | value = number|string -- The value that should be present in the categorical hash 179 | }) 180 | ``` 181 | 182 | _Return value_: boolean 183 | 184 | ### Dataseries.boolean2categorical(self[, false_str][, true_str]) 185 | 186 | Converts a boolean Dataseries into a categorical tensor 187 | 188 | ``` 189 | ({ 190 | self = Dataseries -- 191 | [false_str = string] -- The string value for false [default=false] 192 | [true_str = string] -- The string value for true [default=true] 193 | }) 194 | ``` 195 | 196 | _Return value_: self, boolean indicating successful conversion -------------------------------------------------------------------------------- /doc/dataseries/export.md: -------------------------------------------------------------------------------- 1 | # API documentation for [export functions](#__Export functions__) 2 | - [Dataseries.to_tensor](#Dataseries.to_tensor) 3 | - [Dataseries.to_table](#Dataseries.to_table) 4 | 5 | 6 | ## Export functions 7 | 8 | Here are functions are used for exporting to a different format. Generally `to_` 9 | functions should reside here. Only exception is the `tostring`. 10 | 11 | 12 | ### Dataseries.to_tensor(self[, missing_value][, copy]) 13 | 14 | Returns the values in tensor format. Note that if you don't provide a replacement 15 | for missing values and there are missing values the function will throw an error. 16 | 17 | *Note*: boolean columns are not tensors and need to be manually converted to a 18 | tensor. This since 0 would be a natural value for false but can cause issues as 19 | neurons are labeled 1 to n for classification tasks. See the `Dataframe.update` 20 | function for details or run the `boolean2tensor`. 21 | 22 | ``` 23 | ({ 24 | self = Dataseries -- 25 | [missing_value = number] -- Set a value for the missing data 26 | [copy = boolean] -- Set to false if you want the original data to be returned. [default=true] 27 | }) 28 | ``` 29 | 30 | _Return value_: `torch.*Tensor` of the current type 31 | 32 | ### Dataseries.to_table(self[, boolean2string]) 33 | 34 | Returns the values in table format 35 | 36 | ``` 37 | ({ 38 | self = Dataseries -- 39 | [boolean2string = boolean] -- Convert boolean values to strings since they cause havoc with csvigo 40 | }) 41 | ``` 42 | 43 | _Return value_: table -------------------------------------------------------------------------------- /doc/dataseries/metatable.md: -------------------------------------------------------------------------------- 1 | # API documentation for [metatable functions](#__Metatable functions__) 2 | - [Dataseries.#](#Dataseries.#) 3 | - [Dataseries.`__tostring__`](#Dataseries.__tostring__) 4 | 5 | 6 | ## Metatable functions 7 | 8 | 9 | ### Dataseries.# 10 | 11 | Returns the number of elements 12 | 13 | _Return value_: integer 14 | 15 | ### Dataseries.__tostring__(self) 16 | 17 | A wrapper for `tostring()` 18 | 19 | ``` 20 | ({ 21 | self = Dataseries -- 22 | }) 23 | ``` 24 | 25 | _Return value_: string -------------------------------------------------------------------------------- /doc/dataseries/sngl_elmnt_ops.md: -------------------------------------------------------------------------------- 1 | # API documentation for [single element functions](#__Single element functions__) 2 | - [Dataseries.get](#Dataseries.get) 3 | - [Dataseries.set](#Dataseries.set) 4 | - [Dataseries.mutate](#Dataseries.mutate) 5 | - [Dataseries.append](#Dataseries.append) 6 | - [Dataseries.remove](#Dataseries.remove) 7 | - [Dataseries.insert](#Dataseries.insert) 8 | 9 | 10 | ## Single element functions 11 | 12 | Here are functions are mainly used for manipulating a single element. 13 | 14 | 15 | ### Dataseries.get(self, index[, as_raw]) 16 | 17 | Gets a single or a set of elements. 18 | 19 | ``` 20 | ({ 21 | self = Dataseries -- 22 | index = number -- The index to set the value to 23 | [as_raw = boolean] -- Set to true if you want categorical values to be returned as their raw numeric representation [default=false] 24 | }) 25 | ``` 26 | 27 | _Return value_: number|string|boolean 28 | If you provide a Df_Array you get back a Dataseries of elements 29 | 30 | ``` 31 | ({ 32 | self = Dataseries -- 33 | index = Df_Array -- Indexes of wanted elements 34 | }) 35 | ``` 36 | 37 | _Return value_: Dataseries 38 | 39 | ### Dataseries.set(self, index, value) 40 | 41 | Sets a single element 42 | 43 | ``` 44 | ({ 45 | self = Dataseries -- 46 | index = number -- The index to set the value to 47 | value = * -- The data to set 48 | }) 49 | ``` 50 | 51 | _Return value_: self 52 | 53 | ### Dataseries.mutate(self, mutation[, type]) 54 | 55 | Modifies a dataseries. Takes a function where that each element is applied to. 56 | 57 | ``` 58 | ({ 59 | self = Dataseries -- 60 | mutation = function -- The function to apply to each value 61 | [type = string] -- The return type of the data if other than the current 62 | }) 63 | ``` 64 | 65 | _Return value_: self 66 | 67 | ### Dataseries.append(self, value) 68 | 69 | Appends a single element to series. This function resizes the tensor to +1 70 | and then calls the `set` function so if possible try to directly size the 71 | series to apropriate length before setting elements as this alternative is 72 | slow and should only be used with a few values at the time. 73 | 74 | ``` 75 | ({ 76 | self = Dataseries -- 77 | value = * -- The data to set 78 | }) 79 | ``` 80 | 81 | _Return value_: self 82 | 83 | ### Dataseries.remove(self, index) 84 | 85 | Removes a single element 86 | 87 | ``` 88 | ({ 89 | self = Dataseries -- 90 | index = number -- The index to remove 91 | }) 92 | ``` 93 | 94 | _Return value_: self 95 | 96 | ### Dataseries.insert(self, index, value) 97 | 98 | Inserts a single element 99 | 100 | ``` 101 | ({ 102 | self = Dataseries -- 103 | index = number -- The index to insert at 104 | value = !table -- The value to insert 105 | }) 106 | ``` 107 | 108 | _Return value_: self -------------------------------------------------------------------------------- /doc/dataseries/statistics.md: -------------------------------------------------------------------------------- 1 | # API documentation for [statistics](#__Statistics__) 2 | - [Dataseries.count_na](#Dataseries.count_na) 3 | - [Dataseries.unique](#Dataseries.unique) 4 | - [Dataseries.value_counts](#Dataseries.value_counts) 5 | - [Dataseries.which_max](#Dataseries.which_max) 6 | - [Dataseries.which_min](#Dataseries.which_min) 7 | - [Dataseries.get_mode](#Dataseries.get_mode) 8 | - [Dataseries.get_max_value](#Dataseries.get_max_value) 9 | - [Dataseries.get_min_value](#Dataseries.get_min_value) 10 | 11 | 12 | ## Statistics 13 | 14 | Here are functions gather commmonly used descriptive statistics 15 | 16 | 17 | ### Dataseries.count_na(self) 18 | 19 | Count missing values 20 | 21 | ``` 22 | ({ 23 | self = Dataseries -- 24 | }) 25 | ``` 26 | 27 | _Return value_: number 28 | 29 | ### Dataseries.unique(self[, as_keys][, as_raw]) 30 | 31 | Get unique elements 32 | 33 | ``` 34 | ({ 35 | self = Dataseries -- 36 | [as_keys = boolean] -- return table with unique as keys and a count for frequency [default=false] 37 | [as_raw = boolean] -- return table with raw data without categorical transformation [default=false] 38 | }) 39 | ``` 40 | 41 | _Return value_: tds.Vec with unique values or 42 | tds.Hash if as_keys == true then the unique 43 | value as key with an incremental integer 44 | value => {'unique1':1, 'unique2':2, 'unique6':3} 45 | 46 | ### Dataseries.value_counts(self[, normalize][, dropna][, as_raw][, as_dataframe]) 47 | 48 | Counts number of occurences for each unique element (frequency/histogram). 49 | 50 | ``` 51 | ({ 52 | self = Dataseries -- 53 | [normalize = boolean] -- If True then the object returned will contain the relative frequencies of 54 | the unique values. [default=false] 55 | [dropna = boolean] -- Don’t include counts of NaN (missing values). [default=true] 56 | [as_raw = boolean] -- Use raw numerical values instead of category label for categoricals [default=false] 57 | [as_dataframe = boolean] -- Return a Dataframe with `value` and `count` columns [default=true] 58 | }) 59 | ``` 60 | 61 | _Return value_: Dataframe|table 62 | 63 | ### Dataseries.which_max(self) 64 | 65 | Retrieves the index for the rows with the highest value. Can be > 1 rows that 66 | share the highest value. 67 | 68 | ``` 69 | ({ 70 | self = Dataseries -- 71 | }) 72 | ``` 73 | 74 | _Return value_: table with the highest indexes, max value 75 | 76 | ### Dataseries.which_min(self) 77 | 78 | Retrieves the index for the rows with the lowest value. Can be > 1 rows that 79 | share the lowest value. 80 | 81 | ``` 82 | ({ 83 | self = Dataseries -- 84 | }) 85 | ``` 86 | 87 | _Return value_: table with the lowest indexes, lowest value 88 | 89 | ### Dataseries.get_mode(self[, normalize][, dropna][, as_dataframe]) 90 | 91 | Gets the mode for a Dataseries. A mode is defined as the most frequent value. 92 | Note that if two or more values are equally common then there are several modes. 93 | The mode is useful as it can be viewed as any algorithms most naive guess where 94 | it always guesses the same value. 95 | 96 | ``` 97 | ({ 98 | self = Dataseries -- 99 | [normalize = boolean] -- If True then the object returned will contain the relative frequencies of 100 | the unique values. [default=false] 101 | [dropna = boolean] -- Don’t include counts of NaN (missing values). [default=true] 102 | [as_dataframe = boolean] -- Return a dataframe [default=true] 103 | }) 104 | ``` 105 | 106 | _Return value_: Table or Dataframe 107 | 108 | ### Dataseries.get_max_value(self) 109 | 110 | Gets the maximum value. Similar in function to which_max but it will also return 111 | the maximum integer value for the categorical values. This can be useful when 112 | deciding on the number of neurons in the final layer. 113 | 114 | ``` 115 | ({ 116 | self = Dataseries -- 117 | }) 118 | ``` 119 | 120 | _Return value_: number 121 | 122 | ### Dataseries.get_min_value(self) 123 | 124 | Gets the minimum value for a given column. Returns minimum values for all 125 | numerical columns if none is provided. 126 | 127 | ``` 128 | ({ 129 | self = Dataseries -- 130 | }) 131 | ``` 132 | 133 | _Return value_: number -------------------------------------------------------------------------------- /doc/helper_classes/10_iterator.md: -------------------------------------------------------------------------------- 1 | # API documentation for [Df_Iterator and general about Dataframe's iterators](#__Df_Iterator and general about Dataframe's iterators__) 2 | - [Df_Iterator](#Df_Iterator) 3 | 4 | 5 | ## Df_Iterator and general about Dataframe's iterators 6 | 7 | The `torchnet` iterators allow a simple iteration over a dataset. If combined 8 | with a list function you can create so that the iterators returns a table with 9 | the two key elements `input` and `target` that `tnt.SGDEngine` and 10 | `tnt.OptimEngine` require. 11 | 12 | The Dataframe approach is to combine everything into a single iterator that does 13 | returns the training tensors. This is a complement to the subset `get_batch` 14 | function and relies on the same core functions. 15 | 16 | Iterators implement two methods: 17 | 18 | - `run()` which returns a Lua iterator usable in a for loop. 19 | - `exec(funcname, ...)` which execute a given funcname on the underlying dataset. 20 | 21 | Typical usage is achieved with a for loop: 22 | ```lua 23 | for sample in iterator:run() do 24 | 25 | end 26 | ``` 27 | 28 | Iterators implement the `__call` event, so one might also use the `()` operator: 29 | ```lua 30 | for sample in iterator() do 31 | 32 | end 33 | ``` 34 | 35 | **Important:** The `tnt.DatasetIterator` does not reset the iterator after running 36 | to the end. In order to do this you must add a `reset_sampler` call in the endEpoch 37 | hook for the engine: 38 | 39 | ```lua 40 | engine.hooks.onEndEpoch = function(state) 41 | state.iterator.dataset:reset_sampler() 42 | end 43 | ``` 44 | 45 | As torchnet is epoch-centered all samplers will be behave as if there was an underlying 46 | epoch mechanism. E.g. the uniform sampler will never trigger a reset but the epoch 47 | hook will still be called as there is a "fake epoch" calculated by 48 | `math.ceil(dataset:size()/batch_size)`. 49 | 50 | **Note**: An important note is that the transform and filters are ran before the 51 | `to_tensor` as they are assumed to be more valuable with the raw data. As transformations 52 | can be useful after the tensors have been generated the `target_transform` and `input_transform` 53 | have been added that allow transforming the two tensor elements in the return table. 54 | 55 | 56 | ##### Df_Iterator(self, dataset, batch_size[, filter][, transform][, input_transform][, target_transform]) 57 | 58 | After creating your data split (`create_subsets`) you call the `get_subset` and 59 | get the subset that you need to feed to this method. Remember that you must define 60 | the data and label retrievers that the `Batchframe` will use when calling the 61 | `to_tensor`. The default retrievers can be set through the `class_args` argument: 62 | 63 | ```lua 64 | my_data:create_subsets{ 65 | class_args = Df_Tbl({ 66 | batch_args = Df_Tbl({ 67 | data = function(row) image_loader(row.filename) end, 68 | label = Df_Array("Gender") 69 | }) 70 | }) 71 | } 72 | ``` 73 | 74 | ``` 75 | ({ 76 | self = Df_Iterator -- 77 | dataset = Df_Subset -- 78 | batch_size = number -- The size of the batches 79 | [filter = function] -- is a closure which returns `true` if the given sample 80 | should be considered or `false` if not. Note that filter is called _after_ 81 | fetching the data in a threaded manner and _before_ the `to_tensor` is called. [has default value] 82 | [transform = function] -- a function which maps the given sample to a new value. This transformation occurs before filtering. [has default value] 83 | [input_transform = function] -- Allows transforming the input (data) values after the `Batchframe:to_tensor` call [has default value] 84 | [target_transform = function] -- Allows transforming the target (label) values after the `Batchframe:to_tensor` call [has default value] 85 | }) 86 | ``` -------------------------------------------------------------------------------- /doc/helper_classes/11_paralleliterator.md: -------------------------------------------------------------------------------- 1 | # API documentation for [Df_ParallelIterator](#__Df_ParallelIterator__) 2 | - [Df_ParallelIterator](#Df_ParallelIterator) 3 | 4 | 5 | ## Df_ParallelIterator 6 | 7 | The Df_ParallelIterator allows parallel loading when callin the `to_tensor` 8 | function. For details see `Df_Iterator` docs. 9 | 10 | 11 | ##### Df_ParallelIterator(self, dataset, batch_size[, init], nthread[, filter][, transform][, input_transform][, target_transform][, ordered]) 12 | ``` 13 | ({ 14 | self = Df_ParallelIterator -- 15 | dataset = Df_Subset -- The Dataframe subset to use for the iterator 16 | batch_size = number -- The size of the batches 17 | [init = function] -- `init(threadid)` (where threadid=1..nthread) is a closure which may 18 | initialize the specified thread as needed, if needed. It is loading 19 | the libraries 'torch' and 'Dataframe' by default. [has default value] 20 | nthread = number -- The number of threads used to parallelize is specified by `nthread`. 21 | [filter = function] -- is a closure which returns `true` if the given sample 22 | should be considered or `false` if not. Note that filter is called _after_ 23 | fetching the data in a threaded manner and _before_ the `to_tensor` is called. [has default value] 24 | [transform = function] -- a function which maps the given sample to a new value. This transformation occurs before filtering. [has default value] 25 | [input_transform = function] -- Allows transforming the input (data) values after the `Batchframe:to_tensor` call [has default value] 26 | [target_transform = function] -- Allows transforming the target (label) values after the `Batchframe:to_tensor` call [has default value] 27 | [ordered = boolean] -- This option is particularly useful for repeatable experiments. 28 | By default `ordered` is false, which means that order is not guaranteed by 29 | `run()` (though often the ordering is similar in practice). 30 | }) 31 | ``` 32 | 33 | Allows to iterate over a dataset in a thread 34 | manner. `Df_ParallelIterator:run()` guarantees that all samples 35 | will be seen, but does not guarantee the order unless `ordered` is set to true. 36 | 37 | The purpose of this class is to have a minimal pre-processing cost. 38 | The current implementation calls the `get_batch` inside the scope of the 39 | main process while all the loaders, transformers etc are moved into the threads. 40 | When reading datasets on the fly from 41 | disk (not loading them fully in memory), or performing complex 42 | pre-processing this can be of interest. 43 | 44 | A common error raised by this dataset is when `closure()` is not 45 | serializable. Make sure that all [upvalues](http://www.lua.org/pil/27.3.3.html) of `closure()` are 46 | serializable. It is recommended to avoid [upvalues](http://www.lua.org/pil/27.3.3.html) at all cost, 47 | and to make sure you require all the appropriate torch packages needed to (de-)serialize 48 | `closure()` in the `init()` function. 49 | 50 | For more information, check out the [threads package](https://github.com/torch/threads), 51 | on which `Df_ParallelIterator` relies. -------------------------------------------------------------------------------- /doc/helper_classes/20_tbl.md: -------------------------------------------------------------------------------- 1 | # API documentation for [Df_Tbl](#__Df_Tbl__) 2 | - [Df_Tbl.`__init`](#Df_Tbl.__init) 3 | - [Df_Tbl.#](#Df_Tbl.#) 4 | 5 | 6 | ## Df_Tbl 7 | 8 | The Df_Tbl is a class that is used to wrap a table. In contrast with Df_Array 9 | and Df_Dict it does not check any input data. 10 | 11 | 12 | ### Df_Tbl.__init(table) 13 | 14 | This is the fastes table wrapper that doesn't care to copy the original data. Should be used sparingly. 15 | 16 | 17 | ### Df_Tbl.# 18 | 19 | Returns the number of elements -------------------------------------------------------------------------------- /doc/helper_classes/21_dict.md: -------------------------------------------------------------------------------- 1 | # API documentation for [Df_Dict](#__Df_Dict__) 2 | - [Df_Dict.`__init`](#Df_Dict.__init) 3 | - [Df_Dict.check_lengths()](#Df_Dict.check_lengths) 4 | - [Df_Dict.set_keys](#Df_Dict.set_keys) 5 | - [Df_Dict.[]](#Df_Dict.[]) 6 | - [Df_Dict.#](#Df_Dict.#) 7 | 8 | 9 | ## Df_Dict 10 | 11 | The Df_Dict is a class that is used to wrap a dictionary table. A dictionary table 12 | has a string name corresponding to each key and an array as values, i.e. it may 13 | not contain any tables. 14 | 15 | The following properties are available : 16 | It is possible to access the Df_Dict's keys with the property `keys`. 17 | - `Df_Dict.keys`: list of the key 18 | - `Df_Dict.length`: content size for each key 19 | 20 | ### Df_Dict.__init(table_data) 21 | 22 | Create a Df_Dict object given a table 23 | 24 | 25 | ### Df_Dict.check_lengths() 26 | 27 | Ensure every columns has the same size 28 | 29 | _Return value_: boolean 30 | 31 | ### Df_Dict.set_keys(table_data) 32 | 33 | Replace all the keys by the given values 34 | 35 | `table_data` must be a table and have the same item length as the keys 36 | 37 | 38 | ### Df_Dict.[] 39 | 40 | Returns the value with the given key 41 | - _Single integer_: it returns the value corresponding 42 | - _"$column_name"_: get a column by prepending the name with `$`, e.g. `"$a column name"` 43 | 44 | _Return value_: Table or single value 45 | 46 | 47 | ### Df_Dict.# 48 | 49 | Returns the number of elements -------------------------------------------------------------------------------- /doc/helper_classes/22_array.md: -------------------------------------------------------------------------------- 1 | # API documentation for [Df_Array](#__Df_Array__) 2 | - [Df_Array.`__init`](#Df_Array.__init) 3 | - [Df_Array.[]](#Df_Array.[]) 4 | - [Df_Array.#](#Df_Array.#) 5 | 6 | 7 | ## Df_Array 8 | 9 | The Df_Array is a class that is used to wrap an array table. An array table 10 | has no key names, it only uses numbers for indexing and each element has to be 11 | an atomic element, i.e. it may not contain any tables. 12 | 13 | 14 | ### Df_Array.__init(...) 15 | 16 | Df_Array accepts 5 type of init values : 17 | - single value (string, integer, float, etc) 18 | - table 19 | - torch.*Tensor 20 | - Dataseries 21 | - arguments list (e.g. Df_Array(1,2,3,4,5) ) 22 | 23 | 24 | ### Df_Array.[] 25 | 26 | Returns the value at the given index 27 | 28 | 29 | ### Df_Array.# 30 | 31 | Returns the number of elements -------------------------------------------------------------------------------- /doc/helper_classes/README.md: -------------------------------------------------------------------------------- 1 | # Documentation for helper classes 2 | 3 | This documentation ha been auto-generated from code using the `argcheck` system. 4 | 5 | ## Table of contents (file-level) 6 | 7 | Below follows a more [detailed](#detailed) table of contents with links to 8 | the different functions. Not this list may be incompleted due to failure to 9 | add apropriate anchor tags during documentation. 10 | 11 | 12 | - [Df_Iterator and general about Dataframe's iterators](10_iterator.md) 13 | - [Df_ParallelIterator](11_paralleliterator.md) 14 | - [Df_Tbl](20_tbl.md) 15 | - [Df_Dict](21_dict.md) 16 | - [Df_Array](22_array.md) 17 | 18 | ## Detailed table of contents (file-level + anchors) 19 | 20 | 21 | - **[Df_Iterator and general about Dataframe's iterators](10_iterator.md)** 22 | - [Df_Iterator](10_iterator.md#Df_Iterator) 23 | - **[Df_ParallelIterator](11_paralleliterator.md)** 24 | - [Df_ParallelIterator](11_paralleliterator.md#Df_ParallelIterator) 25 | - **[Df_Tbl](20_tbl.md)** 26 | - [Df_Tbl.`__init`](20_tbl.md#Df_Tbl.__init) 27 | - [Df_Tbl.#](20_tbl.md#Df_Tbl.#) 28 | - **[Df_Dict](21_dict.md)** 29 | - [Df_Dict.`__init`](21_dict.md#Df_Dict.__init) 30 | - [Df_Dict.check_lengths()](21_dict.md#Df_Dict.check_lengths) 31 | - [Df_Dict.set_keys](21_dict.md#Df_Dict.set_keys) 32 | - [Df_Dict.[]](21_dict.md#Df_Dict.[]) 33 | - [Df_Dict.#](21_dict.md#Df_Dict.#) 34 | - **[Df_Array](22_array.md)** 35 | - [Df_Array.`__init`](22_array.md#Df_Array.__init) 36 | - [Df_Array.[]](22_array.md#Df_Array.[]) 37 | - [Df_Array.#](22_array.md#Df_Array.#) -------------------------------------------------------------------------------- /doc/sub_classes/README.md: -------------------------------------------------------------------------------- 1 | # Documentation for sub classes 2 | 3 | This documentation ha been auto-generated from code using the `argcheck` system. 4 | 5 | ## Table of contents (file-level) 6 | 7 | Below follows a more [detailed](#detailed) table of contents with links to 8 | the different functions. Not this list may be incompleted due to failure to 9 | add apropriate anchor tags during documentation. 10 | 11 | 12 | - [Df_Subset](01_subset.md) 13 | - [Batchframe](10_batchframe.md) 14 | 15 | ## Detailed table of contents (file-level + anchors) 16 | 17 | 18 | - **[Df_Subset](01_subset.md)** 19 | - [Df_Subset.`__init`](01_subset.md#Df_Subset.__init) 20 | - [Df_Subset.`_clean`](01_subset.md#Df_Subset._clean) 21 | - [Df_Subset.set_idxs](01_subset.md#Df_Subset.set_idxs) 22 | - [Df_Subset.get_idx](01_subset.md#Df_Subset.get_idx) 23 | - [Df_Subset.set_labels](01_subset.md#Df_Subset.set_labels) 24 | - [Df_Subset.set_sampler](01_subset.md#Df_Subset.set_sampler) 25 | - [Df_Subset.get_sampler](01_subset.md#Df_Subset.get_sampler) 26 | - [Sampler: linear - Df_Subset.get_sampler_linear](01_subset.md#Df_Subset.get_sampler_linear) 27 | - [Sampler: ordered - Df_Subset.get_sampler_ordered](01_subset.md#Df_Subset.get_sampler_ordered) 28 | - [Sampler: uniform - Df_Subset.get_sampler_uniform](01_subset.md#Df_Subset.get_sampler_uniform) 29 | - [Sampler: permutation - Df_Subset.get_sampler_permutation](01_subset.md#Df_Subset.get_sampler_permutation) 30 | - [Sampler: label-uniform - Df_Subset.get_sampler_label_uniform](01_subset.md#Df_Subset.get_sampler_label_uniform) 31 | - [Sampler: label-distribution - Df_Subset.get_sampler_label_distribution](01_subset.md#Df_Subset.get_sampler_label_distribution) 32 | - [Sampler: label-permutation - Df_Subset.get_sampler_label_permutation](01_subset.md#Df_Subset.get_sampler_label_permutation) 33 | - [Df_Subset.get_batch](01_subset.md#Df_Subset.get_batch) 34 | - [Df_Subset.reset_sampler](01_subset.md#Df_Subset.reset_sampler) 35 | - [Df_Subset.get_iterator](01_subset.md#Df_Subset.get_iterator) 36 | - [Df_Subset.get_parallel_iterator](01_subset.md#Df_Subset.get_parallel_iterator) 37 | - [Df_Subset.`__tostring__`](01_subset.md#Df_Subset.__tostring__) 38 | - [Df_Subset.set_data_retriever](01_subset.md#Df_Subset.set_data_retriever) 39 | - [Df_Subset.set_label_retriever](01_subset.md#Df_Subset.set_label_retriever) 40 | - [Df_Subset.set_label_shape](01_subset.md#Df_Subset.set_label_shape) 41 | - **[Batchframe](10_batchframe.md)** 42 | - [Batchframe.`__init`](10_batchframe.md#Batchframe.__init) 43 | - [Batchframe.set_data_retriever](10_batchframe.md#Batchframe.set_data_retriever) 44 | - [Batchframe.get_data_retriever](10_batchframe.md#Batchframe.get_data_retriever) 45 | - [Batchframe.set_label_retriever](10_batchframe.md#Batchframe.set_label_retriever) 46 | - [Batchframe.get_label_retriever](10_batchframe.md#Batchframe.get_label_retriever) 47 | - [Batchframe.set_label_shape](10_batchframe.md#Batchframe.set_label_shape) 48 | - [Batchframe.to_tensor](10_batchframe.md#Batchframe.to_tensor) -------------------------------------------------------------------------------- /doc/utils/README.md: -------------------------------------------------------------------------------- 1 | # Documentation for utils 2 | 3 | This documentation ha been auto-generated from code using the `argcheck` system. 4 | 5 | ## Table of contents (file-level) 6 | 7 | Below follows a more [detailed](#detailed) table of contents with links to 8 | the different functions. Not this list may be incompleted due to failure to 9 | add apropriate anchor tags during documentation. 10 | 11 | 12 | - [Utility functions](utils.md) 13 | 14 | ## Detailed table of contents (file-level + anchors) 15 | 16 | 17 | - **[Utility functions](utils.md)** 18 | - [trim](utils.md#trim) 19 | - [trim_table_strings](utils.md#trim_table_strings) 20 | - [table.array2hash](utils.md#table.array2hash) 21 | - [get_variable_type](utils.md#get_variable_type) 22 | - [warning](utils.md#warning) 23 | - [convert_table_2_dataframe](utils.md#convert_table_2_dataframe) -------------------------------------------------------------------------------- /doc/utils/utils.md: -------------------------------------------------------------------------------- 1 | # API documentation for [utility functions](#__Utility functions__) 2 | - [trim](#trim) 3 | - [trim_table_strings](#trim_table_strings) 4 | - [table.array2hash](#table.array2hash) 5 | - [get_variable_type](#get_variable_type) 6 | - [warning](#warning) 7 | - [convert_table_2_dataframe](#convert_table_2_dataframe) 8 | 9 | 10 | ## Utility functions 11 | 12 | Here are utility functions that are not specific to the dataframe but add a general 13 | Lua functionality. 14 | 15 | 16 | ### trim(s[, ignore]) 17 | 18 | Trims a string from whitespace chars 19 | 20 | ``` 21 | ({ 22 | s = string -- The string to trim 23 | [ignore = number] -- Useful when string is directly given by the gsub function. Gsub returns a number this needs to be ignored through this argument [default=false] 24 | }) 25 | ``` 26 | 27 | _Return value_: string 28 | 29 | ### trim_table_strings(t) 30 | 31 | Trims a table with strings fro whitespace chars 32 | 33 | ``` 34 | ({ 35 | t = table -- The table with strings to trim 36 | }) 37 | ``` 38 | 39 | _Return value_: string 40 | 41 | ### table.array2hash(array) 42 | 43 | Converts an array to hash table with numbers corresponding to the index of the 44 | original elements position in the array. Intended for use with arrays where all 45 | values are unique. 46 | 47 | ``` 48 | ({ 49 | array = table -- An array of elements 50 | }) 51 | ``` 52 | 53 | _Return value_: table with string keys 54 | 55 | ### get_variable_type(value[, prev_type]) 56 | 57 | Checks the variable type for a string/numeric/boolean variable. Missing values 58 | `nan` or "" are ignored. If a previous value is provided then the new variable 59 | type will be in relation to the previous. I.e. if you provide an integer after 60 | previously seen a double then the type will still be double. 61 | 62 | ``` 63 | ({ 64 | value = !table -- The value to type-check 65 | [prev_type = string] -- The previous value type 66 | }) 67 | ``` 68 | 69 | _Return value_: string of type: 'boolean', 'integer', 'long', 'double', or 'string' 70 | 71 | ### warning(ARGP) 72 | 73 | A function for printing warnings, i.e. events that souldn't occur but are not 74 | serious anough to throw an error. If you want to supress the warning then set 75 | the `no_warnings = true` in the global environement. 76 | 77 | @ARPT 78 | 79 | ### convert_table_2_dataframe(tbl[, value_name][, key_name]) 80 | 81 | Converts a table to a Dataframe 82 | 83 | ``` 84 | ({ 85 | tbl = Df_Tbl -- 86 | [value_name = string] -- The name of the value column [default=value] 87 | [key_name = string] -- The name of the key column [default=key] 88 | }) 89 | ``` 90 | 91 | _Return value_: Dataframe -------------------------------------------------------------------------------- /examples/Facebook license/LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For Torchnet software 4 | 5 | Copyright (c) 2016-present, Facebook, Inc. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /examples/Facebook license/PATENTS: -------------------------------------------------------------------------------- 1 | Additional Grant of Patent Rights Version 2 2 | 3 | "Software" means the Torchnet software distributed by Facebook, Inc. 4 | 5 | Facebook, Inc. ("Facebook") hereby grants to each recipient of the Software 6 | ("you") a perpetual, worldwide, royalty-free, non-exclusive, irrevocable 7 | (subject to the termination provision below) license under any Necessary 8 | Claims, to make, have made, use, sell, offer to sell, import, and otherwise 9 | transfer the Software. For avoidance of doubt, no license is granted under 10 | Facebook’s rights in any patent claims that are infringed by (i) modifications 11 | to the Software made by you or any third party or (ii) the Software in 12 | combination with any software or other technology. 13 | 14 | The license granted hereunder will terminate, automatically and without notice, 15 | if you (or any of your subsidiaries, corporate affiliates or agents) initiate 16 | directly or indirectly, or take a direct financial interest in, any Patent 17 | Assertion: (i) against Facebook or any of its subsidiaries or corporate 18 | affiliates, (ii) against any party if such Patent Assertion arises in whole or 19 | in part from any software, technology, product or service of Facebook or any of 20 | its subsidiaries or corporate affiliates, or (iii) against any party relating 21 | to the Software. Notwithstanding the foregoing, if Facebook or any of its 22 | subsidiaries or corporate affiliates files a lawsuit alleging patent 23 | infringement against you in the first instance, and you respond by filing a 24 | patent infringement counterclaim in that lawsuit against that party that is 25 | unrelated to the Software, the license granted hereunder will not terminate 26 | under section (i) of this paragraph due to such counterclaim. 27 | 28 | A "Necessary Claim" is a claim of a patent owned by Facebook that is 29 | necessarily infringed by the Software standing alone. 30 | 31 | A "Patent Assertion" is any lawsuit or other action alleging direct, indirect, 32 | or contributory infringement or inducement to infringe any patent, including a 33 | cross-claim or counterclaim. 34 | -------------------------------------------------------------------------------- /examples/mnist_example.lua: -------------------------------------------------------------------------------- 1 | --[[ 2 | Copyright (c) 2016-present, Facebook, Inc. 3 | All rights reserved. 4 | This source code is licensed under the BSD-style license found in the 5 | LICENSE file in the Facebook license in the same directory as this file. An 6 | additional grant of patent rights can be found in the PATENTS file in the 7 | same directory. 8 | ]]-- 9 | -- load torchnet: 10 | local tnt = require 'torchnet' 11 | 12 | require 'Dataframe' 13 | 14 | -- use GPU or not: 15 | local cmd = torch.CmdLine() 16 | cmd:option('-usegpu', false, 'use gpu for training') 17 | cmd:option('-parallel', false, 'use multithreaded loading for training') 18 | 19 | local config = cmd:parse(arg) 20 | print(string.format('running on %s', config.usegpu and 'GPU' or 'CPU')) 21 | print(string.format('using %s execution', config.parallel and 'parallel' or 'single thread')) 22 | 23 | -- function that sets of dataset iterator: 24 | local function getIterator(mode) 25 | -- load MNIST dataset: 26 | local mnist = require 'mnist' 27 | local mnist_dataset = mnist[mode .. 'dataset']() 28 | 29 | -- Create a Dataframe with the label. The actual images will be loaded 30 | -- as an external resource 31 | local df = Dataframe( 32 | Df_Dict{ 33 | label = mnist_dataset.label:totable(), 34 | row_id = torch.range(1, mnist_dataset.data:size(1)):totable() 35 | }) 36 | 37 | -- Since the mnist package already has taken care of the data 38 | -- splitting we create a single subsetter 39 | df:create_subsets{ 40 | subsets = Df_Dict{core = 1}, 41 | data_retriever = function(row) 42 | return ext_resource[row.row_id] 43 | end, 44 | label_retriever = Df_Array("label") 45 | } 46 | 47 | local subset = df["/core"] 48 | if (config.parallel) then 49 | return Df_ParallelIterator{ 50 | dataset = subset, 51 | batch_size = 128, 52 | init = function(idx) 53 | -- Load the libraries needed 54 | require 'torch' 55 | require 'Dataframe' 56 | 57 | -- Load the datasets external resource 58 | local mnist = require 'mnist' 59 | local mnist_dataset = mnist[mode .. 'dataset']() 60 | ext_resource = mnist_dataset.data:reshape(mnist_dataset.data:size(1), 61 | mnist_dataset.data:size(2) * mnist_dataset.data:size(3)):double() 62 | end, 63 | nthread = 2, 64 | target_transform = function(val) 65 | return val + 1 66 | end 67 | } 68 | else 69 | ext_resource = mnist_dataset.data:reshape(mnist_dataset.data:size(1), 70 | mnist_dataset.data:size(2) * mnist_dataset.data:size(3)):double() 71 | 72 | return Df_Iterator{ 73 | dataset = subset, 74 | batch_size = 128, 75 | target_transform = function(val) 76 | return val + 1 77 | end 78 | } 79 | end 80 | end 81 | 82 | -- set up logistic regressor: 83 | local net = nn.Sequential():add(nn.Linear(784,10)) 84 | local criterion = nn.CrossEntropyCriterion() 85 | 86 | -- set up training engine: 87 | local engine = tnt.SGDEngine() 88 | local meter = tnt.AverageValueMeter() 89 | local clerr = tnt.ClassErrorMeter{topk = {1}} 90 | engine.hooks.onStartEpoch = function(state) 91 | meter:reset() 92 | clerr:reset() 93 | end 94 | engine.hooks.onForwardCriterion = function(state) 95 | meter:add(state.criterion.output) 96 | clerr:add(state.network.output, state.sample.target) 97 | if state.training then 98 | print(string.format('avg. loss: %2.2f; avg. error: %2.2f', 99 | meter:value(), clerr:value{k = 1})) 100 | end 101 | end 102 | -- After each epoch we need to envoke the sampler reset (only needed for some samples) 103 | engine.hooks.onEndEpoch = function(state) 104 | print("End epoch no " .. state.epoch) 105 | state.iterator.dataset:reset_sampler() 106 | end 107 | 108 | -- set up GPU training: 109 | if config.usegpu then 110 | 111 | -- copy model to GPU: 112 | require 'cunn' 113 | net = net:cuda() 114 | criterion = criterion:cuda() 115 | 116 | -- copy sample to GPU buffer: 117 | local igpu, tgpu = torch.CudaTensor(), torch.CudaTensor() 118 | engine.hooks.onSample = function(state) 119 | igpu:resize(state.sample.input:size() ):copy(state.sample.input) 120 | tgpu:resize(state.sample.target:size()):copy(state.sample.target) 121 | state.sample.input = igpu 122 | state.sample.target = tgpu 123 | end -- alternatively, this logic can be implemented via a TransformDataset 124 | end 125 | 126 | -- train the model: 127 | engine:train{ 128 | network = net, 129 | iterator = getIterator('train'), 130 | criterion = criterion, 131 | lr = 0.2, 132 | maxepoch = 3, 133 | } 134 | 135 | -- measure test loss and error: 136 | meter:reset() 137 | clerr:reset() 138 | engine:test{ 139 | network = net, 140 | iterator = getIterator('test'), 141 | criterion = criterion, 142 | } 143 | print("\n ***** Done *****") 144 | print(string.format('test loss: %2.2f; test error: %2.2f', 145 | meter:value(), clerr:value{k = 1})) 146 | -------------------------------------------------------------------------------- /helper_classes/10_iterator.lua: -------------------------------------------------------------------------------- 1 | -- Skip if the Df_Iterator has already been loaded via paralleliterator 2 | if (Df_Iterator) then 3 | return true 4 | end 5 | 6 | local argcheck = require 'argcheck' 7 | local doc = require 'argcheck.doc' 8 | local torchnet 9 | if (doc.__record) then 10 | doc.stop() 11 | torchnet = require "torchnet" 12 | doc.record() 13 | else 14 | torchnet = require "torchnet" 15 | end 16 | 17 | 18 | doc[[ 19 | ## Df_Iterator and general about Dataframe's iterators 20 | 21 | The `torchnet` iterators allow a simple iteration over a dataset. If combined 22 | with a list function you can create so that the iterators returns a table with 23 | the two key elements `input` and `target` that `tnt.SGDEngine` and 24 | `tnt.OptimEngine` require. 25 | 26 | The Dataframe approach is to combine everything into a single iterator that does 27 | returns the training tensors. This is a complement to the subset `get_batch` 28 | function and relies on the same core functions. 29 | 30 | Iterators implement two methods: 31 | 32 | - `run()` which returns a Lua iterator usable in a for loop. 33 | - `exec(funcname, ...)` which execute a given funcname on the underlying dataset. 34 | 35 | Typical usage is achieved with a for loop: 36 | ```lua 37 | for sample in iterator:run() do 38 | 39 | end 40 | ``` 41 | 42 | Iterators implement the `__call` event, so one might also use the `()` operator: 43 | ```lua 44 | for sample in iterator() do 45 | 46 | end 47 | ``` 48 | 49 | **Important:** The `tnt.DatasetIterator` does not reset the iterator after running 50 | to the end. In order to do this you must add a `reset_sampler` call in the endEpoch 51 | hook for the engine: 52 | 53 | ```lua 54 | engine.hooks.onEndEpoch = function(state) 55 | state.iterator.dataset:reset_sampler() 56 | end 57 | ``` 58 | 59 | As torchnet is epoch-centered all samplers will be behave as if there was an underlying 60 | epoch mechanism. E.g. the uniform sampler will never trigger a reset but the epoch 61 | hook will still be called as there is a "fake epoch" calculated by 62 | `math.ceil(dataset:size()/batch_size)`. 63 | 64 | **Note**: An important note is that the transform and filters are ran before the 65 | `to_tensor` as they are assumed to be more valuable with the raw data. As transformations 66 | can be useful after the tensors have been generated the `target_transform` and `input_transform` 67 | have been added that allow transforming the two tensor elements in the return table. 68 | 69 | ]] 70 | 71 | local Df_Iterator, parent_class = torch.class('Df_Iterator', 'tnt.DatasetIterator') 72 | 73 | -- iterate over a dataset 74 | Df_Iterator.__init = argcheck{ 75 | doc = [[ 76 | 77 | ##### Df_Iterator(@ARGP) 78 | 79 | After creating your data split (`create_subsets`) you call the `get_subset` and 80 | get the subset that you need to feed to this method. Remember that you must define 81 | the data and label retrievers that the `Batchframe` will use when calling the 82 | `to_tensor`. The default retrievers can be set through the `class_args` argument: 83 | 84 | ```lua 85 | my_data:create_subsets{ 86 | class_args = Df_Tbl({ 87 | batch_args = Df_Tbl({ 88 | data = function(row) image_loader(row.filename) end, 89 | label = Df_Array("Gender") 90 | }) 91 | }) 92 | } 93 | ``` 94 | 95 | @ARGT 96 | 97 | ]], 98 | {name='self', type='Df_Iterator'}, 99 | {name='dataset', type='Df_Subset'}, 100 | {name="batch_size", type="number", doc="The size of the batches"}, 101 | {name='filter', type='function', default=function(sample) return true end, 102 | doc=[[is a closure which returns `true` if the given sample 103 | should be considered or `false` if not. Note that filter is called _after_ 104 | fetching the data in a threaded manner and _before_ the `to_tensor` is called.]]}, 105 | {name='transform', type='function', default=function(sample) return sample end, 106 | doc='a function which maps the given sample to a new value. This transformation occurs before filtering.'}, 107 | {name='input_transform', type='function', default=function(val) return val end, 108 | doc="Allows transforming the input (data) values after the `Batchframe:to_tensor` call"}, 109 | {name='target_transform', type='function', default=function(val) return val end, 110 | doc="Allows transforming the target (label) values after the `Batchframe:to_tensor` call"}, 111 | call = function(self, dataset, batch_size, filter, transform, input_transform, target_transform) 112 | assert(dataset.batch_args, 113 | "If you want to use the iterator you must prespecify the batch data/label loaders") 114 | assert(isint(batch_size) and batch_size > 0, "The batch size must be a positive integer") 115 | 116 | self.dataset = dataset 117 | 118 | function self.run() 119 | local size = math.ceil(self:exec("size")/batch_size) 120 | local idx = 1 -- TODO: Should the idx be skipped since the Dataframe implementation doesn require it? 121 | return function() 122 | while idx <= size do 123 | local sample, reset = self:exec("get_batch", batch_size) 124 | 125 | if (reset) then 126 | idx = size + 1 127 | else 128 | idx = idx + 1 129 | end 130 | 131 | -- The samplers may return nil value if a reset is needed 132 | if (sample) then 133 | sample = transform(sample) 134 | 135 | -- Only return non-nil values 136 | if (filter(sample)) then 137 | local input, target = sample:to_tensor() 138 | return { 139 | input = input_transform(input), 140 | target = target_transform(target) 141 | } 142 | end 143 | end 144 | end -- End while 145 | 146 | end 147 | end 148 | end} 149 | -------------------------------------------------------------------------------- /helper_classes/20_tbl.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | 3 | local argcheck = require "argcheck" 4 | local doc = require "argcheck.doc" 5 | 6 | doc[[ 7 | 8 | ## Df_Tbl 9 | 10 | The Df_Tbl is a class that is used to wrap a table. In contrast with Df_Array 11 | and Df_Dict it does not check any input data. 12 | 13 | ]] 14 | 15 | -- create class object 16 | local dtbl = torch.class('Df_Tbl') 17 | 18 | doc[[ 19 | 20 | ### Df_Tbl.__init(table) 21 | 22 | This is the fastes table wrapper that doesn't care to copy the original data. Should be used sparingly. 23 | 24 | ]] 25 | function dtbl:__init(table_data) 26 | self.data = table_data 27 | end 28 | 29 | doc[[ 30 | 31 | ### Df_Tbl.# 32 | 33 | Returns the number of elements 34 | 35 | ]] 36 | dtbl.__len__ = argcheck{ 37 | {name="self", type="Df_Tbl"}, 38 | {name="other", type="Df_Tbl"}, 39 | call=function(self) 40 | return table.exact_length(self.data) 41 | end} 42 | 43 | return dtbl 44 | -------------------------------------------------------------------------------- /helper_classes/21_dict.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | 3 | local argcheck = require "argcheck" 4 | local doc = require "argcheck.doc" 5 | 6 | doc[[ 7 | 8 | ## Df_Dict 9 | 10 | The Df_Dict is a class that is used to wrap a dictionary table. A dictionary table 11 | has a string name corresponding to each key and an array as values, i.e. it may 12 | not contain any tables. 13 | 14 | The following properties are available : 15 | It is possible to access the Df_Dict's keys with the property `keys`. 16 | - `Df_Dict.keys`: list of the key 17 | - `Df_Dict.length`: content size for each key 18 | ]] 19 | 20 | -- create class object 21 | local dict = torch.class('Df_Dict') 22 | 23 | doc[[ 24 | 25 | ### Df_Dict.__init(table_data) 26 | 27 | Create a Df_Dict object given a table 28 | 29 | ]] 30 | function dict:__init(table_data) 31 | local dict_data = {} 32 | local dict_lengths = {}-- lengths of each key's value 33 | local dict_keys = {} 34 | 35 | 36 | assert(torch.type(table_data) == "table", "Argument must be a table") 37 | 38 | for k,v in pairs(table_data) do 39 | dict_lengths[k] = 0 40 | 41 | -- Check dimension 42 | if (torch.type(v) == "table") then 43 | for i=1,#v do 44 | assert(type(v[i]) ~= "table", 45 | ("For key '%s' in the position %d the value is a table, this isn't allowed"):format(k, i)) 46 | dict_lengths[k] = dict_lengths[k] + 1 47 | end 48 | else 49 | dict_lengths[k] = 1 50 | end 51 | 52 | -- store the key value in another table for future access 53 | table.insert(dict_keys,k) 54 | 55 | dict_data[k] = v 56 | end 57 | 58 | self.keys = dict_keys 59 | self.data = dict_data 60 | self.length = dict_lengths 61 | end 62 | 63 | doc[[ 64 | 65 | ### Df_Dict.check_lengths() 66 | 67 | Ensure every columns has the same size 68 | 69 | _Return value_: boolean 70 | ]] 71 | function dict:check_lengths() 72 | local previous_length = self.length[self.keys[1]] 73 | 74 | for key,value in pairs(self.length) do 75 | if previous_length ~= value then 76 | return false 77 | end 78 | 79 | previous_length = self.length[key] 80 | end 81 | 82 | return true 83 | end 84 | 85 | doc[[ 86 | 87 | ### Df_Dict.set_keys(table_data) 88 | 89 | Replace all the keys by the given values 90 | 91 | `table_data` must be a table and have the same item length as the keys 92 | 93 | ]] 94 | function dict:set_keys(table_data) 95 | assert(torch.type(table_data) == "table", "You must provide a table as argument") 96 | assert(#table_data == #self.keys, 97 | ("The keys you provided (%d items) has not the same number of current elements (%d items)") 98 | :format(#table_data,#self.keys)) 99 | 100 | local temp_data = {} 101 | 102 | for i=1,#self.keys do 103 | local old_key = self.keys[i] 104 | local new_key = table_data[i] 105 | 106 | temp_data[new_key] = self.data[old_key] 107 | end 108 | 109 | self.keys = table_data 110 | self.data = temp_data 111 | end 112 | 113 | doc[[ 114 | 115 | ### Df_Dict.[] 116 | 117 | Returns the value with the given key 118 | - _Single integer_: it returns the value corresponding 119 | - _"$column_name"_: get a column by prepending the name with `$`, e.g. `"$a column name"` 120 | 121 | _Return value_: Table or single value 122 | 123 | ]] 124 | function dict:__index__(key) 125 | if (torch.type(key) == "number") then 126 | return self.data[key], true 127 | -- Index a column using a $ at the beginning of a string 128 | elseif (torch.type(key) == "string" and key:match("^[$]")) then 129 | local key_name = key:gsub("^[$]", "") 130 | return self.data[key_name], true 131 | end 132 | 133 | return false 134 | end 135 | 136 | function dict:__newindex__(index) 137 | return false 138 | end 139 | 140 | doc[[ 141 | 142 | ### Df_Dict.# 143 | 144 | Returns the number of elements 145 | 146 | ]] 147 | dict.__len__ = argcheck{ 148 | {name="self", type="Df_Dict"}, 149 | {name="other", type="Df_Dict"},-- used by lua when invoking #myDict 150 | call=function(self) 151 | return table.exact_length(self.data) 152 | end} 153 | 154 | return dict 155 | -------------------------------------------------------------------------------- /helper_classes/22_array.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | 3 | 4 | local argcheck = require "argcheck" 5 | local doc = require "argcheck.doc" 6 | 7 | doc[[ 8 | 9 | ## Df_Array 10 | 11 | The Df_Array is a class that is used to wrap an array table. An array table 12 | has no key names, it only uses numbers for indexing and each element has to be 13 | an atomic element, i.e. it may not contain any tables. 14 | 15 | ]] 16 | 17 | -- create class object 18 | local da = torch.class('Df_Array') 19 | 20 | 21 | doc[[ 22 | 23 | ### Df_Array.__init(...) 24 | 25 | Df_Array accepts 5 type of init values : 26 | - single value (string, integer, float, etc) 27 | - table 28 | - torch.*Tensor 29 | - Dataseries 30 | - arguments list (e.g. Df_Array(1,2,3,4,5) ) 31 | 32 | ]] 33 | -- (...) allows to call Df_Array with an infinite number of arguments 34 | function da:__init(...) 35 | arg = {...} 36 | 37 | -- If there is only one value, which can be 38 | -- a simple type (string, number, etc), a table or a tensor 39 | if (#arg == 1 and 40 | (torch.type(arg[1]) == 'table' or 41 | torch.isTensor(arg[1])) or 42 | torch.type(arg[1]) == "Dataseries") then 43 | -- If this is the case, arg var is set as its single value 44 | arg = arg[1] 45 | end 46 | 47 | local array_data = {} 48 | if (torch.isTensor(arg)) then 49 | -- If Df_Array is inited with a tensor, 50 | -- it is simply converted into a table and set 51 | array_data = arg:totable() 52 | elseif (torch.type(arg) == "Dataseries") then 53 | -- Same fate for Dataseries 54 | array_data = arg:to_table() 55 | else 56 | -- If there is multiple arguments or 57 | -- a table (thanks to #arg == 1 condition above), 58 | -- value is set row by row. 59 | -- in the case of a table, it allows to get rid of eventual 60 | -- keys and only keep numerical indexes 61 | for i=1,#arg do 62 | assert(type(arg[i]) ~= "table", 63 | ("The Dataframe array cannot contain tables - see position %d in your input"):format(i)) 64 | array_data[i] = arg[i] 65 | end 66 | end 67 | 68 | self.data = array_data 69 | end 70 | 71 | 72 | doc[[ 73 | 74 | ### Df_Array.[] 75 | 76 | Returns the value at the given index 77 | 78 | ]] 79 | function da:__index__(index) 80 | if (torch.type(index) == "number") then 81 | return self.data[index], true 82 | end 83 | 84 | return false 85 | end 86 | 87 | function da:__newindex__(index) 88 | return false 89 | end 90 | 91 | 92 | doc[[ 93 | 94 | ### Df_Array.# 95 | 96 | Returns the number of elements 97 | 98 | ]] 99 | da.__len__ = argcheck{ 100 | {name="self", type="Df_Array"}, 101 | {name="other", type="Df_Array"},-- used by lua when invoking #myArray 102 | call=function(self) 103 | return #self.data 104 | end} 105 | 106 | return da 107 | -------------------------------------------------------------------------------- /helper_classes/Facebok license: -------------------------------------------------------------------------------- 1 | ../examples/Facebook license/ -------------------------------------------------------------------------------- /init.lua: -------------------------------------------------------------------------------- 1 | local paths = require 'paths' 2 | local dataframe_dir = string.gsub(paths.thisfile(), "[^/]+$", "") 3 | 4 | -- Custom argument checks 5 | local argcheck_file = dataframe_dir .. "argcheck.lua" 6 | assert(loadfile(argcheck_file))() 7 | -- Custom busted assertions, only needed for running tests 8 | local assert_file = dataframe_dir .. "custom_assertions.lua" 9 | if (paths.filep(assert_file)) then 10 | assert(loadfile(assert_file))() 11 | end 12 | 13 | -- Get the loader funciton and start by making utils available to all 14 | local loader_file = dataframe_dir .. "utils/loader.lua" 15 | assert(loadfile(loader_file))() 16 | load_dir_files(dataframe_dir .. "utils/") 17 | 18 | -- Load all the classes 19 | load_dir_files(dataframe_dir .. "helper_classes/") 20 | 21 | load_dir_files(dataframe_dir .. "dataseries/") 22 | 23 | load_dir_files(dataframe_dir .. "dataframe/") 24 | 25 | load_dir_files(dataframe_dir .. "sub_classes/") 26 | 27 | return Dataframe 28 | -------------------------------------------------------------------------------- /rocks/torch-dataframe-1.0-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "torch-dataframe" 2 | version = "1.0-0" 3 | source = { 4 | url = "https://github.com/alexmili/torch-dataframe/archive/v1.0-0.tar.gz", 5 | dir = "torch-dataframe-1.0-0" 6 | } 7 | description = { 8 | summary = "A Dataframe class for Torch", 9 | detailed = [[ 10 | Dataframe is a Torch7 class to load and manipulate 11 | Kaggle-style CSVs inspired from R and pandas Dataframes. 12 | ]], 13 | homepage = "https://github.com/alexmili/torch-dataframe", 14 | license = "MIT/X11", 15 | maintainer = "AlexMili" 16 | } 17 | dependencies = { 18 | "lua ~> 5.1", 19 | "torch >= 7.0", 20 | "luafilesystem >= 1.6.3" 21 | } 22 | build = { 23 | type = 'builtin', 24 | modules = { 25 | ["Dataframe.init"] = 'init.lua', 26 | ["Dataframe.utils"] = 'utils.lua', 27 | ["Dataframe.main"] = 'main.lua', 28 | ["Dataframe.Extensions.categorical"] = 'Extensions/categorical.lua', 29 | ["Dataframe.Extensions.load_batch"] = 'Extensions/load_batch.lua', 30 | ["Dataframe.Extensions.load_data"] = 'Extensions/load_data.lua', 31 | ["Dataframe.Extensions.missing_data"] = 'Extensions/missing_data.lua', 32 | ["Dataframe.Extensions.output"] = 'Extensions/output.lua', 33 | ["Dataframe.Extensions.save_data"] = 'Extensions/save_data.lua', 34 | ["Dataframe.Extensions.select_set_update"] = 'Extensions/select_set_update.lua', 35 | ["Dataframe.Extensions.statistics"] = 'Extensions/statistics.lua' 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /rocks/torch-dataframe-1.1-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "torch-dataframe" 2 | version = "1.1-0" 3 | source = { 4 | url = "https://github.com/alexmili/torch-dataframe/archive/v1.1-0.tar.gz", 5 | dir = "torch-dataframe-1.1-0" 6 | } 7 | description = { 8 | summary = "A Dataframe class for Torch", 9 | detailed = [[ 10 | Dataframe is a Torch7 class to load and manipulate 11 | Kaggle-style CSVs inspired from R's and pandas' Dataframes. 12 | ]], 13 | homepage = "https://github.com/alexmili/torch-dataframe", 14 | license = "MIT/X11", 15 | maintainer = "AlexMili" 16 | } 17 | dependencies = { 18 | "lua ~> 5.1", 19 | "torch >= 7.0", 20 | "argcheck >= 2.0", 21 | "luafilesystem >= 1.6.3" 22 | } 23 | build = { 24 | type = 'builtin', 25 | modules = { 26 | ["Dataframe.init"] = 'init.lua', 27 | ["Dataframe.utils"] = 'utils.lua', 28 | ["Dataframe.argcheck"] = 'argcheck.lua', 29 | ["Dataframe.main"] = 'main.lua', 30 | ["Dataframe.Extensions.categorical"] = 'Extensions/categorical.lua', 31 | ["Dataframe.Extensions.column"] = 'Extensions/column.lua', 32 | ["Dataframe.Extensions.load_batch"] = 'Extensions/load_batch.lua', 33 | ["Dataframe.Extensions.load_data"] = 'Extensions/load_data.lua', 34 | ["Dataframe.Extensions.missing_data"] = 'Extensions/missing_data.lua', 35 | ["Dataframe.Extensions.output"] = 'Extensions/output.lua', 36 | ["Dataframe.Extensions.export_data"] = 'Extensions/export_data.lua', 37 | ["Dataframe.Extensions.select_set_update"] = 'Extensions/select_set_update.lua', 38 | ["Dataframe.Extensions.statistics"] = 'Extensions/statistics.lua', 39 | ["Dataframe.helper_classes.array"] = 'helper_classes/array.lua', 40 | ["Dataframe.helper_classes.dict"] = 'helper_classes/dict.lua', 41 | ["Dataframe.helper_classes.tbl"] = 'helper_classes/tbl.lua' 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /rocks/torch-dataframe-1.5-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "torch-dataframe" 2 | version = "1.5-0" 3 | source = { 4 | url = "https://github.com/alexmili/torch-dataframe/archive/v1.5-0.tar.gz", 5 | dir = "torch-dataframe-1.5-0" 6 | } 7 | 8 | description = { 9 | summary = "A Dataframe class for Torch", 10 | detailed = [[ 11 | Dataframe is a Torch7 class to load and manipulate 12 | Kaggle-style CSVs inspired from R's and pandas' Dataframes. 13 | Compatible with torchnet. 14 | ]], 15 | homepage = "https://github.com/alexmili/torch-dataframe", 16 | license = "MIT/X11", 17 | maintainer = "AlexMili" 18 | } 19 | dependencies = { 20 | "lua >= 5.1", 21 | "torch >= 7.0", 22 | "argcheck >= 2.0", 23 | "luafilesystem >= 1.6.3", 24 | "paths", 25 | "torchnet >= 1.0", 26 | "threads >= 1.0", 27 | "nn" 28 | } 29 | build = { 30 | type = 'builtin', 31 | modules = { 32 | ["Dataframe.init"] = 'init.lua', 33 | ["Dataframe.utils"] = 'utils.lua', 34 | ["Dataframe.argcheck"] = 'argcheck.lua', 35 | ["Dataframe.main"] = 'main.lua', 36 | ["Dataframe.extensions.metatable"] = 'extensions/metatable.lua', 37 | ["Dataframe.extensions.categorical"] = 'extensions/categorical.lua', 38 | ["Dataframe.extensions.column"] = 'extensions/column.lua', 39 | ["Dataframe.extensions.row"] = 'extensions/row.lua', 40 | ["Dataframe.extensions.subsets_and_batches"] = 'extensions/subsets_and_batches.lua', 41 | ["Dataframe.extensions.load_data"] = 'extensions/load_data.lua', 42 | ["Dataframe.extensions.missing_data"] = 'extensions/missing_data.lua', 43 | ["Dataframe.extensions.output"] = 'extensions/output.lua', 44 | ["Dataframe.extensions.export_data"] = 'extensions/export_data.lua', 45 | ["Dataframe.extensions.select_set_update"] = 'extensions/select_set_update.lua', 46 | ["Dataframe.extensions.statistics"] = 'extensions/statistics.lua', 47 | 48 | ["Dataframe.sub_classes.01_subset"] = 'sub_classes/01_subset.lua', 49 | ["Dataframe.sub_classes.10_batchframe"] = 'sub_classes/10_batchframe.lua', 50 | ["Dataframe.sub_classes.subset_extensions.samplers"] = 'sub_classes/subset_extensions/samplers.lua', 51 | 52 | ["Dataframe.helper_classes.01_iterator"] = 'helper_classes/01_iterator.lua', 53 | ["Dataframe.helper_classes.02_paralleliterator"] = 'helper_classes/02_paralleliterator.lua', 54 | ["Dataframe.helper_classes.10_array"] = 'helper_classes/10_array.lua', 55 | ["Dataframe.helper_classes.11_dict"] = 'helper_classes/11_dict.lua', 56 | ["Dataframe.helper_classes.12_tbl"] = 'helper_classes/12_tbl.lua' 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /rocks/torch-dataframe-1.6-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "torch-dataframe" 2 | version = "1.6-0" 3 | source = { 4 | url = "https://github.com/alexmili/torch-dataframe/archive/v1.6-0.tar.gz", 5 | dir = "torch-dataframe-1.6-0" 6 | } 7 | 8 | description = { 9 | summary = "A Dataframe class for Torch", 10 | detailed = [[ 11 | Dataframe is a Torch7 class to load and manipulate 12 | Kaggle-style CSVs inspired from R's and pandas' Dataframes. 13 | Compatible with torchnet. 14 | ]], 15 | homepage = "https://github.com/alexmili/torch-dataframe", 16 | license = "MIT/X11", 17 | maintainer = "AlexMili" 18 | } 19 | dependencies = { 20 | "lua >= 5.1", 21 | "torch >= 7.0", 22 | "argcheck >= 2.0", 23 | "luafilesystem >= 1.6.3", 24 | "paths", 25 | "torchnet >= 1.0", 26 | "threads >= 1.0", 27 | "tds", 28 | "nn" 29 | } 30 | build = { 31 | type = "cmake", 32 | variables = { 33 | CMAKE_BUILD_TYPE="Release", 34 | LUA_PATH="$(LUADIR)", 35 | LUA_CPATH="$(LIBDIR)" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /rocks/torch-dataframe-1.6-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "torch-dataframe" 2 | version = "1.6-1" 3 | source = { 4 | url = "https://github.com/alexmili/torch-dataframe/archive/v1.6-1.tar.gz", 5 | dir = "torch-dataframe-1.6-1" 6 | } 7 | 8 | description = { 9 | summary = "A Dataframe class for Torch", 10 | detailed = [[ 11 | Dataframe is a Torch7 class to load and manipulate 12 | Kaggle-style CSVs inspired from R's and pandas' Dataframes. 13 | Compatible with torchnet. 14 | ]], 15 | homepage = "https://github.com/alexmili/torch-dataframe", 16 | license = "MIT/X11", 17 | maintainer = "AlexMili" 18 | } 19 | dependencies = { 20 | "lua >= 5.1", 21 | "torch >= 7.0", 22 | "argcheck >= 2.0", 23 | "luafilesystem >= 1.6.3", 24 | "paths", 25 | "torchnet >= 1.0", 26 | "threads >= 1.0", 27 | "tds", 28 | "nn" 29 | } 30 | build = { 31 | type = "cmake", 32 | variables = { 33 | CMAKE_BUILD_TYPE="Release", 34 | LUA_PATH="$(LUADIR)", 35 | LUA_CPATH="$(LIBDIR)" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /rocks/torch-dataframe-1.7-0.rockspec: -------------------------------------------------------------------------------- 1 | package = "torch-dataframe" 2 | version = "1.7-0" 3 | source = { 4 | url = "https://github.com/alexmili/torch-dataframe/archive/v1.7-0.tar.gz", 5 | dir = "torch-dataframe-1.7-0" 6 | } 7 | 8 | description = { 9 | summary = "A Dataframe class for Torch", 10 | detailed = [[ 11 | Dataframe is a Torch7 class to load and manipulate 12 | Kaggle-style CSVs inspired from R's and pandas' Dataframes. 13 | Compatible with torchnet. 14 | ]], 15 | homepage = "https://github.com/alexmili/torch-dataframe", 16 | license = "MIT/X11", 17 | maintainer = "AlexMili" 18 | } 19 | dependencies = { 20 | "lua >= 5.1", 21 | "torch >= 7.0", 22 | "argcheck >= 2.0", 23 | "luafilesystem >= 1.6.3", 24 | "paths", 25 | "torchnet >= 1.0", 26 | "threads >= 1.0", 27 | "tds", 28 | "nn" 29 | } 30 | build = { 31 | type = "cmake", 32 | variables = { 33 | CMAKE_BUILD_TYPE="Release", 34 | LUA_PATH="$(LUADIR)", 35 | LUA_CPATH="$(LIBDIR)" 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /rocks/torch-dataframe-scm-1.rockspec: -------------------------------------------------------------------------------- 1 | package = "torch-dataframe" 2 | version = "scm-1" 3 | source = { 4 | url = "https://github.com/alexmili/torch-dataframe/archive/develop.tar.gz", 5 | dir = "torch-dataframe-develop" 6 | } 7 | description = { 8 | summary = "A Dataframe class for Torch", 9 | detailed = [[ 10 | Dataframe is a Torch7 class to load and manipulate 11 | Kaggle-style CSVs inspired from R's and pandas' Dataframes. 12 | Compatible with torchnet. 13 | ]], 14 | homepage = "https://github.com/alexmili/torch-dataframe", 15 | license = "MIT/X11", 16 | maintainer = "AlexMili" 17 | } 18 | dependencies = { 19 | "lua >= 5.1", 20 | "torch >= 7.0", 21 | "argcheck >= 2.0", 22 | "luafilesystem >= 1.6.3", 23 | "paths", 24 | "torchnet >= 1.0", 25 | "threads >= 1.0", 26 | "tds", 27 | "nn" 28 | } 29 | build = { 30 | type = "cmake", 31 | variables = { 32 | CMAKE_BUILD_TYPE="Release", 33 | LUA_PATH="$(LUADIR)", 34 | LUA_CPATH="$(LIBDIR)" 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /specs/coverage.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | RUN_TESTS=true 4 | VERBOSE=false 5 | while [[ $# -gt 0 ]] 6 | do 7 | key="$1" 8 | 9 | case $key in 10 | -v|--verbose) 11 | VERBOSE=true 12 | ;; 13 | -g|--generate) 14 | RUN_TESTS=false 15 | ;; 16 | *) 17 | # unknown option 18 | ;; 19 | esac 20 | shift # past argument or value 21 | done 22 | 23 | echo -e "================="; 24 | echo -e "= Code coverage ="; 25 | echo -e "================="; 26 | echo ""; 27 | 28 | if [ "$RUN_TESTS" = true ]; then 29 | ./run_all.sh --coverage 30 | fi 31 | 32 | mv luacov.stats.out ../luacov.stats.out 33 | 34 | cd .. 35 | 36 | luacov -c .luacov 37 | 38 | if [ "$RUN_TESTS" = true ]; then 39 | 40 | mv -f luacov.stats.out specs/luacov.stats.out 41 | mv -f luacov.report.out specs/luacov.report.out 42 | 43 | cd specs 44 | 45 | if [ "$VERBOSE" = true ]; then 46 | cat luacov.report.out 47 | fi 48 | fi 49 | -------------------------------------------------------------------------------- /specs/data/advanced_short.csv: -------------------------------------------------------------------------------- 1 | Col A,Col B,Col C 2 | 1,A,8 3 | 2,B, 4 | 3,B,9 5 | -------------------------------------------------------------------------------- /specs/data/full.csv: -------------------------------------------------------------------------------- 1 | Col A , Col B,Col C, Col D 2 | 1,0.2,0.1,A 3 | 2,0.3,,B 4 | 3,0.4,9999999999, 5 | 4,0.5,-222,D -------------------------------------------------------------------------------- /specs/data/iris-label.csv: -------------------------------------------------------------------------------- 1 | sepal_length,sepal_width,petal_length,petal_width,class 2 | 5.1,3.5,1.4,0.2,Iris-setosa 3 | 4.9,3.0,1.4,0.2,Iris-setosa 4 | 4.7,3.2,1.3,0.2,Iris-setosa 5 | 4.6,3.1,1.5,0.2,Iris-setosa 6 | 5.0,3.6,1.4,0.2,Iris-setosa 7 | 5.4,3.9,1.7,0.4,Iris-setosa 8 | 4.6,3.4,1.4,0.3,Iris-setosa 9 | 5.0,3.4,1.5,0.2,Iris-setosa 10 | 4.4,2.9,1.4,0.2,Iris-setosa 11 | 4.9,3.1,1.5,0.1,Iris-setosa 12 | 5.4,3.7,1.5,0.2,Iris-setosa 13 | 4.8,3.4,1.6,0.2,Iris-setosa 14 | 4.8,3.0,1.4,0.1,Iris-setosa 15 | 4.3,3.0,1.1,0.1,Iris-setosa 16 | 5.8,4.0,1.2,0.2,Iris-setosa 17 | 5.7,4.4,1.5,0.4,Iris-setosa 18 | 5.4,3.9,1.3,0.4,Iris-setosa 19 | 5.1,3.5,1.4,0.3,Iris-setosa 20 | 5.7,3.8,1.7,0.3,Iris-setosa 21 | 5.1,3.8,1.5,0.3,Iris-setosa 22 | 5.4,3.4,1.7,0.2,Iris-setosa 23 | 5.1,3.7,1.5,0.4,Iris-setosa 24 | 4.6,3.6,1.0,0.2,Iris-setosa 25 | 5.1,3.3,1.7,0.5,Iris-setosa 26 | 4.8,3.4,1.9,0.2,Iris-setosa 27 | 5.0,3.0,1.6,0.2,Iris-setosa 28 | 5.0,3.4,1.6,0.4,Iris-setosa 29 | 5.2,3.5,1.5,0.2,Iris-setosa 30 | 5.2,3.4,1.4,0.2,Iris-setosa 31 | 4.7,3.2,1.6,0.2,Iris-setosa 32 | 4.8,3.1,1.6,0.2,Iris-setosa 33 | 5.4,3.4,1.5,0.4,Iris-setosa 34 | 5.2,4.1,1.5,0.1,Iris-setosa 35 | 5.5,4.2,1.4,0.2,Iris-setosa 36 | 4.9,3.1,1.5,0.1,Iris-setosa 37 | 5.0,3.2,1.2,0.2,Iris-setosa 38 | 5.5,3.5,1.3,0.2,Iris-setosa 39 | 4.9,3.1,1.5,0.1,Iris-setosa 40 | 4.4,3.0,1.3,0.2,Iris-setosa 41 | 5.1,3.4,1.5,0.2,Iris-setosa 42 | 5.0,3.5,1.3,0.3,Iris-setosa 43 | 4.5,2.3,1.3,0.3,Iris-setosa 44 | 4.4,3.2,1.3,0.2,Iris-setosa 45 | 5.0,3.5,1.6,0.6,Iris-setosa 46 | 5.1,3.8,1.9,0.4,Iris-setosa 47 | 4.8,3.0,1.4,0.3,Iris-setosa 48 | 5.1,3.8,1.6,0.2,Iris-setosa 49 | 4.6,3.2,1.4,0.2,Iris-setosa 50 | 5.3,3.7,1.5,0.2,Iris-setosa 51 | 5.0,3.3,1.4,0.2,Iris-setosa 52 | 7.0,3.2,4.7,1.4,Iris-versicolor 53 | 6.4,3.2,4.5,1.5,Iris-versicolor 54 | 6.9,3.1,4.9,1.5,Iris-versicolor 55 | 5.5,2.3,4.0,1.3,Iris-versicolor 56 | 6.5,2.8,4.6,1.5,Iris-versicolor 57 | 5.7,2.8,4.5,1.3,Iris-versicolor 58 | 6.3,3.3,4.7,1.6,Iris-versicolor 59 | 4.9,2.4,3.3,1.0,Iris-versicolor 60 | 6.6,2.9,4.6,1.3,Iris-versicolor 61 | 5.2,2.7,3.9,1.4,Iris-versicolor 62 | 5.0,2.0,3.5,1.0,Iris-versicolor 63 | 5.9,3.0,4.2,1.5,Iris-versicolor 64 | 6.0,2.2,4.0,1.0,Iris-versicolor 65 | 6.1,2.9,4.7,1.4,Iris-versicolor 66 | 5.6,2.9,3.6,1.3,Iris-versicolor 67 | 6.7,3.1,4.4,1.4,Iris-versicolor 68 | 5.6,3.0,4.5,1.5,Iris-versicolor 69 | 5.8,2.7,4.1,1.0,Iris-versicolor 70 | 6.2,2.2,4.5,1.5,Iris-versicolor 71 | 5.6,2.5,3.9,1.1,Iris-versicolor 72 | 5.9,3.2,4.8,1.8,Iris-versicolor 73 | 6.1,2.8,4.0,1.3,Iris-versicolor 74 | 6.3,2.5,4.9,1.5,Iris-versicolor 75 | 6.1,2.8,4.7,1.2,Iris-versicolor 76 | 6.4,2.9,4.3,1.3,Iris-versicolor 77 | 6.6,3.0,4.4,1.4,Iris-versicolor 78 | 6.8,2.8,4.8,1.4,Iris-versicolor 79 | 6.7,3.0,5.0,1.7,Iris-versicolor 80 | 6.0,2.9,4.5,1.5,Iris-versicolor 81 | 5.7,2.6,3.5,1.0,Iris-versicolor 82 | 5.5,2.4,3.8,1.1,Iris-versicolor 83 | 5.5,2.4,3.7,1.0,Iris-versicolor 84 | 5.8,2.7,3.9,1.2,Iris-versicolor 85 | 6.0,2.7,5.1,1.6,Iris-versicolor 86 | 5.4,3.0,4.5,1.5,Iris-versicolor 87 | 6.0,3.4,4.5,1.6,Iris-versicolor 88 | 6.7,3.1,4.7,1.5,Iris-versicolor 89 | 6.3,2.3,4.4,1.3,Iris-versicolor 90 | 5.6,3.0,4.1,1.3,Iris-versicolor 91 | 5.5,2.5,4.0,1.3,Iris-versicolor 92 | 5.5,2.6,4.4,1.2,Iris-versicolor 93 | 6.1,3.0,4.6,1.4,Iris-versicolor 94 | 5.8,2.6,4.0,1.2,Iris-versicolor 95 | 5.0,2.3,3.3,1.0,Iris-versicolor 96 | 5.6,2.7,4.2,1.3,Iris-versicolor 97 | 5.7,3.0,4.2,1.2,Iris-versicolor 98 | 5.7,2.9,4.2,1.3,Iris-versicolor 99 | 6.2,2.9,4.3,1.3,Iris-versicolor 100 | 5.1,2.5,3.0,1.1,Iris-versicolor 101 | 5.7,2.8,4.1,1.3,Iris-versicolor 102 | 6.3,3.3,6.0,2.5,Iris-virginica 103 | 5.8,2.7,5.1,1.9,Iris-virginica 104 | 7.1,3.0,5.9,2.1,Iris-virginica 105 | 6.3,2.9,5.6,1.8,Iris-virginica 106 | 6.5,3.0,5.8,2.2,Iris-virginica 107 | 7.6,3.0,6.6,2.1,Iris-virginica 108 | 4.9,2.5,4.5,1.7,Iris-virginica 109 | 7.3,2.9,6.3,1.8,Iris-virginica 110 | 6.7,2.5,5.8,1.8,Iris-virginica 111 | 7.2,3.6,6.1,2.5,Iris-virginica 112 | 6.5,3.2,5.1,2.0,Iris-virginica 113 | 6.4,2.7,5.3,1.9,Iris-virginica 114 | 6.8,3.0,5.5,2.1,Iris-virginica 115 | 5.7,2.5,5.0,2.0,Iris-virginica 116 | 5.8,2.8,5.1,2.4,Iris-virginica 117 | 6.4,3.2,5.3,2.3,Iris-virginica 118 | 6.5,3.0,5.5,1.8,Iris-virginica 119 | 7.7,3.8,6.7,2.2,Iris-virginica 120 | 7.7,2.6,6.9,2.3,Iris-virginica 121 | 6.0,2.2,5.0,1.5,Iris-virginica 122 | 6.9,3.2,5.7,2.3,Iris-virginica 123 | 5.6,2.8,4.9,2.0,Iris-virginica 124 | 7.7,2.8,6.7,2.0,Iris-virginica 125 | 6.3,2.7,4.9,1.8,Iris-virginica 126 | 6.7,3.3,5.7,2.1,Iris-virginica 127 | 7.2,3.2,6.0,1.8,Iris-virginica 128 | 6.2,2.8,4.8,1.8,Iris-virginica 129 | 6.1,3.0,4.9,1.8,Iris-virginica 130 | 6.4,2.8,5.6,2.1,Iris-virginica 131 | 7.2,3.0,5.8,1.6,Iris-virginica 132 | 7.4,2.8,6.1,1.9,Iris-virginica 133 | 7.9,3.8,6.4,2.0,Iris-virginica 134 | 6.4,2.8,5.6,2.2,Iris-virginica 135 | 6.3,2.8,5.1,1.5,Iris-virginica 136 | 6.1,2.6,5.6,1.4,Iris-virginica 137 | 7.7,3.0,6.1,2.3,Iris-virginica 138 | 6.3,3.4,5.6,2.4,Iris-virginica 139 | 6.4,3.1,5.5,1.8,Iris-virginica 140 | 6.0,3.0,4.8,1.8,Iris-virginica 141 | 6.9,3.1,5.4,2.1,Iris-virginica 142 | 6.7,3.1,5.6,2.4,Iris-virginica 143 | 6.9,3.1,5.1,2.3,Iris-virginica 144 | 5.8,2.7,5.1,1.9,Iris-virginica 145 | 6.8,3.2,5.9,2.3,Iris-virginica 146 | 6.7,3.3,5.7,2.5,Iris-virginica 147 | 6.7,3.0,5.2,2.3,Iris-virginica 148 | 6.3,2.5,5.0,1.9,Iris-virginica 149 | 6.5,3.0,5.2,2.0,Iris-virginica 150 | 6.2,3.4,5.4,2.3,Iris-virginica 151 | 5.9,3.0,5.1,1.8,Iris-virginica -------------------------------------------------------------------------------- /specs/data/iris-no-header.csv: -------------------------------------------------------------------------------- 1 | 5.1,3.5,1.4,0.2,Iris-setosa 2 | 4.9,3.0,1.4,0.2,Iris-setosa 3 | 4.7,3.2,1.3,0.2,Iris-setosa 4 | 4.6,3.1,1.5,0.2,Iris-setosa 5 | 5.0,3.6,1.4,0.2,Iris-setosa 6 | 5.4,3.9,1.7,0.4,Iris-setosa 7 | 4.6,3.4,1.4,0.3,Iris-setosa 8 | 5.0,3.4,1.5,0.2,Iris-setosa 9 | 4.4,2.9,1.4,0.2,Iris-setosa 10 | 4.9,3.1,1.5,0.1,Iris-setosa 11 | 5.4,3.7,1.5,0.2,Iris-setosa 12 | 4.8,3.4,1.6,0.2,Iris-setosa 13 | 4.8,3.0,1.4,0.1,Iris-setosa 14 | 4.3,3.0,1.1,0.1,Iris-setosa 15 | 5.8,4.0,1.2,0.2,Iris-setosa 16 | 5.7,4.4,1.5,0.4,Iris-setosa 17 | 5.4,3.9,1.3,0.4,Iris-setosa 18 | 5.1,3.5,1.4,0.3,Iris-setosa 19 | 5.7,3.8,1.7,0.3,Iris-setosa 20 | 5.1,3.8,1.5,0.3,Iris-setosa 21 | 5.4,3.4,1.7,0.2,Iris-setosa 22 | 5.1,3.7,1.5,0.4,Iris-setosa 23 | 4.6,3.6,1.0,0.2,Iris-setosa 24 | 5.1,3.3,1.7,0.5,Iris-setosa 25 | 4.8,3.4,1.9,0.2,Iris-setosa 26 | 5.0,3.0,1.6,0.2,Iris-setosa 27 | 5.0,3.4,1.6,0.4,Iris-setosa 28 | 5.2,3.5,1.5,0.2,Iris-setosa 29 | 5.2,3.4,1.4,0.2,Iris-setosa 30 | 4.7,3.2,1.6,0.2,Iris-setosa 31 | 4.8,3.1,1.6,0.2,Iris-setosa 32 | 5.4,3.4,1.5,0.4,Iris-setosa 33 | 5.2,4.1,1.5,0.1,Iris-setosa 34 | 5.5,4.2,1.4,0.2,Iris-setosa 35 | 4.9,3.1,1.5,0.1,Iris-setosa 36 | 5.0,3.2,1.2,0.2,Iris-setosa 37 | 5.5,3.5,1.3,0.2,Iris-setosa 38 | 4.9,3.1,1.5,0.1,Iris-setosa 39 | 4.4,3.0,1.3,0.2,Iris-setosa 40 | 5.1,3.4,1.5,0.2,Iris-setosa 41 | 5.0,3.5,1.3,0.3,Iris-setosa 42 | 4.5,2.3,1.3,0.3,Iris-setosa 43 | 4.4,3.2,1.3,0.2,Iris-setosa 44 | 5.0,3.5,1.6,0.6,Iris-setosa 45 | 5.1,3.8,1.9,0.4,Iris-setosa 46 | 4.8,3.0,1.4,0.3,Iris-setosa 47 | 5.1,3.8,1.6,0.2,Iris-setosa 48 | 4.6,3.2,1.4,0.2,Iris-setosa 49 | 5.3,3.7,1.5,0.2,Iris-setosa 50 | 5.0,3.3,1.4,0.2,Iris-setosa 51 | 7.0,3.2,4.7,1.4,Iris-versicolor 52 | 6.4,3.2,4.5,1.5,Iris-versicolor 53 | 6.9,3.1,4.9,1.5,Iris-versicolor 54 | 5.5,2.3,4.0,1.3,Iris-versicolor 55 | 6.5,2.8,4.6,1.5,Iris-versicolor 56 | 5.7,2.8,4.5,1.3,Iris-versicolor 57 | 6.3,3.3,4.7,1.6,Iris-versicolor 58 | 4.9,2.4,3.3,1.0,Iris-versicolor 59 | 6.6,2.9,4.6,1.3,Iris-versicolor 60 | 5.2,2.7,3.9,1.4,Iris-versicolor 61 | 5.0,2.0,3.5,1.0,Iris-versicolor 62 | 5.9,3.0,4.2,1.5,Iris-versicolor 63 | 6.0,2.2,4.0,1.0,Iris-versicolor 64 | 6.1,2.9,4.7,1.4,Iris-versicolor 65 | 5.6,2.9,3.6,1.3,Iris-versicolor 66 | 6.7,3.1,4.4,1.4,Iris-versicolor 67 | 5.6,3.0,4.5,1.5,Iris-versicolor 68 | 5.8,2.7,4.1,1.0,Iris-versicolor 69 | 6.2,2.2,4.5,1.5,Iris-versicolor 70 | 5.6,2.5,3.9,1.1,Iris-versicolor 71 | 5.9,3.2,4.8,1.8,Iris-versicolor 72 | 6.1,2.8,4.0,1.3,Iris-versicolor 73 | 6.3,2.5,4.9,1.5,Iris-versicolor 74 | 6.1,2.8,4.7,1.2,Iris-versicolor 75 | 6.4,2.9,4.3,1.3,Iris-versicolor 76 | 6.6,3.0,4.4,1.4,Iris-versicolor 77 | 6.8,2.8,4.8,1.4,Iris-versicolor 78 | 6.7,3.0,5.0,1.7,Iris-versicolor 79 | 6.0,2.9,4.5,1.5,Iris-versicolor 80 | 5.7,2.6,3.5,1.0,Iris-versicolor 81 | 5.5,2.4,3.8,1.1,Iris-versicolor 82 | 5.5,2.4,3.7,1.0,Iris-versicolor 83 | 5.8,2.7,3.9,1.2,Iris-versicolor 84 | 6.0,2.7,5.1,1.6,Iris-versicolor 85 | 5.4,3.0,4.5,1.5,Iris-versicolor 86 | 6.0,3.4,4.5,1.6,Iris-versicolor 87 | 6.7,3.1,4.7,1.5,Iris-versicolor 88 | 6.3,2.3,4.4,1.3,Iris-versicolor 89 | 5.6,3.0,4.1,1.3,Iris-versicolor 90 | 5.5,2.5,4.0,1.3,Iris-versicolor 91 | 5.5,2.6,4.4,1.2,Iris-versicolor 92 | 6.1,3.0,4.6,1.4,Iris-versicolor 93 | 5.8,2.6,4.0,1.2,Iris-versicolor 94 | 5.0,2.3,3.3,1.0,Iris-versicolor 95 | 5.6,2.7,4.2,1.3,Iris-versicolor 96 | 5.7,3.0,4.2,1.2,Iris-versicolor 97 | 5.7,2.9,4.2,1.3,Iris-versicolor 98 | 6.2,2.9,4.3,1.3,Iris-versicolor 99 | 5.1,2.5,3.0,1.1,Iris-versicolor 100 | 5.7,2.8,4.1,1.3,Iris-versicolor 101 | 6.3,3.3,6.0,2.5,Iris-virginica 102 | 5.8,2.7,5.1,1.9,Iris-virginica 103 | 7.1,3.0,5.9,2.1,Iris-virginica 104 | 6.3,2.9,5.6,1.8,Iris-virginica 105 | 6.5,3.0,5.8,2.2,Iris-virginica 106 | 7.6,3.0,6.6,2.1,Iris-virginica 107 | 4.9,2.5,4.5,1.7,Iris-virginica 108 | 7.3,2.9,6.3,1.8,Iris-virginica 109 | 6.7,2.5,5.8,1.8,Iris-virginica 110 | 7.2,3.6,6.1,2.5,Iris-virginica 111 | 6.5,3.2,5.1,2.0,Iris-virginica 112 | 6.4,2.7,5.3,1.9,Iris-virginica 113 | 6.8,3.0,5.5,2.1,Iris-virginica 114 | 5.7,2.5,5.0,2.0,Iris-virginica 115 | 5.8,2.8,5.1,2.4,Iris-virginica 116 | 6.4,3.2,5.3,2.3,Iris-virginica 117 | 6.5,3.0,5.5,1.8,Iris-virginica 118 | 7.7,3.8,6.7,2.2,Iris-virginica 119 | 7.7,2.6,6.9,2.3,Iris-virginica 120 | 6.0,2.2,5.0,1.5,Iris-virginica 121 | 6.9,3.2,5.7,2.3,Iris-virginica 122 | 5.6,2.8,4.9,2.0,Iris-virginica 123 | 7.7,2.8,6.7,2.0,Iris-virginica 124 | 6.3,2.7,4.9,1.8,Iris-virginica 125 | 6.7,3.3,5.7,2.1,Iris-virginica 126 | 7.2,3.2,6.0,1.8,Iris-virginica 127 | 6.2,2.8,4.8,1.8,Iris-virginica 128 | 6.1,3.0,4.9,1.8,Iris-virginica 129 | 6.4,2.8,5.6,2.1,Iris-virginica 130 | 7.2,3.0,5.8,1.6,Iris-virginica 131 | 7.4,2.8,6.1,1.9,Iris-virginica 132 | 7.9,3.8,6.4,2.0,Iris-virginica 133 | 6.4,2.8,5.6,2.2,Iris-virginica 134 | 6.3,2.8,5.1,1.5,Iris-virginica 135 | 6.1,2.6,5.6,1.4,Iris-virginica 136 | 7.7,3.0,6.1,2.3,Iris-virginica 137 | 6.3,3.4,5.6,2.4,Iris-virginica 138 | 6.4,3.1,5.5,1.8,Iris-virginica 139 | 6.0,3.0,4.8,1.8,Iris-virginica 140 | 6.9,3.1,5.4,2.1,Iris-virginica 141 | 6.7,3.1,5.6,2.4,Iris-virginica 142 | 6.9,3.1,5.1,2.3,Iris-virginica 143 | 5.8,2.7,5.1,1.9,Iris-virginica 144 | 6.8,3.2,5.9,2.3,Iris-virginica 145 | 6.7,3.3,5.7,2.5,Iris-virginica 146 | 6.7,3.0,5.2,2.3,Iris-virginica 147 | 6.3,2.5,5.0,1.9,Iris-virginica 148 | 6.5,3.0,5.2,2.0,Iris-virginica 149 | 6.2,3.4,5.4,2.3,Iris-virginica 150 | 5.9,3.0,5.1,1.8,Iris-virginica -------------------------------------------------------------------------------- /specs/data/iris-no-label.csv: -------------------------------------------------------------------------------- 1 | sepal_length,sepal_width,petal_length,petal_width 2 | 5.1,3.5,1.4,0.2 3 | 4.9,3.0,1.4,0.2 4 | 4.7,3.2,1.3,0.2 5 | 4.6,3.1,1.5,0.2 6 | 5.0,3.6,1.4,0.2 7 | 5.4,3.9,1.7,0.4 8 | 4.6,3.4,1.4,0.3 9 | 5.0,3.4,1.5,0.2 10 | 4.4,2.9,1.4,0.2 11 | 4.9,3.1,1.5,0.1 12 | 5.4,3.7,1.5,0.2 13 | 4.8,3.4,1.6,0.2 14 | 4.8,3.0,1.4,0.1 15 | 4.3,3.0,1.1,0.1 16 | 5.8,4.0,1.2,0.2 17 | 5.7,4.4,1.5,0.4 18 | 5.4,3.9,1.3,0.4 19 | 5.1,3.5,1.4,0.3 20 | 5.7,3.8,1.7,0.3 21 | 5.1,3.8,1.5,0.3 22 | 5.4,3.4,1.7,0.2 23 | 5.1,3.7,1.5,0.4 24 | 4.6,3.6,1.0,0.2 25 | 5.1,3.3,1.7,0.5 26 | 4.8,3.4,1.9,0.2 27 | 5.0,3.0,1.6,0.2 28 | 5.0,3.4,1.6,0.4 29 | 5.2,3.5,1.5,0.2 30 | 5.2,3.4,1.4,0.2 31 | 4.7,3.2,1.6,0.2 32 | 4.8,3.1,1.6,0.2 33 | 5.4,3.4,1.5,0.4 34 | 5.2,4.1,1.5,0.1 35 | 5.5,4.2,1.4,0.2 36 | 4.9,3.1,1.5,0.1 37 | 5.0,3.2,1.2,0.2 38 | 5.5,3.5,1.3,0.2 39 | 4.9,3.1,1.5,0.1 40 | 4.4,3.0,1.3,0.2 41 | 5.1,3.4,1.5,0.2 42 | 5.0,3.5,1.3,0.3 43 | 4.5,2.3,1.3,0.3 44 | 4.4,3.2,1.3,0.2 45 | 5.0,3.5,1.6,0.6 46 | 5.1,3.8,1.9,0.4 47 | 4.8,3.0,1.4,0.3 48 | 5.1,3.8,1.6,0.2 49 | 4.6,3.2,1.4,0.2 50 | 5.3,3.7,1.5,0.2 51 | 5.0,3.3,1.4,0.2 52 | 7.0,3.2,4.7,1.4 53 | 6.4,3.2,4.5,1.5 54 | 6.9,3.1,4.9,1.5 55 | 5.5,2.3,4.0,1.3 56 | 6.5,2.8,4.6,1.5 57 | 5.7,2.8,4.5,1.3 58 | 6.3,3.3,4.7,1.6 59 | 4.9,2.4,3.3,1.0 60 | 6.6,2.9,4.6,1.3 61 | 5.2,2.7,3.9,1.4 62 | 5.0,2.0,3.5,1.0 63 | 5.9,3.0,4.2,1.5 64 | 6.0,2.2,4.0,1.0 65 | 6.1,2.9,4.7,1.4 66 | 5.6,2.9,3.6,1.3 67 | 6.7,3.1,4.4,1.4 68 | 5.6,3.0,4.5,1.5 69 | 5.8,2.7,4.1,1.0 70 | 6.2,2.2,4.5,1.5 71 | 5.6,2.5,3.9,1.1 72 | 5.9,3.2,4.8,1.8 73 | 6.1,2.8,4.0,1.3 74 | 6.3,2.5,4.9,1.5 75 | 6.1,2.8,4.7,1.2 76 | 6.4,2.9,4.3,1.3 77 | 6.6,3.0,4.4,1.4 78 | 6.8,2.8,4.8,1.4 79 | 6.7,3.0,5.0,1.7 80 | 6.0,2.9,4.5,1.5 81 | 5.7,2.6,3.5,1.0 82 | 5.5,2.4,3.8,1.1 83 | 5.5,2.4,3.7,1.0 84 | 5.8,2.7,3.9,1.2 85 | 6.0,2.7,5.1,1.6 86 | 5.4,3.0,4.5,1.5 87 | 6.0,3.4,4.5,1.6 88 | 6.7,3.1,4.7,1.5 89 | 6.3,2.3,4.4,1.3 90 | 5.6,3.0,4.1,1.3 91 | 5.5,2.5,4.0,1.3 92 | 5.5,2.6,4.4,1.2 93 | 6.1,3.0,4.6,1.4 94 | 5.8,2.6,4.0,1.2 95 | 5.0,2.3,3.3,1.0 96 | 5.6,2.7,4.2,1.3 97 | 5.7,3.0,4.2,1.2 98 | 5.7,2.9,4.2,1.3 99 | 6.2,2.9,4.3,1.3 100 | 5.1,2.5,3.0,1.1 101 | 5.7,2.8,4.1,1.3 102 | 6.3,3.3,6.0,2.5 103 | 5.8,2.7,5.1,1.9 104 | 7.1,3.0,5.9,2.1 105 | 6.3,2.9,5.6,1.8 106 | 6.5,3.0,5.8,2.2 107 | 7.6,3.0,6.6,2.1 108 | 4.9,2.5,4.5,1.7 109 | 7.3,2.9,6.3,1.8 110 | 6.7,2.5,5.8,1.8 111 | 7.2,3.6,6.1,2.5 112 | 6.5,3.2,5.1,2.0 113 | 6.4,2.7,5.3,1.9 114 | 6.8,3.0,5.5,2.1 115 | 5.7,2.5,5.0,2.0 116 | 5.8,2.8,5.1,2.4 117 | 6.4,3.2,5.3,2.3 118 | 6.5,3.0,5.5,1.8 119 | 7.7,3.8,6.7,2.2 120 | 7.7,2.6,6.9,2.3 121 | 6.0,2.2,5.0,1.5 122 | 6.9,3.2,5.7,2.3 123 | 5.6,2.8,4.9,2.0 124 | 7.7,2.8,6.7,2.0 125 | 6.3,2.7,4.9,1.8 126 | 6.7,3.3,5.7,2.1 127 | 7.2,3.2,6.0,1.8 128 | 6.2,2.8,4.8,1.8 129 | 6.1,3.0,4.9,1.8 130 | 6.4,2.8,5.6,2.1 131 | 7.2,3.0,5.8,1.6 132 | 7.4,2.8,6.1,1.9 133 | 7.9,3.8,6.4,2.0 134 | 6.4,2.8,5.6,2.2 135 | 6.3,2.8,5.1,1.5 136 | 6.1,2.6,5.6,1.4 137 | 7.7,3.0,6.1,2.3 138 | 6.3,3.4,5.6,2.4 139 | 6.4,3.1,5.5,1.8 140 | 6.0,3.0,4.8,1.8 141 | 6.9,3.1,5.4,2.1 142 | 6.7,3.1,5.6,2.4 143 | 6.9,3.1,5.1,2.3 144 | 5.8,2.7,5.1,1.9 145 | 6.8,3.2,5.9,2.3 146 | 6.7,3.3,5.7,2.5 147 | 6.7,3.0,5.2,2.3 148 | 6.3,2.5,5.0,1.9 149 | 6.5,3.0,5.2,2.0 150 | 6.2,3.4,5.4,2.3 151 | 5.9,3.0,5.1,1.8 -------------------------------------------------------------------------------- /specs/data/realistic_29_row_data.csv: -------------------------------------------------------------------------------- 1 | Filename,Gender,Weight,Comments 2 | /home/test/wow.png,Male,55.5, 3 | /home/test/wow2.png,Female,77,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud ex" 4 | /home/test/wow3.png,Female,66, 5 | /home/test/wow4.png,Female,90, 6 | /home/test/wow5.png,Male,78, 7 | /home/test/wow2.png,Male,55, 8 | /home/test/wow3.png,Male,66, 9 | /home/test/wow4.png,Male,89,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud ex" 10 | /home/test/wow5.png,Female,87, 11 | /home/test/wow2.png,Female,67, 12 | /home/test/wow3.png,Female,88, 13 | /home/test/wow4.png,Male,66,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor" 14 | /home/test/wow5.png,Male,54, 15 | /home/test/wow2.png,Male,66, 16 | /home/test/wow3.png,Male,87, 17 | /home/test/wow4.png,Female,87,Lorem ipsum 18 | /home/test/wow5.png,Female,57, 19 | /home/test/wow2.png,Female,67,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud ex" 20 | /home/test/wow3.png,Male,55, 21 | /home/test/wow4.png,Male,76, 22 | /home/test/wow5.png,Male,88, 23 | /home/test/wow2.png,Male,99, 24 | /home/test/wow3.png,Female,111,"Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud ex" 25 | /home/test/wow4.png,Female,44, 26 | /home/test/wow5.png,Female,56, 27 | /home/test/wow2.png,Male,88, 28 | /home/test/wow3.png,Male,99, 29 | /home/test/wow4.png,Male,99, 30 | -------------------------------------------------------------------------------- /specs/data/sampler_csv_files/index.csv: -------------------------------------------------------------------------------- 1 | filename,label1,label2,label3 2 | 1,A,, 3 | 2,A,, 4 | 3,B,A, 5 | 4,B,, 6 | 5,A,, 7 | 6,A,, 8 | 7,B,, 9 | 8,B,, 10 | 9,A,B,C 11 | 10,A,, 12 | 11,A,, 13 | 12,A,, 14 | 13,B,A, 15 | 14,A,B, 16 | 15,B,, 17 | 16,A,, 18 | 17,B,, 19 | 18,A,, 20 | 19,A,, 21 | 20,A,, 22 | -------------------------------------------------------------------------------- /specs/data/sampler_csv_files/index3.csv: -------------------------------------------------------------------------------- 1 | filename,label 2 | 1,A 3 | 2,A 4 | 3,B 5 | 4,B 6 | 5,A 7 | 6,A 8 | 7,B 9 | 8,B 10 | 9,A 11 | 10,A 12 | 11,A 13 | 12,A 14 | 13,B 15 | 14,A 16 | 15,B 17 | 16,A 18 | 17,B 19 | 18,A 20 | 19,A 21 | 20,A 22 | -------------------------------------------------------------------------------- /specs/data/simple_short.csv: -------------------------------------------------------------------------------- 1 | Col A,Col B,Col C 2 | 1,0.2,1000 3 | 2,0.3,0.1 4 | 3,0.4,9999999999 5 | 4,0.5,-222 6 | -------------------------------------------------------------------------------- /specs/dataframe/column_order_spec.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Ensure the test is launched within the specs/ folder 4 | assert(string.match(lfs.currentdir(), "specs")~=nil, "You must run this test in specs folder") 5 | 6 | local initial_dir = lfs.currentdir() 7 | 8 | -- Go to specs folder 9 | while (not string.match(lfs.currentdir(), "/specs$")) do 10 | lfs.chdir("..") 11 | end 12 | 13 | local specs_dir = lfs.currentdir() 14 | lfs.chdir("..")-- one more directory and it is lib root 15 | 16 | -- Include Dataframe lib 17 | dofile("init.lua") 18 | 19 | -- Go back into initial dir 20 | lfs.chdir(initial_dir) 21 | 22 | describe("Column order functionality", function() 23 | 24 | it("Keeps the right order when loading a CSV",function() 25 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 26 | assert.are.same(a.column_order, 27 | {[1] = "Col A", 28 | [2] = "Col B", 29 | [3] = "Col C"}) 30 | end) 31 | 32 | it("Keeps the right order when loading a table",function() 33 | local a = Dataframe() 34 | local first = {1,2,3} 35 | local second = {"2","1","3"} 36 | local third = {"2","a","3"} 37 | local column_order = {[1] = 'firstColumn', 38 | [2] = 'secondColumn', 39 | [3] = 'thirdColumn'} 40 | local data = {['firstColumn']=first, 41 | ['secondColumn']=second, 42 | ['thirdColumn']=third} 43 | 44 | a:load_table{data=Df_Dict(data), column_order = Df_Array(column_order)} 45 | 46 | assert.are.same(a.column_order, column_order) 47 | 48 | column_order[2] = nil 49 | assert.is.error(function() a:load_table{data=Df_Dict(data), column_order = column_order} end) 50 | end) 51 | 52 | it("Keeps the right order when saving to CSV",function() 53 | local a = Dataframe() 54 | local first = {1,2,3} 55 | local second = {"Wow it's tricky","1,2","323."} 56 | local third = {"\"","a\"a","3"} 57 | 58 | local data = {['firstColumn']=first, 59 | ['secondColumn']=second, 60 | ['thirdColumn']=third} 61 | 62 | c_order = {[1] = "firstColumn", 63 | [4] = "secondColumn", 64 | [3] = "thirdColumn"} 65 | 66 | assert.is.error(function() a:load_table{data=Df_Dict(data), column_order=Df_Array(c_order)} end) 67 | 68 | c_order = {[1] = "firstColumn", 69 | [3] = "thirdColumn"} 70 | 71 | assert.is.error(function() a:load_table{data=Df_Dict(data), column_order=Df_Array(c_order)} end) 72 | 73 | c_order = {[1] = "firstColumn", 74 | [2] = "secondColumn", 75 | [3] = "thirdColumn"} 76 | 77 | a:load_table{data=Df_Dict(data), column_order=Df_Array(c_order)} 78 | a:to_csv{path = "tricky_csv.csv"} 79 | a:load_csv{path = "tricky_csv.csv", verbose = false} 80 | 81 | for cn,cols in pairs(a.dataset) do 82 | assert.are.same(cols, data[cn]) 83 | end 84 | assert.are.same(a.column_order, c_order) 85 | 86 | os.remove("tricky_csv.csv") 87 | end) 88 | 89 | it("Keeps the right order when saving to Tensor",function() 90 | local a = Dataframe() 91 | local first = {1,2,3} 92 | local second = {"A","B","323."} 93 | local third = 2.2 94 | 95 | data = {['1st']=first, 96 | ['2nd']=second, 97 | ['3rd']=third} 98 | 99 | c_order = {[1] = "1st", 100 | [2] = "2nd", 101 | [3] = "3rd"} 102 | 103 | a:load_table{data=Df_Dict(data), column_order=Df_Array(c_order)} 104 | tnsr = a:to_tensor() 105 | 106 | assert.is.equal(tnsr:size(1),a:shape()["rows"]) 107 | assert.is.equal(tnsr:size(2),a:shape()["cols"] - 1) 108 | 109 | sum = 0 110 | col_no = a:get_column_order{column_name='1st', as_tensor = true} 111 | 112 | for i=1,tnsr:size(1) do 113 | sum = math.abs(tnsr[i][col_no] - a:get_column('1st')[i]) 114 | end 115 | 116 | assert.is_true(sum < 10^-5) 117 | 118 | sum = 0 119 | col_no = a:get_column_order{column_name='3rd', as_tensor = true} 120 | 121 | for i=1,tnsr:size(1) do 122 | sum = math.abs(tnsr[i][col_no] - a:get_column('3rd')[i]) 123 | end 124 | 125 | assert.is_true(sum < 10^-5) 126 | 127 | assert.is.equal(a:get_column_order{column_name = '2nd', as_tensor = true}, nil) 128 | end) 129 | 130 | 131 | it("Check that orders can be swapped",function() 132 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 133 | a:swap_column_order("Col A", "Col B") 134 | assert.are.same(a.column_order, 135 | {[1] = "Col B", 136 | [2] = "Col A", 137 | [3] = "Col C"}) 138 | end) 139 | 140 | it("Check that orders can set using pos_column_order",function() 141 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 142 | a:pos_column_order("Col B", 2) 143 | assert.are.same(a.column_order, 144 | {[1] = "Col A", 145 | [2] = "Col B", 146 | [3] = "Col C"}) 147 | 148 | a:pos_column_order("Col B", 1) 149 | assert.are.same(a.column_order, 150 | {[1] = "Col B", 151 | [2] = "Col A", 152 | [3] = "Col C"}) 153 | 154 | a:pos_column_order("Col C", 1) 155 | assert.are.same(a.column_order, 156 | {[1] = "Col C", 157 | [2] = "Col B", 158 | [3] = "Col A"}) 159 | 160 | 161 | a:pos_column_order("Col C", -1) 162 | assert.are.same(a.column_order, 163 | {[1] = "Col C", 164 | [2] = "Col B", 165 | [3] = "Col A"}) 166 | 167 | a:pos_column_order("Col C", 100) 168 | assert.are.same(a.column_order, 169 | {[1] = "Col B", 170 | [2] = "Col A", 171 | [3] = "Col C"}) 172 | end) 173 | end) 174 | -------------------------------------------------------------------------------- /specs/dataframe/export_data_spec.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Ensure the test is launched within the specs/ folder 4 | assert(string.match(lfs.currentdir(), "specs")~=nil, "You must run this test in specs folder") 5 | 6 | local initial_dir = lfs.currentdir() 7 | 8 | -- Go to specs folder 9 | while (not string.match(lfs.currentdir(), "/specs$")) do 10 | lfs.chdir("..") 11 | end 12 | 13 | local specs_dir = lfs.currentdir() 14 | lfs.chdir("..")-- one more directory and it is lib root 15 | 16 | -- Include Dataframe lib 17 | dofile("init.lua") 18 | 19 | -- Go back into initial dir 20 | lfs.chdir(initial_dir) 21 | 22 | describe("Exporting data process", function() 23 | 24 | describe("for CSV files",function() 25 | it("Exports the Dataframe to a CSV file",function() 26 | local a = Dataframe(specs_dir.."/data/full.csv") 27 | 28 | local file_name = specs_dir.."/data/copy_of_full.csv" 29 | a:to_csv(file_name) 30 | local b = Dataframe(file_name) 31 | 32 | for k,v in pairs(a.dataset) do 33 | -- Avoid errors on NaN values 34 | a:fill_na(k,8) 35 | b:fill_na(k,8) 36 | 37 | assert.are.same(a:get_column(k), 38 | b:get_column(k)) 39 | end 40 | 41 | os.remove(file_name) 42 | end) 43 | 44 | describe("Column order functionality",function() 45 | local a = Dataframe() 46 | local data = { 47 | ['firstColumn']={1,2,3}, 48 | ['secondColumn']={"Wow it's tricky","1,2","323."}, 49 | ['thirdColumn']={"\"","a\"a","3"} 50 | } 51 | 52 | it("Raises an error if the provided column order has non-continous indexes",function() 53 | c_order = { 54 | [1] = "firstColumn", 55 | [4] = "secondColumn", 56 | [3] = "thirdColumn" 57 | } 58 | 59 | assert.has.error(function() a:load_table{data=Df_Dict(data), column_order=Df_Array(c_order)} end) 60 | 61 | c_order = { 62 | [1] = "firstColumn", 63 | [3] = "thirdColumn" 64 | } 65 | 66 | assert.has.error(function() a:load_table{data=Df_Dict(data), column_order=Df_Array(c_order)} end) 67 | end) 68 | 69 | it("Keeps the column order when exporting",function() 70 | c_order = { 71 | [1] = "firstColumn", 72 | [2] = "secondColumn", 73 | [3] = "thirdColumn" 74 | } 75 | 76 | a:load_table{data=Df_Dict(data), column_order=Df_Array(c_order)} 77 | a:to_csv(specs_dir.."/data/tricky_csv.csv") 78 | a:load_csv(specs_dir.."/data/tricky_csv.csv") 79 | 80 | assert.are.same(a.column_order, c_order) 81 | 82 | os.remove(specs_dir.."/data/tricky_csv.csv") 83 | end) 84 | end) 85 | end) 86 | 87 | describe("for torch tensors",function() 88 | 89 | it("Exports the Dataframe to a tensor",function() 90 | local a = Dataframe(specs_dir.."/data/advanced_short.csv") 91 | -- Avoid NaN comparison (which always false) 92 | a:fill_all_na(2) 93 | a:to_tensor{filename=specs_dir.."/data/tensor_test.th7"} 94 | 95 | tnsr = a:to_tensor() 96 | tnsr2 = torch.load('./data/tensor_test.th7') 97 | 98 | assert.is_true(torch.all(tnsr:eq(tnsr2))) 99 | 100 | assert.is.equal(tnsr:size(1),a:shape()["rows"]) 101 | assert.is.equal(tnsr:size(2),table.exact_length(a:get_numerical_colnames())) 102 | 103 | sum = 0 104 | col_no = a:get_column_order('Col A') 105 | 106 | for i=1,tnsr:size(1) do 107 | sum = math.abs(tnsr[i][col_no] - a:get_column('Col A')[i]) 108 | end 109 | 110 | assert.near(0, sum, 10^-5) 111 | os.remove(specs_dir.."/data/tensor_test.th7") 112 | end) 113 | 114 | it("Keeps the right order when saving to Tensor",function() 115 | local a = Dataframe() 116 | 117 | data = { 118 | ['1st']={1,2,3}, 119 | ['2nd']={"A","B","323."}, 120 | ['3rd']=2.2 121 | } 122 | 123 | c_order = { 124 | [1] = "1st", 125 | [2] = "2nd", 126 | [3] = "3rd" 127 | } 128 | 129 | a:load_table{data=Df_Dict(data), column_order=Df_Array(c_order)} 130 | tnsr = a:to_tensor() 131 | 132 | assert.is.equal(tnsr:size(1),a:shape()["rows"]) 133 | assert.is.equal(tnsr:size(2),a:shape()["cols"] - 1) 134 | 135 | sum = 0 136 | col_no = a:get_column_order{column_name='1st', as_tensor = true} 137 | for i=1,tnsr:size(1) do 138 | sum = math.abs(tnsr[i][col_no] - a:get_column('1st')[i]) 139 | end 140 | 141 | assert.near(0, sum, 10^-5) 142 | 143 | sum = 0 144 | col_no = a:get_column_order{column_name='3rd', as_tensor = true} 145 | for i=1,tnsr:size(1) do 146 | sum = math.abs(tnsr[i][col_no] - a:get_column('3rd')[i]) 147 | end 148 | 149 | assert.near(0, sum, 10^-5) 150 | end) 151 | end) 152 | 153 | describe("torchnet get compatibility",function() 154 | it("The get should retrieve a single row in tensor format",function() 155 | local a = Dataframe(specs_dir.."/data/advanced_short.csv") 156 | 157 | tnsr = a:get(1) 158 | 159 | assert.is.equal(tnsr:size(1),1) 160 | assert.is.equal(tnsr:size(2),table.exact_length(a:get_numerical_colnames())) 161 | end) 162 | end) 163 | 164 | describe("to_csv with boolean values", function() 165 | -- Do not use advanced_short since it has nan that are 0/0 ~= 0/0 == true 166 | local df = Dataframe() 167 | 168 | df:load_table{ 169 | data = Df_Dict{ 170 | A = {1,2,3}, 171 | B = {"A", "B", 'true'}, 172 | C = {true, false, false} 173 | } 174 | } 175 | 176 | it("Saves with a boolean", function() 177 | df:to_csv("test.csv") 178 | local df2 = Dataframe("test.csv") 179 | 180 | os.remove("test.csv") 181 | 182 | assert.are.same(df.column_order, df2.column_order) 183 | for _,cn in ipairs(df.column_order) do 184 | assert.are.same(df:get_column(cn), df2:get_column(cn)) 185 | end 186 | end) 187 | end) 188 | 189 | end) 190 | -------------------------------------------------------------------------------- /specs/dataframe/metatable_spec.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Ensure the test is launched within the specs/ folder 4 | assert(string.match(lfs.currentdir(), "specs")~=nil, "You must run this test in specs folder") 5 | 6 | local initial_dir = lfs.currentdir() 7 | 8 | -- Go to specs folder 9 | while (not string.match(lfs.currentdir(), "/specs$")) do 10 | lfs.chdir("..") 11 | end 12 | 13 | local specs_dir = lfs.currentdir() 14 | lfs.chdir("..")-- one more directory and it is lib root 15 | 16 | -- Include Dataframe lib 17 | dofile("init.lua") 18 | 19 | -- Go back into initial dir 20 | lfs.chdir(initial_dir) 21 | 22 | describe("Indexing the dataframe", function() 23 | 24 | describe("Retrieving index",function() 25 | local df = Dataframe(specs_dir.."/data/simple_short.csv") 26 | assert.are.same(df["$Col A"], df:get_column('Col A')) 27 | assert.are.same(df["$Col C"], df:get_column('Col C')) 28 | end) 29 | 30 | describe("Retrieving index",function() 31 | local df = Dataframe(specs_dir.."/data/simple_short.csv") 32 | -- Wait until https://github.com/torch/torch7/issues/693 is resolved 33 | it("Retrieves a single row",function() 34 | local subset = df[1] 35 | assert.is.truthy(subset, "Fails to subset row") 36 | assert.are.same(subset["Col A"], 1) 37 | assert.are.same(subset["Col C"], 1000) 38 | end) 39 | 40 | it("Retrieves a several rows",function() 41 | local subset = df[Df_Array(1, 3)] 42 | assert.is.truthy(subset, "Fails to subset rows") 43 | assert.are.same(subset:size(1), 2) 44 | assert.are.same(subset:size(2), df:size(2)) 45 | end) 46 | 47 | it("Retrieves a continuous set of rows",function() 48 | local subset = df["1:4"] 49 | assert.is.truthy(subset, "Fails to subset rows with continuous syntax") 50 | assert.are.same(subset:size(1), 4) 51 | assert.are.same(subset:size(2), df:size(2)) 52 | end) 53 | end) 54 | 55 | describe("Set row via the newindex",function() 56 | local df = Dataframe(specs_dir.."/data/simple_short.csv") 57 | 58 | it("Set a single row",function() 59 | df[1]= {["Col A"] = 3231} 60 | assert.are.same(df[1]["Col A"], 3231) 61 | end) 62 | end) 63 | 64 | describe("Create a copy of the table",function() 65 | local df = Dataframe(Df_Dict({a={1,2,3}})) 66 | 67 | it("Check that it's a true copy and not a reference",function() 68 | local new_df = df:copy() 69 | new_df[1] = {a=2} 70 | assert.are.same(new_df:size(1), df:size(1)) 71 | assert.are.same(new_df:size(2), df:size(2)) 72 | assert.is_false(new_df[1].a == df[1].a) 73 | 74 | -- Check that htis matches also the shape 75 | assert.are.same(new_df:shape(), df:shape()) 76 | end) 77 | end) 78 | 79 | it("Returns the size of the Dataframe",function() 80 | local a = Dataframe(Df_Dict({test = {1,nil,3, 4}, test2 = {5, 9, 99, 88}})) 81 | 82 | assert.are.same(a:size(1), 4) 83 | assert.are.same(a:size(2), 2) 84 | end) 85 | 86 | describe("Gets the version number",function() 87 | local df = Dataframe() 88 | 89 | it("The torch.version goes to version()",function() 90 | assert.are.same(torch.version(df), df:version()) 91 | end) 92 | end) 93 | 94 | describe("Check the __len__",function() 95 | local df = Dataframe(Df_Dict{a={1,2,3,4,5}}) 96 | 97 | it("__len__ should return the n_rows",function() 98 | assert.are.same(df:__len__(), df.n_rows) 99 | end) 100 | 101 | it("# should return the n_rows #skip_version_LUA51",function() 102 | assert.are.same(#df, df.n_rows) 103 | end) 104 | end) 105 | 106 | describe("Check the __eq__",function() 107 | it("Should be equal",function() 108 | local a = Dataframe(Df_Dict{a={1,2,3,4,5}}) 109 | local b = Dataframe(Df_Dict{a={1,2,3,4,5}}) 110 | 111 | assert.is_true(a == b) 112 | assert.is_false(a ~= b) 113 | 114 | a:set(2, Df_Dict{a=0/0}) 115 | b:set(2, Df_Dict{a=0/0}) 116 | assert.is_true(a == b, "Fails with nan values") 117 | assert.is_false(a ~= b, "Fails with nan values") 118 | end) 119 | 120 | it("Should not be equal",function() 121 | local a = Dataframe(Df_Dict{a={1,2,3,4,5}}) 122 | local b = Dataframe(Df_Dict{a={1,3,4,5}}) 123 | local c = Dataframe(Df_Dict{a={1,2,3,4,6}}) 124 | local d = Dataframe(Df_Dict{a={1,2,3,0/0,6}}) 125 | local e = Dataframe(Df_Dict{b={1,2,3,4,5}}) 126 | local f = Dataframe(Df_Dict{a={1,2,3,4,5}, 127 | b={1,2,3,4,5}}) 128 | 129 | assert.is_true(a ~= b, "Fail to differ row length") 130 | assert.is_true(a ~= c, "Fail to differ values") 131 | assert.is_true(a ~= d, "Fail to differ nan") 132 | assert.is_true(a ~= e, "Fail to differ column names") 133 | assert.is_true(a ~= f, "Fail to differ number of columns") 134 | end) 135 | end) 136 | 137 | end) 138 | -------------------------------------------------------------------------------- /specs/dataframe/missing_data_spec.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Ensure the test is launched within the specs/ folder 4 | assert(string.match(lfs.currentdir(), "specs")~=nil, "You must run this test in specs folder") 5 | 6 | local initial_dir = lfs.currentdir() 7 | 8 | -- Go to specs folder 9 | while (not string.match(lfs.currentdir(), "/specs$")) do 10 | lfs.chdir("..") 11 | end 12 | 13 | local specs_dir = lfs.currentdir() 14 | lfs.chdir("..")-- one more directory and it is lib root 15 | 16 | -- Include Dataframe lib 17 | dofile("init.lua") 18 | 19 | -- Go back into initial dir 20 | lfs.chdir(initial_dir) 21 | 22 | describe("Dataframe class", function() 23 | 24 | it("Counts missing values", function() 25 | local a = Dataframe(specs_dir.."/data/full.csv") 26 | 27 | assert.are.same(a:count_na{as_dataframe = false}, {["Col A"]= 0, ["Col B"]= 0, ["Col C"]=1, ["Col D"]=1}) 28 | end) 29 | 30 | it("Fills missing value(s) for a given column(s)",function() 31 | local a = Dataframe(specs_dir.."/data/advanced_short.csv") 32 | 33 | assert.has.error(function() a:fill_na("Random column") end) 34 | 35 | a:fill_na("Col A", 1) 36 | assert.are.same(a:count_na{as_dataframe = false}, 37 | {["Col A"]= 0, ["Col B"]= 0, ["Col C"]=1}) 38 | 39 | a:fill_na("Col C", 1) 40 | assert.are.same(a:count_na{as_dataframe = false}, {["Col A"]= 0, ["Col B"]= 0, ["Col C"]=0}) 41 | 42 | assert.are.same(a:get_column("Col C"), {8, 1, 9}) 43 | end) 44 | 45 | it("Fills all Dataframe's missing values", function() 46 | local a = Dataframe(specs_dir.."/data/advanced_short.csv") 47 | 48 | a.dataset['Col A'][3] = nil 49 | 50 | local cnt, tot = a:count_na{as_dataframe = false} 51 | assert.are.same(cnt, {["Col A"]= 1, ["Col B"]= 0, ["Col C"]=1}) 52 | assert.are.same(tot, 2) 53 | 54 | 55 | a:fill_all_na(-1) 56 | 57 | assert.are.same(a:count_na{as_dataframe = false}, {["Col A"]= 0, ["Col B"]= 0, ["Col C"]=0}) 58 | assert.are.same(a:get_column('Col A'), {1,2,-1}) 59 | end) 60 | 61 | it("The count_na should #1 return a Dataframe by default", function() 62 | local a = Dataframe(specs_dir.."/data/advanced_short.csv") 63 | 64 | local ret = a:count_na() 65 | 66 | assert.are.same(torch.type(ret), "Dataframe") 67 | 68 | assert.are.same(ret:size(), 3, "3 columns should render 3 rows") 69 | end) 70 | 71 | end) 72 | -------------------------------------------------------------------------------- /specs/dataframe/row_spec.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Ensure the test is launched within the specs/ folder 4 | assert(string.match(lfs.currentdir(), "specs")~=nil, "You must run this test in specs folder") 5 | 6 | local initial_dir = lfs.currentdir() 7 | 8 | -- Go to specs folder 9 | while (not string.match(lfs.currentdir(), "/specs$")) do 10 | lfs.chdir("..") 11 | end 12 | 13 | local specs_dir = lfs.currentdir() 14 | lfs.chdir("..")-- one more directory and it is lib root 15 | 16 | -- Include Dataframe lib 17 | dofile("init.lua") 18 | 19 | -- Go back into initial dir 20 | lfs.chdir(initial_dir) 21 | 22 | describe("Row functions", function() 23 | it("Appends new data",function() 24 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 25 | 26 | a:append(Df_Dict({['Col A']={15},['Col B']={25},['Col C']={35}})) 27 | assert.are.same(a:shape(), {rows=5, cols=3})-- "The simple_short.csv is 4x3 after insert should be 5x3" 28 | end) 29 | 30 | it("Appends new columns together with new data",function() 31 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 32 | 33 | a:append(Df_Dict({['Col A']={15},['Col D']={25},['Col C']={35}})) 34 | assert.are.same(a:shape(), {rows=5, cols=4})-- "The simple_short.csv is 4x3 after insert should be 5x3" 35 | end) 36 | 37 | it("Appends dataframe",function() 38 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 39 | 40 | b = Dataframe() 41 | b:load_table{data = Df_Dict({['Col A']={15},['Col B']={25},['Col C']={35}}), 42 | column_order = Df_Array('Col B', 'Col C', 'Col A')} 43 | a:append(b) 44 | assert.are.same(a:shape(), {rows=5, cols=3})-- "The simple_short.csv is 4x3 after insert should be 5x3" 45 | end) 46 | 47 | it("Appends dataframe to empty dataset should copy the original including specs", 48 | function() 49 | local a = Dataframe() 50 | 51 | b = Dataframe() 52 | b:load_table{data = Df_Dict({['Col A']={15},['Col B']={25},['Col C']={35}}), 53 | column_order = Df_Array('Col B', 'Col C', 'Col A')} 54 | a:append(b) 55 | assert.are.same(a:shape(), {rows=1, cols=3})-- "The simple_short.csv is 4x3 after insert should be 5x3" 56 | assert.are.same(a.column_order, b.column_order) 57 | end) 58 | 59 | it("Check rbind new columns together with new data",function() 60 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 61 | 62 | a:rbind(Df_Dict({['Col A']={15},['Col D']={25},['Col C']={35}})) 63 | assert.are.same(a:shape(), {rows=5, cols=4})-- "The simple_short.csv is 4x3 after insert should be 5x3" 64 | end) 65 | 66 | it("Check rbind with dataframe",function() 67 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 68 | 69 | b = Dataframe() 70 | b:load_table{data = Df_Dict({['Col A']={15},['Col B']={25},['Col C']={35}})} 71 | a:rbind(b) 72 | assert.are.same(a:shape(), {rows=5, cols=3})-- "The simple_short.csv is 4x3 after insert should be 5x3" 73 | end) 74 | 75 | it("Inserts a row", function() 76 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 77 | 78 | a:insert(2, Df_Dict({['Col A']={15},['Col E']={25},['Col C']={35}})) 79 | assert.are.same(a:shape(), {rows=5, cols=4}) 80 | assert.are.same(a:get_column('Col A'), {1, 15, 2, 3, 4}) 81 | assert.are.same(a:get_column('Col B'), {0.2, 0/0, 0.3, 0.4, 0.5}) 82 | end) 83 | 84 | it("Inserts three rows", function() 85 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 86 | a:insert(2, Df_Dict({['Col A']={15, 16, 17}})) 87 | assert.are.same(a:shape(), {rows=7, cols=3}) 88 | assert.are.same(a:get_column('Col A'), {1, 15, 16, 17, 2, 3, 4}) 89 | assert.are.same(a:get_column('Col B'), {.2, 0/0, 0/0, 0/0, .3, .4, .5}) 90 | end) 91 | 92 | it("Removes a row given an index",function() 93 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 94 | 95 | a:remove_index(1) 96 | assert.are.same(a:shape(), {rows=3, cols=3})-- "The simple_short.csv is 4x3" 97 | assert.are.same(a:get_column('Col A'), {2,3,4}) 98 | 99 | a:remove_index(1) 100 | a:remove_index(1) 101 | a:remove_index(1) 102 | assert.are.same(a:shape(), {rows=0, cols=3}) 103 | end) 104 | 105 | it("Check that append calls load_table", function() 106 | local a = Dataframe() 107 | a:append(Df_Dict{b=1, a=2}) 108 | 109 | a:assert_has_column('a') 110 | a:assert_has_column('b') 111 | 112 | assert.are.same(a:get_column('a')[1], 2) 113 | end) 114 | 115 | it("Check that append calls load_table with column order", function() 116 | local a = Dataframe() 117 | a:append(Df_Dict{b=1, a=2}, Df_Array("b", "a")) 118 | 119 | local b = Dataframe() 120 | b:append(Df_Dict{b=1, a=2}, Df_Array("a", "b")) 121 | 122 | assert.are.not_equal(a.column_order, b.column_order) 123 | assert.are.same(a:get_column('a'), b:get_column('a')) 124 | assert.are.same(a:get_column('b'), b:get_column('b')) 125 | end) 126 | end) 127 | -------------------------------------------------------------------------------- /specs/dataframe/select_set_update_spec.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Ensure the test is launched within the specs/ folder 4 | assert(string.match(lfs.currentdir(), "specs")~=nil, "You must run this test in specs folder") 5 | 6 | local initial_dir = lfs.currentdir() 7 | 8 | -- Go to specs folder 9 | while (not string.match(lfs.currentdir(), "/specs$")) do 10 | lfs.chdir("..") 11 | end 12 | 13 | local specs_dir = lfs.currentdir() 14 | lfs.chdir("..")-- one more directory and it is lib root 15 | 16 | -- Include Dataframe lib 17 | dofile("init.lua") 18 | 19 | -- Go back into initial dir 20 | lfs.chdir(initial_dir) 21 | 22 | describe("Data manipulationf incl. where, update etc.", function() 23 | 24 | it("Retrieves a value in a column #where",function() 25 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 26 | 27 | local ret_val = a:where('Col A', 2) 28 | assert.are.same(ret_val:get_column("Col A"), {2}) 29 | assert.are.same(ret_val:get_column("Col C"), {.1}) 30 | assert.is.equal(torch.type(ret_val), "Dataframe") 31 | assert.are.same(ret_val:shape(), {rows = 1, cols = 3}) 32 | 33 | local ret_val = a:where('Col A', 222222222) 34 | assert.are.same(ret_val:shape(), {rows = 0, cols = 3}) 35 | 36 | a:__init() 37 | a:load_csv{path = specs_dir.."/data/advanced_short.csv", 38 | verbose = false} 39 | ret_val = a:where('Col B', 'B') 40 | assert.are.same(ret_val:shape(), {rows = 2, cols = 3}) 41 | col_c = ret_val:get_column('Col C') 42 | assert.is_true(isnan(col_c[1])) 43 | assert.is.equal(col_c[2], 9) 44 | assert.are.same(ret_val:get_column('Col A'), {2, 3}) 45 | end) 46 | 47 | it("Updates multiple rows according to a custom condition", function() 48 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 49 | 50 | local start_val = a:get_column('Col B') 51 | start_val[1] = start_val[1] * 2 52 | 53 | a:update( 54 | function(s_row) return s_row['Col A'] == 1 end, 55 | function(upd_row) upd_row['Col B'] = upd_row['Col B'] * 2 return upd_row end 56 | ) 57 | assert.are.same(a:get_column('Col B'), start_val) 58 | 59 | -- Check a double match 60 | local b = Dataframe(specs_dir.."/data/advanced_short.csv") 61 | 62 | start_val = b:get_column('Col A') 63 | start_val[2] = start_val[2] * 2 64 | start_val[3] = start_val[3] * 2 65 | b:update( 66 | function(s_row) return s_row['Col B'] == 1 end, 67 | function(upd_row) upd_row['Col A'] = upd_row['Col A'] * 2 return upd_row end 68 | ) 69 | 70 | assert.are.same(b:get_column('Col A'), start_val) 71 | end) 72 | 73 | it("Updates a single cell given a column name and an value #set",function() 74 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 75 | 76 | a:set(1000, 'Col C', Df_Dict({['Col A']=99})) 77 | assert.is.equal(a:get_column('Col A')[1], 99) 78 | end) 79 | 80 | it("Updates all matching cells when using #set",function() 81 | local a = Dataframe(Df_Dict{a = {1,2,3}, b = {1,1,2}}) 82 | 83 | a:set(1, 'b', Df_Dict({['a']=4})) 84 | assert.are.same(a:get_column('a'), {4,4,3}) 85 | end) 86 | 87 | it("Updates a single cell given a an index",function() 88 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 89 | 90 | a:set(2, Df_Dict({['Col A']=99})) 91 | assert.is.equal(a:get_column('Col A')[2], 99) 92 | end) 93 | 94 | it("Updates a unique row given an index",function() 95 | local a = Dataframe(specs_dir.."/data/simple_short.csv") 96 | 97 | new = { 98 | ['Col A']=4, 99 | ['Col B']=4, 100 | ['Col C']=4 101 | } 102 | a:_update_single_row(1, Df_Tbl(new), Df_Tbl(a:get_row(1))) 103 | assert.are.same(a:get_row(1), new) 104 | end) 105 | 106 | describe("Check #wide2long", function() 107 | local df = Dataframe(Df_Dict({a = {1,2,3}, b={4,nil,5}, c={[3] = 6}})) 108 | a = df:wide2long(Df_Array("c", "b"), "id", "value") 109 | 110 | it("Check that the number of rows are correct", function() 111 | assert.are.same(a:where('a', 1):size(1), 1) 112 | assert.are.same(a:where('a', 2):size(1), 1) 113 | assert.are.same(a:where('a', 3):size(1), 2) 114 | end) 115 | 116 | it("Check that the value is correct when having one value", function() 117 | local row = a:where('a', 1):get_row(1) 118 | assert.are.same(row['id'], 'b') 119 | assert.are.same(row['value'], 4) 120 | end) 121 | 122 | 123 | it("Check that the value is correct when having no value", function() 124 | local row = a:where('a', 2):get_row(1) 125 | assert.is_true(isnan(row['id'])) 126 | assert.is_true(isnan(row['value'])) 127 | end) 128 | 129 | it("Check that the order is correct when having multiple values", function() 130 | local row = a:where('a', 3): 131 | where('id', 'b'): 132 | get_row(1) 133 | assert.are.same(row['id'], 'b') 134 | assert.are.same(row['value'], 5) 135 | 136 | local row = a:where('a', 3): 137 | where('id', 'c'): 138 | get_row(1) 139 | assert.are.same(row['id'], 'c') 140 | assert.are.same(row['value'], 6) 141 | end) 142 | 143 | local df = Dataframe(Df_Dict({a = {1,2,3}, b={4,nil,5}, c={[3] = 6}})) 144 | b = df:wide2long("[bc]", "id", "value") 145 | it("Check that this works the same with regulare expressions", function() 146 | assert.are.same(b:where('a', 1):size(1), 1) 147 | assert.are.same(b:where('a', 2):size(1), 1) 148 | assert.are.same(b:where('a', 3):size(1), 2) 149 | 150 | local row = b:where('a', 3): 151 | where('id', 'b'): 152 | get_row(1) 153 | assert.are.same(row['id'], 'b') 154 | assert.are.same(row['value'], 5) 155 | 156 | local row = b:where('a', 3): 157 | where('id', 'c'): 158 | get_row(1) 159 | assert.are.same(row['id'], 'c') 160 | assert.are.same(row['value'], 6) 161 | end) 162 | 163 | c = df:wide2long("c", "id", "value") 164 | it("Check that different columnt result in different result", function() 165 | assert.is_false(a == c) 166 | end) 167 | end) 168 | end) 169 | -------------------------------------------------------------------------------- /specs/dataframe/serialization_spec.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Ensure the test is launched within the specs/ folder 4 | assert(string.match(lfs.currentdir(), "specs")~=nil, "You must run this test in specs folder") 5 | 6 | local initial_dir = lfs.currentdir() 7 | 8 | -- Go to specs folder 9 | while (not string.match(lfs.currentdir(), "/specs$")) do 10 | lfs.chdir("..") 11 | end 12 | 13 | local specs_dir = lfs.currentdir() 14 | lfs.chdir("..")-- one more directory and it is lib root 15 | 16 | -- Include Dataframe lib 17 | dofile("init.lua") 18 | 19 | -- Go back into initial dir 20 | lfs.chdir(initial_dir) 21 | 22 | describe("Serialization", function() 23 | -- Do not use advanced_short since it has nan that are 0/0 ~= 0/0 == true 24 | local df = Dataframe() 25 | 26 | it("Deserializes a simple Dataframe object",function() 27 | df:load_csv{path = specs_dir.."/data/simple_short.csv", verbose = false} 28 | 29 | b = torch.serialize(df) 30 | c = torch.deserialize(b) 31 | 32 | assert.is.equal(torch.typename(c), "Dataframe") 33 | 34 | --tester:eq(df, c) 35 | end) 36 | 37 | it("Saves then load a Dataframe object",function() 38 | torch.save("test.t7", df) 39 | c = torch.load("test.t7") 40 | 41 | os.remove("test.t7") 42 | 43 | assert.is.equal(torch.typename(c), "Dataframe") 44 | 45 | --tester:eq(df, c) 46 | end) 47 | 48 | it("Saves with init",function() 49 | local a = Dataframe(specs_dir.."/data/realistic_29_row_data.csv") 50 | 51 | a:create_subsets() 52 | a:fill_all_na() 53 | 54 | torch.save("test.t7", a) 55 | c = torch.load("test.t7") 56 | 57 | os.remove("test.t7") 58 | 59 | assert.is.equal(torch.typename(c), "Dataframe") 60 | 61 | --tester:eq(a, c) 62 | end) 63 | end) 64 | -------------------------------------------------------------------------------- /specs/helper_classes/df_array_spec.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Ensure the test is launched within the specs/ folder 4 | assert(string.match(lfs.currentdir(), "specs")~=nil, "You must run this test in specs folder") 5 | 6 | local initial_dir = lfs.currentdir() 7 | 8 | -- Go to specs folder 9 | while (not string.match(lfs.currentdir(), "/specs$")) do 10 | lfs.chdir("..") 11 | end 12 | 13 | local specs_dir = lfs.currentdir() 14 | lfs.chdir("..")-- one more directory and it is lib root 15 | 16 | -- Include Dataframe lib 17 | dofile("init.lua") 18 | 19 | -- Go back into initial dir 20 | lfs.chdir(initial_dir) 21 | 22 | describe("Df_Array", function() 23 | local tableData = {1,2,3,4} 24 | 25 | it("can be init with a table",function() 26 | local array = Df_Array(tableData) 27 | 28 | assert.are.same(tableData,array.data) 29 | end) 30 | 31 | it("can be init with a Dataseries",function() 32 | local series = Dataseries(Df_Array(tableData)) 33 | local array = Df_Array(series) 34 | 35 | assert.are.same(tableData,array.data) 36 | end) 37 | 38 | it("can be init with a tensor",function() 39 | local tensor = torch.IntTensor(tableData) 40 | local array = Df_Array(tensor) 41 | 42 | assert.are.same(tableData,array.data) 43 | end) 44 | 45 | it("can be init with 'infinite' arguments",function() 46 | local array = Df_Array(1,2,3,4) 47 | 48 | assert.are.same(tableData,array.data) 49 | end) 50 | 51 | it("returns asked index with brackets",function() 52 | local array = Df_Array(tableData) 53 | 54 | assert.are.same(array[3],3) 55 | end) 56 | 57 | it("returns nil if index does not exists or it is not a number",function() 58 | local array = Df_Array(tableData) 59 | 60 | assert.are.same(array[42],nil) 61 | end) 62 | 63 | it("# returns its length",function() 64 | local array = Df_Array(tableData) 65 | 66 | assert.are.same(#array,4) 67 | end) 68 | end) -------------------------------------------------------------------------------- /specs/helper_classes/df_dict_spec.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Ensure the test is launched within the specs/ folder 4 | assert(string.match(lfs.currentdir(), "specs")~=nil, "You must run this test in specs folder") 5 | 6 | local initial_dir = lfs.currentdir() 7 | 8 | -- Go to specs folder 9 | while (not string.match(lfs.currentdir(), "/specs$")) do 10 | lfs.chdir("..") 11 | end 12 | 13 | local specs_dir = lfs.currentdir() 14 | lfs.chdir("..")-- one more directory and it is lib root 15 | 16 | -- Include Dataframe lib 17 | dofile("init.lua") 18 | 19 | -- Go back into initial dir 20 | lfs.chdir(initial_dir) 21 | 22 | describe("Df_Dict",function() 23 | local simpleTable = {1,2,3,4} 24 | local simpleTableData = {["col1"]=1,["col2"]=2,["col3"]=3,["col4"]=4} 25 | local dimTableData = {["col1"]=1,["col2"]=2,["col3"]=3,["col4"]={4,5,6}} 26 | 27 | it("can be init with a simple table without key",function() 28 | local dic = Df_Dict(simpleTable) 29 | assert.are.same(dic.data,simpleTable) 30 | assert.are.same(dic.keys,simpleTable) 31 | end) 32 | 33 | it("can be init with a simple table keys",function() 34 | local dic = Df_Dict(simpleTableData) 35 | assert.are.same(dic.data,simpleTableData) 36 | end) 37 | 38 | it("can be init with a multi-dimensional table",function() 39 | local dic = Df_Dict(dimTableData) 40 | assert.are.same(dic.data,dimTableData) 41 | end) 42 | 43 | it("can check if all columns are the same size",function() 44 | local dic = Df_Dict(simpleTable) 45 | assert.is_true(dic:check_lengths()) 46 | 47 | dic = Df_Dict(simpleTableData) 48 | assert.is_true(dic:check_lengths()) 49 | 50 | dic = Df_Dict(dimTableData) 51 | assert.is_false(dic:check_lengths()) 52 | end) 53 | 54 | it("returns asked key's value with brackets",function() 55 | local dic = Df_Dict(simpleTable) 56 | assert.are.same(dic[3],3) 57 | 58 | dic = Df_Dict(simpleTableData) 59 | assert.are.same(dic["$col3"],3) 60 | 61 | dic = Df_Dict(dimTableData) 62 | assert.are.same(dic["$col4"],{4,5,6}) 63 | end) 64 | 65 | it("returns nil if index does not exists or it is not a number",function() 66 | local dic = Df_Dict(simpleTable) 67 | assert.are.same(dic[42],nil) 68 | end) 69 | 70 | it("# returns its length",function() 71 | local dic = Df_Dict(simpleTable) 72 | assert.are.same(#dic,4) 73 | 74 | dic = Df_Dict(simpleTableData) 75 | assert.are.same(#dic,4) 76 | 77 | dic = Df_Dict(dimTableData) 78 | assert.are.same(#dic,4) 79 | end) 80 | end) 81 | 82 | describe("Df_Tbl",function() 83 | local simpleTable = {1,2,3,4} 84 | 85 | it("can be init with a table",function() 86 | local tbl = Df_Tbl(simpleTable) 87 | assert.are.same(tbl.data,simpleTable) 88 | end) 89 | 90 | it("# returns its length",function() 91 | local tbl = Df_Tbl(simpleTable) 92 | assert.are.same(#tbl,4) 93 | end) 94 | end) 95 | -------------------------------------------------------------------------------- /specs/helper_classes/df_tbl_spec.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Ensure the test is launched within the specs/ folder 4 | assert(string.match(lfs.currentdir(), "specs")~=nil, "You must run this test in specs folder") 5 | 6 | local initial_dir = lfs.currentdir() 7 | 8 | -- Go to specs folder 9 | while (not string.match(lfs.currentdir(), "/specs$")) do 10 | lfs.chdir("..") 11 | end 12 | 13 | local specs_dir = lfs.currentdir() 14 | lfs.chdir("..")-- one more directory and it is lib root 15 | 16 | -- Include Dataframe lib 17 | dofile("init.lua") 18 | 19 | -- Go back into initial dir 20 | lfs.chdir(initial_dir) 21 | 22 | describe("Df_Tbl",function() 23 | local simpleTable = {1,2,3,4} 24 | 25 | it("can be init with a table",function() 26 | local tbl = Df_Tbl(simpleTable) 27 | assert.are.same(tbl.data,simpleTable) 28 | end) 29 | 30 | it("# returns its length",function() 31 | local tbl = Df_Tbl(simpleTable) 32 | assert.are.same(#tbl,4) 33 | end) 34 | end) 35 | -------------------------------------------------------------------------------- /specs/linter.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo -e "**********"; 3 | echo -e "* Linter *"; 4 | echo -e "**********"; 5 | echo ""; 6 | 7 | luacheck ../ --no-global --no-self --exclude-files ../specs/* -------------------------------------------------------------------------------- /specs/output/Wiki-templates/Readme.md: -------------------------------------------------------------------------------- 1 | Much of the Wiki requires examples with tables. It is therefore useful to use iTorch and export to markdown that is then clenad from the scripts and entered into the Wiki. 2 | -------------------------------------------------------------------------------- /specs/output/Wiki-templates/Where_update_and_set.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "iTorch", 5 | "language": "lua", 6 | "name": "itorch" 7 | }, 8 | "language_info": { 9 | "name": "lua", 10 | "version": "5.1" 11 | }, 12 | "name": "" 13 | }, 14 | "nbformat": 3, 15 | "nbformat_minor": 0, 16 | "worksheets": [ 17 | { 18 | "cells": [ 19 | { 20 | "cell_type": "heading", 21 | "level": 1, 22 | "metadata": {}, 23 | "source": [ 24 | "Load the packages" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "collapsed": false, 30 | "input": [ 31 | "require 'torch'\n", 32 | "require 'Dataframe'" 33 | ], 34 | "language": "python", 35 | "metadata": {}, 36 | "outputs": [] 37 | }, 38 | { 39 | "cell_type": "heading", 40 | "level": 1, 41 | "metadata": {}, 42 | "source": [ 43 | "Load the data" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "collapsed": false, 49 | "input": [ 50 | "my_data = Dataframe('../../data/realistic_29_row_data.csv')" 51 | ], 52 | "language": "python", 53 | "metadata": {}, 54 | "outputs": [] 55 | }, 56 | { 57 | "cell_type": "markdown", 58 | "metadata": {}, 59 | "source": [ 60 | "# Checkout the first couple of rows\n", 61 | "\n", 62 | "The simplest example way to have a quick look at the data is to use the `output` together with `head`/`tail` - the simplest form of subsetting" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "collapsed": false, 68 | "input": [ 69 | "my_data:head(2):output()\n", 70 | "my_data:tail(2):output()" 71 | ], 72 | "language": "python", 73 | "metadata": {}, 74 | "outputs": [] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "# Searching the dataframe\n", 81 | "\n", 82 | "The where can be convenient when you want to find a particular subset" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "collapsed": false, 88 | "input": [ 89 | "my_data:where('Gender', 'Male'):head(2):output()" 90 | ], 91 | "language": "python", 92 | "metadata": {}, 93 | "outputs": [] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "More flexible searching is allowed through custom search functions" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "collapsed": false, 105 | "input": [ 106 | "my_data:where(function(row) return row.Gender == \"Male\" and row.Weight > 70 end):output()" 107 | ], 108 | "language": "python", 109 | "metadata": {}, 110 | "outputs": [] 111 | }, 112 | { 113 | "cell_type": "markdown", 114 | "metadata": {}, 115 | "source": [ 116 | "# Update\n", 117 | "\n", 118 | "We can easily update the table using an update function" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "collapsed": false, 124 | "input": [ 125 | "my_data:\n", 126 | " update(\n", 127 | " function(row) return row.Weight > 88 end,\n", 128 | " function(row)\n", 129 | " row.Weight = 88\n", 130 | " return row\n", 131 | " end)\n", 132 | "\n", 133 | "my_data:\n", 134 | " where(function(row) return row.Gender == \"Male\" and row.Weight > 70 end):\n", 135 | " output()" 136 | ], 137 | "language": "python", 138 | "metadata": {}, 139 | "outputs": [] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "# The set function\n", 146 | "\n", 147 | "Closely related to the update is the simpler set function" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "collapsed": false, 153 | "input": [ 154 | "my_data:\n", 155 | " set{item_to_find = 55.5, \n", 156 | " column_name = 'Weight', \n", 157 | " new_value = Df_Dict({Gender = \"Female\"})}\n", 158 | "\n", 159 | "my_data:\n", 160 | " where(function(row) return row.Gender == \"Female\" and row.Weight < 60 end):\n", 161 | " output()" 162 | ], 163 | "language": "python", 164 | "metadata": {}, 165 | "outputs": [] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "collapsed": true, 170 | "input": [], 171 | "language": "python", 172 | "metadata": {}, 173 | "outputs": [] 174 | } 175 | ], 176 | "metadata": {} 177 | } 178 | ] 179 | } -------------------------------------------------------------------------------- /specs/output/cli_output.lua: -------------------------------------------------------------------------------- 1 | require 'torch' 2 | 3 | -- Make sure that directory structure is always the same 4 | require('lfs') 5 | if (string.match(lfs.currentdir(), "/specs/output$")) then 6 | lfs.chdir("../..") 7 | end 8 | paths.dofile('init.lua') 9 | 10 | -- Go into tests so that the loading of CSV:s is the same as always 11 | lfs.chdir("specs/output") 12 | 13 | -- A quick way to get a feeling for how the __tostring method works 14 | local a = Dataframe() 15 | a:load_csv{path = "../data/simple_short.csv", 16 | verbose = false} 17 | a:add_column('boolean', true) 18 | a:set(2, Df_Dict{boolean = false}) 19 | a:set(3, Df_Dict{boolean = 0/0}) 20 | 21 | print("-- Simple table with boolean column --") 22 | print(a) 23 | 24 | a:output() 25 | 26 | print("-- Advanced table --") 27 | a:load_csv{path = "../data/advanced_short.csv", 28 | verbose = false} 29 | print(a) 30 | 31 | print(" - check digits") 32 | 33 | a:output{digits = 2} 34 | 35 | print("-- Long table --") 36 | a:load_csv{path = "../data/realistic_29_row_data.csv", 37 | verbose = false} 38 | a.tostring_defaults.no_rows = 5 39 | print(a) 40 | 41 | a.tostring_defaults.no_rows = 20 42 | print(a) 43 | 44 | a:as_categorical('Gender') 45 | a.tostring_defaults.no_rows = 5 46 | print(a) 47 | 48 | females = a:where('Gender', 'Female') 49 | print(females) 50 | 51 | math.randomseed(10) 52 | left_right = {} 53 | for i = 1,a:shape()["rows"] do 54 | if (math.random() > 0.5) then 55 | table.insert(left_right, "left") 56 | else 57 | table.insert(left_right, "right") 58 | end 59 | end 60 | a:add_column("Side", Dataseries(Df_Array(left_right))) 61 | print(a:head(4):tostring(Df_Array("Weight"))) 62 | 63 | a:as_categorical("Side") 64 | print(a:head(4):tostring("Comm")) 65 | 66 | tbl = { 67 | no = {}, 68 | one = {}, 69 | two = {}, 70 | three = {}, 71 | four = {}, 72 | five = {}, 73 | six = {}, 74 | seven = {}, 75 | eight = {}, 76 | nine = {} 77 | } 78 | 79 | local long_txt = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud ex" 80 | for k,v in pairs(tbl) do 81 | for i=1,4 do 82 | if (k == "no") then 83 | v[#v + 1] = i 84 | else 85 | v[#v + 1] = long_txt 86 | end 87 | end 88 | end 89 | 90 | a = Dataframe{data=Df_Dict(tbl), 91 | column_order=Df_Array("no", "one", "two", "three", "four", "five", 92 | "six", "seven", "eight", "nine")} 93 | a:output() 94 | 95 | a = Dataframe(Df_Dict{ 96 | Filename = 11, 97 | fracture = 11, 98 | Side = 11, 99 | Exam_view = 11, 100 | osteoarthritis = 11, 101 | styloid = 11, 102 | prev_fracture = 11, 103 | Exam_body_part = 11 104 | }) 105 | 106 | print(a) 107 | -------------------------------------------------------------------------------- /specs/output/itorch_notebook_df_test.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "kernelspec": { 4 | "display_name": "iTorch", 5 | "language": "lua", 6 | "name": "itorch" 7 | }, 8 | "language_info": { 9 | "name": "lua", 10 | "version": "5.1" 11 | }, 12 | "name": "" 13 | }, 14 | "nbformat": 3, 15 | "nbformat_minor": 0, 16 | "worksheets": [ 17 | { 18 | "cells": [ 19 | { 20 | "cell_type": "code", 21 | "collapsed": false, 22 | "input": [ 23 | "require 'torch'\n", 24 | "require 'lfs'\n", 25 | "\n", 26 | "-- Make sure that directory structure is always the same\n", 27 | "if (string.match(lfs.currentdir(), \"/specs/output$\")) then\n", 28 | " lfs.chdir(\"../..\")\n", 29 | "end\n", 30 | "\n", 31 | "paths.dofile(lfs.currentdir() .. '/init.lua')\n", 32 | "\n", 33 | "-- Go into tests so that the loading of CSV:s is the same as always\n", 34 | "lfs.chdir(\"./specs/\")" 35 | ], 36 | "language": "python", 37 | "metadata": {}, 38 | "outputs": [] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "collapsed": false, 43 | "input": [ 44 | "itorch ~= nil" 45 | ], 46 | "language": "python", 47 | "metadata": {}, 48 | "outputs": [] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "collapsed": false, 53 | "input": [ 54 | "-- A quick way to get a feeling for how the __tostring method works\n", 55 | "a = Dataframe('./data/simple_short.csv')" 56 | ], 57 | "language": "python", 58 | "metadata": {}, 59 | "outputs": [] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "collapsed": false, 64 | "input": [ 65 | "print(\"-- Regular print with a Dataframe --\")\n", 66 | "print(a)" 67 | ], 68 | "language": "python", 69 | "metadata": {}, 70 | "outputs": [] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "collapsed": false, 75 | "input": [ 76 | "print(\" -- Check regular table -- \")\n", 77 | "print({1, 2, 3, {1,2,3, {4,5,6}}})" 78 | ], 79 | "language": "python", 80 | "metadata": {}, 81 | "outputs": [] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "collapsed": false, 86 | "input": [ 87 | "print(\"-- Long table --\")\n", 88 | "local a = Dataframe()\n", 89 | "a:load_csv{path = \"data/realistic_29_row_data.csv\",\n", 90 | " verbose = false}\n", 91 | "\n", 92 | "math.randomseed(10)\n", 93 | "left_right = {}\n", 94 | "for i = 1,a:shape()[\"rows\"] do\n", 95 | " if (math.random() > 0.5) then\n", 96 | " table.insert(left_right, \"left\")\n", 97 | " else\n", 98 | " table.insert(left_right, \"right\")\n", 99 | " end\n", 100 | "end\n", 101 | "a:add_column(\"Side\", Df_Array(left_right))\n", 102 | "a:output()" 103 | ], 104 | "language": "python", 105 | "metadata": {}, 106 | "outputs": [] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "collapsed": false, 111 | "input": [ 112 | "local a = Dataframe()\n", 113 | "a:load_csv{path = \"data/realistic_29_row_data.csv\",\n", 114 | " verbose = false}\n", 115 | "a:as_categorical(\"Gender\")\n", 116 | "print(\"With set number of digits\")\n", 117 | "a:output{digits = 1}" 118 | ], 119 | "language": "python", 120 | "metadata": {}, 121 | "outputs": [] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "collapsed": true, 126 | "input": [], 127 | "language": "python", 128 | "metadata": {}, 129 | "outputs": [] 130 | } 131 | ], 132 | "metadata": {} 133 | } 134 | ] 135 | } -------------------------------------------------------------------------------- /specs/run_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | echo -e "\x1B[32m+++++++++++++++++++++++++++++++\x1B[0m"; 3 | echo -e "\x1B[32m+\x1B[0m Start torch-dataframe specs \x1B[32m+\x1B[0m"; 4 | echo -e "\x1B[32m+++++++++++++++++++++++++++++++\x1B[0m"; 5 | echo ""; 6 | 7 | VERSION="any" 8 | COVERAGE=false 9 | while [[ $# -gt 0 ]] 10 | do 11 | key="$1" 12 | 13 | case $key in 14 | -v|--version) 15 | VERSION="$2" 16 | shift # past argument 17 | ;; 18 | -c|--coverage) 19 | COVERAGE=true 20 | ;; 21 | *) 22 | # unknown option 23 | ;; 24 | esac 25 | shift # past argument or value 26 | done 27 | 28 | var=0 29 | count=0 30 | failed_scripts=() 31 | exclude_tags="skip_version_$VERSION" 32 | for f in `find . -name "*_spec*"`; do 33 | echo ""; 34 | echo "********************************************"; 35 | echo "Running specs in $f"; 36 | 37 | if [ "$COVERAGE" = true ]; then 38 | busted -v --coverage --exclude-tags=$exclude_tags,skip_all $f; 39 | else 40 | busted -v --exclude-tags=$exclude_tags,skip_all $f; 41 | fi 42 | 43 | fail=$? 44 | var=$(($var+$fail)) 45 | count=$(($count+1)) 46 | if [ $fail -ne 0 ] ; then 47 | failed_scripts+=($f) 48 | fi 49 | echo "End $f"; 50 | echo "********************************************"; 51 | done 52 | 53 | echo "" 54 | echo -e "\x1B[93m==============================================\x1B[0m" 55 | if [ $var -gt 0 ] 56 | then 57 | echo -e "Number of scripts failed: \x1B[31m$var\x1B[0m (total scripts: $count)" 58 | echo "Script(s) that failed:" 59 | for i in "${failed_scripts[@]}"; do 60 | echo " -!- $i"; 61 | done 62 | else 63 | echo "Number of scripts failed: $var (total scripts: $count)" 64 | fi 65 | echo " - exclude-tags used: $exclude_tags" 66 | echo -e "\x1B[93m==============================================\x1B[0m" 67 | 68 | exit $var 69 | -------------------------------------------------------------------------------- /specs/utils/test.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Make sure that directory structure is always the same 4 | if (string.match(lfs.currentdir(), "/specs$")) then 5 | lfs.chdir("..") 6 | end 7 | 8 | -- Include Dataframe lib 9 | dofile('init.lua') 10 | 11 | a = Dataframe() 12 | a:load_csv{ 13 | path = "/media/max/Ext_Enc_Rack/Extracted/dataset_4_torch_lda.csv", 14 | verbose = true, 15 | rows2explore = 1e4 16 | } 17 | -------------------------------------------------------------------------------- /specs/utils/utils_spec.lua: -------------------------------------------------------------------------------- 1 | require 'lfs' 2 | 3 | -- Ensure the test is launched within the specs/ folder 4 | assert(string.match(lfs.currentdir(), "specs")~=nil, "You must run this test in specs folder") 5 | 6 | local initial_dir = lfs.currentdir() 7 | 8 | -- Go to specs folder 9 | while (not string.match(lfs.currentdir(), "/specs$")) do 10 | lfs.chdir("..") 11 | end 12 | 13 | local specs_dir = lfs.currentdir() 14 | lfs.chdir("..")-- one more directory and it is lib root 15 | 16 | -- Include Dataframe lib 17 | dofile("init.lua") 18 | 19 | -- Go back into initial dir 20 | lfs.chdir(initial_dir) 21 | 22 | describe("#get_variable_type tests", function() 23 | describe("check integer rules", function() 24 | it("Single integer should give integer as result", function() 25 | local type = get_variable_type("1") 26 | assert.are.same(type, "integer") 27 | type = get_variable_type(23213) 28 | assert.are.same(type, "integer") 29 | end) 30 | 31 | it("previous double should give double", function() 32 | local type = get_variable_type("1", "double") 33 | assert.are.same(type, "double") 34 | type = get_variable_type(23213, "double") 35 | assert.are.same(type, "double") 36 | end) 37 | 38 | it("previous boolean should give string", function() 39 | local type = get_variable_type("1", "boolean") 40 | assert.are.same(type, "string") 41 | type = get_variable_type(23213, "boolean") 42 | assert.are.same(type, "string") 43 | end) 44 | end) 45 | 46 | describe("check double rules", function() 47 | it("Single double should give double as result", function() 48 | local type = get_variable_type("1.2") 49 | assert.are.same(type, "double") 50 | type = get_variable_type(23213.2) 51 | assert.are.same(type, "double") 52 | end) 53 | 54 | it("previous integer should give double", function() 55 | local type = get_variable_type("1.1", "integer") 56 | assert.are.same(type, "double") 57 | type = get_variable_type(23213.2, "integer") 58 | assert.are.same(type, "double") 59 | end) 60 | 61 | it("previous boolean should give string", function() 62 | local type = get_variable_type("1.2", "boolean") 63 | assert.are.same(type, "string") 64 | type = get_variable_type(23213.2, "boolean") 65 | assert.are.same(type, "string") 66 | end) 67 | end) 68 | 69 | describe("check boolean rules", function() 70 | it("Single boolean should give boolean as result", function() 71 | local type = get_variable_type("true") 72 | assert.are.same(type, "boolean") 73 | type = get_variable_type(true) 74 | assert.are.same(type, "boolean") 75 | end) 76 | 77 | it("previous integer should give string", function() 78 | local type = get_variable_type("true", "integer") 79 | assert.are.same(type, "string") 80 | type = get_variable_type(false, "integer") 81 | assert.are.same(type, "string") 82 | end) 83 | 84 | it("previous boolean should give boolean", function() 85 | local type = get_variable_type("false", "boolean") 86 | assert.are.same(type, "boolean") 87 | type = get_variable_type(true, "boolean") 88 | assert.are.same(type, "boolean") 89 | end) 90 | 91 | it("True/false should be case independent", function() 92 | for _,spelling in pairs({"tRue", "fAlse", "FALSE", "TRUE", "True", "False"}) do 93 | local type = get_variable_type(spelling) 94 | assert.are.same(type, "boolean") 95 | end 96 | end) 97 | end) 98 | end) 99 | -------------------------------------------------------------------------------- /utils/doc_helpers/get_anchors.lua: -------------------------------------------------------------------------------- 1 | function get_anchor_link(title, md_path, tag, indent) 2 | indent = indent or " " 3 | md_path = md_path or "" 4 | title = trim(title) 5 | title = title:gsub("(.+)%([^)]+%)", "%1") 6 | title = title:gsub("([^ `]+)%.__([^_()]+)__([^_`]*)", "%1.`__%2__`%3") 7 | title = title:gsub("%.__([^_()`]+)$", ".`__%1`") 8 | title = title:gsub("%._(.+)$", ".`_%1`") 9 | tag = trim(tag) 10 | 11 | return ("\n%s- [%s](%s#%s)"): 12 | format(indent, title, md_path, tag) 13 | end 14 | 15 | function get_doc_anchors(base_path, md_path, pd, rough_toc, detailed_toc) 16 | if (not base_path:match("/$")) then 17 | base_path = base_path .. "/" 18 | end 19 | local rel_md_path = md_path:gsub((base_path):quote(), "") 20 | rough_toc = rough_toc .. "\n- [".. pd.title .."]("..rel_md_path..")" 21 | detailed_toc = detailed_toc .. "\n- **[".. pd.title .."]("..rel_md_path..")**" 22 | for i=1,#pd.anchors.titles do 23 | detailed_toc = detailed_toc .. get_anchor_link(pd.anchors.titles[i], rel_md_path, pd.anchors.tags[i]) 24 | end 25 | return rough_toc, detailed_toc 26 | end 27 | -------------------------------------------------------------------------------- /utils/doc_helpers/parse_file.lua: -------------------------------------------------------------------------------- 1 | 2 | function parse_doc(raw_docs, file_name) 3 | -- Get documentation 4 | local doc_tbl = { 5 | content = trim(raw_docs), 6 | anchors = { 7 | tags = {}, 8 | titles = {} 9 | }, 10 | title = nil, 11 | title_rno = 0 12 | } 13 | 14 | local rows = doc_tbl.content:split("\n") 15 | for row_no,row in ipairs(rows) do 16 | if (row:match("^#")) then 17 | doc_tbl.title = trim(row:gsub("#", "")) 18 | doc_tbl.title_rno = row_no 19 | break 20 | end 21 | end 22 | 23 | -- If title not found use the file name 24 | if (not doc_tbl.title) then 25 | doc_tbl.title = "File: " .. file_name 26 | end 27 | 28 | if (doc_tbl.content:len() > 0) then 29 | rows = doc_tbl.content:split("\n") 30 | 31 | -- Remove empty rows and initial rows that are part of the title 32 | local tmp = {} 33 | for row_no,row in ipairs(rows) do 34 | if (row_no > doc_tbl.title_rno) then 35 | if(trim(row):len() > 0) then 36 | tmp[#tmp + 1] = row 37 | end 38 | end 39 | end 40 | rows = tmp 41 | 42 | -- Find all the anchors in the text 43 | for idx,row in ipairs(rows) do 44 | if (row:match("")) then 45 | local subanchor_tag = row:gsub("", "%1") 46 | local subtitle = subanchor_tag 47 | 48 | if (rows[idx + 1] and 49 | rows[idx + 1]:match("^%s*#")) then 50 | subtitle = trim(rows[idx + 1]:gsub("^#+", "")) 51 | end 52 | 53 | if (subtitle ~= title) then 54 | doc_tbl.anchors.titles[#doc_tbl.anchors.titles + 1] = subtitle 55 | doc_tbl.anchors.tags[#doc_tbl.anchors.tags + 1] = subanchor_tag 56 | end 57 | end 58 | end 59 | end 60 | 61 | return doc_tbl 62 | end 63 | -------------------------------------------------------------------------------- /utils/doc_helpers/write_doc.lua: -------------------------------------------------------------------------------- 1 | 2 | function write_doc(parsed_data, file_name) 3 | 4 | -- Set the general anchor 5 | local anchor = "__" .. parsed_data.title .. "__" 6 | local title = parsed_data.title 7 | if (title:match("^[A-Z][a-z]") and 8 | not title:match("^Data") and 9 | not title:match("^Df") and 10 | not title:match("^Batc")) then 11 | title = title:sub(1,1):lower() .. title:sub(2) 12 | end 13 | local header = ("# API documentation for [%s](#%s)"): 14 | format(title, anchor) 15 | 16 | for i=1,#parsed_data.anchors.tags do 17 | header = header .. get_anchor_link(parsed_data.anchors.titles[i], nil, parsed_data.anchors.tags[i], "") 18 | end 19 | 20 | local docfile = io.open(file_name, "w") 21 | docfile:write(header) 22 | docfile:write(("\n\n\n%s"):format(anchor, parsed_data.content)) 23 | docfile:close() 24 | 25 | end 26 | -------------------------------------------------------------------------------- /utils/loader.lua: -------------------------------------------------------------------------------- 1 | local argcheck = require "argcheck" 2 | local paths = require "paths" 3 | local argdoc = require 'argcheck.doc' 4 | 5 | argdoc[[ 6 | 7 | ## Package load functions 8 | 9 | ]] 10 | 11 | paths.get_sorted_files = argcheck{ 12 | doc=[[ 13 | 14 | ### paths.get_sorted_lua_files(@ARGP) 15 | 16 | Calls the `paths.files()` with the directory and sorts the files according to 17 | name. 18 | 19 | @ARGT 20 | 21 | _Return value_: table with sorted file names 22 | ]], 23 | {name="path", type="string", 24 | doc="The directory path"}, 25 | {name="match_str", type="string", default="[.]lua$", 26 | doc="The file matching string to search for. Defaults to lua file endings."}, 27 | call=function(path, match_str) 28 | local files = {} 29 | for f in paths.files(path) do 30 | if (f:match(match_str)) then 31 | files[#files + 1] = f 32 | end 33 | end 34 | 35 | table.sort(files) 36 | 37 | return files 38 | end} 39 | 40 | load_dir_files = argcheck{ 41 | doc=[[ 42 | 43 | ### load_dir_files(ARGP) 44 | 45 | Traverses a directory and loads all files within 46 | 47 | @ARPT 48 | 49 | _Return values_: 50 | 1. The files loaded in the processed order 51 | 2. The doc content if `docs` argument was true - otherwise it's an empty table 52 | ]], 53 | {name="path", type="string", doc="The directory"}, 54 | {name="params", type="table", doc="Objects to pass to the files", default={}}, 55 | {name="docs", type="boolean", doc="Run with argcheck.doc", default=false}, 56 | call = (function() 57 | -- Hidden variable that makes sure we don't reload files 58 | local loaded_files = {paths.thisfile()} 59 | 60 | local function is_loaded(file) 61 | for _,fn in ipairs(loaded_files) do 62 | if (fn == file) then 63 | return true 64 | end 65 | end 66 | 67 | return false 68 | end 69 | 70 | local function load_file(file, params, docs, ret_docs, ret_fpaths) 71 | if (docs) then 72 | argdoc.record() 73 | end 74 | 75 | local ret = assert(loadfile(file))(table.unpack(params)) 76 | 77 | if (docs) then 78 | 79 | -- Assigns to parent ret_docs 80 | ret_docs[file] = argdoc.stop() 81 | end 82 | 83 | table.insert(loaded_files, file) 84 | table.insert(ret_fpaths, file) 85 | return ret 86 | end 87 | 88 | return function(path, params, docs) 89 | assert(paths.dirp(path), ("The path '%s' isn't a valid directory"):format(path)) 90 | table.insert(params, path) 91 | local ret_docs = {} 92 | local ret_fpaths = {} 93 | 94 | if (paths.filep(path .. "init.lua")) then 95 | local obj = load_file(path .. "init.lua", params, docs, ret_docs, ret_fpaths) 96 | table.insert(params, 1, obj) 97 | end 98 | 99 | local files = paths.get_sorted_files(path) 100 | for _,file in pairs(files) do 101 | file = path .. file 102 | 103 | if (not is_loaded(file)) then 104 | 105 | load_file(file, params, docs, ret_docs, ret_fpaths) 106 | 107 | end 108 | end 109 | 110 | return ret_fpaths, ret_docs 111 | end 112 | end)()} 113 | --------------------------------------------------------------------------------