├── .editorconfig ├── .gitattributes ├── .gitignore ├── CODE_OF_CONDUCT.md ├── LICENSE ├── Makefile ├── README.md ├── examples ├── data.json ├── main.v ├── people-500000.csv └── titanic.parquet ├── src ├── core.v ├── explore.v ├── explore_test.v ├── funcs.v ├── funcs_test.v ├── io.v ├── models.v ├── mutation.v └── mutation_test.v └── v.mod /.editorconfig: -------------------------------------------------------------------------------- 1 | [*] 2 | charset = utf-8 3 | end_of_line = lf 4 | insert_final_newline = true 5 | trim_trailing_whitespace = true 6 | 7 | [*.v] 8 | indent_style = tab 9 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | * text=auto eol=lf 2 | *.bat eol=crlf 3 | 4 | **/*.v linguist-language=V 5 | **/*.vv linguist-language=V 6 | **/*.vsh linguist-language=V 7 | **/v.mod linguist-language=V 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Binaries for programs and plugins 2 | vframes 3 | 4 | # Ignore binary output folders 5 | tmp/ 6 | 7 | # Ignore common editor/system specific metadata 8 | .DS_Store 9 | .vscode/ 10 | docs/ 11 | 12 | # ENV 13 | .env 14 | 15 | # Local databases 16 | *.db 17 | 18 | # Reserved files 19 | TODO.md 20 | Makefile 21 | DEV.md -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # VFrames Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | rodrigo.abt@gmail.com. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Rodrigo Abt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: link docs 2 | 3 | link: 4 | ln -s /home/rabt/.vmodules/rodabt/vframes /home/rabt/devel/vframes 5 | 6 | docs: 7 | VDOC_SORT=false v doc -comments -color -f html -m vframes -o docs -readme -inline-assets -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # vframes 0.1.2 2 | 3 | A DataFrame library inspired by Python's Pandas. Should work on Linux, Windows and Mac (still testing). Uses the powerful DuckDB database as a backend. 4 | 5 | This is still a WIP. More functions, documentation, tutorials, and examples will be added soon. 6 | 7 | ## Dependencies 8 | 9 | [VDuckDB wrapper library](https://github.com/rodabt/vduckdb) 10 | 11 | ## Installation 12 | 13 | ```bash 14 | v install https://github.com/rodabt/vduckdb 15 | v install https://github.com/rodabt/vframes 16 | ``` 17 | 18 | ## Basic usage example 19 | 20 | Make sure the files people-500000.csv, titanic.parquet, and data.json are in the same directory as your .v file (check the examples dir) 21 | 22 | ```v 23 | import vframes 24 | 25 | // A convenience function for better printing 26 | fn printlne(s string) { 27 | println('\n${s}\n') 28 | } 29 | 30 | fn main() { 31 | 32 | printlne("VFrames version: ${vframes.version()}") 33 | 34 | printlne("First initialize a new context. If no arguments are give, memory is used") 35 | mut ctx := vframes.init() 36 | 37 | printlne("Load 500.000 records from a CSV") 38 | df := ctx.read_auto('people-500000.csv')! 39 | 40 | printlne("Print first 5 records:") 41 | df.head(5) 42 | 43 | printlne("Assign first 10 records to variable x as []map[string]json2.Any") 44 | data := df.head(10, to_stdout: false) 45 | println(data) 46 | 47 | printlne("Print last 5 records:") 48 | df.tail(5) 49 | 50 | printlne("DataFrame info:") 51 | df.info() 52 | 53 | printlne("DataFrame shape: ${df.shape()}") 54 | 55 | printlne("Describe DataFrame:") 56 | df.describe() 57 | 58 | printlne("Create new DF with new column 'new_col'=Index*5, and select a subset of columns (Email, Phone, new_col):") 59 | df2 := df 60 | .add_column('new_col', 'Index*5') 61 | .subset(['Email','Phone','new_col']) 62 | df2.head(10) 63 | 64 | printlne("Delete Email from new DF:") 65 | df3 := df2.delete_column('Email') 66 | df3.head(10) 67 | 68 | printlne("Load parquet (Titanic):") 69 | df4 := ctx.read_auto('titanic.parquet')! 70 | df4.head(10) 71 | 72 | printlne("Describe:") 73 | df4.describe() 74 | 75 | printlne("Average of Age and Fare by Sex and Embarked:") 76 | df5 := df4.group_by(['Sex','Embarked'],{"age_avg": "avg(Age)", "avg_fare": "avg(Fare)"}) 77 | df5.head(10) 78 | 79 | printlne("Slice(2,3) of first DataFrame:") 80 | df6 := df.slice(2,3) 81 | df6.head(10) 82 | 83 | println("Reading a JSON file:") 84 | df7 := ctx.read_auto("data.json")! 85 | df7.head(10) 86 | 87 | printlne("Error control: try to load a non valid file") 88 | _ := ctx.read_auto('no_valid.csv') or { 89 | eprintln(err.msg()) 90 | vframes.empty() 91 | } 92 | 93 | ctx.close() 94 | } 95 | ``` 96 | 97 | ## Considerations 98 | 99 | - VFrames uses DuckDB under the hood through the VDuckDB wrapper library, so in theory all operations allowed by DuckDB should be supported by VFrames eventually. 100 | - Currently by design DataFrames are inmutable, so when mutating you should create a new DataFrame to store each new result 101 | 102 | ## Initial settings 103 | 104 | ### DataFrameContext 105 | 106 | To use the library you must first initialize a DataFrame to define which kind of storage you will use for DataFrames, like this: 107 | 108 | ```v 109 | mut ctx := vframes.init() // In memory 110 | mut ctx := vframes.init(location: 'mycontext.db') // Persisted to 'mycontext.db' 111 | ``` 112 | 113 | ### DataFrame 114 | 115 | If you want to suppress console output for functions returning `Data`, set the optional parameter `to_stdout` to `false` (see examples dir). 116 | 117 | ## Accepted data file formats 118 | 119 | The structure of most CSV, Parquet, and JSON files is infered automatically. In the future, there will be options to fine tune loading parameters such as delimiters, column renames, partial loading, etc. 120 | 121 | ## Current functions 122 | 123 | Last updated on 2024-12-21 124 | 125 | - [X] columns 126 | - [X] dtypes 127 | - [X] empty 128 | - [X] groupby 129 | - [X] head 130 | - [X] info 131 | - [X] query 132 | - [X] shape 133 | - [X] tail 134 | - [X] values 135 | - [X] abs 136 | - [X] add 137 | - [X] add_prefix 138 | - [X] add_suffix 139 | - [X] max 140 | - [X] min 141 | - [X] mean 142 | - [X] median 143 | - [X] sum 144 | - [X] dropna 145 | - [X] pow 146 | 147 | ## Roadmap 148 | 149 | Although the library is very inspired by Pandas, **it's purpose is NOT to be a one-on-one replacement**. Some of the funcionalities planned are listed below: 150 | 151 | - [ ] DataFrame joins 152 | - [ ] Query deferral 153 | - [ ] Basic plotting 154 | 155 | ## How to contribute 156 | 157 | Comments, bug reports, requests, and pull requests are welcome. 158 | -------------------------------------------------------------------------------- /examples/data.json: -------------------------------------------------------------------------------- 1 | {"x":10,"y":5,"z":"test"} 2 | {"x":3,"y":6,"z":"test2"} -------------------------------------------------------------------------------- /examples/main.v: -------------------------------------------------------------------------------- 1 | import vframes 2 | 3 | fn printlne(s string) { 4 | println('\n${s}\n') 5 | } 6 | 7 | fn main() { 8 | 9 | printlne("VFrames version: ${vframes.version()}") 10 | 11 | printlne("Initialize context in memory") 12 | mut ctx := vframes.init() // location: 'ctx.db' 13 | 14 | printlne("Load 500.000 records from a CSV") 15 | df := ctx.read_auto('people-500000.csv')! 16 | 17 | printlne("Print first 5 records:") 18 | df.head(5) 19 | 20 | printlne("Assign first 10 records to variable x as []map[string]json2.Any") 21 | data := df.head(10, to_stdout: false) 22 | println(data) 23 | 24 | printlne("Print last 5 records:") 25 | df.tail(5) 26 | 27 | printlne("DataFrame info:") 28 | df.info() 29 | 30 | printlne("DataFrame shape: ${df.shape()}") 31 | 32 | printlne("Describe DataFrame:") 33 | df.describe() 34 | 35 | printlne("Create new DF with new column 'new_col'=Index*5, and select a subset of columns (Email, Phone, new_col):") 36 | df2 := df 37 | .add_column('new_col', 'Index*5') 38 | .subset(['Email','Phone','new_col']) 39 | df2.head(10) 40 | 41 | printlne("Delete Email from new DF:") 42 | df3 := df2.delete_column('Email') 43 | df3.head(10) 44 | 45 | printlne("Load parquet (Titanic):") 46 | df4 := ctx.read_auto('titanic.parquet')! 47 | df4.head(10) 48 | 49 | printlne("Describe:") 50 | df4.describe() 51 | 52 | printlne("Average of Age and Fare by Sex and Embarked:") 53 | df5 := df4.group_by(['Sex','Embarked'],{"age_avg": "avg(Age)", "avg_fare": "avg(Fare)"}) 54 | df5.head(10) 55 | 56 | printlne("Slice(2,3) of first DataFrame:") 57 | df6 := df.slice(2,3) 58 | df6.head(10) 59 | 60 | println("Reading a JSON file:") 61 | df7 := ctx.read_auto("data.json")! 62 | df7.head(100) 63 | 64 | printlne("Error control: try to load a non valid file") 65 | _ := ctx.read_auto('no_valid.csv') or { 66 | eprintln(err.msg()) 67 | vframes.empty() 68 | } 69 | 70 | ctx.close() 71 | } 72 | 73 | -------------------------------------------------------------------------------- /examples/titanic.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rodabt/vframes/9fd969d826a5ff335859f4be8629f85f41c922ec/examples/titanic.parquet -------------------------------------------------------------------------------- /src/core.v: -------------------------------------------------------------------------------- 1 | module vframes 2 | 3 | import vduckdb 4 | import rand 5 | import v.vmod 6 | 7 | // Initializes a new DataFrame context 8 | pub fn init(cfg ContextConfig) DataFrameContext { 9 | mut db := vduckdb.DuckDB{} 10 | _ := db.open(cfg.location) or { panic(err) } 11 | _ := db.query("select 1") or { panic(err) } 12 | return DataFrameContext{ 13 | dpath: cfg.location 14 | db: db 15 | } 16 | } 17 | 18 | // Closes a DataFrame context 19 | pub fn (mut ctx DataFrameContext) close() { 20 | ctx.db.close() 21 | } 22 | 23 | // Prints vframes version 24 | pub fn version() string { 25 | vm := vmod.decode(@VMOD_FILE) or { panic(err) } 26 | return vm.version 27 | } 28 | 29 | // Returns an empty in-memory DataFrame. Mainly used as a Result parameter for `read_auto` function 30 | pub fn empty() DataFrame { 31 | mut ctx := init() 32 | id := 'tbl_${rand.ulid()}' 33 | return DataFrame{ 34 | id: id 35 | ctx: ctx 36 | } 37 | } -------------------------------------------------------------------------------- /src/explore.v: -------------------------------------------------------------------------------- 1 | module vframes 2 | 3 | import x.json2 4 | 5 | // Shows first `n` records from DataFrame. Use `to_stdout: false` to return the data as `[]map[string]json2.Any` instead of the console 6 | // Example: 7 | // ```v 8 | // df.head(10) // Prints the first 10 records to console 9 | // data := df.head(10, to_stdout: false) // Assigns the result as []map[string]json2.Any to data 10 | // ``` 11 | pub fn (df DataFrame) head(n int, dconf DFConfig) Data { 12 | if n <= 0 { 13 | return Data([]map[string]json2.Any{}) 14 | } 15 | mut db := &df.ctx.db 16 | _ := db.query("select * from ${df.id} limit ${n}") or { panic(err) } 17 | if dconf.to_stdout { 18 | println(db.print_table(max_rows: df.display_max_rows, mode: df.display_mode)) 19 | } 20 | return Data(db.get_array()) 21 | } 22 | 23 | // Same as `head`, but for last `n`records 24 | pub fn (df DataFrame) tail(n int, dconf DFConfig) Data { 25 | if n <= 0 { 26 | return Data([]map[string]json2.Any{}) 27 | } 28 | mut db := &df.ctx.db 29 | q := " 30 | WITH _base as ( 31 | SELECT row_number() OVER() as _row_num,* 32 | FROM ${df.id} 33 | ) SELECT * EXCLUDE(_row_num) FROM (SELECT * FROM _base ORDER BY _row_num DESC limit ${n}) ORDER BY _row_num ASC 34 | " 35 | _ := db.query(q) or { panic(err) } 36 | if dconf.to_stdout { 37 | println(db.print_table(max_rows: df.display_max_rows, mode: df.display_mode)) 38 | } 39 | return db.get_array() 40 | } 41 | 42 | // Shows DataFrame columns names and data types 43 | pub fn (df DataFrame) info(dconf DFConfig) Data { 44 | mut db := &df.ctx.db 45 | _ := db.query("SELECT column_name,column_type FROM (DESCRIBE SELECT * FROM ${df.id})") or { panic(err) } 46 | if dconf.to_stdout { 47 | println(db.print_table(max_rows: df.display_max_rows, mode: df.display_mode)) 48 | } 49 | return db.get_array() 50 | } 51 | 52 | // Shows columns basic statistics (nulls, max, min, etc.) 53 | pub fn (df DataFrame) describe(dconf DFConfig) Data { 54 | mut db := &df.ctx.db 55 | _ := db.query("SELECT * FROM (SUMMARIZE SELECT * FROM ${df.id})") or { panic(err) } 56 | if dconf.to_stdout { 57 | println(db.print_table(max_rows: df.display_max_rows, mode: df.display_mode)) 58 | } 59 | return db.get_array() 60 | } 61 | 62 | // Returns the number of rows and columns of the DataFrame 63 | pub fn (df DataFrame) shape() []int { 64 | mut db := &df.ctx.db 65 | _ := db.query('SELECT COUNT(*) as rows FROM ${df.id}') or { panic(err) } 66 | res_rows := db.get_array() 67 | num_rows := (res_rows[0]["rows"] or {0}).int() 68 | 69 | _ := db.query('SELECT COUNT(DISTINCT column_name) as cols FROM (SUMMARIZE SELECT * FROM ${df.id})') or { panic(err) } 70 | res_cols := db.get_array() 71 | num_cols := (res_cols[0]["cols"] or {0}).int() 72 | 73 | return [num_rows,num_cols] 74 | } 75 | 76 | @[params] 77 | struct ValuesParams { 78 | as_string bool 79 | } 80 | 81 | // Returns all the data from DataFrame as []map[string]json2.Any or []map[string]string if `as_string` is true 82 | // NOTE: Use with caution because it will dump all the DataFrame data to memory 83 | pub fn (df DataFrame) values(vp ValuesParams) Data { 84 | mut db := &df.ctx.db 85 | _ := db.query('SELECT * FROM ${df.id}') or { panic(err) } 86 | if vp.as_string { 87 | return db.get_array_as_string() 88 | } 89 | return db.get_array() 90 | } 91 | 92 | // Returns an array of column names 93 | pub fn (df DataFrame) columns() []string { 94 | mut db := &df.ctx.db 95 | _ := db.query('SELECT * FROM ${df.id} where 1=0') or { panic(err) } 96 | return db.columns.keys() 97 | } 98 | 99 | // Returns a map of columns and their types 100 | pub fn (df DataFrame) dtypes() map[string]string { 101 | mut db := &df.ctx.db 102 | _ := db.query('SELECT * FROM ${df.id} where 1=0') or { panic(err) } 103 | return db.columns 104 | } -------------------------------------------------------------------------------- /src/explore_test.v: -------------------------------------------------------------------------------- 1 | module vframes 2 | 3 | import x.json2 4 | 5 | const data = [ 6 | {"x": json2.Any(1), "y": json2.Any("a"), "z": json2.Any(100.0) }, 7 | {"x": json2.Any(2), "y": json2.Any("bb"), "z": json2.Any(250.0) }, 8 | {"x": json2.Any(3), "y": json2.Any("ccc"), "z": json2.Any(400.5) } 9 | ] 10 | 11 | fn df_init(d []map[string]json2.Any) DataFrame { 12 | mut ctx := vframes.init() 13 | df := ctx.read_records(d) 14 | return df 15 | } 16 | 17 | fn test__head_zero() { 18 | mut df := df_init(data) 19 | assert df.head(0) == Data([]map[string]json2.Any{}) 20 | } 21 | 22 | fn test__head_two() { 23 | mut df := df_init(data) 24 | result := Data([ 25 | {"x": json2.Any(1), "y": json2.Any("a"), "z": json2.Any(100.0) }, 26 | {"x": json2.Any(2), "y": json2.Any("bb"), "z": json2.Any(250.0) } 27 | ]) 28 | assert df.head(2, to_stdout: false).str() == result.str() 29 | } 30 | 31 | fn test__head_hundred() { 32 | mut df := df_init(data) 33 | assert df.head(100, to_stdout: false).str() == vframes.Data(data).str() 34 | } 35 | 36 | fn test__tail_zero() { 37 | mut df := df_init(data) 38 | assert df.tail(0) == Data([]map[string]json2.Any{}) 39 | } 40 | 41 | fn test__tail_one() { 42 | mut df := df_init(data) 43 | result := Data([ 44 | {"x": json2.Any(3), "y": json2.Any("ccc"), "z": json2.Any(400.5) } 45 | ]) 46 | assert df.tail(1, to_stdout: false).str() == result.str() 47 | } 48 | 49 | fn test__tail_hundred() { 50 | mut df := df_init(data) 51 | assert df.tail(100, to_stdout: false).str() == vframes.Data(data).str() 52 | } -------------------------------------------------------------------------------- /src/funcs.v: -------------------------------------------------------------------------------- 1 | module vframes 2 | 3 | import rand 4 | 5 | // Internal: Apply function 'func' to numeric values 6 | fn (df DataFrame) v_apply(func string, args ...string) !DataFrame { 7 | id := 'tbl_${rand.ulid()}' 8 | mut db := &df.ctx.db 9 | mut cols := []string{} 10 | for k,v in df.dtypes() { 11 | if v in ['integer','decimal','float','bigint','double','hugeint'] { 12 | cols << if args.len > 0 { '${func}("${k}",${args.join(',')}) as "${k}"' } else { '${func}("${k}") as "${k}"' } 13 | } else { 14 | cols << k 15 | } 16 | } 17 | _ := db.query("create table ${id} as select ${cols.join(',')} from ${df.id}")! 18 | return DataFrame{ 19 | id: id 20 | ctx: df.ctx 21 | } 22 | } 23 | 24 | @[params] 25 | struct FuncOptions { 26 | axis int = 1 27 | skipna bool = true 28 | } 29 | 30 | // Internal: Apply grouping function 'func' to numeric values 31 | fn (df DataFrame) min_max_apply(func string, fo FuncOptions) !DataFrame { 32 | id := 'tbl_${rand.ulid()}' 33 | order_by := if func == 'min' { 'desc' } else { 'asc' } 34 | mut db := &df.ctx.db 35 | mut cols := []string{} 36 | for k,v in df.dtypes() { 37 | cols << if v in ['integer','decimal','float','bigint','double','hugeint'] { 38 | '${func}("${k}") as "${k}"' 39 | } else { 40 | 'last("${k}" order by "${k}" ${order_by}) as "${k}"' 41 | } 42 | } 43 | _ := db.query("create table ${id} as select ${cols.join(',')} from ${df.id}")! 44 | return DataFrame{ 45 | id: id 46 | ctx: df.ctx 47 | } 48 | } 49 | 50 | // Calculates the `func` value for each of the rows (`axis: 0`) or columns (`axis: 1`. default) of the DataFrame 51 | // NOTE: Only returns the numeric values 52 | fn (df DataFrame) g_apply(func string, fo FuncOptions) DataFrame { 53 | id := 'tbl_${rand.ulid()}' 54 | mut db := &df.ctx.db 55 | mut cols := []string{} 56 | for k,v in df.dtypes() { 57 | if v in ['integer','decimal','float','bigint','double','hugeint'] { 58 | cols << '${func}("${k}") as "${k}"' 59 | } 60 | } 61 | _ := db.query("create table ${id} as select ${cols.join(',')} from ${df.id}") or { panic(err) } 62 | return DataFrame{ 63 | id: id 64 | ctx: df.ctx 65 | } 66 | } 67 | 68 | // Adds value `n` to all numeric values 69 | pub fn (df DataFrame) add[T](n T) DataFrame { 70 | id := 'tbl_${rand.ulid()}' 71 | mut db := &df.ctx.db 72 | mut cols := []string{} 73 | for k,v in df.dtypes() { 74 | cols << if v in ['integer','decimal','float','bigint','double','hugeint'] { '${k}+${n.str()} as "${k}"'} else { k } 75 | } 76 | _ := db.query("create table ${id} as select ${cols.join(',')} from ${df.id}") or { panic(err) } 77 | return DataFrame{ 78 | id: id 79 | ctx: df.ctx 80 | } 81 | } 82 | 83 | // Calculates the absolute value for each element of the DataFrame 84 | pub fn (df DataFrame) abs() DataFrame { 85 | new_df := df.v_apply('abs') or { panic(err) } 86 | return new_df 87 | } 88 | 89 | // Calculates the max value for each of the rows (`axis: 0`) or columns (`axis: 1`. default) of the DataFrame 90 | pub fn (df DataFrame) max(fo FuncOptions) DataFrame { 91 | new_df := df.min_max_apply('max', fo) or { panic(err) } 92 | return new_df 93 | } 94 | 95 | // Calculates the max value for each of the rows (`axis: 0`) or columns (`axis: 1`. default) of the DataFrame 96 | pub fn (df DataFrame) min(fo FuncOptions) DataFrame { 97 | new_df := df.min_max_apply('min', fo) or { panic(err) } 98 | return new_df 99 | } 100 | 101 | // Calculates the mean value for each of the rows (`axis: 0`) or columns (`axis: 1`. default) of the DataFrame 102 | pub fn (df DataFrame) mean(fo FuncOptions) DataFrame { 103 | new_df := df.g_apply('mean', fo) 104 | return new_df 105 | } 106 | 107 | // Calculates the median value for each of the rows (`axis: 0`) or columns (`axis: 1`. default) of the DataFrame 108 | pub fn (df DataFrame) median(fo FuncOptions) DataFrame { 109 | new_df := df.g_apply('median', fo) 110 | return new_df 111 | } 112 | 113 | 114 | // Calculates the sum for each of the rows (`axis: 0`) or columns (`axis: 1`. default) of the DataFrame 115 | pub fn (df DataFrame) sum(fo FuncOptions) DataFrame { 116 | new_df := df.g_apply('sum', fo) 117 | return new_df 118 | } 119 | 120 | // Calculates the exponential power (`element^n`) for each element of the Dataframe 121 | pub fn (df DataFrame) pow(n int, fo FuncOptions) DataFrame { 122 | new_df := df.v_apply('pow', n.str()) or { panic(err) } 123 | return new_df 124 | } 125 | 126 | -------------------------------------------------------------------------------- /src/funcs_test.v: -------------------------------------------------------------------------------- 1 | module vframes 2 | 3 | import x.json2 4 | 5 | const data = [ 6 | {"x": json2.Any(1), "y": json2.Any("a"), "z": json2.Any(-100.0) }, 7 | {"x": json2.Any(3), "y": json2.Any("c"), "z": json2.Any(300.0) } 8 | ] 9 | 10 | fn df_init(d []map[string]json2.Any) DataFrame { 11 | mut ctx := vframes.init() 12 | df := ctx.read_records(d) 13 | return df 14 | } 15 | 16 | 17 | fn test__add_integer() { 18 | mut df := df_init(data) 19 | result := Data([ 20 | {"x": json2.Any(3), "y": json2.Any("a"), "z": json2.Any(-98.0) }, 21 | {"x": json2.Any(5), "y": json2.Any("c"), "z": json2.Any(302.0) } 22 | ]) 23 | assert df.add[int](2).values().str() == result.str() 24 | } 25 | 26 | fn test__add_decimal() { 27 | mut df := df_init(data) 28 | result := Data([ 29 | {"x": json2.Any(2.2), "y": json2.Any("a"), "z": json2.Any(-98.8) }, 30 | {"x": json2.Any(4.2), "y": json2.Any("c"), "z": json2.Any(301.2) } 31 | ]) 32 | assert df.add(1.2).values().str() == result.str() 33 | } 34 | 35 | fn test__abs() { 36 | mut df := df_init(data) 37 | result := Data([ 38 | {"x": json2.Any(1), "y": json2.Any("a"), "z": json2.Any(100.0) }, 39 | {"x": json2.Any(3), "y": json2.Any("c"), "z": json2.Any(300.0) } 40 | ]) 41 | assert df.abs().values().str() == result.str() 42 | } 43 | 44 | fn test__max() { 45 | mut df := df_init(data) 46 | result := Data([ 47 | {"x": json2.Any(3), "y": json2.Any("c"), "z": json2.Any(300.0) } 48 | ]) 49 | assert df.max().values().str() == result.str() 50 | } 51 | 52 | fn test__min() { 53 | mut df := df_init(data) 54 | result := Data([ 55 | {"x": json2.Any(1), "y": json2.Any("a"), "z": json2.Any(-100.0) } 56 | ]) 57 | assert df.min().values().str() == result.str() 58 | } 59 | 60 | fn test__mean() { 61 | mut df := df_init(data) 62 | result := Data([ 63 | {"x": json2.Any(2), "z": json2.Any(100.0) } 64 | ]) 65 | assert df.mean().values().str() == result.str() 66 | } 67 | 68 | fn test__median() { 69 | d := [ 70 | {"x": json2.Any(-10.3),"y": json2.Any(-50000),"z": json2.Any('a')}, 71 | {"x": json2.Any(-1),"y": json2.Any(0),"z": json2.Any('b')}, 72 | {"x": json2.Any(2),"y": json2.Any(-3),"z": json2.Any('c')} 73 | ] 74 | mut df := df_init(d) 75 | result := Data([ 76 | {"x": json2.Any(-1), "y": json2.Any(-3) } 77 | ]) 78 | assert df.median().values().str() == result.str() 79 | } 80 | 81 | fn test__sum() { 82 | d := [ 83 | {"x": json2.Any(10),"y": json2.Any(14),"z": json2.Any('a')}, 84 | {"x": json2.Any(4),"y": json2.Any(10),"z": json2.Any('b')}, 85 | {"x": json2.Any(2),"y": json2.Any(15),"z": json2.Any('c')} 86 | ] 87 | mut df := df_init(d) 88 | result := Data([ 89 | {"x": json2.Any(16), "y": json2.Any(39) } 90 | ]) 91 | assert df.sum().values().str() == result.str() 92 | } 93 | 94 | fn test__pow() { 95 | d := [ 96 | {"x": json2.Any(10),"y": json2.Any(14),"z": json2.Any('a')}, 97 | {"x": json2.Any(4),"y": json2.Any(10),"z": json2.Any('b')}, 98 | {"x": json2.Any(2),"y": json2.Any(15),"z": json2.Any('c')} 99 | ] 100 | mut df := df_init(d) 101 | result := Data([ 102 | {"x": json2.Any(100),"y": json2.Any(196),"z": json2.Any('a')}, 103 | {"x": json2.Any(16),"y": json2.Any(100),"z": json2.Any('b')}, 104 | {"x": json2.Any(4),"y": json2.Any(225),"z": json2.Any('c')} 105 | ]) 106 | assert df.pow(2).values().str() == result.str() 107 | } -------------------------------------------------------------------------------- /src/io.v: -------------------------------------------------------------------------------- 1 | module vframes 2 | 3 | import os 4 | import rand 5 | import x.json2 6 | 7 | // Reads a data file from disk. It tries automatically to infer the structure directly from the file 8 | // Currently Accepted formats: .csv, .json, .parquet 9 | // NOTE: The json parser is still under testing 10 | pub fn (mut ctx DataFrameContext) read_auto(filename string) !DataFrame { 11 | if !os.is_file(filename) { 12 | return error("Incorrect filename: ${filename}") 13 | } 14 | id := 'tbl_${rand.ulid()}' 15 | mut df := DataFrame{ 16 | ctx: ctx 17 | } 18 | mut db := &df.ctx.db 19 | _ := db.query("create table ${id} as select * from '${filename}'") or { panic(err) } 20 | return DataFrame{ 21 | id: id 22 | ctx: ctx 23 | } 24 | } 25 | 26 | // Reads []map[string]json2.Any and store in DataFrame 27 | pub fn (mut ctx DataFrameContext) read_records(dict []map[string]json2.Any) DataFrame { 28 | id := 'tbl_${rand.ulid()}' 29 | tmp_dict := dict.map(it.str()) 30 | tmp_file := os.join_path_single(os.temp_dir(),'tmp_${rand.ulid()}.json') 31 | os.write_file(tmp_file, tmp_dict.join_lines()) or { panic(err) } 32 | _ := ctx.db.query("create table ${id} as select * from '${tmp_file}'") or { panic(err) } 33 | os.rm(tmp_file) or { panic(err) } 34 | return DataFrame{ 35 | id: id 36 | ctx: ctx 37 | } 38 | } -------------------------------------------------------------------------------- /src/models.v: -------------------------------------------------------------------------------- 1 | module vframes 2 | 3 | import rand 4 | import x.json2 5 | import vduckdb 6 | 7 | type Data = []map[string]json2.Any | []map[string]string 8 | 9 | @[params] 10 | pub struct ContextConfig { 11 | pub: 12 | location string = ":memory:" 13 | } 14 | 15 | @[params] 16 | pub struct DFConfig { 17 | pub mut: 18 | to_stdout bool = true 19 | } 20 | 21 | @[noinit] 22 | struct DataFrameContext { 23 | dpath string 24 | mut: 25 | db vduckdb.DuckDB 26 | } 27 | 28 | @[noinit] 29 | pub struct DataFrame { 30 | id string = 'tbl_${rand.ulid()}' 31 | ctx DataFrameContext 32 | pub mut: 33 | display_mode string = 'box' 34 | display_max_rows int = 100 35 | } 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /src/mutation.v: -------------------------------------------------------------------------------- 1 | module vframes 2 | 3 | import rand 4 | 5 | // Deletes a column from the DataFrame 6 | pub fn (df DataFrame) delete_column(col string) DataFrame { 7 | id := 'tbl_${rand.ulid()}' 8 | mut db := &df.ctx.db 9 | _ := db.query("create table ${id} as select * exclude(${col}) from ${df.id}") or { panic(err) } 10 | return DataFrame{ 11 | id: id 12 | ctx: df.ctx 13 | } 14 | } 15 | 16 | // Adds a new column to DataFrame where `expr` should be a valid expression (see examples) 17 | pub fn (df DataFrame) add_column(col string, expr string) DataFrame { 18 | id := 'tbl_${rand.ulid()}' 19 | mut db := &df.ctx.db 20 | _ := db.query("create table ${id} as select *, ${expr} as ${col} from ${df.id}") or { panic(err) } 21 | return DataFrame{ 22 | id: id 23 | ctx: df.ctx 24 | } 25 | } 26 | 27 | // Returns a subset of the DataFrame columns passed as an array 28 | pub fn (df DataFrame) subset(cols []string) DataFrame { 29 | id := 'tbl_${rand.ulid()}' 30 | mut db := &df.ctx.db 31 | _ := db.query("create table ${id} as select ${cols.join(',')} from ${df.id}") or { panic(err) } 32 | return DataFrame{ 33 | id: id 34 | ctx: df.ctx 35 | } 36 | } 37 | 38 | // Returns a subset of rows between `start` row and `end` row (both inclusive) 39 | pub fn (df DataFrame) slice(start int, end int) DataFrame { 40 | id := 'tbl_${rand.ulid()}' 41 | offset := start - 1 42 | limit := end - start + 1 43 | mut db := &df.ctx.db 44 | _ := db.query("create table ${id} as select * from ${df.id} limit ${limit} offset ${offset}") or { panic(err) } 45 | return DataFrame{ 46 | id: id 47 | ctx: df.ctx 48 | } 49 | } 50 | 51 | // Performs a group by operation where `dimensions` is an array of grouping labels, and metrics is a map of columns metrics and grouping operations (see examples) 52 | pub fn (df DataFrame) group_by(dimensions []string, metrics map[string]string) DataFrame { 53 | id := 'tbl_${rand.ulid()}' 54 | mut db := &df.ctx.db 55 | mut sets := []string{} 56 | for k,v in metrics { 57 | sets << '${v} as ${k}' 58 | } 59 | _ := db.query("create table ${id} as select ${dimensions.join(',')}, ${sets.join(',')} from ${df.id} group by ${dimensions.join(',')}") or { panic(err) } 60 | return DataFrame{ 61 | id: id 62 | ctx: df.ctx 63 | } 64 | } 65 | 66 | 67 | // Allows you to use a valid sql expression with the DataFrame. It returns a DataFrame Result 68 | // Examples: `df.query("value*2 as new_value, lower(name) as lowercase_name")` 69 | pub fn (df DataFrame) query(q string, dconf DFConfig) !DataFrame { 70 | id := 'tbl_${rand.ulid()}' 71 | mut db := &df.ctx.db 72 | _ := db.query('SELECT ${q} FROM ${df.id}') or { 73 | eprintln("Invalid query syntax: ${err.msg()}") 74 | return error("Invalid query syntax: ${err.msg()}") 75 | } 76 | return DataFrame{ 77 | id: id 78 | ctx: df.ctx 79 | } 80 | } 81 | 82 | // Adds prefix `prefix` to every column 83 | pub fn (df DataFrame) add_prefix(prefix string) DataFrame { 84 | id := 'tbl_${rand.ulid()}' 85 | mut db := &df.ctx.db 86 | _ := db.query("create table ${id} as select columns('(.*)') as '${prefix}_\\1' from ${df.id}") or { panic(err) } 87 | return DataFrame{ 88 | id: id 89 | ctx: df.ctx 90 | } 91 | } 92 | 93 | // Adds suffix `suffix` to every column 94 | pub fn (df DataFrame) add_suffix(suffix string) DataFrame { 95 | id := 'tbl_${rand.ulid()}' 96 | mut db := &df.ctx.db 97 | _ := db.query("create table ${id} as select columns('(.*)') as '\\1_${suffix}' from ${df.id}") or { panic(err) } 98 | return DataFrame{ 99 | id: id 100 | ctx: df.ctx 101 | } 102 | } 103 | 104 | @[params] 105 | struct DropOptions { 106 | axis int // 0: drop rows, 1: drop columns 107 | how string = 'any' // 'any': drop if any NA values, 'all': drop if all NA values 108 | thresh int // Minimum number of non-NA values to keep 109 | subset []string // Subset of columns to consider 110 | nullstr string = 'null' 111 | } 112 | 113 | // Drops NA rows or columns from DataFrame. If how is 'any', it drops the row/column if any NA values are present. 114 | // If how is 'all', it drops the row/column if all NA values are present 115 | // If subset is passed, it only considers the columns passed in the subset as final columns for output 116 | pub fn (df DataFrame) dropna(do DropOptions) DataFrame { 117 | id := 'tbl_${rand.ulid()}' 118 | mut db := &df.ctx.db 119 | selected_columns := if do.subset.len > 0 { do.subset } else { df.columns() } 120 | conn := if do.how == 'any' { 'and' } else { 'or' } 121 | predicate := df.columns().map("${it} is not null").join(' ${conn} ') 122 | _ := db.query("create table ${id} as select ${selected_columns.join(',')} from ${df.id} where ${predicate}") or { panic(err) } 123 | return DataFrame{ 124 | id: id 125 | ctx: df.ctx 126 | } 127 | } -------------------------------------------------------------------------------- /src/mutation_test.v: -------------------------------------------------------------------------------- 1 | module vframes 2 | 3 | import x.json2 4 | 5 | const data = [ 6 | {"x": json2.Any(1), "y": json2.Any("a"), "z": json2.Any(-100.0) }, 7 | {"x": json2.Any(3), "y": json2.Any("c"), "z": json2.Any(300.0) } 8 | ] 9 | 10 | fn df_init(d []map[string]json2.Any) DataFrame { 11 | mut ctx := vframes.init() 12 | df := ctx.read_records(d) 13 | return df 14 | } 15 | 16 | fn test__add_prefix() { 17 | mut df := df_init(data) 18 | result := Data([ 19 | {"col_x": json2.Any(1), "col_y": json2.Any("a"), "col_z": json2.Any(-100.0) }, 20 | {"col_x": json2.Any(3), "col_y": json2.Any("c"), "col_z": json2.Any(300.0) } 21 | ]) 22 | assert df.add_prefix('col').values().str() == result.str() 23 | } 24 | 25 | fn test__add_suffix() { 26 | mut df := df_init(data) 27 | result := Data([ 28 | {"x_col": json2.Any(1), "y_col": json2.Any("a"), "z_col": json2.Any(-100.0) }, 29 | {"x_col": json2.Any(3), "y_col": json2.Any("c"), "z_col": json2.Any(300.0) } 30 | ]) 31 | assert df.add_suffix('col').values().str() == result.str() 32 | } 33 | 34 | fn test__dropna() { 35 | tdata := [ 36 | {"x_col": json2.Any(1), "y_col": json2.Any("a"), "z_col": json2.Any(-100.0) }, 37 | {"x_col": json2.Any(3), "y_col": json2.null, "z_col": json2.Any(300.0) }, 38 | {"x_col": json2.Any(5), "y_col": json2.Any("f"), "z_col": json2.null }, 39 | {"x_col": json2.Any(json2.null), "y_col": json2.null, "z_col": json2.null } 40 | ] 41 | df := df_init(tdata) 42 | // ANY 43 | result1 := Data([{"x_col": "1", "y_col": "a", "z_col": "-100" }]) 44 | // ALL 45 | result2 := Data([ 46 | {"x_col": "1", "y_col": "a", "z_col": "-100" }, 47 | {"x_col": "3", "y_col": "", "z_col": "300" }, 48 | {"x_col": "5", "y_col": "f", "z_col": "" } 49 | ]) 50 | result3 := Data([{"x_col": "1", "y_col": "a" }]) 51 | assert df.dropna().values(as_string: true) == result1 52 | assert df.dropna(how: 'all').values(as_string: true) == result2 53 | assert df.dropna(subset: ['x_col','y_col']).values(as_string: true) == result3 54 | } -------------------------------------------------------------------------------- /v.mod: -------------------------------------------------------------------------------- 1 | Module { 2 | name: 'vframes' 3 | description: 'DataFrames for V' 4 | version: '0.1.2' 5 | license: 'MIT' 6 | dependencies: ['https://github.com/rodabt/vduckdb'] 7 | } --------------------------------------------------------------------------------