├── .gitignore ├── Cargo.toml ├── README.md ├── dask_benchmark.py ├── get_wikipedia_table.py ├── native-rust-optimized ├── Cargo.toml └── src │ ├── RUST.code-workspace │ ├── main.rs │ └── utils.rs ├── native-rust ├── Cargo.toml └── src │ ├── RUST.code-workspace │ ├── main.rs │ └── utils.rs ├── pandas_benchmark.py ├── polars-eager ├── Cargo.toml └── src │ └── main.rs └── polars-lazy ├── Cargo.toml └── src └── main.rs /.gitignore: -------------------------------------------------------------------------------- 1 | Cargo.lock 2 | /.vscode 3 | /data 4 | /target 5 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [workspace] 2 | 3 | members = [ 4 | "polars-lazy", 5 | "polars-eager", 6 | "native-rust", 7 | "native-rust-optimized", 8 | ] 9 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dataframe-python-rust 2 | 3 | Comparing Polars vs Pandas vs Dask vs Rust native :) 4 | 5 | 6 | 7 | To run: 8 | 9 | ## Download the data at: 10 | 11 | - `train_October_9_2012,csv` at https://www.kaggle.com/c/predict-closed-questions-on-stack-overflow/data?select=train_October_9_2012.csv 12 | - wikipedia.csv at https://en.wikipedia.org/wiki/Comparison_of_programming_languages with `python get_wikipedia_table.py` 13 | 14 | ### Polars Lazy 15 | 16 | ```bash 17 | cd polars-lazy 18 | cargo build --release 19 | ../target/release/polars-lazy 20 | ``` 21 | 22 | ### Polars Eager 23 | 24 | ```bash 25 | cd polars-eager 26 | cargo build --release 27 | ../target/release/polars-eager 28 | ``` 29 | 30 | ### Native rust 31 | 32 | ```bash 33 | cd native-rust 34 | cargo build --release 35 | ../target/release/native-rust 36 | ``` 37 | 38 | The result csv are going to be in data. 39 | -------------------------------------------------------------------------------- /dask_benchmark.py: -------------------------------------------------------------------------------- 1 | import dask.dataframe as dd 2 | import pandas as pd 3 | from io import StringIO 4 | from datetime import datetime 5 | 6 | t_initial = datetime.now() 7 | 8 | # 1. Reading 9 | PATH = "/home/peter/Documents/TEST/RUST/stack-overflow/data/train_October_9_2012.csv" 10 | 11 | PATH_DASK = "/home/peter/Documents/TEST/RUST/stack-overflow/data/SO.csv" 12 | PATH_WIKIPEDIA = ( 13 | "/home/peter/Documents/TEST/RUST/stack-overflow/data/wikipedia.csv" 14 | ) 15 | PATH_OUTPUT = ( 16 | "/home/peter/Documents/TEST/RUST/stack-overflow/data/python_output.csv" 17 | ) 18 | PATH_DASK_OUTPUT = ( 19 | "/home/peter/Documents/TEST/RUST/stack-overflow/data/dask-output-*.csv" 20 | ) 21 | 22 | df = pd.read_csv( 23 | PATH, 24 | ) 25 | 26 | df = dd.from_pandas(df, npartitions=2) 27 | df_wikipedia = dd.read_csv(PATH_WIKIPEDIA) 28 | 29 | t_reading = datetime.now() 30 | 31 | # 2. Formatting date 32 | df["PostCreationDate"] = dd.to_datetime( 33 | df["PostCreationDate"], format="%m/%d/%Y %H:%M:%S" 34 | ) 35 | 36 | t_formatting = datetime.now() 37 | 38 | # 3. Formatting custom field 39 | count_words = lambda x: len(x.split(" ")) 40 | 41 | df["BodyMarkdown"] = df["BodyMarkdown"].map( 42 | count_words, 43 | ) 44 | 45 | t_count_words = datetime.now() 46 | 47 | # 4. Merging 48 | df = dd.merge( 49 | df, df_wikipedia, left_on="Tag1", right_on="Language", how="left" 50 | ).fillna(0) 51 | 52 | t_merging = datetime.now() 53 | 54 | # 4. Groupby 55 | groupby_series = [df["OpenStatus"]] 56 | target_column = [ 57 | "ReputationAtPostCreation", 58 | "OwnerUndeletedAnswerCountAtPostTime", 59 | "Imperative", 60 | "Object-oriented", 61 | "Functional", 62 | "Procedural", 63 | "Generic", 64 | "Reflective", 65 | "Event-driven", 66 | ] 67 | 68 | groups = df.groupby(by=groupby_series)[target_column].mean() 69 | 70 | t_groupby = datetime.now() 71 | 72 | # 5. Filtering 73 | df = df[df["Tag1"] == "rust"] 74 | 75 | t_filtering = datetime.now() 76 | 77 | # 6. Writing 78 | 79 | groups.compute().to_csv(PATH_DASK_OUTPUT) 80 | t_writing = datetime.now() 81 | 82 | # 7. printing 83 | 84 | timings = [ 85 | t_initial, 86 | t_reading, 87 | t_formatting, 88 | t_count_words, 89 | t_merging, 90 | t_groupby, 91 | t_filtering, 92 | t_writing, 93 | ] 94 | 95 | names = [ 96 | "reading", 97 | "formatting", 98 | "count_words", 99 | "merging", 100 | "groupby", 101 | "filtering", 102 | "writing", 103 | ] 104 | 105 | for i, name in enumerate(names): 106 | 107 | print(f"{name}: {(timings[i+1] - timings[i]).total_seconds() * 1000}") 108 | 109 | # df = dd.read_csv( 110 | # "partitions/*.csv", 111 | # dtype={ 112 | # "OwnerUndeletedAnswerCountAtPostTime": "float64", 113 | # "OwnerUserId": "object", 114 | # "PostId": "object", 115 | # "ReputationAtPostCreation": "float64", 116 | # "Unnamed: 0": "object", 117 | # }, 118 | # ) 119 | 120 | 121 | # df.repartition(npartitions=100).to_csv("partitiions/*.csv", index=False) 122 | 123 | # group = df.groupby(df.Tag1).ReputationAtPostCreation.sum().compute() 124 | # group.to_csv("dask_output-*.csv") 125 | # df["BodyMarkdown"] = df["BodyMarkdown"].str.replace("\r\n", " ") 126 | 127 | # df["PostCreationDate"] = pd.to_datetime( 128 | # df["PostCreationDate"], format="%m/%d/%Y %H:%M:%S" 129 | # ) 130 | # df["OwnerCreationDate"] = pd.to_datetime(df["OwnerCreationDate"], infer_datetime_format=True 131 | # ) 132 | -------------------------------------------------------------------------------- /get_wikipedia_table.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | df = pd.read_html( 4 | "https://en.wikipedia.org/wiki/Comparison_of_programming_languages" 5 | )[1] 6 | 7 | # df = df.replace("Yes", 1) 8 | # df = df.replace("No", 0) 9 | df = df.fillna(0) 10 | 11 | columns = [ 12 | "Imperative", 13 | "Object-oriented", 14 | "Functional", 15 | "Procedural", 16 | "Generic", 17 | "Reflective", 18 | "Event-driven", 19 | ] 20 | 21 | for col in columns: 22 | index = df[col].str.contains("Yes*") == True 23 | 24 | df.loc[index, col] = 1 25 | df.loc[~index, col] = 0 26 | 27 | df["Language"] = df["Language"].str.lower() 28 | df.to_csv("data/wikipedia.csv", index=False) 29 | -------------------------------------------------------------------------------- /native-rust-optimized/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "native-rust-optimized" 3 | version = "0.1.0" 4 | authors = ["auterium"] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | csv = "1.1" 11 | serde = { version = "1", features = ["derive"] } 12 | serde_json = "1.0.63" 13 | serde_with = "1.6.4" 14 | chrono = "0.4.19" 15 | itertools = "0.10.0" 16 | rayon = "1.5" -------------------------------------------------------------------------------- /native-rust-optimized/src/RUST.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "../.." 5 | }, 6 | { 7 | "path": "../../../polars" 8 | } 9 | ], 10 | "settings": {} 11 | } -------------------------------------------------------------------------------- /native-rust-optimized/src/main.rs: -------------------------------------------------------------------------------- 1 | mod utils; 2 | 3 | use rayon::prelude::*; 4 | use std::{collections::HashMap, fs::File, time::Instant}; 5 | use utils::{NativeDataFrame, NativeDataFrameRaw}; 6 | 7 | fn use_native_rust( 8 | path: &str, 9 | path_wikipedia: &str, 10 | output_path: &str, 11 | ) -> Result<(), Box> { 12 | let start = Instant::now(); 13 | 14 | let file = File::open(path_wikipedia)?; 15 | let mut rdr_wiki = csv::ReaderBuilder::new().delimiter(b',').from_reader(file); 16 | let hash_wikipedia: HashMap = rdr_wiki 17 | .deserialize() 18 | .into_iter() 19 | .filter_map(|result: csv::Result| { 20 | result.map(|x| (x.language.clone(), x)).ok() 21 | }) 22 | .collect(); 23 | 24 | let file = File::open(path)?; 25 | let mut rdr = csv::ReaderBuilder::new().delimiter(b',').from_reader(file); 26 | let groups_hash = rdr.deserialize().into_iter().fold( 27 | HashMap::::new(), 28 | |mut hash_group, record: csv::Result| { 29 | let record = record.unwrap(); 30 | let (group, count) = hash_group.entry(record.open_status.clone()).or_default(); 31 | group.status = record.open_status; 32 | group.reputation_at_post_creation += record.reputation_at_post_creation; 33 | group.owner_undeleted_answer_count_at_post_time += 34 | record.owner_undeleted_answer_count_at_post_time; 35 | 36 | if let Some(wiki) = hash_wikipedia.get(&record.tag1) { 37 | group.imperative += wiki.imperative; 38 | group.object_oriented += wiki.object_oriented; 39 | group.functional += wiki.functional; 40 | group.procedural += wiki.procedural; 41 | group.generic += wiki.generic; 42 | group.reflective += wiki.reflective; 43 | group.event_driven += wiki.event_driven; 44 | } 45 | 46 | *count += 1; 47 | 48 | hash_group 49 | }, 50 | ); 51 | 52 | let groups = groups_hash.into_iter().map(|(_, (mut group, count))| { 53 | group.reputation_at_post_creation /= count as f64; 54 | group.owner_undeleted_answer_count_at_post_time /= count as f64; 55 | group.imperative /= count as f64; 56 | group.object_oriented /= count as f64; 57 | group.functional /= count as f64; 58 | group.procedural /= count as f64; 59 | group.generic /= count as f64; 60 | group.reflective /= count as f64; 61 | group.event_driven /= count as f64; 62 | 63 | group 64 | }); 65 | 66 | let mut wtr = csv::Writer::from_path(output_path)?; 67 | for record in groups { 68 | wtr.serialize(record)?; 69 | } 70 | 71 | println!("Overall time taken: {:?}", start.elapsed()); 72 | 73 | Ok(()) 74 | } 75 | 76 | fn use_native_parallel_rust( 77 | path: &str, 78 | path_wikipedia: &str, 79 | output_path: &str, 80 | ) -> Result<(), Box> { 81 | let start = Instant::now(); 82 | 83 | let file = File::open(path_wikipedia)?; 84 | let mut rdr_wiki = csv::ReaderBuilder::new().delimiter(b',').from_reader(file); 85 | let hash_wikipedia: HashMap = rdr_wiki 86 | .deserialize() 87 | .into_iter() 88 | .filter_map(|result: csv::Result| { 89 | result.map(|x| (x.language.clone(), x)).ok() 90 | }) 91 | .collect(); 92 | 93 | let file = File::open(path)?; 94 | let mut rdr = csv::ReaderBuilder::new().delimiter(b',').from_reader(file); 95 | 96 | let mut buffer = Vec::::with_capacity(1000); 97 | let mut groups_hash: HashMap = HashMap::new(); 98 | 99 | for record in rdr.deserialize().filter_map(Result::ok) { 100 | buffer.push(record); 101 | 102 | if buffer.len() < 1000 { 103 | continue; 104 | } 105 | 106 | process_buffer(&mut groups_hash, &mut buffer, &hash_wikipedia); 107 | } 108 | 109 | process_buffer(&mut groups_hash, &mut buffer, &hash_wikipedia); 110 | 111 | let groups = groups_hash.into_iter().map(|(_, (mut group, count))| { 112 | group.reputation_at_post_creation /= count as f64; 113 | group.owner_undeleted_answer_count_at_post_time /= count as f64; 114 | group.imperative /= count as f64; 115 | group.object_oriented /= count as f64; 116 | group.functional /= count as f64; 117 | group.procedural /= count as f64; 118 | group.generic /= count as f64; 119 | group.reflective /= count as f64; 120 | group.event_driven /= count as f64; 121 | 122 | group 123 | }); 124 | 125 | let mut wtr = csv::Writer::from_path(output_path)?; 126 | for record in groups { 127 | wtr.serialize(record)?; 128 | } 129 | 130 | println!("Overall time taken: {:?}", start.elapsed()); 131 | 132 | Ok(()) 133 | } 134 | 135 | fn process_buffer( 136 | groups_hash: &mut HashMap, 137 | buffer: &mut Vec, 138 | hash_wikipedia: &HashMap, 139 | ) { 140 | let data = buffer 141 | .drain(..) 142 | .collect::>() 143 | .into_par_iter() 144 | .map(NativeDataFrame::from) 145 | .collect::>(); 146 | 147 | for record in data { 148 | let (group, count) = groups_hash.entry(record.open_status.clone()).or_default(); 149 | group.status = record.open_status; 150 | group.reputation_at_post_creation += record.reputation_at_post_creation; 151 | group.owner_undeleted_answer_count_at_post_time += 152 | record.owner_undeleted_answer_count_at_post_time; 153 | 154 | if let Some(wiki) = hash_wikipedia.get(&record.tag1) { 155 | group.imperative += wiki.imperative; 156 | group.object_oriented += wiki.object_oriented; 157 | group.functional += wiki.functional; 158 | group.procedural += wiki.procedural; 159 | group.generic += wiki.generic; 160 | group.reflective += wiki.reflective; 161 | group.event_driven += wiki.event_driven; 162 | } 163 | 164 | *count += 1; 165 | } 166 | } 167 | 168 | fn main() { 169 | let path = 170 | "/home/peter/Documents/TEST/RUST/dataframe-python-rust/data/train_October_9_2012.csv"; 171 | let output_native_rust_path = "native_rust_optimized_output.csv"; 172 | let path_wikipedia = "wikipedia.csv"; 173 | 174 | use_native_parallel_rust(path, path_wikipedia, output_native_rust_path) 175 | .expect("Test of polar oriented result."); 176 | } 177 | -------------------------------------------------------------------------------- /native-rust-optimized/src/utils.rs: -------------------------------------------------------------------------------- 1 | use chrono::NaiveDateTime; 2 | use serde::{Deserialize, de::Deserializer, Serialize}; 3 | 4 | #[derive(Debug, Deserialize)] 5 | #[serde(rename_all = "PascalCase")] 6 | pub struct NativeDataFrameRaw { 7 | pub post_creation_date: String, 8 | pub reputation_at_post_creation: f64, 9 | pub body_markdown: String, 10 | pub tag1: String, 11 | pub open_status: String, 12 | pub owner_undeleted_answer_count_at_post_time: f64, 13 | } 14 | 15 | impl From for NativeDataFrame { 16 | fn from(item: NativeDataFrameRaw) -> Self { 17 | Self { 18 | post_creation_date: NaiveDateTime::parse_from_str(&item.post_creation_date, "%m/%d/%Y %H:%M:%S").unwrap(), 19 | reputation_at_post_creation: item.reputation_at_post_creation, 20 | count_words: item.body_markdown.split(' ').count() as f64, 21 | tag1: item.tag1, 22 | open_status: item.open_status, 23 | owner_undeleted_answer_count_at_post_time: item.owner_undeleted_answer_count_at_post_time, 24 | } 25 | } 26 | } 27 | 28 | #[derive(Debug, Deserialize)] 29 | #[serde(rename_all = "PascalCase")] 30 | pub struct NativeDataFrame { 31 | #[serde(deserialize_with = "datetime_parser")] 32 | pub post_creation_date: NaiveDateTime, 33 | pub reputation_at_post_creation: f64, 34 | #[serde(deserialize_with = "word_counter", rename = "BodyMarkdown")] 35 | pub count_words: f64, 36 | pub tag1: String, 37 | pub open_status: String, 38 | pub owner_undeleted_answer_count_at_post_time: f64, 39 | } 40 | 41 | fn datetime_parser<'de, D: Deserializer<'de>>(deserializer: D) -> Result { 42 | let raw: &str = Deserialize::deserialize(deserializer)?; 43 | 44 | NaiveDateTime::parse_from_str(raw, "%m/%d/%Y %H:%M:%S") 45 | .map_err(|x| serde::de::Error::custom(x.to_string())) 46 | } 47 | 48 | fn word_counter<'de, D: Deserializer<'de>>(deserializer: D) -> Result { 49 | let raw: String = Deserialize::deserialize(deserializer)?; 50 | 51 | Ok(raw.split(' ').count() as f64) 52 | } 53 | 54 | #[derive(Debug, Clone, Deserialize)] 55 | #[serde(rename_all = "PascalCase")] 56 | pub struct WikiDataFrame { 57 | pub language: String, 58 | pub procedural: f64, 59 | #[serde(rename = "Object-oriented")] 60 | pub object_oriented: f64, 61 | pub imperative: f64, 62 | pub functional: f64, 63 | pub generic: f64, 64 | pub reflective: f64, 65 | #[serde(rename = "Event-driven")] 66 | pub event_driven: f64, 67 | } 68 | 69 | #[derive(Debug, Default, Clone, Deserialize, Serialize)] 70 | #[serde(rename_all = "PascalCase")] 71 | pub struct GroupBy { 72 | pub status: String, 73 | pub reputation_at_post_creation: f64, 74 | pub owner_undeleted_answer_count_at_post_time: f64, 75 | pub imperative: f64, 76 | pub object_oriented: f64, 77 | pub functional: f64, 78 | pub procedural: f64, 79 | pub generic: f64, 80 | pub reflective: f64, 81 | pub event_driven: f64, 82 | } 83 | -------------------------------------------------------------------------------- /native-rust/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "native-rust" 3 | version = "0.1.0" 4 | authors = ["xavier tao "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | csv = "1.1" 11 | serde = { version = "1", features = ["derive"] } 12 | serde_json = "1.0.63" 13 | serde_with = "1.6.4" 14 | chrono = "0.4.19" 15 | itertools = "0.10.0" 16 | rayon = "1.5" -------------------------------------------------------------------------------- /native-rust/src/RUST.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "../.." 5 | }, 6 | { 7 | "path": "../../../polars" 8 | } 9 | ], 10 | "settings": {} 11 | } -------------------------------------------------------------------------------- /native-rust/src/main.rs: -------------------------------------------------------------------------------- 1 | mod utils; 2 | use chrono::DateTime; 3 | use itertools::Itertools; 4 | use rayon::prelude::*; 5 | use std::collections::HashMap; 6 | use std::collections::HashSet; 7 | use std::fs::File; 8 | use std::time::Instant; 9 | 10 | fn use_native_rust( 11 | path: &str, 12 | path_wikipedia: &str, 13 | output_path: &str, 14 | ) -> std::result::Result<(), Box> { 15 | let t_initial = Instant::now(); 16 | 17 | let file = File::open(path)?; 18 | 19 | let mut rdr = csv::ReaderBuilder::new().delimiter(b',').from_reader(file); 20 | let mut records: Vec = rdr 21 | .deserialize() 22 | .into_iter() 23 | .filter_map(|result| match result { 24 | Ok(rec) => rec, 25 | Err(e) => None, 26 | }) 27 | .collect(); 28 | 29 | let file = File::open(path_wikipedia)?; 30 | let mut rdr_wiki = csv::ReaderBuilder::new().delimiter(b',').from_reader(file); 31 | let records_wikipedia: Vec = rdr_wiki 32 | .deserialize() 33 | .into_iter() 34 | .filter_map(|result| match result { 35 | Ok(rec) => rec, 36 | Err(e) => None, 37 | }) 38 | .collect(); 39 | 40 | let t_reading = Instant::now(); 41 | 42 | // 1. Apply Format Date 43 | let fmt = "%m/%d/%Y %H:%M:%S"; 44 | 45 | records 46 | .iter_mut() 47 | .for_each(|record: &mut utils::NativeDataFrame| { 48 | record.PostCreationDatetime = 49 | match DateTime::parse_from_str(record.PostCreationDate.as_ref().unwrap(), fmt) { 50 | Ok(dates) => Some(dates), 51 | Err(_) => None, 52 | } 53 | }); 54 | 55 | let t_formatting = Instant::now(); 56 | 57 | // 2. Apply Custom Formatting 58 | records 59 | .iter_mut() 60 | .for_each(|record: &mut utils::NativeDataFrame| { 61 | record.CountWords = 62 | Some(record.BodyMarkdown.as_ref().unwrap().split(' ').count() as f64) 63 | }); 64 | 65 | let t_count_words = Instant::now(); 66 | let hash_wikipedia: &HashMap<&String, &utils::WikiDataFrame> = &records_wikipedia 67 | .iter() 68 | .map(|record| (record.Language.as_ref().unwrap(), record)) 69 | .collect(); 70 | 71 | records.iter_mut().for_each(|record| { 72 | record.Wikipedia = match hash_wikipedia.get(&record.Tag1.as_ref().unwrap()) { 73 | Some(wikipedia) => Some(wikipedia.clone().clone()), 74 | None => None, 75 | } 76 | }); 77 | 78 | let t_merging = Instant::now(); 79 | 80 | let groups_hash: HashMap = records 81 | .par_iter() 82 | .fold( 83 | || HashMap::with_capacity(10), // || HashMap::new() 84 | |mut hash_group: HashMap, record| { 85 | let group: utils::GroupBy = if let Some(wiki) = &record.Wikipedia { 86 | utils::GroupBy { 87 | status: record.OpenStatus.as_ref().unwrap().to_string(), 88 | ReputationAtPostCreation: record.ReputationAtPostCreation.unwrap(), 89 | OwnerUndeletedAnswerCountAtPostTime: record 90 | .OwnerUndeletedAnswerCountAtPostTime 91 | .unwrap(), 92 | Imperative: wiki.Imperative.unwrap(), 93 | ObjectOriented: wiki.ObjectOriented.unwrap(), 94 | Functional: wiki.Functional.unwrap(), 95 | Procedural: wiki.Procedural.unwrap(), 96 | Generic: wiki.Generic.unwrap(), 97 | Reflective: wiki.Reflective.unwrap(), 98 | EventDriven: wiki.EventDriven.unwrap(), 99 | } 100 | } else { 101 | utils::GroupBy { 102 | status: record.OpenStatus.as_ref().unwrap().to_string(), 103 | ReputationAtPostCreation: record.ReputationAtPostCreation.unwrap(), 104 | OwnerUndeletedAnswerCountAtPostTime: record 105 | .OwnerUndeletedAnswerCountAtPostTime 106 | .unwrap(), 107 | ..Default::default() 108 | } 109 | }; 110 | if let Some((previous, count)) = hash_group.get_mut(&group.status.to_string()) { 111 | *previous = previous.clone() + group; 112 | *count += 1; 113 | } else { 114 | hash_group.insert(group.status.to_string(), (group, 1)); 115 | }; 116 | hash_group 117 | }, 118 | ) 119 | .reduce( 120 | || HashMap::new(), 121 | |prev, other| { 122 | let set1: HashSet = prev.keys().cloned().collect(); 123 | let set2: HashSet = other.keys().cloned().collect(); 124 | let unions: HashSet = set1.union(&set2).cloned().collect(); 125 | let mut map = HashMap::new(); 126 | for key in unions.iter() { 127 | map.insert( 128 | key.to_string(), 129 | match (prev.get(key), other.get(key)) { 130 | (Some((previous, count_prev)), Some((group, count_other))) => { 131 | (previous.clone() + group.clone(), count_prev + count_other) 132 | } 133 | (Some(previous), None) => previous.clone(), 134 | (None, Some(other)) => other.clone(), 135 | (None, None) => (utils::GroupBy::new(), 0), 136 | }, 137 | ); 138 | } 139 | map 140 | }, 141 | ); 142 | 143 | let groups: Vec = groups_hash 144 | .iter() 145 | .map(|(_, (group, count))| utils::GroupBy { 146 | status: group.status.to_string(), 147 | ReputationAtPostCreation: group.ReputationAtPostCreation / count.clone() as f64, 148 | OwnerUndeletedAnswerCountAtPostTime: group.OwnerUndeletedAnswerCountAtPostTime 149 | / count.clone() as f64, 150 | Imperative: group.Imperative / count.clone() as f64, 151 | ObjectOriented: group.ObjectOriented / count.clone() as f64, 152 | Functional: group.Functional / count.clone() as f64, 153 | Procedural: group.Procedural / count.clone() as f64, 154 | Generic: group.Generic / count.clone() as f64, 155 | Reflective: group.Reflective / count.clone() as f64, 156 | EventDriven: group.EventDriven / count.clone() as f64, 157 | }) 158 | .collect(); 159 | 160 | let t_groupby = Instant::now(); 161 | 162 | let mut wtr = csv::Writer::from_path(output_path)?; 163 | 164 | for record in groups { 165 | wtr.serialize(record)?; 166 | } 167 | 168 | let t_writing = Instant::now(); 169 | 170 | let _ = records 171 | .iter() 172 | .filter(|record| record.Tag1 == Some("rust".to_string())) 173 | .collect::>(); 174 | 175 | let t_filtering = Instant::now(); 176 | 177 | let timings = [ 178 | t_initial, 179 | t_reading, 180 | t_formatting, 181 | t_count_words, 182 | t_merging, 183 | t_groupby, 184 | t_writing, 185 | t_filtering, 186 | ]; 187 | let names = [ 188 | "reading", 189 | "formatting", 190 | "count_words", 191 | "merging", 192 | "groupby", 193 | "writing", 194 | "filtering", 195 | ]; 196 | for (i, name) in names.iter().enumerate() { 197 | println!("{}: {:#?}", name, (timings[i + 1] - timings[i]).as_millis()); 198 | } 199 | 200 | Ok(()) 201 | } 202 | 203 | fn main() { 204 | let path = "/home/peter/Documents/TEST/RUST/stack-overflow/data/train_October_9_2012.csv"; 205 | let output_native_rust_path = 206 | "/home/peter/Documents/TEST/RUST/stack-overflow/data/native_rust_output.csv"; 207 | let path_wikipedia = "/home/peter/Documents/BLOG/dataframe-python-rust/data/wikipedia.csv"; 208 | 209 | use_native_rust(path, path_wikipedia, output_native_rust_path) 210 | .expect("Test of polar oriented result."); 211 | } 212 | -------------------------------------------------------------------------------- /native-rust/src/utils.rs: -------------------------------------------------------------------------------- 1 | use chrono::DateTime; 2 | use serde::{Deserialize, Serialize}; 3 | use serde_with::skip_serializing_none; 4 | use std::collections::HashMap; 5 | use std::ops::Add; 6 | 7 | #[skip_serializing_none] 8 | #[derive(Debug, Deserialize, Serialize)] 9 | pub struct NativeDataFrame { 10 | #[serialize_always] 11 | pub OwnerUserId: Option, 12 | #[serialize_always] 13 | pub PostClosedDate: Option, 14 | #[serialize_always] 15 | pub PostCreationDate: Option, 16 | #[serialize_always] 17 | pub PostId: Option, 18 | #[serialize_always] 19 | pub ReputationAtPostCreation: Option, 20 | #[serialize_always] 21 | pub BodyMarkdown: Option, 22 | #[serialize_always] 23 | pub Tag4: Option, 24 | #[serialize_always] 25 | pub Tag1: Option, 26 | #[serialize_always] 27 | pub OwnerCreationDate: Option, 28 | #[serialize_always] 29 | pub Tag5: Option, 30 | #[serialize_always] 31 | pub Tag3: Option, 32 | #[serialize_always] 33 | pub OpenStatus: Option, 34 | #[serialize_always] 35 | pub Tag2: Option, 36 | #[serialize_always] 37 | pub OwnerUndeletedAnswerCountAtPostTime: Option, 38 | #[serialize_always] 39 | pub Title: Option, 40 | #[serde(skip)] 41 | pub PostCreationDatetime: Option>, 42 | #[serialize_always] 43 | pub CountWords: Option, 44 | #[serde(skip)] 45 | pub Wikipedia: Option, 46 | } 47 | 48 | #[skip_serializing_none] 49 | #[derive(Debug, Clone, Deserialize, Serialize)] 50 | pub struct WikiDataFrame { 51 | #[serialize_always] 52 | pub Language: Option, 53 | #[serialize_always] 54 | pub Procedural: Option, 55 | #[serialize_always] 56 | #[serde(rename(serialize = "Object-oriented", deserialize = "Object-oriented"))] 57 | pub ObjectOriented: Option, 58 | #[serialize_always] 59 | pub Imperative: Option, 60 | #[serialize_always] 61 | pub Functional: Option, 62 | #[serialize_always] 63 | pub Generic: Option, 64 | #[serialize_always] 65 | pub Reflective: Option, 66 | #[serialize_always] 67 | #[serde(rename(serialize = "Event-driven", deserialize = "Event-driven"))] 68 | pub EventDriven: Option, 69 | #[serialize_always] 70 | #[serde(rename(serialize = "Other paradigm(s)", deserialize = "Other paradigm(s)"))] 71 | pub OtherParadigm: Option, 72 | #[serialize_always] 73 | #[serde(rename(serialize = "Intended use", deserialize = "Intended use"))] 74 | pub IntendedUse: Option, 75 | #[serialize_always] 76 | #[serde(rename(serialize = "Standardized?", deserialize = "Standardized?"))] 77 | pub Standardized: Option, 78 | } 79 | 80 | pub type Record = HashMap; 81 | 82 | #[derive(Debug, Clone, Deserialize, Serialize)] 83 | pub struct GroupBy { 84 | pub status: String, 85 | pub ReputationAtPostCreation: f64, 86 | pub OwnerUndeletedAnswerCountAtPostTime: f64, 87 | pub Imperative: f64, 88 | pub ObjectOriented: f64, 89 | pub Functional: f64, 90 | pub Procedural: f64, 91 | pub Generic: f64, 92 | pub Reflective: f64, 93 | pub EventDriven: f64, 94 | } 95 | 96 | impl Default for GroupBy { 97 | fn default() -> GroupBy { 98 | GroupBy { 99 | status: "".to_string(), 100 | ReputationAtPostCreation: 0., 101 | OwnerUndeletedAnswerCountAtPostTime: 0., 102 | Imperative: 0., 103 | ObjectOriented: 0., 104 | Functional: 0., 105 | Procedural: 0., 106 | Generic: 0., 107 | Reflective: 0., 108 | EventDriven: 0., 109 | } 110 | } 111 | } 112 | 113 | impl GroupBy { 114 | pub fn new() -> GroupBy { 115 | GroupBy { 116 | status: "".to_string(), 117 | ReputationAtPostCreation: 0., 118 | OwnerUndeletedAnswerCountAtPostTime: 0., 119 | Imperative: 0., 120 | ObjectOriented: 0., 121 | Functional: 0., 122 | Procedural: 0., 123 | Generic: 0., 124 | Reflective: 0., 125 | EventDriven: 0., 126 | } 127 | } 128 | } 129 | 130 | impl Add for GroupBy { 131 | type Output = GroupBy; 132 | fn add(self, other: GroupBy) -> GroupBy { 133 | GroupBy { 134 | status: self.status, 135 | ReputationAtPostCreation: self.ReputationAtPostCreation 136 | + other.ReputationAtPostCreation, 137 | OwnerUndeletedAnswerCountAtPostTime: self.OwnerUndeletedAnswerCountAtPostTime 138 | + other.OwnerUndeletedAnswerCountAtPostTime, 139 | Imperative: self.Imperative + other.Imperative, 140 | ObjectOriented: self.ObjectOriented + other.ObjectOriented, 141 | Functional: self.Functional + other.Functional, 142 | Procedural: self.Procedural + other.Procedural, 143 | Generic: self.Generic + other.Generic, 144 | Reflective: self.Reflective + other.Reflective, 145 | EventDriven: self.EventDriven + other.EventDriven, 146 | } 147 | } 148 | } 149 | 150 | pub fn inspect(path: &str) { 151 | let mut record: Record = HashMap::new(); 152 | 153 | let mut rdr = csv::Reader::from_path(path).unwrap(); 154 | 155 | for result in rdr.deserialize() { 156 | match result { 157 | Ok(rec) => { 158 | record = rec; 159 | break; 160 | } 161 | Err(_e) => (), 162 | }; 163 | } 164 | // Print Struct 165 | println!("#[skip_serializing_none]"); 166 | println!("#[derive(Debug, Deserialize, Serialize)]"); 167 | println!("struct lib::DataFrame {{"); 168 | for (key, value) in &record { 169 | println!(" #[serialize_always]"); 170 | 171 | match value.parse::() { 172 | Ok(_n) => { 173 | // println!(" #[serde(deserialize_with = \"empty_string_as_none\")]"); 174 | println!(" {}: Option,", key); 175 | continue; 176 | } 177 | Err(_e) => (), 178 | } 179 | match value.parse::() { 180 | Ok(_n) => { 181 | // println!(" #[serde(deserialize_with = \"empty_string_as_none\")]"); 182 | println!(" {}: Option,", key); 183 | continue; 184 | } 185 | Err(_e) => (), 186 | } 187 | println!(" {}: Option,", key); 188 | } 189 | println!("}}"); 190 | } 191 | -------------------------------------------------------------------------------- /pandas_benchmark.py: -------------------------------------------------------------------------------- 1 | # import dask.dataframe as dd 2 | import pandas as pd 3 | from io import StringIO 4 | from datetime import datetime 5 | 6 | t_initial = datetime.now() 7 | 8 | # 1. Reading 9 | PATH = "/home/peter/Documents/TEST/RUST/stack-overflow/data/train_October_9_2012.csv" 10 | PATH_WIKIPEDIA = ( 11 | "/home/peter/Documents/TEST/RUST/stack-overflow/data/wikipedia.csv" 12 | ) 13 | PATH_OUTPUT = ( 14 | "/home/peter/Documents/TEST/RUST/stack-overflow/data/python_output.csv" 15 | ) 16 | 17 | df = pd.read_csv(PATH) 18 | df_wikipedia = pd.read_csv(PATH_WIKIPEDIA) 19 | 20 | t_reading = datetime.now() 21 | 22 | # 2. Formatting date 23 | df["PostCreationDate"] = pd.to_datetime( 24 | df["PostCreationDate"], format="%m/%d/%Y %H:%M:%S" 25 | ) 26 | 27 | t_formatting = datetime.now() 28 | 29 | # 3. Formatting custom field 30 | count_words = lambda x: len(x.split(" ")) 31 | 32 | df["BodyMarkdown"] = df["BodyMarkdown"].map(count_words) 33 | 34 | t_count_words = datetime.now() 35 | 36 | # 4. Merging 37 | df = pd.merge( 38 | df, df_wikipedia, left_on="Tag1", right_on="Language", how="left" 39 | ).fillna(0) 40 | 41 | t_merging = datetime.now() 42 | 43 | # 4. Groupby 44 | groupby_series = [df["OpenStatus"]] 45 | target_column = [ 46 | "ReputationAtPostCreation", 47 | "OwnerUndeletedAnswerCountAtPostTime", 48 | "Imperative", 49 | "Object-oriented", 50 | "Functional", 51 | "Procedural", 52 | "Generic", 53 | "Reflective", 54 | "Event-driven", 55 | ] 56 | 57 | groups = df.groupby(by=groupby_series)[target_column].mean() 58 | 59 | t_groupby = datetime.now() 60 | 61 | # 5. Filtering 62 | df = df[df["Tag1"] == "rust"] 63 | 64 | t_filtering = datetime.now() 65 | 66 | # 6. Writing 67 | 68 | groups.to_csv(PATH_OUTPUT) 69 | t_writing = datetime.now() 70 | 71 | # 7. printing 72 | 73 | timings = [ 74 | t_initial, 75 | t_reading, 76 | t_formatting, 77 | t_count_words, 78 | t_merging, 79 | t_groupby, 80 | t_filtering, 81 | t_writing, 82 | ] 83 | 84 | names = [ 85 | "reading", 86 | "formatting", 87 | "count_words", 88 | "merging", 89 | "groupby", 90 | "filtering", 91 | "writing", 92 | ] 93 | 94 | for i, name in enumerate(names): 95 | 96 | print(f"{name}: {(timings[i+1] - timings[i]).total_seconds() * 1000}") 97 | 98 | # df = dd.read_csv( 99 | # "partitions/*.csv", 100 | # dtype={ 101 | # "OwnerUndeletedAnswerCountAtPostTime": "float64", 102 | # "OwnerUserId": "object", 103 | # "PostId": "object", 104 | # "ReputationAtPostCreation": "float64", 105 | # "Unnamed: 0": "object", 106 | # }, 107 | # ) 108 | 109 | 110 | # df.repartition(npartitions=100).to_csv("partitiions/*.csv", index=False) 111 | 112 | # group = df.groupby(df.Tag1).ReputationAtPostCreation.sum().compute() 113 | # group.to_csv("dask_output-*.csv") 114 | # df["BodyMarkdown"] = df["BodyMarkdown"].str.replace("\r\n", " ") 115 | 116 | # df["PostCreationDate"] = pd.to_datetime( 117 | # df["PostCreationDate"], format="%m/%d/%Y %H:%M:%S" 118 | # ) 119 | # df["OwnerCreationDate"] = pd.to_datetime(df["OwnerCreationDate"], infer_datetime_format=True 120 | # ) 121 | -------------------------------------------------------------------------------- /polars-eager/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "polars-eager" 3 | version = "0.1.0" 4 | authors = ["xavier tao "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | polars = { git = "https://github.com/ritchie46/polars", features = [], version = "0.15.1"} 11 | rayon = "1.5.1" -------------------------------------------------------------------------------- /polars-eager/src/main.rs: -------------------------------------------------------------------------------- 1 | use polars::prelude::*; 2 | use rayon::prelude::*; 3 | use std::fs::File; 4 | use std::time::Instant; 5 | 6 | fn str_to_date(dates: &Series) -> std::result::Result { 7 | let fmt = Some("%m/%d/%Y %H:%M:%S"); 8 | 9 | Ok(dates.utf8()?.as_date64(fmt)?.into_series()) 10 | } 11 | 12 | fn count_words(column: &Series) -> std::result::Result { 13 | Ok(column 14 | .utf8()? 15 | .into_iter() 16 | .map(|opt_name: Option<&str>| opt_name.map(|name: &str| name.split(' ').count() as f64)) 17 | .collect::() 18 | .into_series()) 19 | } 20 | 21 | fn use_polars( 22 | path: &str, 23 | path_wikipedia: &str, 24 | output_path: &str, 25 | ) -> std::result::Result<(), Box> { 26 | let t_initial = Instant::now(); 27 | 28 | let target_column = vec![ 29 | "Language", 30 | "Imperative", 31 | "Object-oriented", 32 | "Functional", 33 | "Procedural", 34 | "Generic", 35 | "Reflective", 36 | "Event-driven", 37 | ]; 38 | 39 | let mut df = CsvReader::from_path(path)? 40 | .with_encoding(CsvEncoding::LossyUtf8) 41 | // .with_n_threads(Some(1)) 42 | .has_header(true) 43 | .finish()?; 44 | let df_wikipedia = CsvReader::from_path(path_wikipedia)? 45 | .with_encoding(CsvEncoding::LossyUtf8) 46 | // .with_n_threads(Some(1)) 47 | .has_header(true) 48 | .finish()? 49 | .select(target_column)?; 50 | 51 | let t_reading = Instant::now(); 52 | 53 | // 1. Apply Format Date 54 | df.may_apply("PostCreationDate", str_to_date)?; 55 | 56 | let t_formatting = Instant::now(); 57 | 58 | // 2. Apply Custom Formatting 59 | df.may_apply("BodyMarkdown", count_words)?; 60 | 61 | let t_count_words = Instant::now(); 62 | 63 | df = df 64 | .join(&df_wikipedia, "Tag1", "Language", JoinType::Left)? 65 | .fill_none(FillNoneStrategy::Min)?; 66 | 67 | let t_merging = Instant::now(); 68 | 69 | // 3. groupby 70 | let groupby_series = vec![df.column("OpenStatus")?.clone()]; 71 | 72 | let target_column = vec![ 73 | "ReputationAtPostCreation", 74 | "OwnerUndeletedAnswerCountAtPostTime", 75 | "Imperative", 76 | "Object-oriented", 77 | "Functional", 78 | "Procedural", 79 | "Generic", 80 | "Reflective", 81 | "Event-driven", 82 | ]; 83 | 84 | let groups = df 85 | .groupby_with_series(groupby_series, false)? 86 | .select(target_column) 87 | .mean()?; 88 | 89 | let t_groupby = Instant::now(); 90 | 91 | // 4. Filtering 92 | let values = df.column("Tag1")?; 93 | let mask = values.eq("rust"); 94 | let _ = df.filter(&mask)?; 95 | 96 | let t_filtering = Instant::now(); 97 | 98 | let mut buffer = File::create(output_path)?; 99 | 100 | CsvWriter::new(&mut buffer) 101 | .has_headers(true) 102 | .finish(&mut groups.sort("OpenStatus", false)?) 103 | .expect("csv written"); 104 | let t_writing = Instant::now(); 105 | 106 | let timings = [ 107 | t_initial, 108 | t_reading, 109 | t_formatting, 110 | t_count_words, 111 | t_merging, 112 | t_groupby, 113 | t_filtering, 114 | t_writing, 115 | ]; 116 | 117 | let names = [ 118 | "reading", 119 | "formatting", 120 | "count_words", 121 | "merging", 122 | "groupby", 123 | "filtering", 124 | "writing", 125 | ]; 126 | 127 | for (i, name) in names.iter().enumerate() { 128 | println!("{}: {:#?}", name, (timings[i + 1] - timings[i]).as_millis()); 129 | } 130 | 131 | Ok(()) 132 | } 133 | 134 | fn main() { 135 | let path = 136 | "/home/peter/Documents/TEST/RUST/dataframe-python-rust/data/train_October_9_2012.csv"; 137 | let output_polars_eager_path = 138 | "/home/peter/Documents/TEST/RUST/dataframe-python-rust/data/polars_eager_output.csv"; 139 | let path_wikipedia = "/home/peter/Documents/BLOG/dataframe-python-rust/data/wikipedia.csv"; 140 | 141 | use_polars(path, path_wikipedia, output_polars_eager_path).expect("Polar eager failed."); 142 | } 143 | -------------------------------------------------------------------------------- /polars-lazy/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "polars-lazy" 3 | version = "0.1.0" 4 | authors = ["xavier tao "] 5 | edition = "2018" 6 | 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 8 | 9 | [dependencies] 10 | polars-lazy = { git = "https://github.com/ritchie46/polars", features = [ "csv-file"], version = "0.15.1"} 11 | polars = { git = "https://github.com/ritchie46/polars", features = [ "csv-file"], version = "0.15.1"} 12 | # remove simd for stable cargo 13 | rayon = "1.5.1" -------------------------------------------------------------------------------- /polars-lazy/src/main.rs: -------------------------------------------------------------------------------- 1 | use polars::prelude::*; 2 | use polars_lazy::dsl::col; 3 | use polars_lazy::prelude::*; 4 | use rayon::prelude::*; 5 | use std::fs::File; 6 | use std::time::Instant; 7 | 8 | fn lazy_str_to_date(dates: Series) -> std::result::Result { 9 | let fmt = Some("%m/%d/%Y %H:%M:%S"); 10 | 11 | Ok(dates.utf8()?.as_date64(fmt)?.into_series()) 12 | } 13 | 14 | fn lazy_date_to_hour(dates: Series) -> std::result::Result { 15 | Ok(dates.date64()?.hour().into_series()) 16 | } 17 | 18 | fn lazy_count_words(dates: Series) -> std::result::Result { 19 | Ok(dates 20 | .utf8()? 21 | .into_iter() 22 | .map(|opt_name: Option<&str>| opt_name.map(|name: &str| name.split(' ').count() as f64)) 23 | .collect::() 24 | .into_series()) 25 | } 26 | 27 | fn use_lazy_polars( 28 | path: &str, 29 | path_wikipedia: &str, 30 | output_path: &str, 31 | ) -> std::result::Result<(), Box> { 32 | let t_initial = Instant::now(); 33 | let df_wikipedia = LazyCsvReader::new(path_wikipedia.to_string()) 34 | // .with_encoding(CsvEncoding::LossyUtf8) 35 | // .with_n_threads(Some(1)) 36 | .has_header(true) 37 | .finish(); 38 | 39 | let mut df = LazyCsvReader::new(path.to_string()) 40 | // .with_encoding(CsvEncoding::LossyUtf8) 41 | // .with_n_threads(Some(1)) 42 | .has_header(true) 43 | .finish() 44 | .with_columns(vec![ 45 | col("PostCreationDate") 46 | .map(lazy_str_to_date, GetOutput::from_type(DataType::Date64)) 47 | .map(lazy_date_to_hour, GetOutput::from_type(DataType::Date64)) 48 | .alias("hour"), 49 | col("BodyMarkdown") 50 | .map(lazy_count_words, GetOutput::from_type(DataType::UInt64)) 51 | .alias("newBodyMarkdown"), 52 | ]) 53 | .inner_join(df_wikipedia, col("Tag1"), col("Language")) 54 | .groupby(vec![col("OpenStatus")]) 55 | .agg(vec![ 56 | col("ReputationAtPostCreation").mean(), 57 | col("OwnerUndeletedAnswerCountAtPostTime").mean(), 58 | col("Imperative").mean(), 59 | col("Object-oriented").mean(), 60 | col("Functional").mean(), 61 | col("Procedural").mean(), 62 | col("Generic").mean(), 63 | col("Reflective").mean(), 64 | col("Event-driven").mean(), 65 | ]) 66 | .select(&[ 67 | col("OpenStatus"), 68 | col("ReputationAtPostCreation_mean"), 69 | col("OwnerUndeletedAnswerCountAtPostTime_mean"), 70 | col("Imperative_mean"), 71 | col("Object-oriented_mean"), 72 | col("Functional_mean"), 73 | col("Procedural_mean"), 74 | col("Generic_mean"), 75 | col("Reflective_mean"), 76 | col("Event-driven_mean"), 77 | ]) 78 | .sort("OpenStatus", false) 79 | .collect()?; 80 | 81 | let mut buffer = File::create(output_path)?; 82 | 83 | CsvWriter::new(&mut buffer) 84 | .has_headers(true) 85 | .finish(&mut df) 86 | .expect("csv written"); 87 | 88 | let t_writing = Instant::now(); 89 | println!("Read to write: {}", (t_writing - t_initial).as_millis()); 90 | Ok(()) 91 | } 92 | 93 | fn main() { 94 | let path = 95 | "/home/peter/Documents/TEST/RUST/dataframe-python-rust/data/train_October_9_2012.csv"; 96 | let output_polars_lazy_path = 97 | "/home/peter/Documents/TEST/RUST/dataframe-python-rust/data/polars_lazy_output.csv"; 98 | let path_wikipedia = "/home/peter/Documents/BLOG/dataframe-python-rust/data/wikipedia.csv"; 99 | 100 | use_lazy_polars(path, path_wikipedia, output_polars_lazy_path) 101 | .expect("Test of polar lazy failed."); 102 | } 103 | --------------------------------------------------------------------------------