├── .gitignore
├── Cargo.toml
├── README.md
├── dask_benchmark.py
├── get_wikipedia_table.py
├── native-rust-optimized
    ├── Cargo.toml
    └── src
    │   ├── RUST.code-workspace
    │   ├── main.rs
    │   └── utils.rs
├── native-rust
    ├── Cargo.toml
    └── src
    │   ├── RUST.code-workspace
    │   ├── main.rs
    │   └── utils.rs
├── pandas_benchmark.py
├── polars-eager
    ├── Cargo.toml
    └── src
    │   └── main.rs
└── polars-lazy
    ├── Cargo.toml
    └── src
        └── main.rs


/.gitignore:
--------------------------------------------------------------------------------
1 | Cargo.lock
2 | /.vscode
3 | /data
4 | /target
5 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
1 | [workspace]
2 | 
3 | members = [
4 |     "polars-lazy",
5 |     "polars-eager",
6 |     "native-rust",
7 |     "native-rust-optimized",
8 | ]
9 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # dataframe-python-rust
 2 | 
 3 | Comparing Polars vs Pandas vs Dask vs Rust native :)
 4 | 
 5 | 
 6 | 
 7 | To run:
 8 | 
 9 | ## Download the data at:
10 | 
11 | - `train_October_9_2012,csv` at https://www.kaggle.com/c/predict-closed-questions-on-stack-overflow/data?select=train_October_9_2012.csv
12 | - wikipedia.csv at https://en.wikipedia.org/wiki/Comparison_of_programming_languages with `python get_wikipedia_table.py`
13 | 
14 | ### Polars Lazy
15 | 
16 | ```bash
17 | cd polars-lazy
18 | cargo build --release
19 | ../target/release/polars-lazy
20 | ```
21 | 
22 | ### Polars Eager
23 | 
24 | ```bash
25 | cd polars-eager
26 | cargo build --release
27 | ../target/release/polars-eager
28 | ```
29 | 
30 | ### Native rust
31 | 
32 | ```bash
33 | cd native-rust
34 | cargo build --release
35 | ../target/release/native-rust
36 | ```
37 | 
38 | The result csv are going to be in data.
39 | 


--------------------------------------------------------------------------------
/dask_benchmark.py:
--------------------------------------------------------------------------------
  1 | import dask.dataframe as dd
  2 | import pandas as pd
  3 | from io import StringIO
  4 | from datetime import datetime
  5 | 
  6 | t_initial = datetime.now()
  7 | 
  8 | # 1. Reading
  9 | PATH = "/home/peter/Documents/TEST/RUST/stack-overflow/data/train_October_9_2012.csv"
 10 | 
 11 | PATH_DASK = "/home/peter/Documents/TEST/RUST/stack-overflow/data/SO.csv"
 12 | PATH_WIKIPEDIA = (
 13 |     "/home/peter/Documents/TEST/RUST/stack-overflow/data/wikipedia.csv"
 14 | )
 15 | PATH_OUTPUT = (
 16 |     "/home/peter/Documents/TEST/RUST/stack-overflow/data/python_output.csv"
 17 | )
 18 | PATH_DASK_OUTPUT = (
 19 |     "/home/peter/Documents/TEST/RUST/stack-overflow/data/dask-output-*.csv"
 20 | )
 21 | 
 22 | df = pd.read_csv(
 23 |     PATH,
 24 | )
 25 | 
 26 | df = dd.from_pandas(df, npartitions=2)
 27 | df_wikipedia = dd.read_csv(PATH_WIKIPEDIA)
 28 | 
 29 | t_reading = datetime.now()
 30 | 
 31 | # 2. Formatting date
 32 | df["PostCreationDate"] = dd.to_datetime(
 33 |     df["PostCreationDate"], format="%m/%d/%Y %H:%M:%S"
 34 | )
 35 | 
 36 | t_formatting = datetime.now()
 37 | 
 38 | # 3. Formatting custom field
 39 | count_words = lambda x: len(x.split(" "))
 40 | 
 41 | df["BodyMarkdown"] = df["BodyMarkdown"].map(
 42 |     count_words,
 43 | )
 44 | 
 45 | t_count_words = datetime.now()
 46 | 
 47 | # 4. Merging
 48 | df = dd.merge(
 49 |     df, df_wikipedia, left_on="Tag1", right_on="Language", how="left"
 50 | ).fillna(0)
 51 | 
 52 | t_merging = datetime.now()
 53 | 
 54 | # 4. Groupby
 55 | groupby_series = [df["OpenStatus"]]
 56 | target_column = [
 57 |     "ReputationAtPostCreation",
 58 |     "OwnerUndeletedAnswerCountAtPostTime",
 59 |     "Imperative",
 60 |     "Object-oriented",
 61 |     "Functional",
 62 |     "Procedural",
 63 |     "Generic",
 64 |     "Reflective",
 65 |     "Event-driven",
 66 | ]
 67 | 
 68 | groups = df.groupby(by=groupby_series)[target_column].mean()
 69 | 
 70 | t_groupby = datetime.now()
 71 | 
 72 | # 5. Filtering
 73 | df = df[df["Tag1"] == "rust"]
 74 | 
 75 | t_filtering = datetime.now()
 76 | 
 77 | # 6. Writing
 78 | 
 79 | groups.compute().to_csv(PATH_DASK_OUTPUT)
 80 | t_writing = datetime.now()
 81 | 
 82 | # 7. printing
 83 | 
 84 | timings = [
 85 |     t_initial,
 86 |     t_reading,
 87 |     t_formatting,
 88 |     t_count_words,
 89 |     t_merging,
 90 |     t_groupby,
 91 |     t_filtering,
 92 |     t_writing,
 93 | ]
 94 | 
 95 | names = [
 96 |     "reading",
 97 |     "formatting",
 98 |     "count_words",
 99 |     "merging",
100 |     "groupby",
101 |     "filtering",
102 |     "writing",
103 | ]
104 | 
105 | for i, name in enumerate(names):
106 | 
107 |     print(f"{name}: {(timings[i+1] - timings[i]).total_seconds() * 1000}")
108 | 
109 | # df = dd.read_csv(
110 | #     "partitions/*.csv",
111 | #     dtype={
112 | #         "OwnerUndeletedAnswerCountAtPostTime": "float64",
113 | #         "OwnerUserId": "object",
114 | #         "PostId": "object",
115 | #         "ReputationAtPostCreation": "float64",
116 | #         "Unnamed: 0": "object",
117 | #     },
118 | # )
119 | 
120 | 
121 | # df.repartition(npartitions=100).to_csv("partitiions/*.csv", index=False)
122 | 
123 | # group = df.groupby(df.Tag1).ReputationAtPostCreation.sum().compute()
124 | # group.to_csv("dask_output-*.csv")
125 | # df["BodyMarkdown"] = df["BodyMarkdown"].str.replace("\r\n", " ")
126 | 
127 | # df["PostCreationDate"] = pd.to_datetime(
128 | #     df["PostCreationDate"], format="%m/%d/%Y %H:%M:%S"
129 | # )
130 | # df["OwnerCreationDate"] = pd.to_datetime(df["OwnerCreationDate"], infer_datetime_format=True
131 | # )
132 | 


--------------------------------------------------------------------------------
/get_wikipedia_table.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | df = pd.read_html(
 4 |     "https://en.wikipedia.org/wiki/Comparison_of_programming_languages"
 5 | )[1]
 6 | 
 7 | # df = df.replace("Yes", 1)
 8 | # df = df.replace("No", 0)
 9 | df = df.fillna(0)
10 | 
11 | columns = [
12 |     "Imperative",
13 |     "Object-oriented",
14 |     "Functional",
15 |     "Procedural",
16 |     "Generic",
17 |     "Reflective",
18 |     "Event-driven",
19 | ]
20 | 
21 | for col in columns:
22 |     index = df[col].str.contains("Yes*") == True
23 | 
24 |     df.loc[index, col] = 1
25 |     df.loc[~index, col] = 0
26 | 
27 | df["Language"] = df["Language"].str.lower()
28 | df.to_csv("data/wikipedia.csv", index=False)
29 | 


--------------------------------------------------------------------------------
/native-rust-optimized/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "native-rust-optimized"
 3 | version = "0.1.0"
 4 | authors = ["auterium"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | csv = "1.1"
11 | serde = { version = "1", features = ["derive"] }
12 | serde_json = "1.0.63"
13 | serde_with = "1.6.4"
14 | chrono = "0.4.19"
15 | itertools = "0.10.0"
16 | rayon = "1.5"


--------------------------------------------------------------------------------
/native-rust-optimized/src/RUST.code-workspace:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"folders": [
 3 | 		{
 4 | 			"path": "../.."
 5 | 		},
 6 | 		{
 7 | 			"path": "../../../polars"
 8 | 		}
 9 | 	],
10 | 	"settings": {}
11 | }


--------------------------------------------------------------------------------
/native-rust-optimized/src/main.rs:
--------------------------------------------------------------------------------
  1 | mod utils;
  2 | 
  3 | use rayon::prelude::*;
  4 | use std::{collections::HashMap, fs::File, time::Instant};
  5 | use utils::{NativeDataFrame, NativeDataFrameRaw};
  6 | 
  7 | fn use_native_rust(
  8 |     path: &str,
  9 |     path_wikipedia: &str,
 10 |     output_path: &str,
 11 | ) -> Result<(), Box<dyn std::error::Error>> {
 12 |     let start = Instant::now();
 13 | 
 14 |     let file = File::open(path_wikipedia)?;
 15 |     let mut rdr_wiki = csv::ReaderBuilder::new().delimiter(b',').from_reader(file);
 16 |     let hash_wikipedia: HashMap<String, utils::WikiDataFrame> = rdr_wiki
 17 |         .deserialize()
 18 |         .into_iter()
 19 |         .filter_map(|result: csv::Result<utils::WikiDataFrame>| {
 20 |             result.map(|x| (x.language.clone(), x)).ok()
 21 |         })
 22 |         .collect();
 23 | 
 24 |     let file = File::open(path)?;
 25 |     let mut rdr = csv::ReaderBuilder::new().delimiter(b',').from_reader(file);
 26 |     let groups_hash = rdr.deserialize().into_iter().fold(
 27 |         HashMap::<String, (utils::GroupBy, usize)>::new(),
 28 |         |mut hash_group, record: csv::Result<utils::NativeDataFrame>| {
 29 |             let record = record.unwrap();
 30 |             let (group, count) = hash_group.entry(record.open_status.clone()).or_default();
 31 |             group.status = record.open_status;
 32 |             group.reputation_at_post_creation += record.reputation_at_post_creation;
 33 |             group.owner_undeleted_answer_count_at_post_time +=
 34 |                 record.owner_undeleted_answer_count_at_post_time;
 35 | 
 36 |             if let Some(wiki) = hash_wikipedia.get(&record.tag1) {
 37 |                 group.imperative += wiki.imperative;
 38 |                 group.object_oriented += wiki.object_oriented;
 39 |                 group.functional += wiki.functional;
 40 |                 group.procedural += wiki.procedural;
 41 |                 group.generic += wiki.generic;
 42 |                 group.reflective += wiki.reflective;
 43 |                 group.event_driven += wiki.event_driven;
 44 |             }
 45 | 
 46 |             *count += 1;
 47 | 
 48 |             hash_group
 49 |         },
 50 |     );
 51 | 
 52 |     let groups = groups_hash.into_iter().map(|(_, (mut group, count))| {
 53 |         group.reputation_at_post_creation /= count as f64;
 54 |         group.owner_undeleted_answer_count_at_post_time /= count as f64;
 55 |         group.imperative /= count as f64;
 56 |         group.object_oriented /= count as f64;
 57 |         group.functional /= count as f64;
 58 |         group.procedural /= count as f64;
 59 |         group.generic /= count as f64;
 60 |         group.reflective /= count as f64;
 61 |         group.event_driven /= count as f64;
 62 | 
 63 |         group
 64 |     });
 65 | 
 66 |     let mut wtr = csv::Writer::from_path(output_path)?;
 67 |     for record in groups {
 68 |         wtr.serialize(record)?;
 69 |     }
 70 | 
 71 |     println!("Overall time taken: {:?}", start.elapsed());
 72 | 
 73 |     Ok(())
 74 | }
 75 | 
 76 | fn use_native_parallel_rust(
 77 |     path: &str,
 78 |     path_wikipedia: &str,
 79 |     output_path: &str,
 80 | ) -> Result<(), Box<dyn std::error::Error>> {
 81 |     let start = Instant::now();
 82 | 
 83 |     let file = File::open(path_wikipedia)?;
 84 |     let mut rdr_wiki = csv::ReaderBuilder::new().delimiter(b',').from_reader(file);
 85 |     let hash_wikipedia: HashMap<String, utils::WikiDataFrame> = rdr_wiki
 86 |         .deserialize()
 87 |         .into_iter()
 88 |         .filter_map(|result: csv::Result<utils::WikiDataFrame>| {
 89 |             result.map(|x| (x.language.clone(), x)).ok()
 90 |         })
 91 |         .collect();
 92 | 
 93 |     let file = File::open(path)?;
 94 |     let mut rdr = csv::ReaderBuilder::new().delimiter(b',').from_reader(file);
 95 | 
 96 |     let mut buffer = Vec::<NativeDataFrameRaw>::with_capacity(1000);
 97 |     let mut groups_hash: HashMap<String, (utils::GroupBy, usize)> = HashMap::new();
 98 | 
 99 |     for record in rdr.deserialize().filter_map(Result::ok) {
100 |         buffer.push(record);
101 | 
102 |         if buffer.len() < 1000 {
103 |             continue;
104 |         }
105 | 
106 |         process_buffer(&mut groups_hash, &mut buffer, &hash_wikipedia);
107 |     }
108 | 
109 |     process_buffer(&mut groups_hash, &mut buffer, &hash_wikipedia);
110 | 
111 |     let groups = groups_hash.into_iter().map(|(_, (mut group, count))| {
112 |         group.reputation_at_post_creation /= count as f64;
113 |         group.owner_undeleted_answer_count_at_post_time /= count as f64;
114 |         group.imperative /= count as f64;
115 |         group.object_oriented /= count as f64;
116 |         group.functional /= count as f64;
117 |         group.procedural /= count as f64;
118 |         group.generic /= count as f64;
119 |         group.reflective /= count as f64;
120 |         group.event_driven /= count as f64;
121 | 
122 |         group
123 |     });
124 | 
125 |     let mut wtr = csv::Writer::from_path(output_path)?;
126 |     for record in groups {
127 |         wtr.serialize(record)?;
128 |     }
129 | 
130 |     println!("Overall time taken: {:?}", start.elapsed());
131 | 
132 |     Ok(())
133 | }
134 | 
135 | fn process_buffer(
136 |     groups_hash: &mut HashMap<String, (utils::GroupBy, usize)>,
137 |     buffer: &mut Vec<NativeDataFrameRaw>,
138 |     hash_wikipedia: &HashMap<String, utils::WikiDataFrame>,
139 | ) {
140 |     let data = buffer
141 |         .drain(..)
142 |         .collect::<Vec<_>>()
143 |         .into_par_iter()
144 |         .map(NativeDataFrame::from)
145 |         .collect::<Vec<_>>();
146 | 
147 |     for record in data {
148 |         let (group, count) = groups_hash.entry(record.open_status.clone()).or_default();
149 |         group.status = record.open_status;
150 |         group.reputation_at_post_creation += record.reputation_at_post_creation;
151 |         group.owner_undeleted_answer_count_at_post_time +=
152 |             record.owner_undeleted_answer_count_at_post_time;
153 | 
154 |         if let Some(wiki) = hash_wikipedia.get(&record.tag1) {
155 |             group.imperative += wiki.imperative;
156 |             group.object_oriented += wiki.object_oriented;
157 |             group.functional += wiki.functional;
158 |             group.procedural += wiki.procedural;
159 |             group.generic += wiki.generic;
160 |             group.reflective += wiki.reflective;
161 |             group.event_driven += wiki.event_driven;
162 |         }
163 | 
164 |         *count += 1;
165 |     }
166 | }
167 | 
168 | fn main() {
169 |     let path =
170 |         "/home/peter/Documents/TEST/RUST/dataframe-python-rust/data/train_October_9_2012.csv";
171 |     let output_native_rust_path = "native_rust_optimized_output.csv";
172 |     let path_wikipedia = "wikipedia.csv";
173 | 
174 |     use_native_parallel_rust(path, path_wikipedia, output_native_rust_path)
175 |         .expect("Test of polar oriented result.");
176 | }
177 | 


--------------------------------------------------------------------------------
/native-rust-optimized/src/utils.rs:
--------------------------------------------------------------------------------
 1 | use chrono::NaiveDateTime;
 2 | use serde::{Deserialize, de::Deserializer, Serialize};
 3 | 
 4 | #[derive(Debug, Deserialize)]
 5 | #[serde(rename_all = "PascalCase")]
 6 | pub struct NativeDataFrameRaw {
 7 |     pub post_creation_date: String,
 8 |     pub reputation_at_post_creation: f64,
 9 |     pub body_markdown: String,
10 |     pub tag1: String,
11 |     pub open_status: String,
12 |     pub owner_undeleted_answer_count_at_post_time: f64,
13 | }
14 | 
15 | impl From<NativeDataFrameRaw> for NativeDataFrame {
16 |     fn from(item: NativeDataFrameRaw) -> Self {
17 |         Self {
18 |             post_creation_date: NaiveDateTime::parse_from_str(&item.post_creation_date, "%m/%d/%Y %H:%M:%S").unwrap(),
19 |             reputation_at_post_creation: item.reputation_at_post_creation,
20 |             count_words: item.body_markdown.split(' ').count() as f64,
21 |             tag1: item.tag1,
22 |             open_status: item.open_status,
23 |             owner_undeleted_answer_count_at_post_time: item.owner_undeleted_answer_count_at_post_time,
24 |         }
25 |     }
26 | }
27 | 
28 | #[derive(Debug, Deserialize)]
29 | #[serde(rename_all = "PascalCase")]
30 | pub struct NativeDataFrame {
31 |     #[serde(deserialize_with = "datetime_parser")]
32 |     pub post_creation_date: NaiveDateTime,
33 |     pub reputation_at_post_creation: f64,
34 |     #[serde(deserialize_with = "word_counter", rename = "BodyMarkdown")]
35 |     pub count_words: f64,
36 |     pub tag1: String,
37 |     pub open_status: String,
38 |     pub owner_undeleted_answer_count_at_post_time: f64,
39 | }
40 | 
41 | fn datetime_parser<'de, D: Deserializer<'de>>(deserializer: D) -> Result<NaiveDateTime, D::Error> {
42 |     let raw: &str = Deserialize::deserialize(deserializer)?;
43 | 
44 |     NaiveDateTime::parse_from_str(raw, "%m/%d/%Y %H:%M:%S")
45 |         .map_err(|x| serde::de::Error::custom(x.to_string()))
46 | }
47 | 
48 | fn word_counter<'de, D: Deserializer<'de>>(deserializer: D) -> Result<f64, D::Error> {
49 |     let raw: String = Deserialize::deserialize(deserializer)?;
50 | 
51 |     Ok(raw.split(' ').count() as f64)
52 | }
53 | 
54 | #[derive(Debug, Clone, Deserialize)]
55 | #[serde(rename_all = "PascalCase")]
56 | pub struct WikiDataFrame {
57 |     pub language: String,
58 |     pub procedural: f64,
59 |     #[serde(rename = "Object-oriented")]
60 |     pub object_oriented: f64,
61 |     pub imperative: f64,
62 |     pub functional: f64,
63 |     pub generic: f64,
64 |     pub reflective: f64,
65 |     #[serde(rename = "Event-driven")]
66 |     pub event_driven: f64,
67 | }
68 | 
69 | #[derive(Debug, Default, Clone, Deserialize, Serialize)]
70 | #[serde(rename_all = "PascalCase")]
71 | pub struct GroupBy {
72 |     pub status: String,
73 |     pub reputation_at_post_creation: f64,
74 |     pub owner_undeleted_answer_count_at_post_time: f64,
75 |     pub imperative: f64,
76 |     pub object_oriented: f64,
77 |     pub functional: f64,
78 |     pub procedural: f64,
79 |     pub generic: f64,
80 |     pub reflective: f64,
81 |     pub event_driven: f64,
82 | }
83 | 


--------------------------------------------------------------------------------
/native-rust/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "native-rust"
 3 | version = "0.1.0"
 4 | authors = ["xavier tao <shavtao@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | csv = "1.1"
11 | serde = { version = "1", features = ["derive"] }
12 | serde_json = "1.0.63"
13 | serde_with = "1.6.4"
14 | chrono = "0.4.19"
15 | itertools = "0.10.0"
16 | rayon = "1.5"


--------------------------------------------------------------------------------
/native-rust/src/RUST.code-workspace:
--------------------------------------------------------------------------------
 1 | {
 2 | 	"folders": [
 3 | 		{
 4 | 			"path": "../.."
 5 | 		},
 6 | 		{
 7 | 			"path": "../../../polars"
 8 | 		}
 9 | 	],
10 | 	"settings": {}
11 | }


--------------------------------------------------------------------------------
/native-rust/src/main.rs:
--------------------------------------------------------------------------------
  1 | mod utils;
  2 | use chrono::DateTime;
  3 | use itertools::Itertools;
  4 | use rayon::prelude::*;
  5 | use std::collections::HashMap;
  6 | use std::collections::HashSet;
  7 | use std::fs::File;
  8 | use std::time::Instant;
  9 | 
 10 | fn use_native_rust(
 11 |     path: &str,
 12 |     path_wikipedia: &str,
 13 |     output_path: &str,
 14 | ) -> std::result::Result<(), Box<dyn std::error::Error>> {
 15 |     let t_initial = Instant::now();
 16 | 
 17 |     let file = File::open(path)?;
 18 | 
 19 |     let mut rdr = csv::ReaderBuilder::new().delimiter(b',').from_reader(file);
 20 |     let mut records: Vec<utils::NativeDataFrame> = rdr
 21 |         .deserialize()
 22 |         .into_iter()
 23 |         .filter_map(|result| match result {
 24 |             Ok(rec) => rec,
 25 |             Err(e) => None,
 26 |         })
 27 |         .collect();
 28 | 
 29 |     let file = File::open(path_wikipedia)?;
 30 |     let mut rdr_wiki = csv::ReaderBuilder::new().delimiter(b',').from_reader(file);
 31 |     let records_wikipedia: Vec<utils::WikiDataFrame> = rdr_wiki
 32 |         .deserialize()
 33 |         .into_iter()
 34 |         .filter_map(|result| match result {
 35 |             Ok(rec) => rec,
 36 |             Err(e) => None,
 37 |         })
 38 |         .collect();
 39 | 
 40 |     let t_reading = Instant::now();
 41 | 
 42 |     // 1. Apply Format Date
 43 |     let fmt = "%m/%d/%Y %H:%M:%S";
 44 | 
 45 |     records
 46 |         .iter_mut()
 47 |         .for_each(|record: &mut utils::NativeDataFrame| {
 48 |             record.PostCreationDatetime =
 49 |                 match DateTime::parse_from_str(record.PostCreationDate.as_ref().unwrap(), fmt) {
 50 |                     Ok(dates) => Some(dates),
 51 |                     Err(_) => None,
 52 |                 }
 53 |         });
 54 | 
 55 |     let t_formatting = Instant::now();
 56 | 
 57 |     // 2. Apply Custom Formatting
 58 |     records
 59 |         .iter_mut()
 60 |         .for_each(|record: &mut utils::NativeDataFrame| {
 61 |             record.CountWords =
 62 |                 Some(record.BodyMarkdown.as_ref().unwrap().split(' ').count() as f64)
 63 |         });
 64 | 
 65 |     let t_count_words = Instant::now();
 66 |     let hash_wikipedia: &HashMap<&String, &utils::WikiDataFrame> = &records_wikipedia
 67 |         .iter()
 68 |         .map(|record| (record.Language.as_ref().unwrap(), record))
 69 |         .collect();
 70 | 
 71 |     records.iter_mut().for_each(|record| {
 72 |         record.Wikipedia = match hash_wikipedia.get(&record.Tag1.as_ref().unwrap()) {
 73 |             Some(wikipedia) => Some(wikipedia.clone().clone()),
 74 |             None => None,
 75 |         }
 76 |     });
 77 | 
 78 |     let t_merging = Instant::now();
 79 | 
 80 |     let groups_hash: HashMap<String, (utils::GroupBy, i16)> = records
 81 |         .par_iter()
 82 |         .fold(
 83 |             || HashMap::with_capacity(10), // || HashMap::new()
 84 |             |mut hash_group: HashMap<String, (utils::GroupBy, i16)>, record| {
 85 |                 let group: utils::GroupBy = if let Some(wiki) = &record.Wikipedia {
 86 |                     utils::GroupBy {
 87 |                         status: record.OpenStatus.as_ref().unwrap().to_string(),
 88 |                         ReputationAtPostCreation: record.ReputationAtPostCreation.unwrap(),
 89 |                         OwnerUndeletedAnswerCountAtPostTime: record
 90 |                             .OwnerUndeletedAnswerCountAtPostTime
 91 |                             .unwrap(),
 92 |                         Imperative: wiki.Imperative.unwrap(),
 93 |                         ObjectOriented: wiki.ObjectOriented.unwrap(),
 94 |                         Functional: wiki.Functional.unwrap(),
 95 |                         Procedural: wiki.Procedural.unwrap(),
 96 |                         Generic: wiki.Generic.unwrap(),
 97 |                         Reflective: wiki.Reflective.unwrap(),
 98 |                         EventDriven: wiki.EventDriven.unwrap(),
 99 |                     }
100 |                 } else {
101 |                     utils::GroupBy {
102 |                         status: record.OpenStatus.as_ref().unwrap().to_string(),
103 |                         ReputationAtPostCreation: record.ReputationAtPostCreation.unwrap(),
104 |                         OwnerUndeletedAnswerCountAtPostTime: record
105 |                             .OwnerUndeletedAnswerCountAtPostTime
106 |                             .unwrap(),
107 |                         ..Default::default()
108 |                     }
109 |                 };
110 |                 if let Some((previous, count)) = hash_group.get_mut(&group.status.to_string()) {
111 |                     *previous = previous.clone() + group;
112 |                     *count += 1;
113 |                 } else {
114 |                     hash_group.insert(group.status.to_string(), (group, 1));
115 |                 };
116 |                 hash_group
117 |             },
118 |         )
119 |         .reduce(
120 |             || HashMap::new(),
121 |             |prev, other| {
122 |                 let set1: HashSet<String> = prev.keys().cloned().collect();
123 |                 let set2: HashSet<String> = other.keys().cloned().collect();
124 |                 let unions: HashSet<String> = set1.union(&set2).cloned().collect();
125 |                 let mut map = HashMap::new();
126 |                 for key in unions.iter() {
127 |                     map.insert(
128 |                         key.to_string(),
129 |                         match (prev.get(key), other.get(key)) {
130 |                             (Some((previous, count_prev)), Some((group, count_other))) => {
131 |                                 (previous.clone() + group.clone(), count_prev + count_other)
132 |                             }
133 |                             (Some(previous), None) => previous.clone(),
134 |                             (None, Some(other)) => other.clone(),
135 |                             (None, None) => (utils::GroupBy::new(), 0),
136 |                         },
137 |                     );
138 |                 }
139 |                 map
140 |             },
141 |         );
142 | 
143 |     let groups: Vec<utils::GroupBy> = groups_hash
144 |         .iter()
145 |         .map(|(_, (group, count))| utils::GroupBy {
146 |             status: group.status.to_string(),
147 |             ReputationAtPostCreation: group.ReputationAtPostCreation / count.clone() as f64,
148 |             OwnerUndeletedAnswerCountAtPostTime: group.OwnerUndeletedAnswerCountAtPostTime
149 |                 / count.clone() as f64,
150 |             Imperative: group.Imperative / count.clone() as f64,
151 |             ObjectOriented: group.ObjectOriented / count.clone() as f64,
152 |             Functional: group.Functional / count.clone() as f64,
153 |             Procedural: group.Procedural / count.clone() as f64,
154 |             Generic: group.Generic / count.clone() as f64,
155 |             Reflective: group.Reflective / count.clone() as f64,
156 |             EventDriven: group.EventDriven / count.clone() as f64,
157 |         })
158 |         .collect();
159 | 
160 |     let t_groupby = Instant::now();
161 | 
162 |     let mut wtr = csv::Writer::from_path(output_path)?;
163 | 
164 |     for record in groups {
165 |         wtr.serialize(record)?;
166 |     }
167 | 
168 |     let t_writing = Instant::now();
169 | 
170 |     let _ = records
171 |         .iter()
172 |         .filter(|record| record.Tag1 == Some("rust".to_string()))
173 |         .collect::<Vec<&utils::NativeDataFrame>>();
174 | 
175 |     let t_filtering = Instant::now();
176 | 
177 |     let timings = [
178 |         t_initial,
179 |         t_reading,
180 |         t_formatting,
181 |         t_count_words,
182 |         t_merging,
183 |         t_groupby,
184 |         t_writing,
185 |         t_filtering,
186 |     ];
187 |     let names = [
188 |         "reading",
189 |         "formatting",
190 |         "count_words",
191 |         "merging",
192 |         "groupby",
193 |         "writing",
194 |         "filtering",
195 |     ];
196 |     for (i, name) in names.iter().enumerate() {
197 |         println!("{}: {:#?}", name, (timings[i + 1] - timings[i]).as_millis());
198 |     }
199 | 
200 |     Ok(())
201 | }
202 | 
203 | fn main() {
204 |     let path = "/home/peter/Documents/TEST/RUST/stack-overflow/data/train_October_9_2012.csv";
205 |     let output_native_rust_path =
206 |         "/home/peter/Documents/TEST/RUST/stack-overflow/data/native_rust_output.csv";
207 |     let path_wikipedia = "/home/peter/Documents/BLOG/dataframe-python-rust/data/wikipedia.csv";
208 | 
209 |     use_native_rust(path, path_wikipedia, output_native_rust_path)
210 |         .expect("Test of polar oriented result.");
211 | }
212 | 


--------------------------------------------------------------------------------
/native-rust/src/utils.rs:
--------------------------------------------------------------------------------
  1 | use chrono::DateTime;
  2 | use serde::{Deserialize, Serialize};
  3 | use serde_with::skip_serializing_none;
  4 | use std::collections::HashMap;
  5 | use std::ops::Add;
  6 | 
  7 | #[skip_serializing_none]
  8 | #[derive(Debug, Deserialize, Serialize)]
  9 | pub struct NativeDataFrame {
 10 |     #[serialize_always]
 11 |     pub OwnerUserId: Option<f64>,
 12 |     #[serialize_always]
 13 |     pub PostClosedDate: Option<String>,
 14 |     #[serialize_always]
 15 |     pub PostCreationDate: Option<String>,
 16 |     #[serialize_always]
 17 |     pub PostId: Option<f64>,
 18 |     #[serialize_always]
 19 |     pub ReputationAtPostCreation: Option<f64>,
 20 |     #[serialize_always]
 21 |     pub BodyMarkdown: Option<String>,
 22 |     #[serialize_always]
 23 |     pub Tag4: Option<String>,
 24 |     #[serialize_always]
 25 |     pub Tag1: Option<String>,
 26 |     #[serialize_always]
 27 |     pub OwnerCreationDate: Option<String>,
 28 |     #[serialize_always]
 29 |     pub Tag5: Option<String>,
 30 |     #[serialize_always]
 31 |     pub Tag3: Option<String>,
 32 |     #[serialize_always]
 33 |     pub OpenStatus: Option<String>,
 34 |     #[serialize_always]
 35 |     pub Tag2: Option<String>,
 36 |     #[serialize_always]
 37 |     pub OwnerUndeletedAnswerCountAtPostTime: Option<f64>,
 38 |     #[serialize_always]
 39 |     pub Title: Option<String>,
 40 |     #[serde(skip)]
 41 |     pub PostCreationDatetime: Option<DateTime<chrono::FixedOffset>>,
 42 |     #[serialize_always]
 43 |     pub CountWords: Option<f64>,
 44 |     #[serde(skip)]
 45 |     pub Wikipedia: Option<WikiDataFrame>,
 46 | }
 47 | 
 48 | #[skip_serializing_none]
 49 | #[derive(Debug, Clone, Deserialize, Serialize)]
 50 | pub struct WikiDataFrame {
 51 |     #[serialize_always]
 52 |     pub Language: Option<String>,
 53 |     #[serialize_always]
 54 |     pub Procedural: Option<f64>,
 55 |     #[serialize_always]
 56 |     #[serde(rename(serialize = "Object-oriented", deserialize = "Object-oriented"))]
 57 |     pub ObjectOriented: Option<f64>,
 58 |     #[serialize_always]
 59 |     pub Imperative: Option<f64>,
 60 |     #[serialize_always]
 61 |     pub Functional: Option<f64>,
 62 |     #[serialize_always]
 63 |     pub Generic: Option<f64>,
 64 |     #[serialize_always]
 65 |     pub Reflective: Option<f64>,
 66 |     #[serialize_always]
 67 |     #[serde(rename(serialize = "Event-driven", deserialize = "Event-driven"))]
 68 |     pub EventDriven: Option<f64>,
 69 |     #[serialize_always]
 70 |     #[serde(rename(serialize = "Other paradigm(s)", deserialize = "Other paradigm(s)"))]
 71 |     pub OtherParadigm: Option<String>,
 72 |     #[serialize_always]
 73 |     #[serde(rename(serialize = "Intended use", deserialize = "Intended use"))]
 74 |     pub IntendedUse: Option<String>,
 75 |     #[serialize_always]
 76 |     #[serde(rename(serialize = "Standardized?", deserialize = "Standardized?"))]
 77 |     pub Standardized: Option<String>,
 78 | }
 79 | 
 80 | pub type Record = HashMap<String, String>;
 81 | 
 82 | #[derive(Debug, Clone, Deserialize, Serialize)]
 83 | pub struct GroupBy {
 84 |     pub status: String,
 85 |     pub ReputationAtPostCreation: f64,
 86 |     pub OwnerUndeletedAnswerCountAtPostTime: f64,
 87 |     pub Imperative: f64,
 88 |     pub ObjectOriented: f64,
 89 |     pub Functional: f64,
 90 |     pub Procedural: f64,
 91 |     pub Generic: f64,
 92 |     pub Reflective: f64,
 93 |     pub EventDriven: f64,
 94 | }
 95 | 
 96 | impl Default for GroupBy {
 97 |     fn default() -> GroupBy {
 98 |         GroupBy {
 99 |             status: "".to_string(),
100 |             ReputationAtPostCreation: 0.,
101 |             OwnerUndeletedAnswerCountAtPostTime: 0.,
102 |             Imperative: 0.,
103 |             ObjectOriented: 0.,
104 |             Functional: 0.,
105 |             Procedural: 0.,
106 |             Generic: 0.,
107 |             Reflective: 0.,
108 |             EventDriven: 0.,
109 |         }
110 |     }
111 | }
112 | 
113 | impl GroupBy {
114 |     pub fn new() -> GroupBy {
115 |         GroupBy {
116 |             status: "".to_string(),
117 |             ReputationAtPostCreation: 0.,
118 |             OwnerUndeletedAnswerCountAtPostTime: 0.,
119 |             Imperative: 0.,
120 |             ObjectOriented: 0.,
121 |             Functional: 0.,
122 |             Procedural: 0.,
123 |             Generic: 0.,
124 |             Reflective: 0.,
125 |             EventDriven: 0.,
126 |         }
127 |     }
128 | }
129 | 
130 | impl Add for GroupBy {
131 |     type Output = GroupBy;
132 |     fn add(self, other: GroupBy) -> GroupBy {
133 |         GroupBy {
134 |             status: self.status,
135 |             ReputationAtPostCreation: self.ReputationAtPostCreation
136 |                 + other.ReputationAtPostCreation,
137 |             OwnerUndeletedAnswerCountAtPostTime: self.OwnerUndeletedAnswerCountAtPostTime
138 |                 + other.OwnerUndeletedAnswerCountAtPostTime,
139 |             Imperative: self.Imperative + other.Imperative,
140 |             ObjectOriented: self.ObjectOriented + other.ObjectOriented,
141 |             Functional: self.Functional + other.Functional,
142 |             Procedural: self.Procedural + other.Procedural,
143 |             Generic: self.Generic + other.Generic,
144 |             Reflective: self.Reflective + other.Reflective,
145 |             EventDriven: self.EventDriven + other.EventDriven,
146 |         }
147 |     }
148 | }
149 | 
150 | pub fn inspect(path: &str) {
151 |     let mut record: Record = HashMap::new();
152 | 
153 |     let mut rdr = csv::Reader::from_path(path).unwrap();
154 | 
155 |     for result in rdr.deserialize() {
156 |         match result {
157 |             Ok(rec) => {
158 |                 record = rec;
159 |                 break;
160 |             }
161 |             Err(_e) => (),
162 |         };
163 |     }
164 |     // Print Struct
165 |     println!("#[skip_serializing_none]");
166 |     println!("#[derive(Debug, Deserialize, Serialize)]");
167 |     println!("struct lib::DataFrame {{");
168 |     for (key, value) in &record {
169 |         println!("    #[serialize_always]");
170 | 
171 |         match value.parse::<i64>() {
172 |             Ok(_n) => {
173 |                 // println!("    #[serde(deserialize_with = \"empty_string_as_none\")]");
174 |                 println!("    {}: Option<i64>,", key);
175 |                 continue;
176 |             }
177 |             Err(_e) => (),
178 |         }
179 |         match value.parse::<f64>() {
180 |             Ok(_n) => {
181 |                 // println!("    #[serde(deserialize_with = \"empty_string_as_none\")]");
182 |                 println!("    {}: Option<f64>,", key);
183 |                 continue;
184 |             }
185 |             Err(_e) => (),
186 |         }
187 |         println!("    {}: Option<String>,", key);
188 |     }
189 |     println!("}}");
190 | }
191 | 


--------------------------------------------------------------------------------
/pandas_benchmark.py:
--------------------------------------------------------------------------------
  1 | # import dask.dataframe as dd
  2 | import pandas as pd
  3 | from io import StringIO
  4 | from datetime import datetime
  5 | 
  6 | t_initial = datetime.now()
  7 | 
  8 | # 1. Reading
  9 | PATH = "/home/peter/Documents/TEST/RUST/stack-overflow/data/train_October_9_2012.csv"
 10 | PATH_WIKIPEDIA = (
 11 |     "/home/peter/Documents/TEST/RUST/stack-overflow/data/wikipedia.csv"
 12 | )
 13 | PATH_OUTPUT = (
 14 |     "/home/peter/Documents/TEST/RUST/stack-overflow/data/python_output.csv"
 15 | )
 16 | 
 17 | df = pd.read_csv(PATH)
 18 | df_wikipedia = pd.read_csv(PATH_WIKIPEDIA)
 19 | 
 20 | t_reading = datetime.now()
 21 | 
 22 | # 2. Formatting date
 23 | df["PostCreationDate"] = pd.to_datetime(
 24 |     df["PostCreationDate"], format="%m/%d/%Y %H:%M:%S"
 25 | )
 26 | 
 27 | t_formatting = datetime.now()
 28 | 
 29 | # 3. Formatting custom field
 30 | count_words = lambda x: len(x.split(" "))
 31 | 
 32 | df["BodyMarkdown"] = df["BodyMarkdown"].map(count_words)
 33 | 
 34 | t_count_words = datetime.now()
 35 | 
 36 | # 4. Merging
 37 | df = pd.merge(
 38 |     df, df_wikipedia, left_on="Tag1", right_on="Language", how="left"
 39 | ).fillna(0)
 40 | 
 41 | t_merging = datetime.now()
 42 | 
 43 | # 4. Groupby
 44 | groupby_series = [df["OpenStatus"]]
 45 | target_column = [
 46 |     "ReputationAtPostCreation",
 47 |     "OwnerUndeletedAnswerCountAtPostTime",
 48 |     "Imperative",
 49 |     "Object-oriented",
 50 |     "Functional",
 51 |     "Procedural",
 52 |     "Generic",
 53 |     "Reflective",
 54 |     "Event-driven",
 55 | ]
 56 | 
 57 | groups = df.groupby(by=groupby_series)[target_column].mean()
 58 | 
 59 | t_groupby = datetime.now()
 60 | 
 61 | # 5. Filtering
 62 | df = df[df["Tag1"] == "rust"]
 63 | 
 64 | t_filtering = datetime.now()
 65 | 
 66 | # 6. Writing
 67 | 
 68 | groups.to_csv(PATH_OUTPUT)
 69 | t_writing = datetime.now()
 70 | 
 71 | # 7. printing
 72 | 
 73 | timings = [
 74 |     t_initial,
 75 |     t_reading,
 76 |     t_formatting,
 77 |     t_count_words,
 78 |     t_merging,
 79 |     t_groupby,
 80 |     t_filtering,
 81 |     t_writing,
 82 | ]
 83 | 
 84 | names = [
 85 |     "reading",
 86 |     "formatting",
 87 |     "count_words",
 88 |     "merging",
 89 |     "groupby",
 90 |     "filtering",
 91 |     "writing",
 92 | ]
 93 | 
 94 | for i, name in enumerate(names):
 95 | 
 96 |     print(f"{name}: {(timings[i+1] - timings[i]).total_seconds() * 1000}")
 97 | 
 98 | # df = dd.read_csv(
 99 | #     "partitions/*.csv",
100 | #     dtype={
101 | #         "OwnerUndeletedAnswerCountAtPostTime": "float64",
102 | #         "OwnerUserId": "object",
103 | #         "PostId": "object",
104 | #         "ReputationAtPostCreation": "float64",
105 | #         "Unnamed: 0": "object",
106 | #     },
107 | # )
108 | 
109 | 
110 | # df.repartition(npartitions=100).to_csv("partitiions/*.csv", index=False)
111 | 
112 | # group = df.groupby(df.Tag1).ReputationAtPostCreation.sum().compute()
113 | # group.to_csv("dask_output-*.csv")
114 | # df["BodyMarkdown"] = df["BodyMarkdown"].str.replace("\r\n", " ")
115 | 
116 | # df["PostCreationDate"] = pd.to_datetime(
117 | #     df["PostCreationDate"], format="%m/%d/%Y %H:%M:%S"
118 | # )
119 | # df["OwnerCreationDate"] = pd.to_datetime(df["OwnerCreationDate"], infer_datetime_format=True
120 | # )
121 | 


--------------------------------------------------------------------------------
/polars-eager/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "polars-eager"
 3 | version = "0.1.0"
 4 | authors = ["xavier tao <shavtao@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | polars = { git = "https://github.com/ritchie46/polars", features = [], version = "0.15.1"}
11 | rayon = "1.5.1"


--------------------------------------------------------------------------------
/polars-eager/src/main.rs:
--------------------------------------------------------------------------------
  1 | use polars::prelude::*;
  2 | use rayon::prelude::*;
  3 | use std::fs::File;
  4 | use std::time::Instant;
  5 | 
  6 | fn str_to_date(dates: &Series) -> std::result::Result<Series, PolarsError> {
  7 |     let fmt = Some("%m/%d/%Y %H:%M:%S");
  8 | 
  9 |     Ok(dates.utf8()?.as_date64(fmt)?.into_series())
 10 | }
 11 | 
 12 | fn count_words(column: &Series) -> std::result::Result<Series, PolarsError> {
 13 |     Ok(column
 14 |         .utf8()?
 15 |         .into_iter()
 16 |         .map(|opt_name: Option<&str>| opt_name.map(|name: &str| name.split(' ').count() as f64))
 17 |         .collect::<Float64Chunked>()
 18 |         .into_series())
 19 | }
 20 | 
 21 | fn use_polars(
 22 |     path: &str,
 23 |     path_wikipedia: &str,
 24 |     output_path: &str,
 25 | ) -> std::result::Result<(), Box<dyn std::error::Error>> {
 26 |     let t_initial = Instant::now();
 27 | 
 28 |     let target_column = vec![
 29 |         "Language",
 30 |         "Imperative",
 31 |         "Object-oriented",
 32 |         "Functional",
 33 |         "Procedural",
 34 |         "Generic",
 35 |         "Reflective",
 36 |         "Event-driven",
 37 |     ];
 38 | 
 39 |     let mut df = CsvReader::from_path(path)?
 40 |         .with_encoding(CsvEncoding::LossyUtf8)
 41 |         // .with_n_threads(Some(1))
 42 |         .has_header(true)
 43 |         .finish()?;
 44 |     let df_wikipedia = CsvReader::from_path(path_wikipedia)?
 45 |         .with_encoding(CsvEncoding::LossyUtf8)
 46 |         // .with_n_threads(Some(1))
 47 |         .has_header(true)
 48 |         .finish()?
 49 |         .select(target_column)?;
 50 | 
 51 |     let t_reading = Instant::now();
 52 | 
 53 |     // 1. Apply Format Date
 54 |     df.may_apply("PostCreationDate", str_to_date)?;
 55 | 
 56 |     let t_formatting = Instant::now();
 57 | 
 58 |     // 2. Apply Custom Formatting
 59 |     df.may_apply("BodyMarkdown", count_words)?;
 60 | 
 61 |     let t_count_words = Instant::now();
 62 | 
 63 |     df = df
 64 |         .join(&df_wikipedia, "Tag1", "Language", JoinType::Left)?
 65 |         .fill_none(FillNoneStrategy::Min)?;
 66 | 
 67 |     let t_merging = Instant::now();
 68 | 
 69 |     // 3. groupby
 70 |     let groupby_series = vec![df.column("OpenStatus")?.clone()];
 71 | 
 72 |     let target_column = vec![
 73 |         "ReputationAtPostCreation",
 74 |         "OwnerUndeletedAnswerCountAtPostTime",
 75 |         "Imperative",
 76 |         "Object-oriented",
 77 |         "Functional",
 78 |         "Procedural",
 79 |         "Generic",
 80 |         "Reflective",
 81 |         "Event-driven",
 82 |     ];
 83 | 
 84 |     let groups = df
 85 |         .groupby_with_series(groupby_series, false)?
 86 |         .select(target_column)
 87 |         .mean()?;
 88 | 
 89 |     let t_groupby = Instant::now();
 90 | 
 91 |     // 4. Filtering
 92 |     let values = df.column("Tag1")?;
 93 |     let mask = values.eq("rust");
 94 |     let _ = df.filter(&mask)?;
 95 | 
 96 |     let t_filtering = Instant::now();
 97 | 
 98 |     let mut buffer = File::create(output_path)?;
 99 | 
100 |     CsvWriter::new(&mut buffer)
101 |         .has_headers(true)
102 |         .finish(&mut groups.sort("OpenStatus", false)?)
103 |         .expect("csv written");
104 |     let t_writing = Instant::now();
105 | 
106 |     let timings = [
107 |         t_initial,
108 |         t_reading,
109 |         t_formatting,
110 |         t_count_words,
111 |         t_merging,
112 |         t_groupby,
113 |         t_filtering,
114 |         t_writing,
115 |     ];
116 | 
117 |     let names = [
118 |         "reading",
119 |         "formatting",
120 |         "count_words",
121 |         "merging",
122 |         "groupby",
123 |         "filtering",
124 |         "writing",
125 |     ];
126 | 
127 |     for (i, name) in names.iter().enumerate() {
128 |         println!("{}: {:#?}", name, (timings[i + 1] - timings[i]).as_millis());
129 |     }
130 | 
131 |     Ok(())
132 | }
133 | 
134 | fn main() {
135 |     let path =
136 |         "/home/peter/Documents/TEST/RUST/dataframe-python-rust/data/train_October_9_2012.csv";
137 |     let output_polars_eager_path =
138 |         "/home/peter/Documents/TEST/RUST/dataframe-python-rust/data/polars_eager_output.csv";
139 |     let path_wikipedia = "/home/peter/Documents/BLOG/dataframe-python-rust/data/wikipedia.csv";
140 | 
141 |     use_polars(path, path_wikipedia, output_polars_eager_path).expect("Polar eager failed.");
142 | }
143 | 


--------------------------------------------------------------------------------
/polars-lazy/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "polars-lazy"
 3 | version = "0.1.0"
 4 | authors = ["xavier tao <shavtao@gmail.com>"]
 5 | edition = "2018"
 6 | 
 7 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 8 | 
 9 | [dependencies]
10 | polars-lazy = { git =  "https://github.com/ritchie46/polars", features = [ "csv-file"], version = "0.15.1"}
11 | polars = { git =  "https://github.com/ritchie46/polars", features = [ "csv-file"], version = "0.15.1"}
12 | # remove simd for stable cargo
13 | rayon = "1.5.1"


--------------------------------------------------------------------------------
/polars-lazy/src/main.rs:
--------------------------------------------------------------------------------
  1 | use polars::prelude::*;
  2 | use polars_lazy::dsl::col;
  3 | use polars_lazy::prelude::*;
  4 | use rayon::prelude::*;
  5 | use std::fs::File;
  6 | use std::time::Instant;
  7 | 
  8 | fn lazy_str_to_date(dates: Series) -> std::result::Result<Series, PolarsError> {
  9 |     let fmt = Some("%m/%d/%Y %H:%M:%S");
 10 | 
 11 |     Ok(dates.utf8()?.as_date64(fmt)?.into_series())
 12 | }
 13 | 
 14 | fn lazy_date_to_hour(dates: Series) -> std::result::Result<Series, PolarsError> {
 15 |     Ok(dates.date64()?.hour().into_series())
 16 | }
 17 | 
 18 | fn lazy_count_words(dates: Series) -> std::result::Result<Series, PolarsError> {
 19 |     Ok(dates
 20 |         .utf8()?
 21 |         .into_iter()
 22 |         .map(|opt_name: Option<&str>| opt_name.map(|name: &str| name.split(' ').count() as f64))
 23 |         .collect::<Float64Chunked>()
 24 |         .into_series())
 25 | }
 26 | 
 27 | fn use_lazy_polars(
 28 |     path: &str,
 29 |     path_wikipedia: &str,
 30 |     output_path: &str,
 31 | ) -> std::result::Result<(), Box<dyn std::error::Error>> {
 32 |     let t_initial = Instant::now();
 33 |     let df_wikipedia = LazyCsvReader::new(path_wikipedia.to_string())
 34 |         // .with_encoding(CsvEncoding::LossyUtf8)
 35 |         // .with_n_threads(Some(1))
 36 |         .has_header(true)
 37 |         .finish();
 38 | 
 39 |     let mut df = LazyCsvReader::new(path.to_string())
 40 |         // .with_encoding(CsvEncoding::LossyUtf8)
 41 |         // .with_n_threads(Some(1))
 42 |         .has_header(true)
 43 |         .finish()
 44 |         .with_columns(vec![
 45 |             col("PostCreationDate")
 46 |                 .map(lazy_str_to_date, GetOutput::from_type(DataType::Date64))
 47 |                 .map(lazy_date_to_hour, GetOutput::from_type(DataType::Date64))
 48 |                 .alias("hour"),
 49 |             col("BodyMarkdown")
 50 |                 .map(lazy_count_words, GetOutput::from_type(DataType::UInt64))
 51 |                 .alias("newBodyMarkdown"),
 52 |         ])
 53 |         .inner_join(df_wikipedia, col("Tag1"), col("Language"))
 54 |         .groupby(vec![col("OpenStatus")])
 55 |         .agg(vec![
 56 |             col("ReputationAtPostCreation").mean(),
 57 |             col("OwnerUndeletedAnswerCountAtPostTime").mean(),
 58 |             col("Imperative").mean(),
 59 |             col("Object-oriented").mean(),
 60 |             col("Functional").mean(),
 61 |             col("Procedural").mean(),
 62 |             col("Generic").mean(),
 63 |             col("Reflective").mean(),
 64 |             col("Event-driven").mean(),
 65 |         ])
 66 |         .select(&[
 67 |             col("OpenStatus"),
 68 |             col("ReputationAtPostCreation_mean"),
 69 |             col("OwnerUndeletedAnswerCountAtPostTime_mean"),
 70 |             col("Imperative_mean"),
 71 |             col("Object-oriented_mean"),
 72 |             col("Functional_mean"),
 73 |             col("Procedural_mean"),
 74 |             col("Generic_mean"),
 75 |             col("Reflective_mean"),
 76 |             col("Event-driven_mean"),
 77 |         ])
 78 |         .sort("OpenStatus", false)
 79 |         .collect()?;
 80 | 
 81 |     let mut buffer = File::create(output_path)?;
 82 | 
 83 |     CsvWriter::new(&mut buffer)
 84 |         .has_headers(true)
 85 |         .finish(&mut df)
 86 |         .expect("csv written");
 87 | 
 88 |     let t_writing = Instant::now();
 89 |     println!("Read to write: {}", (t_writing - t_initial).as_millis());
 90 |     Ok(())
 91 | }
 92 | 
 93 | fn main() {
 94 |     let path =
 95 |         "/home/peter/Documents/TEST/RUST/dataframe-python-rust/data/train_October_9_2012.csv";
 96 |     let output_polars_lazy_path =
 97 |         "/home/peter/Documents/TEST/RUST/dataframe-python-rust/data/polars_lazy_output.csv";
 98 |     let path_wikipedia = "/home/peter/Documents/BLOG/dataframe-python-rust/data/wikipedia.csv";
 99 | 
100 |     use_lazy_polars(path, path_wikipedia, output_polars_lazy_path)
101 |         .expect("Test of polar lazy failed.");
102 | }
103 | 


--------------------------------------------------------------------------------