├── .dockerignore
├── .gitignore
├── Dockerfile
├── LICENSE
├── README.md
└── examples
    ├── pyfake-v1
        ├── pyfake
        │   ├── __init__.py
        │   ├── benchmark.py
        │   ├── generate.py
        │   └── generate_row.py
        └── pyproject.toml
    ├── pyfake-v2
        ├── pyfake
        │   ├── __init__.py
        │   └── generate.py
        └── pyproject.toml
    ├── rsfake-v1
        ├── Cargo.toml
        └── src
        │   └── main.rs
    └── rsfake-v2
        ├── Cargo.toml
        ├── schema.json
        └── src
            ├── extract.rs
            ├── generate.rs
            └── main.rs


/.dockerignore:
--------------------------------------------------------------------------------
1 | **/target
2 | **/__pycache__/
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # IDEs
 2 | .vscode
 3 | .idea
 4 | 
 5 | # Mac/OSX
 6 | .DS_Store
 7 | 
 8 | # cache files
 9 | .cache
10 | __pycache__/
11 | .pytest_cache/
12 | .mypy_cache/
13 | .pyc
14 | *.lock
15 | 
16 | target/
17 | 
18 | # virtenv
19 | venv/
20 | 
21 | # environment variables
22 | .env*
23 | 
24 | # dat files (too large for github)
25 | *.dat
26 | *.parquet
27 | *.zip
28 | 
29 | # temporary files
30 | */temp/
31 | tmp/
32 | 
33 | # temp build dir
34 | build/
35 | # archive/
36 | dist/
37 | 
38 | data/
39 | 
40 | 
41 | # history
42 | .bash_history
43 | .python_history
44 | 
45 | node_modules/
46 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM public.ecr.aws/docker/library/rust:slim-bookworm
 2 | 
 3 | RUN apt-get update --fix-missing \
 4 |     && apt-get -y install --no-install-recommends \
 5 |         bash \
 6 |         zip \
 7 |         python3-pip \
 8 |     && ln -s /usr/bin/python3 /usr/bin/python \
 9 |     && pip3 install --break-system-packages \
10 |       faker==21.0.0 \
11 |       polars==0.20.2 \
12 |     && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
13 | 
14 | COPY ./examples /examples
15 | RUN \
16 |   cd /examples/rsfake-v1 && cargo build --release \
17 |   && mkdir -p bin && cp target/release/rsfake bin/rsfake \
18 |   && rm -rf target \
19 |   && cd /examples/rsfake-v2 && cargo build --release \
20 |   && mkdir -p bin && cp target/release/rsfake bin/rsfake \
21 |   && rm -rf target
22 | 
23 | RUN useradd -u 4000 -ms /bin/bash foo \
24 |   && chown -R foo:foo /examples /usr/local/cargo
25 | WORKDIR /examples
26 | USER foo


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Anthony Potappel
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Generating data 100x faster with Rust
 2 | 
 3 | Example scripts to demonstrate the ability of Rust to accelerate data generation. Performance improvement can vary, initial measurements currently show 100-150x gain for a typical dataset/ configuration in v1.
 4 | 
 5 | v2 version is still a work in progress, although most functionality is working it may have some rough edges. Performance with Rayon threading is still something I am exploring, when scaling up it does not saturate all the cores perfectly. If you know why feel free to send me a PR. 
 6 | 
 7 | PRs on any other performance improvement are welcome, as the goal of this project is to get the fastest data generation possible. While we are currently at 100x -- have a sense we can push this much higher over time.
 8 | 
 9 | ## Versions
10 | ### v1 (Python / Rust)
11 | * generate dataset
12 | 
13 | ### v2 (Python / Rust)
14 | * ability to pass parameters via CLI
15 | * dynamic schema loading (Rust-only)
16 | * enable threading (multi-core)
17 | * convert to dataframe
18 | * export to Parquet
19 | 
20 | ## Run examples
21 | 
22 | ### Python
23 | 
24 | ```
25 | # enter directory
26 | cd examples/pyfake-v1
27 | 
28 | # v1 requires faker
29 | pip install faker
30 | # v2 requires faker and polars
31 | pip install faker polars
32 | 
33 | # run
34 | python pyfake/generate.py
35 | 
36 | # benchmark row (average over 10 runs)
37 | python -c 'import pyfake; pyfake.benchmark_row()'
38 | 
39 | # benchmark column (average over 10 runs)
40 | python -c 'import pyfake; pyfake.benchmark_column()'
41 | ```
42 | 
43 | ### Python with Poetry
44 | ```
45 | # enter directory
46 | cd examples/pyfake-v1
47 | 
48 | # install dependencies
49 | poetry update
50 | 
51 | # run script
52 | poetry run pyfake
53 | ```
54 | 
55 | ### Rust
56 | ```
57 | # enter directory
58 | cd examples/rsfake-v1
59 | 
60 | cargo build --release
61 | target/release/rsfake
62 | ```
63 | 
64 | ### Docker
65 | For convenience a Dockerfile is included with both Python and Rust dependencies pre-installed.
66 | 
67 | ```
68 | # build
69 | docker build -t fakeroo .
70 | 
71 | # run interactive shell
72 | docker run -ti  --rm fakeroo bash
73 | 
74 | # run Python example
75 | cd /examples/pyfake-v1
76 | python pyfake/generate.py
77 | 
78 | # run Rust example
79 | cd /examples/rsfake-v1
80 | bin/rsfake
81 | ```


--------------------------------------------------------------------------------
/examples/pyfake-v1/pyfake/__init__.py:
--------------------------------------------------------------------------------
1 | from .benchmark import benchmark_column, benchmark_row


--------------------------------------------------------------------------------
/examples/pyfake-v1/pyfake/benchmark.py:
--------------------------------------------------------------------------------
 1 | import timeit
 2 | 
 3 | NO_ROWS = 10000
 4 | NO_EXECUTIONS = 10
 5 | 
 6 | 
 7 | def benchmark_column() -> None:
 8 |     setup_code = f"""
 9 | from pyfake.generate import ColumnTable
10 | NO_ROWS = {NO_ROWS}
11 | """
12 |     execution_time = timeit.timeit(
13 |         "ColumnTable(NO_ROWS)",
14 |         setup=setup_code,
15 |         number=NO_EXECUTIONS,
16 |     ) 
17 |     print(f"Average time taken to generate {NO_ROWS} people:")
18 |     print(f"--- {round((execution_time / NO_EXECUTIONS), 3)} seconds ---")
19 | 
20 | 
21 | def benchmark_row() -> None:
22 |     setup_code = f"""
23 | from pyfake.generate_row import generate_person_list
24 | NO_ROWS = {NO_ROWS}
25 | """
26 |     execution_time = timeit.timeit(
27 |         "generate_person_list(NO_ROWS)",
28 |         setup=setup_code,
29 |         number=NO_EXECUTIONS,
30 |     ) 
31 | 
32 |     print(f"Average time taken to generate {NO_ROWS} people:")
33 |     print(f"--- {round((execution_time / NO_EXECUTIONS), 3)} seconds ---")


--------------------------------------------------------------------------------
/examples/pyfake-v1/pyfake/generate.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | import time
 4 | 
 5 | from faker import Faker
 6 | 
 7 | NO_ROWS = 10000
 8 | 
 9 | fake = Faker()
10 | 
11 | class ColumnTable:
12 |     def __init__(self, count: int):
13 |         self.ids = [
14 |             random.randrange(1000, 9999999999999) for _ in range(count)
15 |         ]
16 |         self.first_names = [fake.first_name() for _ in range(count)]
17 |         self.last_names = [fake.last_name() for _ in range(count)]
18 |         self.emails = [fake.unique.ascii_email() for _ in range(count)]
19 |         self.companies = [fake.company() for _ in range(count)]
20 |         self.phone_numbers = [fake.phone_number() for _ in range(count)]
21 | 
22 | 
23 | def main() -> int:  
24 |     start_time = time.time()
25 |     table = ColumnTable(NO_ROWS)
26 |     end_time = time.time()
27 | 
28 |     print("First 3 records:")
29 |     for i in range(3):
30 |         print(
31 |             f"Record {i + 1}: {{ id: {table.ids[i]}, "
32 |             f"first_name: \"{table.first_names[i]}\", "
33 |             f"last_name: \"{table.last_names[i]}\", "
34 |             f"email: \"{table.emails[i]}\", "
35 |             f"company: \"{table.companies[i]}\", "
36 |             f"phone_number: \"{table.phone_numbers[i]}\" }}"
37 |         )
38 | 
39 |     print(f"Time taken to generate {NO_ROWS} people:")
40 |     print(f"--- {round((end_time - start_time), 3)} seconds ---")
41 |     return 0
42 | 
43 | 
44 | if __name__ == "__main__":
45 |     sys.exit(main())


--------------------------------------------------------------------------------
/examples/pyfake-v1/pyfake/generate_row.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import random
 3 | import time
 4 | 
 5 | from faker import Faker
 6 | from typing import Any
 7 | 
 8 | NO_ROWS = 10000
 9 | 
10 | fake = Faker()
11 | 
12 | def get_person() -> dict[str, Any]:
13 |     person = {
14 |         "id": random.randrange(1000, 9999999999999),
15 |         "first_name": fake.first_name(),
16 |         "last_name": fake.last_name(),
17 |         "email": fake.unique.ascii_email(),
18 |         "company": fake.company(),
19 |         "phone": fake.phone_number()
20 |     }
21 |     return person
22 | 
23 | 
24 | def generate_person_list(count: int) -> list[dict[str, Any]]:
25 |     person_list = [get_person() for _ in range(count)]
26 |     return person_list
27 | 
28 | 
29 | def main() -> int:
30 |     start_time = time.time()
31 |     person_list = generate_person_list(NO_ROWS)
32 |     end_time = time.time()
33 | 
34 |     print("First 3 records:", person_list[:3])
35 |     print(f"Time taken to generate {NO_ROWS} people:")
36 |     print(f"--- {round((end_time - start_time), 3)} seconds ---")
37 | 
38 | 
39 | if __name__ == "__main__":
40 |     sys.exit(main())


--------------------------------------------------------------------------------
/examples/pyfake-v1/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "pyfake"
 3 | version = "0.0.1"
 4 | description = ""
 5 | authors = ["aprxi <mail@aprxi.com>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.11"
10 | faker = "^21.0.1"
11 | 
12 | [tool.poetry.scripts]
13 | pyfake = "pyfake.generate:main"
14 | pyfake_row = "pyfake.generate_row:main"
15 | benchmark_row = "pyfake.benchmark:benchmark_row"
16 | benchmark_column = "pyfake.benchmark:benchmark_column"
17 | 
18 | [build-system]
19 | requires = ["poetry-core"]
20 | build-backend = "poetry.core.masonry.api"
21 | 


--------------------------------------------------------------------------------
/examples/pyfake-v2/pyfake/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aprxi/faster-data-generation/1962cb098eb813bfdee3032a2684b4362d06ebc5/examples/pyfake-v2/pyfake/__init__.py


--------------------------------------------------------------------------------
/examples/pyfake-v2/pyfake/generate.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import random
  3 | import time
  4 | import argparse
  5 | from concurrent import futures
  6 | 
  7 | import polars as pl
  8 | from faker import Faker
  9 | 
 10 | VERSION = "0.0.1"
 11 | 
 12 | fake = Faker()
 13 | 
 14 | 
 15 | class ColumnTable:
 16 |     def __init__(self, count: int):
 17 |         self.ids = [
 18 |             random.randrange(1000, 9999999999999) for _ in range(count)
 19 |         ]
 20 |         self.first_names = [fake.first_name() for _ in range(count)]
 21 |         self.last_names = [fake.last_name() for _ in range(count)]
 22 |         self.emails = [fake.unique.ascii_email() for _ in range(count)]
 23 |         self.companies = [fake.company() for _ in range(count)]
 24 |         self.phone_numbers = [fake.phone_number() for _ in range(count)]
 25 | 
 26 | 
 27 | def generate_dataframe(no_rows: int, no_threads: int) -> pl.DataFrame:
 28 |     rows_per_thread = no_rows // no_threads
 29 | 
 30 |     with futures.ProcessPoolExecutor(max_workers=no_threads) as executor:
 31 |         # Submitting tasks
 32 |         tasks = [executor.submit(ColumnTable, rows_per_thread) for _ in range(no_threads)]
 33 | 
 34 |         # Collecting and combining results
 35 |         try:
 36 |             combined_data = {
 37 |                 "ids": [],
 38 |                 "first_names": [],
 39 |                 "last_names": [],
 40 |                 "emails": [],
 41 |                 "companies": [],
 42 |                 "phone_numbers": []
 43 |             }
 44 | 
 45 |             for future in futures.as_completed(tasks):
 46 |                 result = future.result()
 47 |                 combined_data["ids"].extend(result.ids)
 48 |                 combined_data["first_names"].extend(result.first_names)
 49 |                 combined_data["last_names"].extend(result.last_names)
 50 |                 combined_data["emails"].extend(result.emails)
 51 |                 combined_data["companies"].extend(result.companies)
 52 |                 combined_data["phone_numbers"].extend(result.phone_numbers)
 53 | 
 54 |             return pl.DataFrame(combined_data)
 55 |         except Exception as exc:
 56 |             print(f"A task raised an exception: {exc}")
 57 |             return None
 58 | 
 59 | 
 60 | def main() -> int:
 61 |     parser = argparse.ArgumentParser(
 62 |         description="Example script."
 63 |     )
 64 | 
 65 |     parser.add_argument(
 66 |         "-r",
 67 |         "--rows",
 68 |         type=int,
 69 |         default=10000,
 70 |         help="Number of rows"
 71 |     )
 72 |     parser.add_argument(
 73 |         "-t", 
 74 |         "--threads",
 75 |         type=int,
 76 |         default=1,
 77 |         help="Number of threads"
 78 |     )
 79 |     parser.add_argument(
 80 |         "-V",
 81 |         "--version",
 82 |         action="store_true",
 83 |         help="Print version"
 84 |     )
 85 | 
 86 |     # Parse the arguments
 87 |     args = parser.parse_args()
 88 | 
 89 |     if args.version:
 90 |         print(VERSION)
 91 |         return 0
 92 | 
 93 |     no_rows = args.rows
 94 |     no_threads = args.threads
 95 | 
 96 |     start_time = time.time()
 97 | 
 98 |     df = generate_dataframe(no_rows, no_threads)
 99 |     end_time = time.time()
100 |     print(df)
101 |     print(f"Time taken to generate {no_rows} people into a dataframe:")
102 |     print(f"--- {round((end_time - start_time), 3)} seconds ---")
103 | 
104 |     start_time = time.time()
105 |     df.write_parquet("people.parquet")
106 |     end_time = time.time()
107 |     print(f"Time taken to write to Parquet:")
108 |     print(f"--- {round((end_time - start_time), 3)} seconds ---")
109 |     return 0
110 | 
111 | 
112 | if __name__ == "__main__":
113 |     sys.exit(main())


--------------------------------------------------------------------------------
/examples/pyfake-v2/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "pyfake"
 3 | version = "0.0.2"
 4 | description = ""
 5 | authors = ["aprxi <mail@aprxi.com>"]
 6 | 
 7 | [tool.poetry.dependencies]
 8 | python = "^3.11"
 9 | faker = "^21.0.1"
10 | polars = "^0.20.0"
11 | 
12 | [tool.poetry.scripts]
13 | pyfake = "pyfake.generate:main"
14 | 
15 | [build-system]
16 | requires = ["poetry-core"]
17 | build-backend = "poetry.core.masonry.api"
18 | 


--------------------------------------------------------------------------------
/examples/rsfake-v1/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rsfake"
 3 | version = "0.0.1"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | fake = { version = "2.9", features = ["derive"] }
 8 | 
 9 | [profile.release]
10 | opt-level = 3
11 | 


--------------------------------------------------------------------------------
/examples/rsfake-v1/src/main.rs:
--------------------------------------------------------------------------------
 1 | use std::time::Instant;
 2 | use fake::Dummy;
 3 | use fake::{Fake, Faker};
 4 | 
 5 | use fake::faker::name::en::*;
 6 | use fake::faker::internet::en::*;
 7 | use fake::faker::company::en::*;
 8 | use fake::faker::phone_number::en::*;
 9 | 
10 | 
11 | const NO_ROWS: usize = 10000;
12 | 
13 | 
14 | #[derive(Debug, Dummy)]
15 | struct TableColumns {
16 |     #[dummy(faker = "(1000..9999999999999, NO_ROWS)")]
17 |     pub ids: Vec<i64>,
18 | 
19 |     #[dummy(faker = "(FirstName(), NO_ROWS)")]
20 |     pub first_names: Vec<String>,
21 | 
22 |     #[dummy(faker = "(LastName(), NO_ROWS)")]
23 |     pub last_names: Vec<String>,
24 | 
25 |     #[dummy(faker = "(FreeEmail(), NO_ROWS)")]
26 |     pub emails: Vec<String>,
27 | 
28 |     #[dummy(faker = "(CompanyName(), NO_ROWS)")]
29 |     pub companies: Vec<String>,
30 | 
31 |     #[dummy(faker = "(PhoneNumber(), NO_ROWS)")]
32 |     pub phone_numbers: Vec<String>,
33 | }
34 | 
35 | 
36 | fn generate_table() {
37 |     let start_time = Instant::now();
38 |     let table: TableColumns = Faker.fake();
39 |     let elapsed = start_time.elapsed().as_secs_f64();
40 | 
41 |     println!("First 3 records:");
42 |     for i in 0..3 {
43 |         println!(
44 |         "Record {}: {{ id: {}, first_name: \"{}\", last_name: \"{}\",\
45 |          email: \"{}\", company: \"{}\", phone_number: \"{}\" }}", 
46 |             i + 1,
47 |             table.ids[i], 
48 |             table.first_names[i], 
49 |             table.last_names[i], 
50 |             table.emails[i], 
51 |             table.companies[i], 
52 |             table.phone_numbers[i]
53 |         );
54 |     }
55 | 
56 |     println!("Time taken to generate {NO_ROWS} people:");
57 |     println!("--- {:.3} seconds ---", elapsed);
58 | }
59 | 
60 | 
61 | fn main() {
62 |     generate_table();
63 | }


--------------------------------------------------------------------------------
/examples/rsfake-v2/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "rsfake"
 3 | version = "0.0.1"
 4 | edition = "2021"
 5 | 
 6 | [dependencies]
 7 | fake = { version = "2.9", features = ["derive"] }
 8 | 
 9 | clap = { version = "4.4" , default-features = false, features = ["std", "env", "help"]}
10 | 
11 | serde = "1.0.136"
12 | serde_json = "1.0.108"
13 | rayon = "1.8"
14 | polars = { version = "0.35", features = ["parquet"] }
15 | 
16 | [profile.release]
17 | opt-level = 3
18 | 


--------------------------------------------------------------------------------
/examples/rsfake-v2/schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "columns": [
 3 |     {
 4 |       "name": "id",
 5 |       "type": "u64"
 6 |     },
 7 |     {
 8 |       "name": "first_name",
 9 |       "type": "FirstName"
10 |     },
11 |     {
12 |       "name": "last_name",
13 |       "type": "LastName"
14 |     },
15 |     {
16 |       "name": "email",
17 |       "type": "FreeEmail"
18 |     },
19 |     {
20 |       "name": "company",
21 |       "type": "CompanyName"
22 |     },
23 |     {
24 |       "name": "phone",
25 |       "type": "PhoneNumber"
26 |     }
27 |   ]
28 | }


--------------------------------------------------------------------------------
/examples/rsfake-v2/src/extract.rs:
--------------------------------------------------------------------------------
  1 | use std::error::Error;
  2 | use std::fs::{self, File};
  3 | use std::path::Path;
  4 | 
  5 | use polars::prelude::*;
  6 | use std::io::BufWriter;
  7 | 
  8 | pub fn write_dataframe_to_single_parquet(
  9 |     df: &mut DataFrame,
 10 |     file_path: &str,
 11 | ) -> Result<(), Box<dyn Error>> {
 12 |     let file = File::create(file_path)?;
 13 |     let writer = BufWriter::new(file);
 14 |     ParquetWriter::new(writer).finish(df)?;
 15 |     Ok(())
 16 | }
 17 | 
 18 | pub fn cleanup_dataset_parquet_files(dataset_dir: &str) -> Result<(), Box<dyn Error>> {
 19 |     if Path::new(&dataset_dir).exists() {
 20 |         for entry in fs::read_dir(&dataset_dir)? {
 21 |             let path = entry?.path();
 22 |             if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("parquet") {
 23 |                 fs::remove_file(path)?;
 24 |             }
 25 |         }
 26 |     }
 27 | 
 28 |     Ok(())
 29 | }
 30 | 
 31 | pub fn write_dataframe_chunk_to_parquet(
 32 |     df_chunk: &mut DataFrame,
 33 |     dataset_id: &str,
 34 |     base_dir: &str,
 35 |     part_number: usize,
 36 | ) -> Result<(), Box<dyn Error>> {
 37 |     // Path for the dataset directory
 38 |     let dataset_dir = format!("{}/dataset={}", base_dir, dataset_id);
 39 | 
 40 |     // Ensure the dataset directory exists
 41 |     if !std::path::Path::new(&dataset_dir).exists() {
 42 |         fs::create_dir_all(&dataset_dir)?;
 43 |     }
 44 |     // Generate the part file name
 45 |     let file_path = format!("{}/part-{:05}.parquet", dataset_dir, part_number);
 46 | 
 47 |     // Write the DataFrame chunk to the Parquet file
 48 |     let file = File::create(&file_path)?;
 49 |     let writer = BufWriter::new(file);
 50 |     ParquetWriter::new(writer).finish(df_chunk)?;
 51 |     Ok(())
 52 | }
 53 | 
 54 | pub fn write_dataframe_to_multi_parquet(
 55 |     df: &DataFrame,
 56 |     dataset_id: &str,
 57 |     base_dir: &str,
 58 |     chunk_size: usize,
 59 | ) -> Result<(), Box<dyn Error>> {
 60 |     // Ensure the base directory and dataset directory exist
 61 |     let dataset_dir = format!("{}/dataset={}", base_dir, dataset_id);
 62 | 
 63 |     // create dataset directory if not exist, else clean up
 64 |     if !std::path::Path::new(&dataset_dir).exists() {
 65 |         fs::create_dir_all(&dataset_dir)?;
 66 |     } else {
 67 |         cleanup_dataset_parquet_files(&dataset_dir)?;
 68 |     }
 69 | 
 70 |     let n_rows = df.height();
 71 |     let mut part_number = 0;
 72 | 
 73 |     for start in (0..n_rows).step_by(chunk_size) {
 74 |         let end = std::cmp::min(start + chunk_size, n_rows);
 75 |         let chunk = df.slice(start as i64, end - start);
 76 | 
 77 |         // Convert chunk to mutable for writing
 78 |         let mut chunk_mut = chunk.clone();
 79 | 
 80 |         // write the chunk
 81 |         write_dataframe_chunk_to_parquet(&mut chunk_mut, dataset_id, base_dir, part_number)?;
 82 |         part_number += 1;
 83 |     }
 84 |     Ok(())
 85 | }
 86 | 
 87 | pub fn read_single_parquet_file(file_path: &str) -> Result<DataFrame, Box<dyn Error>> {
 88 |     let file = File::open(file_path)?;
 89 |     let df = ParquetReader::new(file).finish()?;
 90 |     Ok(df)
 91 | }
 92 | 
 93 | pub fn read_partitioned_parquet(base_dir: &str) -> Result<DataFrame, Box<dyn Error>> {
 94 |     let mut dataframes: Vec<DataFrame> = Vec::new();
 95 | 
 96 |     fn read_parquet_files(
 97 |         path: &Path,
 98 |         dataframes: &mut Vec<DataFrame>,
 99 |     ) -> Result<(), Box<dyn Error>> {
100 |         if path.is_dir() {
101 |             for entry in fs::read_dir(path)? {
102 |                 let entry = entry?;
103 |                 let path = entry.path();
104 |                 if path.is_dir() {
105 |                     // Recursively read nested directories
106 |                     read_parquet_files(&path, dataframes)?;
107 |                 } else if path.is_file()
108 |                     && path.extension().and_then(|s| s.to_str()) == Some("parquet")
109 |                 {
110 |                     let df = ParquetReader::new(File::open(path)?).finish()?;
111 |                     dataframes.push(df);
112 |                 }
113 |             }
114 |         }
115 |         Ok(())
116 |     }
117 | 
118 |     let base_path = Path::new(base_dir);
119 |     read_parquet_files(base_path, &mut dataframes)?;
120 | 
121 |     // Iteratively vstack DataFrames
122 |     let mut combined_df = match dataframes.get(0) {
123 |         Some(df) => df.clone(),
124 |         None => return Err("No dataframes found".into()),
125 |     };
126 | 
127 |     for df in dataframes.iter().skip(1) {
128 |         combined_df = combined_df.vstack(df)?;
129 |     }
130 | 
131 |     Ok(combined_df)
132 | }
133 | 


--------------------------------------------------------------------------------
/examples/rsfake-v2/src/generate.rs:
--------------------------------------------------------------------------------
  1 | use fake::{Fake, Faker};
  2 | use std::fs;
  3 | 
  4 | use serde_json::Value;
  5 | 
  6 | use polars::prelude::*;
  7 | use rayon::prelude::*;
  8 | 
  9 | use fake::faker::address::raw::*;
 10 | use fake::faker::company::raw::*;
 11 | use fake::faker::internet::raw::*;
 12 | use fake::faker::name::raw::*;
 13 | use fake::faker::phone_number::raw::*;
 14 | use fake::locales::*;
 15 | 
 16 | pub fn load_json(json_file: &str) -> Result<Value, Box<dyn std::error::Error>> {
 17 |     let json_str = fs::read_to_string(json_file)?;
 18 |     let json: Value = serde_json::from_str(&json_str)?;
 19 |     Ok(json)
 20 | }
 21 | 
 22 | pub fn generate_from_json(
 23 |     json_file: &str,
 24 |     no_rows: usize,
 25 | ) -> Result<DataFrame, Box<dyn std::error::Error>> {
 26 |     let json = load_json(json_file)?;
 27 | 
 28 |     let mut columns = Vec::new();
 29 | 
 30 |     if let Some(columns_def) = json.get("columns").and_then(|c| c.as_array()) {
 31 |         for col_def in columns_def {
 32 |             let col_name = col_def
 33 |                 .get("name")
 34 |                 .and_then(|n| n.as_str())
 35 |                 .unwrap_or_default();
 36 |             let col_type = col_def
 37 |                 .get("type")
 38 |                 .and_then(|t| t.as_str())
 39 |                 .unwrap_or_default();
 40 | 
 41 |             let series_en = create_series_from_type(col_type, col_name, no_rows, EN);
 42 |             columns.push(series_en);
 43 |         }
 44 |     }
 45 |     Ok(DataFrame::new(columns)?)
 46 | }
 47 | 
 48 | fn create_series_from_type<L>(type_name: &str, col_name: &str, no_rows: usize, locale: L) -> Series
 49 | where
 50 |     L: Data + Sync + Send + Copy,
 51 | {
 52 |     match type_name {
 53 |         "u64" => {
 54 |             let data = (0..no_rows)
 55 |                 .into_par_iter()
 56 |                 .map(|_| Faker.fake::<u64>())
 57 |                 .collect::<Vec<u64>>();
 58 |             Series::new(col_name, data)
 59 |         }
 60 |         "FirstName" => {
 61 |             let data: Vec<String> = (0..no_rows)
 62 |                 .into_par_iter()
 63 |                 .map(|_| FirstName(locale).fake::<String>())
 64 |                 .collect();
 65 |             Series::new(col_name, data)
 66 |         }
 67 |         "LastName" => {
 68 |             let data: Vec<String> = (0..no_rows)
 69 |                 .into_par_iter()
 70 |                 .map(|_| LastName(locale).fake::<String>())
 71 |                 .collect();
 72 |             Series::new(col_name, data)
 73 |         }
 74 |         "FreeEmail" => {
 75 |             let data: Vec<String> = (0..no_rows)
 76 |                 .into_par_iter()
 77 |                 .map(|_| FreeEmail(locale).fake::<String>())
 78 |                 .collect();
 79 |             Series::new(col_name, data)
 80 |         }
 81 |         "CompanyName" => {
 82 |             let data: Vec<String> = (0..no_rows)
 83 |                 .into_par_iter()
 84 |                 .map(|_| CompanyName(locale).fake::<String>())
 85 |                 .collect();
 86 |             Series::new(col_name, data)
 87 |         }
 88 |         "PhoneNumber" => {
 89 |             let data: Vec<String> = (0..no_rows)
 90 |                 .into_par_iter()
 91 |                 .map(|_| PhoneNumber(locale).fake::<String>())
 92 |                 .collect();
 93 |             Series::new(col_name, data)
 94 |         }
 95 |         "StreetName" => {
 96 |             let data: Vec<String> = (0..no_rows)
 97 |                 .into_par_iter()
 98 |                 .map(|_| StreetName(locale).fake::<String>())
 99 |                 .collect();
100 |             Series::new(col_name, data)
101 |         }
102 |         _ => panic!("Unsupported type: {}", type_name),
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/examples/rsfake-v2/src/main.rs:
--------------------------------------------------------------------------------
  1 | use std::env;
  2 | use std::path::Path;
  3 | use std::time::Instant;
  4 | 
  5 | use clap::{Arg, Command};
  6 | use polars::frame::DataFrame;
  7 | 
  8 | mod extract;
  9 | mod generate;
 10 | 
 11 | use extract::{
 12 |     read_partitioned_parquet, read_single_parquet_file, write_dataframe_to_multi_parquet,
 13 |     write_dataframe_to_single_parquet,
 14 | };
 15 | use generate::generate_from_json;
 16 | 
 17 | const PROGRAM_NAME: &str = "rsfake";
 18 | const DEFAULT_SCHEMA_FILE: &str = "schema.json";
 19 | const DEFAULT_NO_ROWS: &str = "10000";
 20 | const RAYON_NUM_THREADS: &str = "1";
 21 | 
 22 | fn parse_cli_arguments() -> Command {
 23 |     Command::new(PROGRAM_NAME)
 24 |         .version(env!("CARGO_PKG_VERSION")) // set version from Cargo.toml
 25 |         .about("Generates fake data based on the provided schema file.")
 26 |         .long_about(format!(
 27 |             "This program generates fake data based on a JSON schema file. \
 28 |             You can specify the number of rows, the number of threads for \
 29 |             parallel processing, and the schema file to be used.\n\n\
 30 |             Example usage:\n    {} -s schema.json -r {} -t {}",
 31 |             PROGRAM_NAME, DEFAULT_NO_ROWS, RAYON_NUM_THREADS
 32 |         ))
 33 |         .arg(
 34 |             Arg::new("schema")
 35 |                 .short('s')
 36 |                 .long("schema")
 37 |                 .env("FAKER_SCHEMA_FILE")
 38 |                 .value_name("SCHEMA_FILE")
 39 |                 .help("JSON file to describe column names and types")
 40 |                 .default_value(DEFAULT_SCHEMA_FILE),
 41 |         )
 42 |         .arg(
 43 |             Arg::new("rows")
 44 |                 .short('r')
 45 |                 .long("rows")
 46 |                 .env("FAKER_NUM_ROWS")
 47 |                 .value_name("NUM_ROWS")
 48 |                 .help("Number of rows to generate")
 49 |                 .default_value(DEFAULT_NO_ROWS),
 50 |         )
 51 |         .arg(
 52 |             Arg::new("threads")
 53 |                 .short('t')
 54 |                 .long("threads")
 55 |                 .env("RAYON_NUM_THREADS")
 56 |                 .value_name("NO_THREADS")
 57 |                 .help("Number of threads to use")
 58 |                 .default_value(RAYON_NUM_THREADS),
 59 |         )
 60 |         .arg(
 61 |             Arg::new("output")
 62 |                 .short('o')
 63 |                 .long("output")
 64 |                 .env("FAKER_OUTPUT_PATH")
 65 |                 .value_name("OUTPUT_PATH")
 66 |                 .help("Output path to write to"),
 67 |         )
 68 |         .arg(
 69 |             Arg::new("input")
 70 |                 .short('i')
 71 |                 .long("input")
 72 |                 .env("FAKER_INPUT_PATH")
 73 |                 .value_name("INPUT_PATH")
 74 |                 .help("Input path to read from"),
 75 |         )
 76 | }
 77 | 
 78 | fn main() {
 79 |     let args: Vec<String> = env::args().collect();
 80 |     let app = parse_cli_arguments();
 81 |     let matches = app.try_get_matches_from(args).unwrap_or_else(|e| {
 82 |         e.exit();
 83 |     });
 84 | 
 85 |     let schema_file = matches
 86 |         .get_one::<String>("schema")
 87 |         .expect("Failed to parse schema file");
 88 | 
 89 |     // additional check to see if schema file exists
 90 |     if !std::path::Path::new(&schema_file).exists() {
 91 |         println!("Schema file \"{}\" does not exist", schema_file);
 92 |         parse_cli_arguments().print_help().unwrap();
 93 |         std::process::exit(1);
 94 |     }
 95 | 
 96 |     let no_threads = matches
 97 |         .get_one::<String>("threads")
 98 |         .map(|s| s.parse::<usize>().expect("Failed to parse thread count"))
 99 |         .expect("Failed to parse default thread count");
100 | 
101 |     let no_rows = matches
102 |         .get_one::<String>("rows")
103 |         .map(|s| s.parse::<usize>().expect("Failed to parse row count"))
104 |         .expect("Failed to parse default row count");
105 | 
106 |     let output_path = matches.get_one::<String>("output");
107 |     let input_path = matches.get_one::<String>("input");
108 | 
109 |     // set RAYON_NUM_THREADS in env for Rayon to use
110 |     env::set_var("RAYON_NUM_THREADS", no_threads.to_string());
111 | 
112 |     let mut df: DataFrame;
113 | 
114 |     // read from parquet if input_path is specified
115 |     if let Some(input_path) = input_path {
116 |         let start_time = Instant::now();
117 |         let path = Path::new(input_path);
118 | 
119 |         df = if path.is_dir() {
120 |             match read_partitioned_parquet(input_path) {
121 |                 Ok(data) => data,
122 |                 Err(e) => {
123 |                     println!("Error reading partitioned Parquet: {:?}", e);
124 |                     return;
125 |                 }
126 |             }
127 |         } else if path.is_file() {
128 |             match read_single_parquet_file(input_path) {
129 |                 Ok(data) => data,
130 |                 Err(e) => {
131 |                     println!("Error reading single Parquet file: {:?}", e);
132 |                     return;
133 |                 }
134 |             }
135 |         } else {
136 |             // input path is neither a file nor a directory
137 |             println!(
138 |                 "Error: Input path \"{}\" is neither a file nor a directory",
139 |                 input_path
140 |             );
141 |             return;
142 |         };
143 | 
144 |         let elapsed = start_time.elapsed().as_secs_f64();
145 |         println!("{:?}", df);
146 |         println!("Time taken to read from Parquet: {:.3} seconds", elapsed);
147 |     } else {
148 |         let start_time = Instant::now();
149 |         df = generate_from_json(DEFAULT_SCHEMA_FILE, no_rows).unwrap();
150 |         let elapsed = start_time.elapsed().as_secs_f64();
151 |         println!("{:?}", df);
152 |         println!(
153 |             "Time taken to generate {no_rows} people into a dataframe using \
154 |             {no_threads} threads:"
155 |         );
156 |         println!("--- {:.3} seconds ---", elapsed);
157 |     }
158 | 
159 |     // write to parquet if output_path is specified
160 |     if let Some(output_path) = output_path {
161 |         let path = Path::new(output_path);
162 |         let mut is_partitioned = false;
163 | 
164 |         // Check if the path contains a "/" indicating a multi-parquet file
165 |         if path.to_str().unwrap_or("").contains("/") {
166 |             is_partitioned = true;
167 | 
168 |             // Check if a file with the same base name already exists
169 |             let base_path = Path::new(output_path.trim_end_matches('/'));
170 |             if base_path.exists() && base_path.is_file() {
171 |                 println!(
172 |                     "Error: A file with the name '{}' already exists.",
173 |                     base_path.display()
174 |                 );
175 |                 return;
176 |             }
177 |         }
178 | 
179 |         let start_time: Instant;
180 |         let elapsed: f64;
181 | 
182 |         if is_partitioned {
183 |             // partitioned parquet file
184 |             println!(
185 |                 "Output directory for multi-parquet file data: {}",
186 |                 output_path
187 |             );
188 |             let dataset_id = "0";
189 |             let chunk_size = no_rows / no_threads;
190 |             start_time = Instant::now();
191 |             let _ = write_dataframe_to_multi_parquet(&mut df, dataset_id, &output_path, chunk_size)
192 |                 .unwrap();
193 |             elapsed = start_time.elapsed().as_secs_f64();
194 |         } else {
195 |             // single parquet file
196 |             println!("Output file for single-parquet file data: {}", output_path);
197 |             start_time = Instant::now();
198 |             let _ = write_dataframe_to_single_parquet(&mut df, &output_path).unwrap();
199 |             elapsed = start_time.elapsed().as_secs_f64();
200 |         }
201 |         println!("Time taken to write to Parquet: {:.3} seconds", elapsed);
202 |     }
203 | }
204 | 


--------------------------------------------------------------------------------