├── .dockerignore ├── .gitignore ├── Dockerfile ├── LICENSE ├── README.md └── examples ├── pyfake-v1 ├── pyfake │ ├── __init__.py │ ├── benchmark.py │ ├── generate.py │ └── generate_row.py └── pyproject.toml ├── pyfake-v2 ├── pyfake │ ├── __init__.py │ └── generate.py └── pyproject.toml ├── rsfake-v1 ├── Cargo.toml └── src │ └── main.rs └── rsfake-v2 ├── Cargo.toml ├── schema.json └── src ├── extract.rs ├── generate.rs └── main.rs /.dockerignore: -------------------------------------------------------------------------------- 1 | **/target 2 | **/__pycache__/ 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # IDEs 2 | .vscode 3 | .idea 4 | 5 | # Mac/OSX 6 | .DS_Store 7 | 8 | # cache files 9 | .cache 10 | __pycache__/ 11 | .pytest_cache/ 12 | .mypy_cache/ 13 | .pyc 14 | *.lock 15 | 16 | target/ 17 | 18 | # virtenv 19 | venv/ 20 | 21 | # environment variables 22 | .env* 23 | 24 | # dat files (too large for github) 25 | *.dat 26 | *.parquet 27 | *.zip 28 | 29 | # temporary files 30 | */temp/ 31 | tmp/ 32 | 33 | # temp build dir 34 | build/ 35 | # archive/ 36 | dist/ 37 | 38 | data/ 39 | 40 | 41 | # history 42 | .bash_history 43 | .python_history 44 | 45 | node_modules/ 46 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/docker/library/rust:slim-bookworm 2 | 3 | RUN apt-get update --fix-missing \ 4 | && apt-get -y install --no-install-recommends \ 5 | bash \ 6 | zip \ 7 | python3-pip \ 8 | && ln -s /usr/bin/python3 /usr/bin/python \ 9 | && pip3 install --break-system-packages \ 10 | faker==21.0.0 \ 11 | polars==0.20.2 \ 12 | && apt-get clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* 13 | 14 | COPY ./examples /examples 15 | RUN \ 16 | cd /examples/rsfake-v1 && cargo build --release \ 17 | && mkdir -p bin && cp target/release/rsfake bin/rsfake \ 18 | && rm -rf target \ 19 | && cd /examples/rsfake-v2 && cargo build --release \ 20 | && mkdir -p bin && cp target/release/rsfake bin/rsfake \ 21 | && rm -rf target 22 | 23 | RUN useradd -u 4000 -ms /bin/bash foo \ 24 | && chown -R foo:foo /examples /usr/local/cargo 25 | WORKDIR /examples 26 | USER foo -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Anthony Potappel 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Generating data 100x faster with Rust 2 | 3 | Example scripts to demonstrate the ability of Rust to accelerate data generation. Performance improvement can vary, initial measurements currently show 100-150x gain for a typical dataset/ configuration in v1. 4 | 5 | v2 version is still a work in progress, although most functionality is working it may have some rough edges. Performance with Rayon threading is still something I am exploring, when scaling up it does not saturate all the cores perfectly. If you know why feel free to send me a PR. 6 | 7 | PRs on any other performance improvement are welcome, as the goal of this project is to get the fastest data generation possible. While we are currently at 100x -- have a sense we can push this much higher over time. 8 | 9 | ## Versions 10 | ### v1 (Python / Rust) 11 | * generate dataset 12 | 13 | ### v2 (Python / Rust) 14 | * ability to pass parameters via CLI 15 | * dynamic schema loading (Rust-only) 16 | * enable threading (multi-core) 17 | * convert to dataframe 18 | * export to Parquet 19 | 20 | ## Run examples 21 | 22 | ### Python 23 | 24 | ``` 25 | # enter directory 26 | cd examples/pyfake-v1 27 | 28 | # v1 requires faker 29 | pip install faker 30 | # v2 requires faker and polars 31 | pip install faker polars 32 | 33 | # run 34 | python pyfake/generate.py 35 | 36 | # benchmark row (average over 10 runs) 37 | python -c 'import pyfake; pyfake.benchmark_row()' 38 | 39 | # benchmark column (average over 10 runs) 40 | python -c 'import pyfake; pyfake.benchmark_column()' 41 | ``` 42 | 43 | ### Python with Poetry 44 | ``` 45 | # enter directory 46 | cd examples/pyfake-v1 47 | 48 | # install dependencies 49 | poetry update 50 | 51 | # run script 52 | poetry run pyfake 53 | ``` 54 | 55 | ### Rust 56 | ``` 57 | # enter directory 58 | cd examples/rsfake-v1 59 | 60 | cargo build --release 61 | target/release/rsfake 62 | ``` 63 | 64 | ### Docker 65 | For convenience a Dockerfile is included with both Python and Rust dependencies pre-installed. 66 | 67 | ``` 68 | # build 69 | docker build -t fakeroo . 70 | 71 | # run interactive shell 72 | docker run -ti --rm fakeroo bash 73 | 74 | # run Python example 75 | cd /examples/pyfake-v1 76 | python pyfake/generate.py 77 | 78 | # run Rust example 79 | cd /examples/rsfake-v1 80 | bin/rsfake 81 | ``` -------------------------------------------------------------------------------- /examples/pyfake-v1/pyfake/__init__.py: -------------------------------------------------------------------------------- 1 | from .benchmark import benchmark_column, benchmark_row -------------------------------------------------------------------------------- /examples/pyfake-v1/pyfake/benchmark.py: -------------------------------------------------------------------------------- 1 | import timeit 2 | 3 | NO_ROWS = 10000 4 | NO_EXECUTIONS = 10 5 | 6 | 7 | def benchmark_column() -> None: 8 | setup_code = f""" 9 | from pyfake.generate import ColumnTable 10 | NO_ROWS = {NO_ROWS} 11 | """ 12 | execution_time = timeit.timeit( 13 | "ColumnTable(NO_ROWS)", 14 | setup=setup_code, 15 | number=NO_EXECUTIONS, 16 | ) 17 | print(f"Average time taken to generate {NO_ROWS} people:") 18 | print(f"--- {round((execution_time / NO_EXECUTIONS), 3)} seconds ---") 19 | 20 | 21 | def benchmark_row() -> None: 22 | setup_code = f""" 23 | from pyfake.generate_row import generate_person_list 24 | NO_ROWS = {NO_ROWS} 25 | """ 26 | execution_time = timeit.timeit( 27 | "generate_person_list(NO_ROWS)", 28 | setup=setup_code, 29 | number=NO_EXECUTIONS, 30 | ) 31 | 32 | print(f"Average time taken to generate {NO_ROWS} people:") 33 | print(f"--- {round((execution_time / NO_EXECUTIONS), 3)} seconds ---") -------------------------------------------------------------------------------- /examples/pyfake-v1/pyfake/generate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | import time 4 | 5 | from faker import Faker 6 | 7 | NO_ROWS = 10000 8 | 9 | fake = Faker() 10 | 11 | class ColumnTable: 12 | def __init__(self, count: int): 13 | self.ids = [ 14 | random.randrange(1000, 9999999999999) for _ in range(count) 15 | ] 16 | self.first_names = [fake.first_name() for _ in range(count)] 17 | self.last_names = [fake.last_name() for _ in range(count)] 18 | self.emails = [fake.unique.ascii_email() for _ in range(count)] 19 | self.companies = [fake.company() for _ in range(count)] 20 | self.phone_numbers = [fake.phone_number() for _ in range(count)] 21 | 22 | 23 | def main() -> int: 24 | start_time = time.time() 25 | table = ColumnTable(NO_ROWS) 26 | end_time = time.time() 27 | 28 | print("First 3 records:") 29 | for i in range(3): 30 | print( 31 | f"Record {i + 1}: {{ id: {table.ids[i]}, " 32 | f"first_name: \"{table.first_names[i]}\", " 33 | f"last_name: \"{table.last_names[i]}\", " 34 | f"email: \"{table.emails[i]}\", " 35 | f"company: \"{table.companies[i]}\", " 36 | f"phone_number: \"{table.phone_numbers[i]}\" }}" 37 | ) 38 | 39 | print(f"Time taken to generate {NO_ROWS} people:") 40 | print(f"--- {round((end_time - start_time), 3)} seconds ---") 41 | return 0 42 | 43 | 44 | if __name__ == "__main__": 45 | sys.exit(main()) -------------------------------------------------------------------------------- /examples/pyfake-v1/pyfake/generate_row.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | import time 4 | 5 | from faker import Faker 6 | from typing import Any 7 | 8 | NO_ROWS = 10000 9 | 10 | fake = Faker() 11 | 12 | def get_person() -> dict[str, Any]: 13 | person = { 14 | "id": random.randrange(1000, 9999999999999), 15 | "first_name": fake.first_name(), 16 | "last_name": fake.last_name(), 17 | "email": fake.unique.ascii_email(), 18 | "company": fake.company(), 19 | "phone": fake.phone_number() 20 | } 21 | return person 22 | 23 | 24 | def generate_person_list(count: int) -> list[dict[str, Any]]: 25 | person_list = [get_person() for _ in range(count)] 26 | return person_list 27 | 28 | 29 | def main() -> int: 30 | start_time = time.time() 31 | person_list = generate_person_list(NO_ROWS) 32 | end_time = time.time() 33 | 34 | print("First 3 records:", person_list[:3]) 35 | print(f"Time taken to generate {NO_ROWS} people:") 36 | print(f"--- {round((end_time - start_time), 3)} seconds ---") 37 | 38 | 39 | if __name__ == "__main__": 40 | sys.exit(main()) -------------------------------------------------------------------------------- /examples/pyfake-v1/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "pyfake" 3 | version = "0.0.1" 4 | description = "" 5 | authors = ["aprxi "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.11" 10 | faker = "^21.0.1" 11 | 12 | [tool.poetry.scripts] 13 | pyfake = "pyfake.generate:main" 14 | pyfake_row = "pyfake.generate_row:main" 15 | benchmark_row = "pyfake.benchmark:benchmark_row" 16 | benchmark_column = "pyfake.benchmark:benchmark_column" 17 | 18 | [build-system] 19 | requires = ["poetry-core"] 20 | build-backend = "poetry.core.masonry.api" 21 | -------------------------------------------------------------------------------- /examples/pyfake-v2/pyfake/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aprxi/faster-data-generation/1962cb098eb813bfdee3032a2684b4362d06ebc5/examples/pyfake-v2/pyfake/__init__.py -------------------------------------------------------------------------------- /examples/pyfake-v2/pyfake/generate.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import random 3 | import time 4 | import argparse 5 | from concurrent import futures 6 | 7 | import polars as pl 8 | from faker import Faker 9 | 10 | VERSION = "0.0.1" 11 | 12 | fake = Faker() 13 | 14 | 15 | class ColumnTable: 16 | def __init__(self, count: int): 17 | self.ids = [ 18 | random.randrange(1000, 9999999999999) for _ in range(count) 19 | ] 20 | self.first_names = [fake.first_name() for _ in range(count)] 21 | self.last_names = [fake.last_name() for _ in range(count)] 22 | self.emails = [fake.unique.ascii_email() for _ in range(count)] 23 | self.companies = [fake.company() for _ in range(count)] 24 | self.phone_numbers = [fake.phone_number() for _ in range(count)] 25 | 26 | 27 | def generate_dataframe(no_rows: int, no_threads: int) -> pl.DataFrame: 28 | rows_per_thread = no_rows // no_threads 29 | 30 | with futures.ProcessPoolExecutor(max_workers=no_threads) as executor: 31 | # Submitting tasks 32 | tasks = [executor.submit(ColumnTable, rows_per_thread) for _ in range(no_threads)] 33 | 34 | # Collecting and combining results 35 | try: 36 | combined_data = { 37 | "ids": [], 38 | "first_names": [], 39 | "last_names": [], 40 | "emails": [], 41 | "companies": [], 42 | "phone_numbers": [] 43 | } 44 | 45 | for future in futures.as_completed(tasks): 46 | result = future.result() 47 | combined_data["ids"].extend(result.ids) 48 | combined_data["first_names"].extend(result.first_names) 49 | combined_data["last_names"].extend(result.last_names) 50 | combined_data["emails"].extend(result.emails) 51 | combined_data["companies"].extend(result.companies) 52 | combined_data["phone_numbers"].extend(result.phone_numbers) 53 | 54 | return pl.DataFrame(combined_data) 55 | except Exception as exc: 56 | print(f"A task raised an exception: {exc}") 57 | return None 58 | 59 | 60 | def main() -> int: 61 | parser = argparse.ArgumentParser( 62 | description="Example script." 63 | ) 64 | 65 | parser.add_argument( 66 | "-r", 67 | "--rows", 68 | type=int, 69 | default=10000, 70 | help="Number of rows" 71 | ) 72 | parser.add_argument( 73 | "-t", 74 | "--threads", 75 | type=int, 76 | default=1, 77 | help="Number of threads" 78 | ) 79 | parser.add_argument( 80 | "-V", 81 | "--version", 82 | action="store_true", 83 | help="Print version" 84 | ) 85 | 86 | # Parse the arguments 87 | args = parser.parse_args() 88 | 89 | if args.version: 90 | print(VERSION) 91 | return 0 92 | 93 | no_rows = args.rows 94 | no_threads = args.threads 95 | 96 | start_time = time.time() 97 | 98 | df = generate_dataframe(no_rows, no_threads) 99 | end_time = time.time() 100 | print(df) 101 | print(f"Time taken to generate {no_rows} people into a dataframe:") 102 | print(f"--- {round((end_time - start_time), 3)} seconds ---") 103 | 104 | start_time = time.time() 105 | df.write_parquet("people.parquet") 106 | end_time = time.time() 107 | print(f"Time taken to write to Parquet:") 108 | print(f"--- {round((end_time - start_time), 3)} seconds ---") 109 | return 0 110 | 111 | 112 | if __name__ == "__main__": 113 | sys.exit(main()) -------------------------------------------------------------------------------- /examples/pyfake-v2/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "pyfake" 3 | version = "0.0.2" 4 | description = "" 5 | authors = ["aprxi "] 6 | 7 | [tool.poetry.dependencies] 8 | python = "^3.11" 9 | faker = "^21.0.1" 10 | polars = "^0.20.0" 11 | 12 | [tool.poetry.scripts] 13 | pyfake = "pyfake.generate:main" 14 | 15 | [build-system] 16 | requires = ["poetry-core"] 17 | build-backend = "poetry.core.masonry.api" 18 | -------------------------------------------------------------------------------- /examples/rsfake-v1/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rsfake" 3 | version = "0.0.1" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | fake = { version = "2.9", features = ["derive"] } 8 | 9 | [profile.release] 10 | opt-level = 3 11 | -------------------------------------------------------------------------------- /examples/rsfake-v1/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::time::Instant; 2 | use fake::Dummy; 3 | use fake::{Fake, Faker}; 4 | 5 | use fake::faker::name::en::*; 6 | use fake::faker::internet::en::*; 7 | use fake::faker::company::en::*; 8 | use fake::faker::phone_number::en::*; 9 | 10 | 11 | const NO_ROWS: usize = 10000; 12 | 13 | 14 | #[derive(Debug, Dummy)] 15 | struct TableColumns { 16 | #[dummy(faker = "(1000..9999999999999, NO_ROWS)")] 17 | pub ids: Vec, 18 | 19 | #[dummy(faker = "(FirstName(), NO_ROWS)")] 20 | pub first_names: Vec, 21 | 22 | #[dummy(faker = "(LastName(), NO_ROWS)")] 23 | pub last_names: Vec, 24 | 25 | #[dummy(faker = "(FreeEmail(), NO_ROWS)")] 26 | pub emails: Vec, 27 | 28 | #[dummy(faker = "(CompanyName(), NO_ROWS)")] 29 | pub companies: Vec, 30 | 31 | #[dummy(faker = "(PhoneNumber(), NO_ROWS)")] 32 | pub phone_numbers: Vec, 33 | } 34 | 35 | 36 | fn generate_table() { 37 | let start_time = Instant::now(); 38 | let table: TableColumns = Faker.fake(); 39 | let elapsed = start_time.elapsed().as_secs_f64(); 40 | 41 | println!("First 3 records:"); 42 | for i in 0..3 { 43 | println!( 44 | "Record {}: {{ id: {}, first_name: \"{}\", last_name: \"{}\",\ 45 | email: \"{}\", company: \"{}\", phone_number: \"{}\" }}", 46 | i + 1, 47 | table.ids[i], 48 | table.first_names[i], 49 | table.last_names[i], 50 | table.emails[i], 51 | table.companies[i], 52 | table.phone_numbers[i] 53 | ); 54 | } 55 | 56 | println!("Time taken to generate {NO_ROWS} people:"); 57 | println!("--- {:.3} seconds ---", elapsed); 58 | } 59 | 60 | 61 | fn main() { 62 | generate_table(); 63 | } -------------------------------------------------------------------------------- /examples/rsfake-v2/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "rsfake" 3 | version = "0.0.1" 4 | edition = "2021" 5 | 6 | [dependencies] 7 | fake = { version = "2.9", features = ["derive"] } 8 | 9 | clap = { version = "4.4" , default-features = false, features = ["std", "env", "help"]} 10 | 11 | serde = "1.0.136" 12 | serde_json = "1.0.108" 13 | rayon = "1.8" 14 | polars = { version = "0.35", features = ["parquet"] } 15 | 16 | [profile.release] 17 | opt-level = 3 18 | -------------------------------------------------------------------------------- /examples/rsfake-v2/schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "columns": [ 3 | { 4 | "name": "id", 5 | "type": "u64" 6 | }, 7 | { 8 | "name": "first_name", 9 | "type": "FirstName" 10 | }, 11 | { 12 | "name": "last_name", 13 | "type": "LastName" 14 | }, 15 | { 16 | "name": "email", 17 | "type": "FreeEmail" 18 | }, 19 | { 20 | "name": "company", 21 | "type": "CompanyName" 22 | }, 23 | { 24 | "name": "phone", 25 | "type": "PhoneNumber" 26 | } 27 | ] 28 | } -------------------------------------------------------------------------------- /examples/rsfake-v2/src/extract.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use std::fs::{self, File}; 3 | use std::path::Path; 4 | 5 | use polars::prelude::*; 6 | use std::io::BufWriter; 7 | 8 | pub fn write_dataframe_to_single_parquet( 9 | df: &mut DataFrame, 10 | file_path: &str, 11 | ) -> Result<(), Box> { 12 | let file = File::create(file_path)?; 13 | let writer = BufWriter::new(file); 14 | ParquetWriter::new(writer).finish(df)?; 15 | Ok(()) 16 | } 17 | 18 | pub fn cleanup_dataset_parquet_files(dataset_dir: &str) -> Result<(), Box> { 19 | if Path::new(&dataset_dir).exists() { 20 | for entry in fs::read_dir(&dataset_dir)? { 21 | let path = entry?.path(); 22 | if path.is_file() && path.extension().and_then(|s| s.to_str()) == Some("parquet") { 23 | fs::remove_file(path)?; 24 | } 25 | } 26 | } 27 | 28 | Ok(()) 29 | } 30 | 31 | pub fn write_dataframe_chunk_to_parquet( 32 | df_chunk: &mut DataFrame, 33 | dataset_id: &str, 34 | base_dir: &str, 35 | part_number: usize, 36 | ) -> Result<(), Box> { 37 | // Path for the dataset directory 38 | let dataset_dir = format!("{}/dataset={}", base_dir, dataset_id); 39 | 40 | // Ensure the dataset directory exists 41 | if !std::path::Path::new(&dataset_dir).exists() { 42 | fs::create_dir_all(&dataset_dir)?; 43 | } 44 | // Generate the part file name 45 | let file_path = format!("{}/part-{:05}.parquet", dataset_dir, part_number); 46 | 47 | // Write the DataFrame chunk to the Parquet file 48 | let file = File::create(&file_path)?; 49 | let writer = BufWriter::new(file); 50 | ParquetWriter::new(writer).finish(df_chunk)?; 51 | Ok(()) 52 | } 53 | 54 | pub fn write_dataframe_to_multi_parquet( 55 | df: &DataFrame, 56 | dataset_id: &str, 57 | base_dir: &str, 58 | chunk_size: usize, 59 | ) -> Result<(), Box> { 60 | // Ensure the base directory and dataset directory exist 61 | let dataset_dir = format!("{}/dataset={}", base_dir, dataset_id); 62 | 63 | // create dataset directory if not exist, else clean up 64 | if !std::path::Path::new(&dataset_dir).exists() { 65 | fs::create_dir_all(&dataset_dir)?; 66 | } else { 67 | cleanup_dataset_parquet_files(&dataset_dir)?; 68 | } 69 | 70 | let n_rows = df.height(); 71 | let mut part_number = 0; 72 | 73 | for start in (0..n_rows).step_by(chunk_size) { 74 | let end = std::cmp::min(start + chunk_size, n_rows); 75 | let chunk = df.slice(start as i64, end - start); 76 | 77 | // Convert chunk to mutable for writing 78 | let mut chunk_mut = chunk.clone(); 79 | 80 | // write the chunk 81 | write_dataframe_chunk_to_parquet(&mut chunk_mut, dataset_id, base_dir, part_number)?; 82 | part_number += 1; 83 | } 84 | Ok(()) 85 | } 86 | 87 | pub fn read_single_parquet_file(file_path: &str) -> Result> { 88 | let file = File::open(file_path)?; 89 | let df = ParquetReader::new(file).finish()?; 90 | Ok(df) 91 | } 92 | 93 | pub fn read_partitioned_parquet(base_dir: &str) -> Result> { 94 | let mut dataframes: Vec = Vec::new(); 95 | 96 | fn read_parquet_files( 97 | path: &Path, 98 | dataframes: &mut Vec, 99 | ) -> Result<(), Box> { 100 | if path.is_dir() { 101 | for entry in fs::read_dir(path)? { 102 | let entry = entry?; 103 | let path = entry.path(); 104 | if path.is_dir() { 105 | // Recursively read nested directories 106 | read_parquet_files(&path, dataframes)?; 107 | } else if path.is_file() 108 | && path.extension().and_then(|s| s.to_str()) == Some("parquet") 109 | { 110 | let df = ParquetReader::new(File::open(path)?).finish()?; 111 | dataframes.push(df); 112 | } 113 | } 114 | } 115 | Ok(()) 116 | } 117 | 118 | let base_path = Path::new(base_dir); 119 | read_parquet_files(base_path, &mut dataframes)?; 120 | 121 | // Iteratively vstack DataFrames 122 | let mut combined_df = match dataframes.get(0) { 123 | Some(df) => df.clone(), 124 | None => return Err("No dataframes found".into()), 125 | }; 126 | 127 | for df in dataframes.iter().skip(1) { 128 | combined_df = combined_df.vstack(df)?; 129 | } 130 | 131 | Ok(combined_df) 132 | } 133 | -------------------------------------------------------------------------------- /examples/rsfake-v2/src/generate.rs: -------------------------------------------------------------------------------- 1 | use fake::{Fake, Faker}; 2 | use std::fs; 3 | 4 | use serde_json::Value; 5 | 6 | use polars::prelude::*; 7 | use rayon::prelude::*; 8 | 9 | use fake::faker::address::raw::*; 10 | use fake::faker::company::raw::*; 11 | use fake::faker::internet::raw::*; 12 | use fake::faker::name::raw::*; 13 | use fake::faker::phone_number::raw::*; 14 | use fake::locales::*; 15 | 16 | pub fn load_json(json_file: &str) -> Result> { 17 | let json_str = fs::read_to_string(json_file)?; 18 | let json: Value = serde_json::from_str(&json_str)?; 19 | Ok(json) 20 | } 21 | 22 | pub fn generate_from_json( 23 | json_file: &str, 24 | no_rows: usize, 25 | ) -> Result> { 26 | let json = load_json(json_file)?; 27 | 28 | let mut columns = Vec::new(); 29 | 30 | if let Some(columns_def) = json.get("columns").and_then(|c| c.as_array()) { 31 | for col_def in columns_def { 32 | let col_name = col_def 33 | .get("name") 34 | .and_then(|n| n.as_str()) 35 | .unwrap_or_default(); 36 | let col_type = col_def 37 | .get("type") 38 | .and_then(|t| t.as_str()) 39 | .unwrap_or_default(); 40 | 41 | let series_en = create_series_from_type(col_type, col_name, no_rows, EN); 42 | columns.push(series_en); 43 | } 44 | } 45 | Ok(DataFrame::new(columns)?) 46 | } 47 | 48 | fn create_series_from_type(type_name: &str, col_name: &str, no_rows: usize, locale: L) -> Series 49 | where 50 | L: Data + Sync + Send + Copy, 51 | { 52 | match type_name { 53 | "u64" => { 54 | let data = (0..no_rows) 55 | .into_par_iter() 56 | .map(|_| Faker.fake::()) 57 | .collect::>(); 58 | Series::new(col_name, data) 59 | } 60 | "FirstName" => { 61 | let data: Vec = (0..no_rows) 62 | .into_par_iter() 63 | .map(|_| FirstName(locale).fake::()) 64 | .collect(); 65 | Series::new(col_name, data) 66 | } 67 | "LastName" => { 68 | let data: Vec = (0..no_rows) 69 | .into_par_iter() 70 | .map(|_| LastName(locale).fake::()) 71 | .collect(); 72 | Series::new(col_name, data) 73 | } 74 | "FreeEmail" => { 75 | let data: Vec = (0..no_rows) 76 | .into_par_iter() 77 | .map(|_| FreeEmail(locale).fake::()) 78 | .collect(); 79 | Series::new(col_name, data) 80 | } 81 | "CompanyName" => { 82 | let data: Vec = (0..no_rows) 83 | .into_par_iter() 84 | .map(|_| CompanyName(locale).fake::()) 85 | .collect(); 86 | Series::new(col_name, data) 87 | } 88 | "PhoneNumber" => { 89 | let data: Vec = (0..no_rows) 90 | .into_par_iter() 91 | .map(|_| PhoneNumber(locale).fake::()) 92 | .collect(); 93 | Series::new(col_name, data) 94 | } 95 | "StreetName" => { 96 | let data: Vec = (0..no_rows) 97 | .into_par_iter() 98 | .map(|_| StreetName(locale).fake::()) 99 | .collect(); 100 | Series::new(col_name, data) 101 | } 102 | _ => panic!("Unsupported type: {}", type_name), 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /examples/rsfake-v2/src/main.rs: -------------------------------------------------------------------------------- 1 | use std::env; 2 | use std::path::Path; 3 | use std::time::Instant; 4 | 5 | use clap::{Arg, Command}; 6 | use polars::frame::DataFrame; 7 | 8 | mod extract; 9 | mod generate; 10 | 11 | use extract::{ 12 | read_partitioned_parquet, read_single_parquet_file, write_dataframe_to_multi_parquet, 13 | write_dataframe_to_single_parquet, 14 | }; 15 | use generate::generate_from_json; 16 | 17 | const PROGRAM_NAME: &str = "rsfake"; 18 | const DEFAULT_SCHEMA_FILE: &str = "schema.json"; 19 | const DEFAULT_NO_ROWS: &str = "10000"; 20 | const RAYON_NUM_THREADS: &str = "1"; 21 | 22 | fn parse_cli_arguments() -> Command { 23 | Command::new(PROGRAM_NAME) 24 | .version(env!("CARGO_PKG_VERSION")) // set version from Cargo.toml 25 | .about("Generates fake data based on the provided schema file.") 26 | .long_about(format!( 27 | "This program generates fake data based on a JSON schema file. \ 28 | You can specify the number of rows, the number of threads for \ 29 | parallel processing, and the schema file to be used.\n\n\ 30 | Example usage:\n {} -s schema.json -r {} -t {}", 31 | PROGRAM_NAME, DEFAULT_NO_ROWS, RAYON_NUM_THREADS 32 | )) 33 | .arg( 34 | Arg::new("schema") 35 | .short('s') 36 | .long("schema") 37 | .env("FAKER_SCHEMA_FILE") 38 | .value_name("SCHEMA_FILE") 39 | .help("JSON file to describe column names and types") 40 | .default_value(DEFAULT_SCHEMA_FILE), 41 | ) 42 | .arg( 43 | Arg::new("rows") 44 | .short('r') 45 | .long("rows") 46 | .env("FAKER_NUM_ROWS") 47 | .value_name("NUM_ROWS") 48 | .help("Number of rows to generate") 49 | .default_value(DEFAULT_NO_ROWS), 50 | ) 51 | .arg( 52 | Arg::new("threads") 53 | .short('t') 54 | .long("threads") 55 | .env("RAYON_NUM_THREADS") 56 | .value_name("NO_THREADS") 57 | .help("Number of threads to use") 58 | .default_value(RAYON_NUM_THREADS), 59 | ) 60 | .arg( 61 | Arg::new("output") 62 | .short('o') 63 | .long("output") 64 | .env("FAKER_OUTPUT_PATH") 65 | .value_name("OUTPUT_PATH") 66 | .help("Output path to write to"), 67 | ) 68 | .arg( 69 | Arg::new("input") 70 | .short('i') 71 | .long("input") 72 | .env("FAKER_INPUT_PATH") 73 | .value_name("INPUT_PATH") 74 | .help("Input path to read from"), 75 | ) 76 | } 77 | 78 | fn main() { 79 | let args: Vec = env::args().collect(); 80 | let app = parse_cli_arguments(); 81 | let matches = app.try_get_matches_from(args).unwrap_or_else(|e| { 82 | e.exit(); 83 | }); 84 | 85 | let schema_file = matches 86 | .get_one::("schema") 87 | .expect("Failed to parse schema file"); 88 | 89 | // additional check to see if schema file exists 90 | if !std::path::Path::new(&schema_file).exists() { 91 | println!("Schema file \"{}\" does not exist", schema_file); 92 | parse_cli_arguments().print_help().unwrap(); 93 | std::process::exit(1); 94 | } 95 | 96 | let no_threads = matches 97 | .get_one::("threads") 98 | .map(|s| s.parse::().expect("Failed to parse thread count")) 99 | .expect("Failed to parse default thread count"); 100 | 101 | let no_rows = matches 102 | .get_one::("rows") 103 | .map(|s| s.parse::().expect("Failed to parse row count")) 104 | .expect("Failed to parse default row count"); 105 | 106 | let output_path = matches.get_one::("output"); 107 | let input_path = matches.get_one::("input"); 108 | 109 | // set RAYON_NUM_THREADS in env for Rayon to use 110 | env::set_var("RAYON_NUM_THREADS", no_threads.to_string()); 111 | 112 | let mut df: DataFrame; 113 | 114 | // read from parquet if input_path is specified 115 | if let Some(input_path) = input_path { 116 | let start_time = Instant::now(); 117 | let path = Path::new(input_path); 118 | 119 | df = if path.is_dir() { 120 | match read_partitioned_parquet(input_path) { 121 | Ok(data) => data, 122 | Err(e) => { 123 | println!("Error reading partitioned Parquet: {:?}", e); 124 | return; 125 | } 126 | } 127 | } else if path.is_file() { 128 | match read_single_parquet_file(input_path) { 129 | Ok(data) => data, 130 | Err(e) => { 131 | println!("Error reading single Parquet file: {:?}", e); 132 | return; 133 | } 134 | } 135 | } else { 136 | // input path is neither a file nor a directory 137 | println!( 138 | "Error: Input path \"{}\" is neither a file nor a directory", 139 | input_path 140 | ); 141 | return; 142 | }; 143 | 144 | let elapsed = start_time.elapsed().as_secs_f64(); 145 | println!("{:?}", df); 146 | println!("Time taken to read from Parquet: {:.3} seconds", elapsed); 147 | } else { 148 | let start_time = Instant::now(); 149 | df = generate_from_json(DEFAULT_SCHEMA_FILE, no_rows).unwrap(); 150 | let elapsed = start_time.elapsed().as_secs_f64(); 151 | println!("{:?}", df); 152 | println!( 153 | "Time taken to generate {no_rows} people into a dataframe using \ 154 | {no_threads} threads:" 155 | ); 156 | println!("--- {:.3} seconds ---", elapsed); 157 | } 158 | 159 | // write to parquet if output_path is specified 160 | if let Some(output_path) = output_path { 161 | let path = Path::new(output_path); 162 | let mut is_partitioned = false; 163 | 164 | // Check if the path contains a "/" indicating a multi-parquet file 165 | if path.to_str().unwrap_or("").contains("/") { 166 | is_partitioned = true; 167 | 168 | // Check if a file with the same base name already exists 169 | let base_path = Path::new(output_path.trim_end_matches('/')); 170 | if base_path.exists() && base_path.is_file() { 171 | println!( 172 | "Error: A file with the name '{}' already exists.", 173 | base_path.display() 174 | ); 175 | return; 176 | } 177 | } 178 | 179 | let start_time: Instant; 180 | let elapsed: f64; 181 | 182 | if is_partitioned { 183 | // partitioned parquet file 184 | println!( 185 | "Output directory for multi-parquet file data: {}", 186 | output_path 187 | ); 188 | let dataset_id = "0"; 189 | let chunk_size = no_rows / no_threads; 190 | start_time = Instant::now(); 191 | let _ = write_dataframe_to_multi_parquet(&mut df, dataset_id, &output_path, chunk_size) 192 | .unwrap(); 193 | elapsed = start_time.elapsed().as_secs_f64(); 194 | } else { 195 | // single parquet file 196 | println!("Output file for single-parquet file data: {}", output_path); 197 | start_time = Instant::now(); 198 | let _ = write_dataframe_to_single_parquet(&mut df, &output_path).unwrap(); 199 | elapsed = start_time.elapsed().as_secs_f64(); 200 | } 201 | println!("Time taken to write to Parquet: {:.3} seconds", elapsed); 202 | } 203 | } 204 | --------------------------------------------------------------------------------