├── submissions └── .gitkeep ├── .gitignore ├── Cargo.toml ├── README.md ├── Cargo.lock ├── gen.py └── src └── main.rs /submissions/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | random_text_file.txt 2 | 3 | 4 | # Added by cargo 5 | 6 | /target 7 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "challenge200g" 3 | version = "0.1.0" 4 | edition = "2021" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | rand = "*" 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 200GB Challenge 2 | 3 | ## Problem 4 | 5 | Given a file of approximately 200GB, where each line is a string of characters no longer than 1KB. In this file, there are **exactly two duplicate** strings. You are provided with a computer that has 16GB of memory. Find a solution to identify the duplicate string as quickly as possible. 6 | 7 | ## Generate challenge 8 | 9 | Run rust script to generate: 10 | 11 | ```bash 12 | cargo run --release 13 | # `seed` - random seed in unsigned integer for data generation 14 | # `max_line` - maximum size of each line in bytes 15 | # `size_in_gb` - total file size in GB 16 | # `output` - path to the output file 17 | 18 | # example: 19 | # seed line size_in_gb output 20 | cargo run --release 1 1024 1 random_text_file.txt 21 | ``` 22 | 23 | ## Submission 24 | 25 | To submit your solution, follow these steps: 26 | 27 | 1. Fork repository to your own GitHub account 28 | 2. Create your submission folder: In your forked repository, create new directory under `submission/` 29 | 3. Create README (optional): Include a README file in your submission folder with a brief description of your approach, or instructions on how to run your solution 30 | 4. Create Pull Request 31 | 32 | 33 | From WeBuild with 💖 34 | 35 | We look forward to your innovative solutions! Happy coding! 36 | 37 | -------------------------------------------------------------------------------- /Cargo.lock: -------------------------------------------------------------------------------- 1 | # This file is automatically @generated by Cargo. 2 | # It is not intended for manual editing. 3 | version = 3 4 | 5 | [[package]] 6 | name = "cfg-if" 7 | version = "1.0.0" 8 | source = "registry+https://github.com/rust-lang/crates.io-index" 9 | checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" 10 | 11 | [[package]] 12 | name = "challenge200g" 13 | version = "0.1.0" 14 | dependencies = [ 15 | "rand", 16 | ] 17 | 18 | [[package]] 19 | name = "getrandom" 20 | version = "0.2.15" 21 | source = "registry+https://github.com/rust-lang/crates.io-index" 22 | checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" 23 | dependencies = [ 24 | "cfg-if", 25 | "libc", 26 | "wasi", 27 | ] 28 | 29 | [[package]] 30 | name = "libc" 31 | version = "0.2.155" 32 | source = "registry+https://github.com/rust-lang/crates.io-index" 33 | checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" 34 | 35 | [[package]] 36 | name = "ppv-lite86" 37 | version = "0.2.17" 38 | source = "registry+https://github.com/rust-lang/crates.io-index" 39 | checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" 40 | 41 | [[package]] 42 | name = "rand" 43 | version = "0.8.5" 44 | source = "registry+https://github.com/rust-lang/crates.io-index" 45 | checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" 46 | dependencies = [ 47 | "libc", 48 | "rand_chacha", 49 | "rand_core", 50 | ] 51 | 52 | [[package]] 53 | name = "rand_chacha" 54 | version = "0.3.1" 55 | source = "registry+https://github.com/rust-lang/crates.io-index" 56 | checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" 57 | dependencies = [ 58 | "ppv-lite86", 59 | "rand_core", 60 | ] 61 | 62 | [[package]] 63 | name = "rand_core" 64 | version = "0.6.4" 65 | source = "registry+https://github.com/rust-lang/crates.io-index" 66 | checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" 67 | dependencies = [ 68 | "getrandom", 69 | ] 70 | 71 | [[package]] 72 | name = "wasi" 73 | version = "0.11.0+wasi-snapshot-preview1" 74 | source = "registry+https://github.com/rust-lang/crates.io-index" 75 | checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" 76 | -------------------------------------------------------------------------------- /gen.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | 4 | def generate_random_line(counter, max_size): 5 | # Generates a random line of text with a maximum size of `max_size` bytes 6 | base_length = max_size - len(str(counter)) - 1 # Ensure space for the counter and newline 7 | line_length = random.randint(1, base_length) 8 | line = ''.join(random.choices(string.ascii_letters + string.digits + string.punctuation + ' ', k=line_length)) 9 | return f'{line}{counter}\n' 10 | 11 | def generate_text_with_offsets(seed, file_path, total_size_gb, max_line_size): 12 | random.seed(seed) 13 | total_size_bytes = total_size_gb * 1024**3 14 | 15 | # Generate two random offsets between 0 and total_size_bytes 16 | offset1 = random.randint(0, total_size_bytes) 17 | offset2 = random.randint(0, total_size_bytes) 18 | 19 | # Ensure offset1 is the smaller one 20 | offset1, offset2 = min(offset1, offset2), max(offset1, offset2) 21 | 22 | generated_size = 0 23 | counter = 0 24 | recorded_line = None 25 | inserted = False 26 | 27 | with open(file_path, 'w', encoding='utf-8') as f: 28 | while generated_size < total_size_bytes: 29 | line = generate_random_line(counter, max_line_size) 30 | 31 | if not inserted and generated_size >= offset2: 32 | f.write(recorded_line) 33 | generated_size += len(recorded_line.encode('utf-8')) 34 | inserted = True 35 | print(f'Inserted recorded line at offset {offset2}: {recorded_line.strip()}') 36 | 37 | f.write(line) 38 | generated_size += len(line.encode('utf-8')) # Update size with the actual byte size of the line 39 | 40 | if generated_size >= offset1 and recorded_line is None: 41 | recorded_line = line 42 | print(f'Recorded line at offset {offset1}: {recorded_line.strip()}') 43 | 44 | counter += 1 45 | 46 | print(f'Generated text file: {file_path}, Size: {total_size_bytes / 1024**3} GB') 47 | return recorded_line.strip() 48 | 49 | # User input for seed, line size, total size in GB, and file path 50 | seed_input = int(input("Enter the seed: ")) 51 | line_size_input = int(input("Enter the maximum line size in bytes: ")) 52 | total_size_gb_input = int(input("Enter the total file size in GB: ")) 53 | file_path_input = 'random_text_file.txt' 54 | 55 | # Generate the text file and get the recorded line data 56 | recorded_line_data = generate_text_with_offsets(seed_input, file_path_input, total_size_gb_input, line_size_input) 57 | print(f'Recorded line data: {recorded_line_data}') 58 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use rand::{Rng, SeedableRng}; 2 | use rand::rngs::StdRng; 3 | use std::fs::File; 4 | use std::io::{BufWriter, Write}; 5 | use std::env; 6 | 7 | fn generate_random_line(rng: &mut StdRng, counter: u64, max_size: usize) -> String { 8 | let base_length = max_size - counter.to_string().len() - 1; 9 | let line_length = rng.gen_range(1..=base_length); 10 | let line: String = rng 11 | .sample_iter(&rand::distributions::Alphanumeric) 12 | .take(line_length) 13 | .map(char::from) 14 | .collect(); 15 | format!("{}{}\n", line, counter) 16 | } 17 | 18 | fn main() { 19 | let args: Vec = env::args().collect(); 20 | if args.len() != 5 { 21 | eprintln!("Usage: {} ", args[0]); 22 | return; 23 | } 24 | 25 | let seed: u64 = args[1].parse().expect("Invalid seed"); 26 | let max_line_size: usize = args[2].parse().expect("Invalid max line size"); 27 | let total_size_gb: usize = args[3].parse().expect("Invalid total size in GB"); 28 | let file_path = &args[4]; 29 | 30 | let total_size_bytes = total_size_gb * 1024 * 1024 * 1024; 31 | 32 | let mut rng = StdRng::seed_from_u64(seed); 33 | let offset1 = rng.gen_range(0..total_size_bytes); 34 | let offset2 = rng.gen_range(0..total_size_bytes); 35 | let (offset1, offset2) = if offset1 < offset2 { (offset1, offset2) } else { (offset2, offset1) }; 36 | 37 | let mut file = BufWriter::new(File::create(file_path).expect("Unable to create file")); 38 | let mut generated_size = 0; 39 | let mut counter = 0; 40 | let mut recorded_line: Option = None; 41 | let mut inserted = false; 42 | 43 | while generated_size < total_size_bytes { 44 | let line = generate_random_line(&mut rng, counter, max_line_size); 45 | let line_bytes = line.as_bytes(); 46 | 47 | if !inserted && generated_size >= offset2 { 48 | if let Some(ref rec_line) = recorded_line { 49 | file.write_all(rec_line.as_bytes()).expect("Unable to write to file"); 50 | generated_size += rec_line.len(); 51 | inserted = true; 52 | println!("Inserted recorded line at offset {}: {}", offset2, rec_line.trim()); 53 | } 54 | } 55 | 56 | file.write_all(line_bytes).expect("Unable to write to file"); 57 | generated_size += line_bytes.len(); 58 | 59 | if generated_size >= offset1 && recorded_line.is_none() { 60 | recorded_line = Some(line.clone()); 61 | println!("Recorded line at offset {}: {}", offset1, line.trim()); 62 | } 63 | 64 | counter += 1; 65 | } 66 | 67 | println!("Generated text file: {}, Size: {} GB", file_path, total_size_gb); 68 | if let Some(ref rec_line) = recorded_line { 69 | println!("Recorded line data: {}", rec_line.trim()); 70 | } 71 | } 72 | --------------------------------------------------------------------------------