├── README.md └── generate_data.py /README.md: -------------------------------------------------------------------------------- 1 | # de_tech_test_pyspark 2 | 3 | Repo for functions required to setup the PySpark DE technical test. Currently this just includes the generate_data function. 4 | -------------------------------------------------------------------------------- /generate_data.py: -------------------------------------------------------------------------------- 1 | import random 2 | from functools import partial 3 | import csv 4 | from itertools import product 5 | import string 6 | import csv 7 | from datetime import datetime 8 | import os 9 | 10 | def generate_data(size=1000000): 11 | """ 12 | Generates csvs in root directory. The csvs are: 13 | main/test_data.csv 14 | - field1, field2, field3, field4 are random ints in range(1,20) 15 | - val1, val2, val3 are random floats 16 | 17 | Args: 18 | size (int, optional): The number of rows required in the 19 | main test data csv. 20 | 21 | Raises: 22 | FileExistsError: Raised if a file has already been generated 23 | with today's date. 24 | """ 25 | def _randomly_nullify(series, n): 26 | "Replaces n entires in series with None" 27 | indices = random.choices(range(size),k=n) 28 | return [v if i not in indices else None for i,v in enumerate(series)] 29 | 30 | date = datetime.today().strftime('%Y-%m-%d') 31 | 32 | part_choices = partial(random.choices, range(1,20), k=size) 33 | 34 | field1 = _randomly_nullify( 35 | part_choices(weights=[i**2/2 for i in range(1,20)]), 5 36 | ) # end weighted 37 | 38 | field2 = _randomly_nullify( 39 | part_choices(weights=[(20-i)/i for i in range(1,20)]), 30 40 | ) # start weighted 41 | 42 | field3 = part_choices(weights=[1/(1+abs(i - 10)) for i in range(1,20)]) # mid weighted 43 | field4 = part_choices() # uniform 44 | 45 | val1 = (random.gauss(1000, 100) for i in range(size)) # normal random 46 | val2 = (random.random()*1000*i if i else 0 for i in field1) # random correlated with field1 47 | val3 = _randomly_nullify( 48 | [random.random()*1000*i for i in field4],10 49 | ) # random correlated with field4 50 | 51 | combined = zip(field1, field2, field3, field4, val1, val2, val3) 52 | 53 | path = os.path.join(os.getcwd(), f'data/main/{date}/test_data.csv') 54 | os.makedirs(os.path.dirname(path), exist_ok=True) 55 | 56 | with open(path, 'x', newline='') as f: 57 | writer = csv.writer(f) 58 | writer.writerow(['field1','field2','field3','field4','val1','val2','val3']) 59 | writer.writerows(combined) 60 | 61 | # lookup csv 62 | field = [i for i in range(1,20) if i != 10] 63 | group = product(field, field) 64 | lookup = list([x, y, random.choice(string.ascii_letters)] for x,y in group) 65 | try: 66 | with open('data/lookup.csv', 'x', newline='') as f: 67 | writer = csv.writer(f) 68 | writer.writerow(['field1','f2','lookup_val']) 69 | writer.writerows(lookup) 70 | except FileExistsError: 71 | pass 72 | --------------------------------------------------------------------------------