├── datageneration_sample_spreadsheet.xlsx
├── datageneration_sample_spreadsheet_null.xlsx
├── assemble.awk
├── datageneration_sample_spreadsheet.csv
├── test_txt.sql
├── test_csv.sql
├── test_txt_null.sql
├── datageneration_sample_spreadsheet.txt
├── datageneration_sample_spreadsheet_null.txt
├── README.md
└── snowflake_python_generator.py


/datageneration_sample_spreadsheet.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RobertFehrmann/fehrminator/HEAD/datageneration_sample_spreadsheet.xlsx


--------------------------------------------------------------------------------
/datageneration_sample_spreadsheet_null.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RobertFehrmann/fehrminator/HEAD/datageneration_sample_spreadsheet_null.xlsx


--------------------------------------------------------------------------------
/assemble.awk:
--------------------------------------------------------------------------------
 1 | { 
 2 |    if (substr($4,1,7)=="create ") {
 3 |       if (substr($5,1,4)=="from") {
 4 |          if (index($9,"::")>0) {
 5 |             print $4 "\n   " $9 "\nfrom ( \n   select\n      " $7 "\n   " $5 "); \n\n";;
 6 |          }
 7 |          else {
 8 |             print $4 "\n      " $7 "\n   " $5 "; \n\n";
 9 |          }
10 |       }
11 |       else {
12 |          create_clause=$4
13 |          outer_select=sprintf("   %s",$9);
14 |          inner_select=sprintf("      %s",$7);
15 |       }
16 |    } else if (substr($5,1,4)=="from") {
17 |       print create_clause;
18 |       if (index(outer_select,"::")>0) {
19 |          print outer_select "\n   ," $9 "\nfrom ( \n   select";
20 |          print inner_select "\n      ," $7 "\n   " $5 "); \n\n";
21 |       }
22 |       else {
23 |          print inner_select "\n      ," $7 "\n   " $5 "; \n\n";
24 |       }
25 |    } else if (index($7,"::")>0) {
26 |       outer_select=sprintf("%s\n   ,%s",outer_select,$9);
27 |       inner_select=sprintf("%s\n      ,%s",inner_select,$7)
28 |    }   
29 | }
30 | 


--------------------------------------------------------------------------------
/datageneration_sample_spreadsheet.csv:
--------------------------------------------------------------------------------
 1 | Schema Name,TableName,Column Name,Column Cardinality,Table Cardinality,Data type,Length,Precision,# Nulls,Distribution,Nullable,Dim key,Is Foreign Key
 2 | ,,,,,,,,,,,,
 3 | schema1,table1,column1,1234,10000,date,,, ,,,,
 4 | schema1,table1,column2,1234,10000,timestamp,,,  ,,,,
 5 | schema1,table1,column3,1234,10000,varchar,10,, ,,,,
 6 | schema1,table1,column4,1234,10000,char,10,, ,,,,
 7 | schema1,table1,column5,1234,10000,bigint,,, ,,,,
 8 | schema1,table1,column6,1234,10000,integer,,, ,,,,
 9 | schema1,table1,column7,1234,10000,double,,, ,,,,
10 | schema1,table1,column8,1234,10000,float,,, ,,,,
11 | schema1,table1,column9,1234,10000,number,10,2, ,,,,
12 | schema1,table1,column10,10000,10000,char,10,,,,,,
13 | schema1,table1,column11,10000,10000,bigint,,,,,,,
14 | schema1,table1,column12,2,10000,boolean,,, ,,,,
15 | schema1,table1,column13,0,10000,integer,,,,,,,
16 | schema1,table1,column101,1234,10000,date,,,3000,,,,
17 | schema1,table1,column102,1234,10000,timestamp,,,3000,,,,
18 | schema1,table1,column103,1234,10000,varchar,10,,3000,,,,
19 | schema1,table1,column104,1234,10000,char,10,,3000,,,,
20 | schema1,table1,column105,1234,10000,bigint,,,3000,,,,
21 | schema1,table1,column106,1234,10000,integer,,,3000,,,,
22 | schema1,table1,column107,1234,10000,double,,,3000,,,,
23 | schema1,table1,column108,1234,10000,float,,,3000,,,,
24 | schema1,table1,column109,1234,10000,number,10,2,3000,,,,
25 | schema1,table1,column110,10000,10000,char,10,,3000,,,,
26 | schema1,table1,column111,10000,10000,bigint,,,3000,,,,
27 | schema1,table1,column112,2,10000,boolean,,,3000,,,,
28 | schema1,table1,column113,0,10000,integer,,,3000,,,,
29 | 


--------------------------------------------------------------------------------
/test_txt.sql:
--------------------------------------------------------------------------------
 1 | create transient schema if not exists schema1 data_retention_time_in_days=0;  create or replace table schema1.table1 as select
 2 |       dateadd(day,uniform(1,1234 , random(10001)),current_date)::date as column1
 3 |       ,(date_part(epoch_second, current_date)+(uniform(1,1234, random(10002))))::timestamp as column2
 4 |       ,randstr(uniform(1,10, random(10003)),uniform(1,1234,random(10003)))::varchar(10) as column3
 5 |       ,rpad(uniform(1,1234 , random(10004))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column4
 6 |       ,uniform(1,1234 , random(10005))::bigint as column5
 7 |       ,uniform(1,1234 , random(10006))::integer as column6
 8 |       ,uniform(1,1234 , random(10007))::double as column7
 9 |       ,uniform(1,1234 , random(10008))::float as column8
10 |       ,uniform(1,1234 , random(10009))::number(10,2) as column9
11 |       ,rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column10
12 |       ,seq8()::bigint as column11
13 |       ,(uniform(1,2,random(10012))-1)::boolean as column12
14 |       ,null::integer as column13
15 |       ,dateadd(day,uniform(1,1234 , random(10014)),current_date)::date as column101
16 |       ,(date_part(epoch_second, current_date)+(uniform(1,1234, random(10015))))::timestamp as column102
17 |       ,randstr(uniform(1,10, random(10016)),uniform(1,1234,random(10016)))::varchar(10) as column103
18 |       ,rpad(uniform(1,1234 , random(10017))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column104
19 |       ,uniform(1,1234 , random(10018))::bigint as column105
20 |       ,uniform(1,1234 , random(10019))::integer as column106
21 |       ,uniform(1,1234 , random(10020))::double as column107
22 |       ,uniform(1,1234 , random(10021))::float as column108
23 |       ,uniform(1,1234 , random(10022))::number(10,2) as column109
24 |       ,rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column110
25 |       ,seq8()::bigint as column111
26 |       ,(uniform(1,2,random(10025))-1)::boolean as column112
27 |       ,null::integer as column113
28 |    from table(generator(rowcount => 10000)); 
29 | 
30 | 
31 | 


--------------------------------------------------------------------------------
/test_csv.sql:
--------------------------------------------------------------------------------
 1 | -------------------------------------
 2 | -- Generating Database: TEST
 3 | -------------------------------------
 4 | CREATE TRANSIENT SCHEMA IF NOT EXISTS schema1 DATA_RETENTION_TIME_IN_DAYS=0;
 5 | USE SCHEMA schema1;
 6 | CREATE or REPLACE TABLE table1
 7 | AS
 8 | SELECT
 9 |    dateadd(day, uniform(1, 1234, random(10002)), date_trunc(day, current_date))::date as column1,
10 |    (date_part(epoch_second, current_date) + (uniform(1, 1234, random(10003))))::timestamp as column2,
11 |    randstr(uniform(1,10, random(10004)),uniform(1,1234,random(10004)))::varchar(10) as column3,
12 |    rpad(uniform(1, 1234, random(10005))::varchar,10, 'abcdefghifklmnopqrstuvwxyz')::char(10) as column4,
13 |    uniform(1,1234 , random(10006))::bigint as column5,
14 |    uniform(1,1234 , random(10007))::integer as column6,
15 |    uniform(1,1234 , random(10008))::double as column7,
16 |    uniform(1,1234 , random(10009))::float as column8,
17 |    uniform(1,1234 , random(10010))::number(10,2) as column9,
18 |    rpad((seq8()+1)::varchar,10, 'abcdefghifklmnopqrstuvwxyz')::char(10) as column10,
19 |    (seq8()+1)::bigint as column11,
20 |    (uniform(1,2 , random(10013))-1)::boolean as column12,
21 |    null::integer as column13,
22 |    (case when uniform(1,1000,random(20015))<=300 then null else dateadd(day, uniform(1, 1234, random(10015)), date_trunc(day, current_date)) end)::date as column101,
23 |    (case when uniform(1,1000,random(20016))<=300 then null else (date_part(epoch_second, current_date) + (uniform(1, 1234, random(10016)))) end)::timestamp as column102,
24 |    (case when uniform(1,1000,random(20017))<=300 then null else randstr(uniform(1,10, random(10017)),uniform(1,1234,random(10017))) end)::varchar(10) as column103,
25 |    (case when uniform(1,1000,random(20018))<=300 then null else rpad(uniform(1, 1234, random(10018))::varchar,10, 'abcdefghifklmnopqrstuvwxyz') end)::char(10) as column104,
26 |    (case when uniform(1,1000,random(20019))<=300 then null else uniform(1,1234 , random(10019)) end)::bigint as column105,
27 |    (case when uniform(1,1000,random(20020))<=300 then null else uniform(1,1234 , random(10020)) end)::integer as column106,
28 |    (case when uniform(1,1000,random(20021))<=300 then null else uniform(1,1234 , random(10021)) end)::double as column107,
29 |    (case when uniform(1,1000,random(20022))<=300 then null else uniform(1,1234 , random(10022)) end)::float as column108,
30 |    (case when uniform(1,1000,random(20023))<=300 then null else uniform(1,1234 , random(10023)) end)::number(10,2) as column109,
31 |    (case when uniform(1,1000,random(20024))<=300 then null else rpad((seq8()+1)::varchar,10, 'abcdefghifklmnopqrstuvwxyz') end)::char(10) as column110,
32 |    (case when uniform(1,1000,random(20025))<=300 then null else (seq8()+1) end)::bigint as column111,
33 |    (case when uniform(1,1000,random(20026))<=300 then null else (uniform(1,2 , random(10026))-1) end)::boolean as column112,
34 |    null::integer as column113
35 | from table(generator(rowcount => 10000));
36 |  
37 | 


--------------------------------------------------------------------------------
/test_txt_null.sql:
--------------------------------------------------------------------------------
 1 | create transient schema if not exists schema1 data_retention_time_in_days=0;  create or replace table schema1.table1 as select
 2 |    column1::date as column1
 3 |    ,column2::timestamp as column2
 4 |    ,column3::varchar(10) as column3
 5 |    ,column4::char(10) as column4
 6 |    ,column5::bigint as column5
 7 |    ,column6::integer as column6
 8 |    ,column7::double as column7
 9 |    ,column8::float as column8
10 |    ,column9::number(10,2) as column9
11 |    ,column10::char(10) as column10
12 |    ,column11::bigint as column11
13 |    ,column12::boolean as column12
14 |    ,column13::integer as column13
15 |    ,(case when uniform(1,1000,random(20014)) > 300 then column101 else null end)::date as column101
16 |    ,(case when uniform(1,1000,random(20015)) > 300 then column102 else null end)::timestamp as column102
17 |    ,(case when uniform(1,1000,random(20016)) > 300 then column103 else null end)::varchar(10) as column103
18 |    ,(case when uniform(1,1000,random(20017)) > 300 then column104 else null end)::char(10) as column104
19 |    ,(case when uniform(1,1000,random(20018)) > 300 then column105 else null end)::bigint as column105
20 |    ,(case when uniform(1,1000,random(20019)) > 300 then column106 else null end)::integer as column106
21 |    ,(case when uniform(1,1000,random(20020)) > 300 then column107 else null end)::double as column107
22 |    ,(case when uniform(1,1000,random(20021)) > 300 then column108 else null end)::float as column108
23 |    ,(case when uniform(1,1000,random(20022)) > 300 then column109 else null end)::number(10,2) as column109
24 |    ,(case when uniform(1,1000,random(20023)) > 300 then column110 else null end)::char(10) as column110
25 |    ,(case when uniform(1,1000,random(20024)) > 300 then column111 else null end)::bigint as column111
26 |    ,(case when uniform(1,1000,random(20025)) > 300 then column112 else null end)::boolean as column112
27 |    ,(case when uniform(1,1000,random(20026)) > 300 then column113 else null end)::integer as column113
28 | from ( 
29 |    select
30 |       dateadd(day,uniform(1,1234 , random(10001)),current_date)::date as column1
31 |       ,(date_part(epoch_second, current_date)+(uniform(1,1234, random(10002))))::timestamp as column2
32 |       ,randstr(uniform(1,10, random(10003)),uniform(1,1234,random(10003)))::varchar(10) as column3
33 |       ,rpad(uniform(1,1234 , random(10004))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column4
34 |       ,uniform(1,1234 , random(10005))::bigint as column5
35 |       ,uniform(1,1234 , random(10006))::integer as column6
36 |       ,uniform(1,1234 , random(10007))::double as column7
37 |       ,uniform(1,1234 , random(10008))::float as column8
38 |       ,uniform(1,1234 , random(10009))::number(10,2) as column9
39 |       ,rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column10
40 |       ,seq8()::bigint as column11
41 |       ,(uniform(1,2,random(10012))-1)::boolean as column12
42 |       ,null::integer as column13
43 |       ,dateadd(day,uniform(1,1234 , random(10014)),current_date)::date as column101
44 |       ,(date_part(epoch_second, current_date)+(uniform(1,1234, random(10015))))::timestamp as column102
45 |       ,randstr(uniform(1,10, random(10016)),uniform(1,1234,random(10016)))::varchar(10) as column103
46 |       ,rpad(uniform(1,1234 , random(10017))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column104
47 |       ,uniform(1,1234 , random(10018))::bigint as column105
48 |       ,uniform(1,1234 , random(10019))::integer as column106
49 |       ,uniform(1,1234 , random(10020))::double as column107
50 |       ,uniform(1,1234 , random(10021))::float as column108
51 |       ,uniform(1,1234 , random(10022))::number(10,2) as column109
52 |       ,rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column110
53 |       ,seq8()::bigint as column111
54 |       ,(uniform(1,2,random(10025))-1)::boolean as column112
55 |       ,null::integer as column113
56 |    from table(generator(rowcount => 10000))); 
57 | 
58 | 
59 | 


--------------------------------------------------------------------------------
/datageneration_sample_spreadsheet.txt:
--------------------------------------------------------------------------------
 1 | id	Schema Name	Table Name	STMT: CREATE 	STMT: FROM	Column Name	STMT: Expression	ColumnCardinality			Table Cardinalyty	DataType	Length	Precision	Seed	Seed	Normal Dist	Schema Name	TableName	Column Name	Column Cardinality	Table Cardinality	Data type	Length	Precision	# Nulls	Distribution	Nullable	Dim key	Is Foreign Key
 2 | 0														20000	10000														
 3 | 1	schema1	table1	create transient schema if not exists schema1 data_retention_time_in_days=0;  create or replace table schema1.table1 as select		column1	"dateadd(day,uniform(1,1234 , random(10001)),current_date)::date as column1"	1234			10000	date			20001	10001		schema1	table1	column1	1234	10000	date			 				
 4 | 2	schema1	table1			column2	"(date_part(epoch_second, current_date)+(uniform(1,1234, random(10002))))::timestamp as column2"	1234			10000	timestamp			20002	10002		schema1	table1	column2	1234	10000	timestamp			  				
 5 | 3	schema1	table1			column3	"randstr(uniform(1,10, random(10003)),uniform(1,1234,random(10003)))::varchar(10) as column3"	1234			10000	varchar	10		20003	10003		schema1	table1	column3	1234	10000	varchar	10		 				
 6 | 4	schema1	table1			column4	"rpad(uniform(1,1234 , random(10004))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column4"	1234			10000	char	10		20004	10004		schema1	table1	column4	1234	10000	char	10		 				
 7 | 5	schema1	table1			column5	"uniform(1,1234 , random(10005))::bigint as column5"	1234			10000	bigint			20005	10005		schema1	table1	column5	1234	10000	bigint			 				
 8 | 6	schema1	table1			column6	"uniform(1,1234 , random(10006))::integer as column6"	1234			10000	integer			20006	10006		schema1	table1	column6	1234	10000	integer			 				
 9 | 7	schema1	table1			column7	"uniform(1,1234 , random(10007))::double as column7"	1234			10000	double			20007	10007		schema1	table1	column7	1234	10000	double			 				
10 | 8	schema1	table1			column8	"uniform(1,1234 , random(10008))::float as column8"	1234			10000	float			20008	10008		schema1	table1	column8	1234	10000	float			 				
11 | 9	schema1	table1			column9	"uniform(1,1234 , random(10009))::number(10,2) as column9"	1234			10000	number	10	2	20009	10009		schema1	table1	column9	1234	10000	number	10	2	 				
12 | 10	schema1	table1			column10	"rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column10"	10000			10000	char	10		20010	10010		schema1	table1	column10	10000	10000	char	10						
13 | 11	schema1	table1			column11	seq8()::bigint as column11	10000			10000	bigint			20011	10011		schema1	table1	column11	10000	10000	bigint							
14 | 11	schema1	table1			column12	"(uniform(1,2,random(10012))-1)::boolean as column12"	2			10000	boolean			20012	10012		schema1	table1	column12	2	10000	boolean			 				
15 | 12	schema1	table1			column13	null::integer as column13	0			10000	integer			20013	10013		schema1	table1	column13	0	10000	integer			 				
16 | 13	schema1	table1			column101	"dateadd(day,uniform(1,1234 , random(10014)),current_date)::date as column101"	1234			10000	date			20014	10014		schema1	table1	column101	1234	10000	date			3000				
17 | 14	schema1	table1			column102	"(date_part(epoch_second, current_date)+(uniform(1,1234, random(10015))))::timestamp as column102"	1234			10000	timestamp			20015	10015		schema1	table1	column102	1234	10000	timestamp			3000				
18 | 15	schema1	table1			column103	"randstr(uniform(1,10, random(10016)),uniform(1,1234,random(10016)))::varchar(10) as column103"	1234			10000	varchar	10		20016	10016		schema1	table1	column103	1234	10000	varchar	10		3000				
19 | 16	schema1	table1			column104	"rpad(uniform(1,1234 , random(10017))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column104"	1234			10000	char	10		20017	10017		schema1	table1	column104	1234	10000	char	10		3000				
20 | 17	schema1	table1			column105	"uniform(1,1234 , random(10018))::bigint as column105"	1234			10000	bigint			20018	10018		schema1	table1	column105	1234	10000	bigint			3000				
21 | 18	schema1	table1			column106	"uniform(1,1234 , random(10019))::integer as column106"	1234			10000	integer			20019	10019		schema1	table1	column106	1234	10000	integer			3000				
22 | 19	schema1	table1			column107	"uniform(1,1234 , random(10020))::double as column107"	1234			10000	double			20020	10020		schema1	table1	column107	1234	10000	double			3000				
23 | 20	schema1	table1			column108	"uniform(1,1234 , random(10021))::float as column108"	1234			10000	float			20021	10021		schema1	table1	column108	1234	10000	float			3000				
24 | 21	schema1	table1			column109	"uniform(1,1234 , random(10022))::number(10,2) as column109"	1234			10000	number	10	2	20022	10022		schema1	table1	column109	1234	10000	number	10	2	3000				
25 | 22	schema1	table1			column110	"rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column110"	10000			10000	char	10		20023	10023		schema1	table1	column110	10000	10000	char	10		3000				
26 | 23	schema1	table1			column111	seq8()::bigint as column111	10000			10000	bigint			20024	10024		schema1	table1	column111	10000	10000	bigint			3000				
27 | 23	schema1	table1			column112	"(uniform(1,2,random(10025))-1)::boolean as column112"	2			10000	boolean			20025	10025		schema1	table1	column112	2	10000	boolean			3000				
28 | 24	schema1	table1		from table(generator(rowcount => 10000))	column113	null::integer as column113	0			10000	integer			20026	10026		schema1	table1	column113	0	10000	integer			3000				


--------------------------------------------------------------------------------
/datageneration_sample_spreadsheet_null.txt:
--------------------------------------------------------------------------------
 1 | id	Schema Name	Table Name	STMT: CREATE 	STMT: FROM	Column Name	STMT: Expression	ColumnCardinality	STMT: Expression	Null Ratio	Table Cardinalyty	DataType	Length	Precision	Seed	Seed	Normal Dist	Schema Name	TableName	Column Name	Column Cardinality	Table Cardinality	Data type	Length	Precision	Number Null	Normal Dist	Nullable	Dim key	Is Foreign Key
 2 | 0														20000	10000														
 3 | 1	schema1	table1	create transient schema if not exists schema1 data_retention_time_in_days=0;  create or replace table schema1.table1 as select		column1	"dateadd(day,uniform(1,1234 , random(10001)),current_date)::date as column1"	1234	column1::date as column1	0	10000	date			20001	10001		schema1	table1	column1	1234	10000	date			 				
 4 | 2	schema1	table1			column2	"(date_part(epoch_second, current_date)+(uniform(1,1234, random(10002))))::timestamp as column2"	1234	column2::timestamp as column2	0	10000	timestamp			20002	10002		schema1	table1	column2	1234	10000	timestamp			  				
 5 | 3	schema1	table1			column3	"randstr(uniform(1,10, random(10003)),uniform(1,1234,random(10003)))::varchar(10) as column3"	1234	column3::varchar(10) as column3	0	10000	varchar	10		20003	10003		schema1	table1	column3	1234	10000	varchar	10		 				
 6 | 4	schema1	table1			column4	"rpad(uniform(1,1234 , random(10004))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column4"	1234	column4::char(10) as column4	0	10000	char	10		20004	10004		schema1	table1	column4	1234	10000	char	10		 				
 7 | 5	schema1	table1			column5	"uniform(1,1234 , random(10005))::bigint as column5"	1234	column5::bigint as column5	0	10000	bigint			20005	10005		schema1	table1	column5	1234	10000	bigint			 				
 8 | 6	schema1	table1			column6	"uniform(1,1234 , random(10006))::integer as column6"	1234	column6::integer as column6	0	10000	integer			20006	10006		schema1	table1	column6	1234	10000	integer			 				
 9 | 7	schema1	table1			column7	"uniform(1,1234 , random(10007))::double as column7"	1234	column7::double as column7	0	10000	double			20007	10007		schema1	table1	column7	1234	10000	double			 				
10 | 8	schema1	table1			column8	"uniform(1,1234 , random(10008))::float as column8"	1234	column8::float as column8	0	10000	float			20008	10008		schema1	table1	column8	1234	10000	float			 				
11 | 9	schema1	table1			column9	"uniform(1,1234 , random(10009))::number(10,2) as column9"	1234	"column9::number(10,2) as column9"	0	10000	number	10	2	20009	10009		schema1	table1	column9	1234	10000	number	10	2	 				
12 | 10	schema1	table1			column10	"rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column10"	10000	column10::char(10) as column10	0	10000	char	10		20010	10010		schema1	table1	column10	10000	10000	char	10						
13 | 11	schema1	table1			column11	seq8()::bigint as column11	10000	column11::bigint as column11	0	10000	bigint			20011	10011		schema1	table1	column11	10000	10000	bigint							
14 | 11	schema1	table1			column12	"(uniform(1,2,random(10012))-1)::boolean as column12"	2	column12::boolean as column12	0	10000	boolean			20012	10012		schema1	table1	column12	2	10000	boolean			 				
15 | 12	schema1	table1			column13	null::integer as column13	0	column13::integer as column13	0	10000	integer			20013	10013		schema1	table1	column13	0	10000	integer			 				
16 | 13	schema1	table1			column101	"dateadd(day,uniform(1,1234 , random(10014)),current_date)::date as column101"	1234	"(case when uniform(1,1000,random(20014)) > 300 then column101 else null end)::date as column101"	300	10000	date			20014	10014		schema1	table1	column101	1234	10000	date			3000				
17 | 14	schema1	table1			column102	"(date_part(epoch_second, current_date)+(uniform(1,1234, random(10015))))::timestamp as column102"	1234	"(case when uniform(1,1000,random(20015)) > 300 then column102 else null end)::timestamp as column102"	300	10000	timestamp			20015	10015		schema1	table1	column102	1234	10000	timestamp			3000				
18 | 15	schema1	table1			column103	"randstr(uniform(1,10, random(10016)),uniform(1,1234,random(10016)))::varchar(10) as column103"	1234	"(case when uniform(1,1000,random(20016)) > 300 then column103 else null end)::varchar(10) as column103"	300	10000	varchar	10		20016	10016		schema1	table1	column103	1234	10000	varchar	10		3000				
19 | 16	schema1	table1			column104	"rpad(uniform(1,1234 , random(10017))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column104"	1234	"(case when uniform(1,1000,random(20017)) > 300 then column104 else null end)::char(10) as column104"	300	10000	char	10		20017	10017		schema1	table1	column104	1234	10000	char	10		3000				
20 | 17	schema1	table1			column105	"uniform(1,1234 , random(10018))::bigint as column105"	1234	"(case when uniform(1,1000,random(20018)) > 300 then column105 else null end)::bigint as column105"	300	10000	bigint			20018	10018		schema1	table1	column105	1234	10000	bigint			3000				
21 | 18	schema1	table1			column106	"uniform(1,1234 , random(10019))::integer as column106"	1234	"(case when uniform(1,1000,random(20019)) > 300 then column106 else null end)::integer as column106"	300	10000	integer			20019	10019		schema1	table1	column106	1234	10000	integer			3000				
22 | 19	schema1	table1			column107	"uniform(1,1234 , random(10020))::double as column107"	1234	"(case when uniform(1,1000,random(20020)) > 300 then column107 else null end)::double as column107"	300	10000	double			20020	10020		schema1	table1	column107	1234	10000	double			3000				
23 | 20	schema1	table1			column108	"uniform(1,1234 , random(10021))::float as column108"	1234	"(case when uniform(1,1000,random(20021)) > 300 then column108 else null end)::float as column108"	300	10000	float			20021	10021		schema1	table1	column108	1234	10000	float			3000				
24 | 21	schema1	table1			column109	"uniform(1,1234 , random(10022))::number(10,2) as column109"	1234	"(case when uniform(1,1000,random(20022)) > 300 then column109 else null end)::number(10,2) as column109"	300	10000	number	10	2	20022	10022		schema1	table1	column109	1234	10000	number	10	2	3000				
25 | 22	schema1	table1			column110	"rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column110"	10000	"(case when uniform(1,1000,random(20023)) > 300 then column110 else null end)::char(10) as column110"	300	10000	char	10		20023	10023		schema1	table1	column110	10000	10000	char	10		3000				
26 | 23	schema1	table1			column111	seq8()::bigint as column111	10000	"(case when uniform(1,1000,random(20024)) > 300 then column111 else null end)::bigint as column111"	300	10000	bigint			20024	10024		schema1	table1	column111	10000	10000	bigint			3000				
27 | 23	schema1	table1			column112	"(uniform(1,2,random(10025))-1)::boolean as column112"	2	"(case when uniform(1,1000,random(20025)) > 300 then column112 else null end)::boolean as column112"	300	10000	boolean			20025	10025		schema1	table1	column112	2	10000	boolean			3000				
28 | 24	schema1	table1		from table(generator(rowcount => 10000))	column113	null::integer as column113	0	"(case when uniform(1,1000,random(20026)) > 300 then column113 else null end)::integer as column113"	300	10000	integer			20026	10026		schema1	table1	column113	0	10000	integer			3000				


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Fehrminator
 2 | 
 3 | 
 4 | Fehrminator is a data generation framework to generate random data sets based on a given schema at scale. It was mainly developed to compare performance for a target environment, for instance a new Snowflake DW, against an existing DW environment, while a real world production dataset can not be used.
 5 | 
 6 | The idea is to generate a dataset that looks very similar in terms for naming and schema, datasize (i.e. number or rows), and number of rows by column. By taking these values as input parameters, the framework generates a SQL script to generate a randomized dataset of similar size and similar distribution.
 7 | 
 8 | One word of caution. Be sure to check the length of VARCHAR columns (and review the documentation below). Generating data for long VARCHAR columns can be time consuming and costly. If you have a spec with several long varchar columns, consider using the max of the actual length of the values in those columns rather than the maximum length defined in the data dictionary.
 9 | 
10 | 
11 | ## Specification
12 | 
13 | The framework takes the following input schema
14 | 
15 | 1. Schema name
16 | 1. Table name
17 | 1. Column name
18 | 1. Column cardinality (number of distinct values in column)
19 | 1. Table cardinality (number of rows in table)
20 | 1. Data type
21 | 1. Data length
22 | 1. Data precision
23 | 
24 | The Framework supports the following datatypes
25 | 
26 | 1. DATE
27 | 1. TIMESTAMP
28 | 1. CHAR (LENGTH)
29 | 1. VARCHAR (LENGTH)
30 | 1. BOOLEAN
31 | 1. INTEGER
32 | 1. BIGINT
33 | 1. FLOAT
34 | 1. DOUBLE
35 | 1. NUMBER (LENGTH, PRECISION)
36 | 
37 | Please Note:
38 |  * The customer usually can create the required input schema easily by running a meta data query against their existing system. If this is green field development, then take the best guess based on customer requirements.
39 |  * The framework does not generate a database. It expects the database to exist. 
40 |  * The framework generates code to create a schema in case it doesn't exist. Drop the schema manually, if you want to start over completely.
41 |  * The framework generates code to override tables if they exist, i.e. the script can be re-executed against an existing test database.
42 |  * The framework generates code to create TRANSIENT tables to cut down on cost.
43 |  * The number of distinct values per column in the generated table is not an exact value. 
44 |   Example: If a table has 100 rows and a column is defined as having 100 distinct values, then there is a chance that the number if distinct values is slightly below the specified value.
45 |  * The framework primarily relies on numbers that will be randomly generated between a **min** and a **max** value. The distribution is uniform (please see below for the special case of normal distribution), i.e. for a specific column within the a specific tables we have approx. the same number of rows per column values. Example: If a table has 100 rows and a column has 10 distict values, then each vaule has appox. 10 rows (some will have 11 rows, some will have 9 rows).
46 |  * CHAR/VARCHAR will be generated with a specific length from a repeated pattern. Uniqueness is achived by pre-fixing the string with a number. Truly random strings could be used as well but generating random strings using the Snowflake RANDSTR SQL function take consideraly longer time.
47 |  * DATE columns will be generated by adding a random number (as days) to current date.
48 |  * TIMESTAMP columns will be generated by adding a random number (as seconds) to midnight of the current date. In case finer granuality is required, the generated code could be manually modified to divide the added value by i.e. 1000 (milliseconds), 1000000 (microseconds), and so on.
49 |  * Real world customer schemas usually contain multiple tables with primary/foreign key relationships. Simple cases with only single column keys (natural or surrogate keys) are completely covered within the framework, as long as data types between primary/foreign key match and as long as the number of distinct values for the primary key is at least as big as the number of distinct values for the column referencing the primary key. 
50 |  * Requesting Primary/Foreign key meta information is a good practive to validate data type consistentcy, but it's not a requirement for the tool.
51 |  * Creating large CHAR/VARCHAR columns (1k+) can consume a considerable amount of time and space. Review these instances with the customer and consider to limit the size. To be clear, Snowflake can handle bigger sizes but it can be very costly to generate the data.
52 |  * Creating a big dataset, e.g. 100 billion rows, can take a considerable amount of time and should be executed on a sufficiently big cluster. However, smaller dataset, e.g. 1 million rows do not benefit from using a large cluster. Rule of Thumb: Start with an XL or smaller for 1 billion rows and below. If you need to create bigger sets, snowflake scales almost linearly from this point forward, i.e. scaling up one level cuts the time to generate the data by half.
53 |  * When creating a large dataset which consumes a considerable amount of credits, start with a scaled down version of the spec, e.g. 1 million rows (or whatever makes sense) and review the output with the customer in terms of dats distribution and joins. Also record the time it took to create the dataset as well as the size to extrapolate total time and size.   
54 |  * Uniqueness for a column can be achieved by setting the column cardinality for that column to match the table cardinality. Only BIGINT/CHAR/VARCHAR are supported for unique columns. Uniqueness is garuanteed via function seq8().
55 |   
56 | Using the input schema, the framework generates an SQL script to generate data in Snowflake. The SQL script can be generated using the Excel workwork or the python Script.
57 | 
58 | ## Excel Code Generation
59 | 
60 | 1. Request the schema information from the customer
61 | 1. Cleanse the schema information, in particular check the data types
62 | 1. Copy the customer data onto the formula worksheet
63 | 1. Save the worksheet as txt format
64 | 1. Run the following command to generate the SQL Script (the awk script assemble.awk is part of this repo)
65 | 
66 | cat name.txt | sed "s/\\"//g" | sed "s/|/\\"/g" | awk -F"\t" -f assemble.awk
67 | 
68 | ## Python Code Generation
69 | 
70 | 1. Request the schema information from the customer
71 | 1. Cleanse the schema information, in particular check the data types
72 | 1. Save the worksheet as csv format
73 | 1. Run the following command to generate the SQL Script
74 | 
75 | python snowflake_python_generator.py name.csv
76 | 
77 | ## Advanced Use-Cases
78 | 
79 | The are 2 additional datapoints that can be requested from the customer to cover special cases in term of data distribution.
80 | 1. Number of null values per column
81 | 2. Data distribution (uniform vs. normal)
82 | 
83 |  * Uniform data distribution can be selected ( 1 in column `Normal Dist` ) for data types BIGINT, VARCHAR, CHAR. Please note that the value for column cardinality will be slightly higher, i.e. is not the exactly the value requested. 
84 | 
85 | Please note:
86 |  * In case of large dataset and sparse distribution of large string columns, it's very benefical to request the number of null values per column. When reviewing the spec, the number of null values divided by table cardinality should be a reasonable percentage valye, i.e. number between 1 and 100. The framework will generate code that will randomly choose to generate a value or null based on the specified distribution.
87 |  * NOT YET IMPLEMENTED IN PYTHON: In some cases the default data distribution, i.e. uniform, does not meet requirements and customers are looking for a more normal (i.e. bell curve) distribution of FACTS for a specific DIMENSION.  Normal distribution only makes sense for fact tables though the framework doesn't limit the usage to fact tables. If a normal distribution is being specified (Distribution=1), then the framework generates code to use a different distribution function.
88 |  
89 | 
90 | 


--------------------------------------------------------------------------------
/snowflake_python_generator.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # Based on Robert Fehrmann's Data Generation process.
  4 | # Program loads the file providing the input and generates the required execution scripts.
  5 | # 2019-01-08  jfrink    Initial Creation
  6 | #
  7 | #  Version 1.0.0
  8 | #
  9 | #  -> ./snowflake_datagen.py demoxx load_wh peter -db SALES
 10 | 
 11 | import sys
 12 | import os, glob, errno
 13 | import getpass
 14 | import argparse
 15 | #import snowflake.connector
 16 | import re
 17 | import csv
 18 | 
 19 | #Verbose processing
 20 | def show_verbose(verbose, text_to_print):
 21 |   if verbose:
 22 |     print(text_to_print)
 23 |   return
 24 | 
 25 | 
 26 | #run_sql to pass result set back
 27 | def run_sql(conn, sql, fetchall):
 28 |   cur = conn.cursor()
 29 |   try:
 30 |     cur.execute(sql)
 31 |     if fetchall:
 32 |       res=cur.fetchall()
 33 |     else:
 34 |       res = cur.fetchone()
 35 | 
 36 |   except (snowflake.connector.errors.ProgrammingError) as e:
 37 |     print("Statement error: {0}".format(e.msg))
 38 |   except:
 39 |     print("Unexpected error: {0}".format(e.msg))
 40 | 
 41 |   finally:
 42 |     cur.close()
 43 |   return res
 44 | 
 45 | def load_db_file(filename,quoted_names):
 46 | 
 47 |   database_objects = {}
 48 |   seed_1 = 10000  #default seed value, increaments for each row added
 49 |   line_number=0
 50 |   if (quoted_names):
 51 |     quotes='"'
 52 |   else:
 53 |     quotes=''
 54 | 
 55 |   """try:
 56 |    fh = open(filename, 'rU')
 57 |   except IOError:
 58 |     print("Could not open file! " + filename)"""
 59 | 
 60 |   #Used to skip the header record in the file.
 61 |   with open(filename, 'rU') as fh:
 62 |     next(fh)
 63 |     line_number+=1
 64 |     for line_list in csv.reader(fh, delimiter=',', quotechar='"'):
 65 |       db="TEST"
 66 |       schema=line_list[0]
 67 |       tbl=quotes + line_list[1] + quotes
 68 |       col=quotes + line_list[2] + quotes
 69 |       cardinality=line_list[3].strip()
 70 |       tbl_cardinality=line_list[4].strip()
 71 |       datatype=line_list[5]
 72 |       datatype_length=line_list[6].strip()
 73 |       datatype_precision=line_list[7].strip()
 74 |       if(len(line_list)>8):
 75 |         null_values=line_list[8].strip()
 76 |         if (null_values==''):
 77 |            null_ratio=0
 78 |         else:
 79 |            null_ratio=(float(null_values)/float(tbl_cardinality))*1000
 80 |       else:
 81 |          null_ratio=0
 82 | 
 83 |       seed_1=seed_1+1
 84 | 
 85 |       #Reset the formula to make sure we catch all occurances
 86 |       formula = ''
 87 | 
 88 |       if datatype.upper() == 'DATE':
 89 |         fb = 'dateadd(day, uniform(1, ' + str(cardinality) + ', random('+ str(seed_1) +')), date_trunc(day, current_date))'
 90 |         fe = '::date as ' + col
 91 |       elif datatype.upper() == 'TIMESTAMP':
 92 |         fb = '(date_part(epoch_second, current_date) + (uniform(1, ' + str(cardinality) + ', random('+ str(seed_1) +'))))'
 93 |         fe = '::timestamp as ' + col
 94 |       elif datatype.upper() == 'CHAR':
 95 |         if cardinality == tbl_cardinality:
 96 |           fb = 'rpad((seq8()+1)::varchar,'+ str(datatype_length) + ", 'abcdefghifklmnopqrstuvwxyz')"
 97 |         else:
 98 |           fb = 'rpad(uniform(1, ' + str(cardinality) + ', random(' + str(seed_1) + '))::varchar,'+ str(datatype_length) + ", 'abcdefghifklmnopqrstuvwxyz')"
 99 |         fe = '::' + datatype.lower() + '(' + str(datatype_length) + ') as ' + col
100 |       elif datatype.upper() == 'VARCHAR':
101 |         if (datatype_precision==''):
102 |           datatype_precision=1
103 |         fb = 'randstr(uniform(' + str(datatype_precision) + ',' + str(datatype_length) + ', random(' + str(seed_1) + ')),uniform(1,'+ str(cardinality) + ',random(' + str(seed_1) + ')))'
104 |         fe = '::' + datatype.lower() + '(' + str(datatype_length) + ') as ' + col
105 |       elif datatype.upper() == 'BIGINT':
106 |         if cardinality == tbl_cardinality:
107 |           fb = '(seq8()+1)'
108 |         else:
109 |           fb = 'uniform(1,' + str(cardinality) + ' , random('+ str(seed_1) +'))'
110 |         fe = '::' + datatype.lower() + ' as ' + col
111 |       elif datatype.upper() == 'BOOLEAN':
112 |         if (int(cardinality)>2):
113 |           cardinality=2
114 |         fb = '(uniform(1,' + str(cardinality) + ' , random('+ str(seed_1) +'))-1)'
115 |         fe = '::' + datatype.lower() + ' as ' + col
116 |       elif datatype.upper() == 'INTEGER' or datatype.upper() == 'DOUBLE' or datatype.upper() == 'FLOAT':
117 |         fb = 'uniform(1,' + str(cardinality) + ' , random('+ str(seed_1) +'))'
118 |         fe = '::' + datatype.lower() + ' as ' + col
119 |       elif datatype.upper() == 'NUMBER':
120 |         fb = 'uniform(1,' + str(cardinality) + ' , random('+ str(seed_1) +'))'
121 |         fe = '::number(' + str(datatype_length) + ',' + str(datatype_precision) + ') as ' + col
122 |       else:
123 |         fb=''
124 |         fe=''
125 | 
126 |       if (fb==''):
127 |         if (schema!='' or datatype!=''):
128 |            print( "WARNING: Line: {0} Unknown Datatype: {1}".format(line_number,datatype ) )
129 |       else:
130 |         if (int(cardinality)<=0):
131 |           formula = 'null'+fe
132 |         elif (null_ratio<=0):
133 |           formula = fb + fe
134 |         else:
135 |           formula = '(case when uniform(1,1000,random('+str(seed_1+10000)+'))<='+str(int(null_ratio))+' then null else '+fb+' end)'+fe
136 | 
137 |         column_info={'NAME': col, 'DATA_TYPE': datatype, 'LENGTH': datatype_length, 'CARDINALITY': cardinality, 'TBL_CARDINALITY': tbl_cardinality, 'FORMULA': formula  }
138 | 
139 |         if db not in database_objects:
140 |           database_objects[db] = {}
141 | 
142 |         schema_objects=database_objects[db]
143 | 
144 |         if schema not in schema_objects:
145 |           schema_objects[schema] = {}
146 | 
147 |         tbl_objects=schema_objects[schema]
148 | 
149 |         if tbl not in tbl_objects:
150 |           tbl_objects[tbl] = []
151 | 
152 |         tbl_objects[tbl].append(column_info)
153 | 
154 |   fh.close()
155 | 
156 |   return database_objects
157 | 
158 | 
159 | 
160 | def print_ddl(database_objects, outputfile):
161 | 
162 |   if outputfile:
163 |     try:
164 |       of = open(outputfile, 'w')
165 |     except IOError:
166 |       print("Could not open file! " + outputfile)
167 |   else:
168 |     of = ''
169 | 
170 |   for db in sorted(database_objects.keys()):    #Database Name
171 |     #Now loop through the new obj_hash to print out the ddl
172 |     printer('-------------------------------------', of)
173 |     printer('-- Generating Database: ' + str(db), of)
174 |     printer('-------------------------------------', of)
175 |     schema_obj=database_objects[db]
176 |     for schema in schema_obj:
177 |       printer('CREATE TRANSIENT SCHEMA IF NOT EXISTS ' + str(schema) + ' DATA_RETENTION_TIME_IN_DAYS=0;', of)
178 |       printer('USE SCHEMA '+ str(schema) +';', of)
179 |       tbl_obj=schema_obj[schema]
180 |       for tbl in tbl_obj:
181 |         printer('CREATE or REPLACE TABLE '+ tbl, of)
182 |         printer('AS', of)
183 |         printer('SELECT', of)
184 |         col_obj=tbl_obj[tbl]
185 |         num_of_columns=len(col_obj)
186 |         counter=1
187 |         for col in col_obj:
188 |           if counter < num_of_columns:
189 |             printer('   ' + col['FORMULA'] + ',', of)
190 |             counter=counter+1
191 |           else:
192 |             printer('   ' + col['FORMULA'], of)
193 |             printer('from table(generator(rowcount => ' + str(col['TBL_CARDINALITY']) + '));' , of)
194 |             printer(' ', of)
195 | 
196 |   if of != '':
197 |     if not of.closed:
198 |       of.close()
199 | 
200 |   return
201 | 
202 | def printer(text, fh):
203 | 
204 | 
205 |   if fh =='':
206 |     print(text)
207 |   else:
208 |     if not fh.closed:
209 |       fh.write(text+"\n")
210 |   return
211 | 
212 | 
213 | def main():
214 |   ##### MAIN #####
215 |   parser = argparse.ArgumentParser(description='Snowflake Data Generation Utility.',
216 |           epilog='Example: snowflake_datagen.py source_file.csv')
217 |   parser.add_argument('filename', action='store',
218 |         help='Customer Sample file template')
219 |   parser.add_argument('-sqlfile', '--sqlfile', action='store',
220 |         help='Create an output file script containing the create role ddl and grants, to be used with -ddl switch')
221 |   parser.add_argument('-q', '--quoted_names', action='store_true',
222 |         help='Enclose name with double quotes')
223 |   parser.add_argument('-v', '--verbose', action='store_true', help='verbose')
224 | 
225 |   args=parser.parse_args();
226 |   if args.verbose:
227 |     print("filename=" + str(args.filename))       # File to be loaded
228 |     print("sqlfile=" + str(args.sqlfile))  # show debugging output
229 |     print("verbose=" + str(args.verbose))  # show debugging output
230 | 
231 |   db_objects=load_db_file(args.filename,args.quoted_names)
232 | 
233 |   print_ddl(db_objects, args.sqlfile)
234 | 
235 |   return
236 | 
237 | if __name__ == "__main__":
238 |   main()
239 | 


--------------------------------------------------------------------------------