├── datageneration_sample_spreadsheet.xlsx ├── datageneration_sample_spreadsheet_null.xlsx ├── assemble.awk ├── datageneration_sample_spreadsheet.csv ├── test_txt.sql ├── test_csv.sql ├── test_txt_null.sql ├── datageneration_sample_spreadsheet.txt ├── datageneration_sample_spreadsheet_null.txt ├── README.md └── snowflake_python_generator.py /datageneration_sample_spreadsheet.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertFehrmann/fehrminator/HEAD/datageneration_sample_spreadsheet.xlsx -------------------------------------------------------------------------------- /datageneration_sample_spreadsheet_null.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/RobertFehrmann/fehrminator/HEAD/datageneration_sample_spreadsheet_null.xlsx -------------------------------------------------------------------------------- /assemble.awk: -------------------------------------------------------------------------------- 1 | { 2 | if (substr($4,1,7)=="create ") { 3 | if (substr($5,1,4)=="from") { 4 | if (index($9,"::")>0) { 5 | print $4 "\n " $9 "\nfrom ( \n select\n " $7 "\n " $5 "); \n\n";; 6 | } 7 | else { 8 | print $4 "\n " $7 "\n " $5 "; \n\n"; 9 | } 10 | } 11 | else { 12 | create_clause=$4 13 | outer_select=sprintf(" %s",$9); 14 | inner_select=sprintf(" %s",$7); 15 | } 16 | } else if (substr($5,1,4)=="from") { 17 | print create_clause; 18 | if (index(outer_select,"::")>0) { 19 | print outer_select "\n ," $9 "\nfrom ( \n select"; 20 | print inner_select "\n ," $7 "\n " $5 "); \n\n"; 21 | } 22 | else { 23 | print inner_select "\n ," $7 "\n " $5 "; \n\n"; 24 | } 25 | } else if (index($7,"::")>0) { 26 | outer_select=sprintf("%s\n ,%s",outer_select,$9); 27 | inner_select=sprintf("%s\n ,%s",inner_select,$7) 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /datageneration_sample_spreadsheet.csv: -------------------------------------------------------------------------------- 1 | Schema Name,TableName,Column Name,Column Cardinality,Table Cardinality,Data type,Length,Precision,# Nulls,Distribution,Nullable,Dim key,Is Foreign Key 2 | ,,,,,,,,,,,, 3 | schema1,table1,column1,1234,10000,date,,, ,,,, 4 | schema1,table1,column2,1234,10000,timestamp,,, ,,,, 5 | schema1,table1,column3,1234,10000,varchar,10,, ,,,, 6 | schema1,table1,column4,1234,10000,char,10,, ,,,, 7 | schema1,table1,column5,1234,10000,bigint,,, ,,,, 8 | schema1,table1,column6,1234,10000,integer,,, ,,,, 9 | schema1,table1,column7,1234,10000,double,,, ,,,, 10 | schema1,table1,column8,1234,10000,float,,, ,,,, 11 | schema1,table1,column9,1234,10000,number,10,2, ,,,, 12 | schema1,table1,column10,10000,10000,char,10,,,,,, 13 | schema1,table1,column11,10000,10000,bigint,,,,,,, 14 | schema1,table1,column12,2,10000,boolean,,, ,,,, 15 | schema1,table1,column13,0,10000,integer,,,,,,, 16 | schema1,table1,column101,1234,10000,date,,,3000,,,, 17 | schema1,table1,column102,1234,10000,timestamp,,,3000,,,, 18 | schema1,table1,column103,1234,10000,varchar,10,,3000,,,, 19 | schema1,table1,column104,1234,10000,char,10,,3000,,,, 20 | schema1,table1,column105,1234,10000,bigint,,,3000,,,, 21 | schema1,table1,column106,1234,10000,integer,,,3000,,,, 22 | schema1,table1,column107,1234,10000,double,,,3000,,,, 23 | schema1,table1,column108,1234,10000,float,,,3000,,,, 24 | schema1,table1,column109,1234,10000,number,10,2,3000,,,, 25 | schema1,table1,column110,10000,10000,char,10,,3000,,,, 26 | schema1,table1,column111,10000,10000,bigint,,,3000,,,, 27 | schema1,table1,column112,2,10000,boolean,,,3000,,,, 28 | schema1,table1,column113,0,10000,integer,,,3000,,,, 29 | -------------------------------------------------------------------------------- /test_txt.sql: -------------------------------------------------------------------------------- 1 | create transient schema if not exists schema1 data_retention_time_in_days=0; create or replace table schema1.table1 as select 2 | dateadd(day,uniform(1,1234 , random(10001)),current_date)::date as column1 3 | ,(date_part(epoch_second, current_date)+(uniform(1,1234, random(10002))))::timestamp as column2 4 | ,randstr(uniform(1,10, random(10003)),uniform(1,1234,random(10003)))::varchar(10) as column3 5 | ,rpad(uniform(1,1234 , random(10004))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column4 6 | ,uniform(1,1234 , random(10005))::bigint as column5 7 | ,uniform(1,1234 , random(10006))::integer as column6 8 | ,uniform(1,1234 , random(10007))::double as column7 9 | ,uniform(1,1234 , random(10008))::float as column8 10 | ,uniform(1,1234 , random(10009))::number(10,2) as column9 11 | ,rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column10 12 | ,seq8()::bigint as column11 13 | ,(uniform(1,2,random(10012))-1)::boolean as column12 14 | ,null::integer as column13 15 | ,dateadd(day,uniform(1,1234 , random(10014)),current_date)::date as column101 16 | ,(date_part(epoch_second, current_date)+(uniform(1,1234, random(10015))))::timestamp as column102 17 | ,randstr(uniform(1,10, random(10016)),uniform(1,1234,random(10016)))::varchar(10) as column103 18 | ,rpad(uniform(1,1234 , random(10017))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column104 19 | ,uniform(1,1234 , random(10018))::bigint as column105 20 | ,uniform(1,1234 , random(10019))::integer as column106 21 | ,uniform(1,1234 , random(10020))::double as column107 22 | ,uniform(1,1234 , random(10021))::float as column108 23 | ,uniform(1,1234 , random(10022))::number(10,2) as column109 24 | ,rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column110 25 | ,seq8()::bigint as column111 26 | ,(uniform(1,2,random(10025))-1)::boolean as column112 27 | ,null::integer as column113 28 | from table(generator(rowcount => 10000)); 29 | 30 | 31 | -------------------------------------------------------------------------------- /test_csv.sql: -------------------------------------------------------------------------------- 1 | ------------------------------------- 2 | -- Generating Database: TEST 3 | ------------------------------------- 4 | CREATE TRANSIENT SCHEMA IF NOT EXISTS schema1 DATA_RETENTION_TIME_IN_DAYS=0; 5 | USE SCHEMA schema1; 6 | CREATE or REPLACE TABLE table1 7 | AS 8 | SELECT 9 | dateadd(day, uniform(1, 1234, random(10002)), date_trunc(day, current_date))::date as column1, 10 | (date_part(epoch_second, current_date) + (uniform(1, 1234, random(10003))))::timestamp as column2, 11 | randstr(uniform(1,10, random(10004)),uniform(1,1234,random(10004)))::varchar(10) as column3, 12 | rpad(uniform(1, 1234, random(10005))::varchar,10, 'abcdefghifklmnopqrstuvwxyz')::char(10) as column4, 13 | uniform(1,1234 , random(10006))::bigint as column5, 14 | uniform(1,1234 , random(10007))::integer as column6, 15 | uniform(1,1234 , random(10008))::double as column7, 16 | uniform(1,1234 , random(10009))::float as column8, 17 | uniform(1,1234 , random(10010))::number(10,2) as column9, 18 | rpad((seq8()+1)::varchar,10, 'abcdefghifklmnopqrstuvwxyz')::char(10) as column10, 19 | (seq8()+1)::bigint as column11, 20 | (uniform(1,2 , random(10013))-1)::boolean as column12, 21 | null::integer as column13, 22 | (case when uniform(1,1000,random(20015))<=300 then null else dateadd(day, uniform(1, 1234, random(10015)), date_trunc(day, current_date)) end)::date as column101, 23 | (case when uniform(1,1000,random(20016))<=300 then null else (date_part(epoch_second, current_date) + (uniform(1, 1234, random(10016)))) end)::timestamp as column102, 24 | (case when uniform(1,1000,random(20017))<=300 then null else randstr(uniform(1,10, random(10017)),uniform(1,1234,random(10017))) end)::varchar(10) as column103, 25 | (case when uniform(1,1000,random(20018))<=300 then null else rpad(uniform(1, 1234, random(10018))::varchar,10, 'abcdefghifklmnopqrstuvwxyz') end)::char(10) as column104, 26 | (case when uniform(1,1000,random(20019))<=300 then null else uniform(1,1234 , random(10019)) end)::bigint as column105, 27 | (case when uniform(1,1000,random(20020))<=300 then null else uniform(1,1234 , random(10020)) end)::integer as column106, 28 | (case when uniform(1,1000,random(20021))<=300 then null else uniform(1,1234 , random(10021)) end)::double as column107, 29 | (case when uniform(1,1000,random(20022))<=300 then null else uniform(1,1234 , random(10022)) end)::float as column108, 30 | (case when uniform(1,1000,random(20023))<=300 then null else uniform(1,1234 , random(10023)) end)::number(10,2) as column109, 31 | (case when uniform(1,1000,random(20024))<=300 then null else rpad((seq8()+1)::varchar,10, 'abcdefghifklmnopqrstuvwxyz') end)::char(10) as column110, 32 | (case when uniform(1,1000,random(20025))<=300 then null else (seq8()+1) end)::bigint as column111, 33 | (case when uniform(1,1000,random(20026))<=300 then null else (uniform(1,2 , random(10026))-1) end)::boolean as column112, 34 | null::integer as column113 35 | from table(generator(rowcount => 10000)); 36 | 37 | -------------------------------------------------------------------------------- /test_txt_null.sql: -------------------------------------------------------------------------------- 1 | create transient schema if not exists schema1 data_retention_time_in_days=0; create or replace table schema1.table1 as select 2 | column1::date as column1 3 | ,column2::timestamp as column2 4 | ,column3::varchar(10) as column3 5 | ,column4::char(10) as column4 6 | ,column5::bigint as column5 7 | ,column6::integer as column6 8 | ,column7::double as column7 9 | ,column8::float as column8 10 | ,column9::number(10,2) as column9 11 | ,column10::char(10) as column10 12 | ,column11::bigint as column11 13 | ,column12::boolean as column12 14 | ,column13::integer as column13 15 | ,(case when uniform(1,1000,random(20014)) > 300 then column101 else null end)::date as column101 16 | ,(case when uniform(1,1000,random(20015)) > 300 then column102 else null end)::timestamp as column102 17 | ,(case when uniform(1,1000,random(20016)) > 300 then column103 else null end)::varchar(10) as column103 18 | ,(case when uniform(1,1000,random(20017)) > 300 then column104 else null end)::char(10) as column104 19 | ,(case when uniform(1,1000,random(20018)) > 300 then column105 else null end)::bigint as column105 20 | ,(case when uniform(1,1000,random(20019)) > 300 then column106 else null end)::integer as column106 21 | ,(case when uniform(1,1000,random(20020)) > 300 then column107 else null end)::double as column107 22 | ,(case when uniform(1,1000,random(20021)) > 300 then column108 else null end)::float as column108 23 | ,(case when uniform(1,1000,random(20022)) > 300 then column109 else null end)::number(10,2) as column109 24 | ,(case when uniform(1,1000,random(20023)) > 300 then column110 else null end)::char(10) as column110 25 | ,(case when uniform(1,1000,random(20024)) > 300 then column111 else null end)::bigint as column111 26 | ,(case when uniform(1,1000,random(20025)) > 300 then column112 else null end)::boolean as column112 27 | ,(case when uniform(1,1000,random(20026)) > 300 then column113 else null end)::integer as column113 28 | from ( 29 | select 30 | dateadd(day,uniform(1,1234 , random(10001)),current_date)::date as column1 31 | ,(date_part(epoch_second, current_date)+(uniform(1,1234, random(10002))))::timestamp as column2 32 | ,randstr(uniform(1,10, random(10003)),uniform(1,1234,random(10003)))::varchar(10) as column3 33 | ,rpad(uniform(1,1234 , random(10004))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column4 34 | ,uniform(1,1234 , random(10005))::bigint as column5 35 | ,uniform(1,1234 , random(10006))::integer as column6 36 | ,uniform(1,1234 , random(10007))::double as column7 37 | ,uniform(1,1234 , random(10008))::float as column8 38 | ,uniform(1,1234 , random(10009))::number(10,2) as column9 39 | ,rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column10 40 | ,seq8()::bigint as column11 41 | ,(uniform(1,2,random(10012))-1)::boolean as column12 42 | ,null::integer as column13 43 | ,dateadd(day,uniform(1,1234 , random(10014)),current_date)::date as column101 44 | ,(date_part(epoch_second, current_date)+(uniform(1,1234, random(10015))))::timestamp as column102 45 | ,randstr(uniform(1,10, random(10016)),uniform(1,1234,random(10016)))::varchar(10) as column103 46 | ,rpad(uniform(1,1234 , random(10017))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column104 47 | ,uniform(1,1234 , random(10018))::bigint as column105 48 | ,uniform(1,1234 , random(10019))::integer as column106 49 | ,uniform(1,1234 , random(10020))::double as column107 50 | ,uniform(1,1234 , random(10021))::float as column108 51 | ,uniform(1,1234 , random(10022))::number(10,2) as column109 52 | ,rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column110 53 | ,seq8()::bigint as column111 54 | ,(uniform(1,2,random(10025))-1)::boolean as column112 55 | ,null::integer as column113 56 | from table(generator(rowcount => 10000))); 57 | 58 | 59 | -------------------------------------------------------------------------------- /datageneration_sample_spreadsheet.txt: -------------------------------------------------------------------------------- 1 | id Schema Name Table Name STMT: CREATE STMT: FROM Column Name STMT: Expression ColumnCardinality Table Cardinalyty DataType Length Precision Seed Seed Normal Dist Schema Name TableName Column Name Column Cardinality Table Cardinality Data type Length Precision # Nulls Distribution Nullable Dim key Is Foreign Key 2 | 0 20000 10000 3 | 1 schema1 table1 create transient schema if not exists schema1 data_retention_time_in_days=0; create or replace table schema1.table1 as select column1 "dateadd(day,uniform(1,1234 , random(10001)),current_date)::date as column1" 1234 10000 date 20001 10001 schema1 table1 column1 1234 10000 date 4 | 2 schema1 table1 column2 "(date_part(epoch_second, current_date)+(uniform(1,1234, random(10002))))::timestamp as column2" 1234 10000 timestamp 20002 10002 schema1 table1 column2 1234 10000 timestamp 5 | 3 schema1 table1 column3 "randstr(uniform(1,10, random(10003)),uniform(1,1234,random(10003)))::varchar(10) as column3" 1234 10000 varchar 10 20003 10003 schema1 table1 column3 1234 10000 varchar 10 6 | 4 schema1 table1 column4 "rpad(uniform(1,1234 , random(10004))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column4" 1234 10000 char 10 20004 10004 schema1 table1 column4 1234 10000 char 10 7 | 5 schema1 table1 column5 "uniform(1,1234 , random(10005))::bigint as column5" 1234 10000 bigint 20005 10005 schema1 table1 column5 1234 10000 bigint 8 | 6 schema1 table1 column6 "uniform(1,1234 , random(10006))::integer as column6" 1234 10000 integer 20006 10006 schema1 table1 column6 1234 10000 integer 9 | 7 schema1 table1 column7 "uniform(1,1234 , random(10007))::double as column7" 1234 10000 double 20007 10007 schema1 table1 column7 1234 10000 double 10 | 8 schema1 table1 column8 "uniform(1,1234 , random(10008))::float as column8" 1234 10000 float 20008 10008 schema1 table1 column8 1234 10000 float 11 | 9 schema1 table1 column9 "uniform(1,1234 , random(10009))::number(10,2) as column9" 1234 10000 number 10 2 20009 10009 schema1 table1 column9 1234 10000 number 10 2 12 | 10 schema1 table1 column10 "rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column10" 10000 10000 char 10 20010 10010 schema1 table1 column10 10000 10000 char 10 13 | 11 schema1 table1 column11 seq8()::bigint as column11 10000 10000 bigint 20011 10011 schema1 table1 column11 10000 10000 bigint 14 | 11 schema1 table1 column12 "(uniform(1,2,random(10012))-1)::boolean as column12" 2 10000 boolean 20012 10012 schema1 table1 column12 2 10000 boolean 15 | 12 schema1 table1 column13 null::integer as column13 0 10000 integer 20013 10013 schema1 table1 column13 0 10000 integer 16 | 13 schema1 table1 column101 "dateadd(day,uniform(1,1234 , random(10014)),current_date)::date as column101" 1234 10000 date 20014 10014 schema1 table1 column101 1234 10000 date 3000 17 | 14 schema1 table1 column102 "(date_part(epoch_second, current_date)+(uniform(1,1234, random(10015))))::timestamp as column102" 1234 10000 timestamp 20015 10015 schema1 table1 column102 1234 10000 timestamp 3000 18 | 15 schema1 table1 column103 "randstr(uniform(1,10, random(10016)),uniform(1,1234,random(10016)))::varchar(10) as column103" 1234 10000 varchar 10 20016 10016 schema1 table1 column103 1234 10000 varchar 10 3000 19 | 16 schema1 table1 column104 "rpad(uniform(1,1234 , random(10017))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column104" 1234 10000 char 10 20017 10017 schema1 table1 column104 1234 10000 char 10 3000 20 | 17 schema1 table1 column105 "uniform(1,1234 , random(10018))::bigint as column105" 1234 10000 bigint 20018 10018 schema1 table1 column105 1234 10000 bigint 3000 21 | 18 schema1 table1 column106 "uniform(1,1234 , random(10019))::integer as column106" 1234 10000 integer 20019 10019 schema1 table1 column106 1234 10000 integer 3000 22 | 19 schema1 table1 column107 "uniform(1,1234 , random(10020))::double as column107" 1234 10000 double 20020 10020 schema1 table1 column107 1234 10000 double 3000 23 | 20 schema1 table1 column108 "uniform(1,1234 , random(10021))::float as column108" 1234 10000 float 20021 10021 schema1 table1 column108 1234 10000 float 3000 24 | 21 schema1 table1 column109 "uniform(1,1234 , random(10022))::number(10,2) as column109" 1234 10000 number 10 2 20022 10022 schema1 table1 column109 1234 10000 number 10 2 3000 25 | 22 schema1 table1 column110 "rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column110" 10000 10000 char 10 20023 10023 schema1 table1 column110 10000 10000 char 10 3000 26 | 23 schema1 table1 column111 seq8()::bigint as column111 10000 10000 bigint 20024 10024 schema1 table1 column111 10000 10000 bigint 3000 27 | 23 schema1 table1 column112 "(uniform(1,2,random(10025))-1)::boolean as column112" 2 10000 boolean 20025 10025 schema1 table1 column112 2 10000 boolean 3000 28 | 24 schema1 table1 from table(generator(rowcount => 10000)) column113 null::integer as column113 0 10000 integer 20026 10026 schema1 table1 column113 0 10000 integer 3000 -------------------------------------------------------------------------------- /datageneration_sample_spreadsheet_null.txt: -------------------------------------------------------------------------------- 1 | id Schema Name Table Name STMT: CREATE STMT: FROM Column Name STMT: Expression ColumnCardinality STMT: Expression Null Ratio Table Cardinalyty DataType Length Precision Seed Seed Normal Dist Schema Name TableName Column Name Column Cardinality Table Cardinality Data type Length Precision Number Null Normal Dist Nullable Dim key Is Foreign Key 2 | 0 20000 10000 3 | 1 schema1 table1 create transient schema if not exists schema1 data_retention_time_in_days=0; create or replace table schema1.table1 as select column1 "dateadd(day,uniform(1,1234 , random(10001)),current_date)::date as column1" 1234 column1::date as column1 0 10000 date 20001 10001 schema1 table1 column1 1234 10000 date 4 | 2 schema1 table1 column2 "(date_part(epoch_second, current_date)+(uniform(1,1234, random(10002))))::timestamp as column2" 1234 column2::timestamp as column2 0 10000 timestamp 20002 10002 schema1 table1 column2 1234 10000 timestamp 5 | 3 schema1 table1 column3 "randstr(uniform(1,10, random(10003)),uniform(1,1234,random(10003)))::varchar(10) as column3" 1234 column3::varchar(10) as column3 0 10000 varchar 10 20003 10003 schema1 table1 column3 1234 10000 varchar 10 6 | 4 schema1 table1 column4 "rpad(uniform(1,1234 , random(10004))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column4" 1234 column4::char(10) as column4 0 10000 char 10 20004 10004 schema1 table1 column4 1234 10000 char 10 7 | 5 schema1 table1 column5 "uniform(1,1234 , random(10005))::bigint as column5" 1234 column5::bigint as column5 0 10000 bigint 20005 10005 schema1 table1 column5 1234 10000 bigint 8 | 6 schema1 table1 column6 "uniform(1,1234 , random(10006))::integer as column6" 1234 column6::integer as column6 0 10000 integer 20006 10006 schema1 table1 column6 1234 10000 integer 9 | 7 schema1 table1 column7 "uniform(1,1234 , random(10007))::double as column7" 1234 column7::double as column7 0 10000 double 20007 10007 schema1 table1 column7 1234 10000 double 10 | 8 schema1 table1 column8 "uniform(1,1234 , random(10008))::float as column8" 1234 column8::float as column8 0 10000 float 20008 10008 schema1 table1 column8 1234 10000 float 11 | 9 schema1 table1 column9 "uniform(1,1234 , random(10009))::number(10,2) as column9" 1234 "column9::number(10,2) as column9" 0 10000 number 10 2 20009 10009 schema1 table1 column9 1234 10000 number 10 2 12 | 10 schema1 table1 column10 "rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column10" 10000 column10::char(10) as column10 0 10000 char 10 20010 10010 schema1 table1 column10 10000 10000 char 10 13 | 11 schema1 table1 column11 seq8()::bigint as column11 10000 column11::bigint as column11 0 10000 bigint 20011 10011 schema1 table1 column11 10000 10000 bigint 14 | 11 schema1 table1 column12 "(uniform(1,2,random(10012))-1)::boolean as column12" 2 column12::boolean as column12 0 10000 boolean 20012 10012 schema1 table1 column12 2 10000 boolean 15 | 12 schema1 table1 column13 null::integer as column13 0 column13::integer as column13 0 10000 integer 20013 10013 schema1 table1 column13 0 10000 integer 16 | 13 schema1 table1 column101 "dateadd(day,uniform(1,1234 , random(10014)),current_date)::date as column101" 1234 "(case when uniform(1,1000,random(20014)) > 300 then column101 else null end)::date as column101" 300 10000 date 20014 10014 schema1 table1 column101 1234 10000 date 3000 17 | 14 schema1 table1 column102 "(date_part(epoch_second, current_date)+(uniform(1,1234, random(10015))))::timestamp as column102" 1234 "(case when uniform(1,1000,random(20015)) > 300 then column102 else null end)::timestamp as column102" 300 10000 timestamp 20015 10015 schema1 table1 column102 1234 10000 timestamp 3000 18 | 15 schema1 table1 column103 "randstr(uniform(1,10, random(10016)),uniform(1,1234,random(10016)))::varchar(10) as column103" 1234 "(case when uniform(1,1000,random(20016)) > 300 then column103 else null end)::varchar(10) as column103" 300 10000 varchar 10 20016 10016 schema1 table1 column103 1234 10000 varchar 10 3000 19 | 16 schema1 table1 column104 "rpad(uniform(1,1234 , random(10017))::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column104" 1234 "(case when uniform(1,1000,random(20017)) > 300 then column104 else null end)::char(10) as column104" 300 10000 char 10 20017 10017 schema1 table1 column104 1234 10000 char 10 3000 20 | 17 schema1 table1 column105 "uniform(1,1234 , random(10018))::bigint as column105" 1234 "(case when uniform(1,1000,random(20018)) > 300 then column105 else null end)::bigint as column105" 300 10000 bigint 20018 10018 schema1 table1 column105 1234 10000 bigint 3000 21 | 18 schema1 table1 column106 "uniform(1,1234 , random(10019))::integer as column106" 1234 "(case when uniform(1,1000,random(20019)) > 300 then column106 else null end)::integer as column106" 300 10000 integer 20019 10019 schema1 table1 column106 1234 10000 integer 3000 22 | 19 schema1 table1 column107 "uniform(1,1234 , random(10020))::double as column107" 1234 "(case when uniform(1,1000,random(20020)) > 300 then column107 else null end)::double as column107" 300 10000 double 20020 10020 schema1 table1 column107 1234 10000 double 3000 23 | 20 schema1 table1 column108 "uniform(1,1234 , random(10021))::float as column108" 1234 "(case when uniform(1,1000,random(20021)) > 300 then column108 else null end)::float as column108" 300 10000 float 20021 10021 schema1 table1 column108 1234 10000 float 3000 24 | 21 schema1 table1 column109 "uniform(1,1234 , random(10022))::number(10,2) as column109" 1234 "(case when uniform(1,1000,random(20022)) > 300 then column109 else null end)::number(10,2) as column109" 300 10000 number 10 2 20022 10022 schema1 table1 column109 1234 10000 number 10 2 3000 25 | 22 schema1 table1 column110 "rpad(seq8()::varchar,10,'abcdefghifklmnopqrstuvwxyz')::char(10) as column110" 10000 "(case when uniform(1,1000,random(20023)) > 300 then column110 else null end)::char(10) as column110" 300 10000 char 10 20023 10023 schema1 table1 column110 10000 10000 char 10 3000 26 | 23 schema1 table1 column111 seq8()::bigint as column111 10000 "(case when uniform(1,1000,random(20024)) > 300 then column111 else null end)::bigint as column111" 300 10000 bigint 20024 10024 schema1 table1 column111 10000 10000 bigint 3000 27 | 23 schema1 table1 column112 "(uniform(1,2,random(10025))-1)::boolean as column112" 2 "(case when uniform(1,1000,random(20025)) > 300 then column112 else null end)::boolean as column112" 300 10000 boolean 20025 10025 schema1 table1 column112 2 10000 boolean 3000 28 | 24 schema1 table1 from table(generator(rowcount => 10000)) column113 null::integer as column113 0 "(case when uniform(1,1000,random(20026)) > 300 then column113 else null end)::integer as column113" 300 10000 integer 20026 10026 schema1 table1 column113 0 10000 integer 3000 -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Fehrminator 2 | 3 | 4 | Fehrminator is a data generation framework to generate random data sets based on a given schema at scale. It was mainly developed to compare performance for a target environment, for instance a new Snowflake DW, against an existing DW environment, while a real world production dataset can not be used. 5 | 6 | The idea is to generate a dataset that looks very similar in terms for naming and schema, datasize (i.e. number or rows), and number of rows by column. By taking these values as input parameters, the framework generates a SQL script to generate a randomized dataset of similar size and similar distribution. 7 | 8 | One word of caution. Be sure to check the length of VARCHAR columns (and review the documentation below). Generating data for long VARCHAR columns can be time consuming and costly. If you have a spec with several long varchar columns, consider using the max of the actual length of the values in those columns rather than the maximum length defined in the data dictionary. 9 | 10 | 11 | ## Specification 12 | 13 | The framework takes the following input schema 14 | 15 | 1. Schema name 16 | 1. Table name 17 | 1. Column name 18 | 1. Column cardinality (number of distinct values in column) 19 | 1. Table cardinality (number of rows in table) 20 | 1. Data type 21 | 1. Data length 22 | 1. Data precision 23 | 24 | The Framework supports the following datatypes 25 | 26 | 1. DATE 27 | 1. TIMESTAMP 28 | 1. CHAR (LENGTH) 29 | 1. VARCHAR (LENGTH) 30 | 1. BOOLEAN 31 | 1. INTEGER 32 | 1. BIGINT 33 | 1. FLOAT 34 | 1. DOUBLE 35 | 1. NUMBER (LENGTH, PRECISION) 36 | 37 | Please Note: 38 | * The customer usually can create the required input schema easily by running a meta data query against their existing system. If this is green field development, then take the best guess based on customer requirements. 39 | * The framework does not generate a database. It expects the database to exist. 40 | * The framework generates code to create a schema in case it doesn't exist. Drop the schema manually, if you want to start over completely. 41 | * The framework generates code to override tables if they exist, i.e. the script can be re-executed against an existing test database. 42 | * The framework generates code to create TRANSIENT tables to cut down on cost. 43 | * The number of distinct values per column in the generated table is not an exact value. 44 | Example: If a table has 100 rows and a column is defined as having 100 distinct values, then there is a chance that the number if distinct values is slightly below the specified value. 45 | * The framework primarily relies on numbers that will be randomly generated between a **min** and a **max** value. The distribution is uniform (please see below for the special case of normal distribution), i.e. for a specific column within the a specific tables we have approx. the same number of rows per column values. Example: If a table has 100 rows and a column has 10 distict values, then each vaule has appox. 10 rows (some will have 11 rows, some will have 9 rows). 46 | * CHAR/VARCHAR will be generated with a specific length from a repeated pattern. Uniqueness is achived by pre-fixing the string with a number. Truly random strings could be used as well but generating random strings using the Snowflake RANDSTR SQL function take consideraly longer time. 47 | * DATE columns will be generated by adding a random number (as days) to current date. 48 | * TIMESTAMP columns will be generated by adding a random number (as seconds) to midnight of the current date. In case finer granuality is required, the generated code could be manually modified to divide the added value by i.e. 1000 (milliseconds), 1000000 (microseconds), and so on. 49 | * Real world customer schemas usually contain multiple tables with primary/foreign key relationships. Simple cases with only single column keys (natural or surrogate keys) are completely covered within the framework, as long as data types between primary/foreign key match and as long as the number of distinct values for the primary key is at least as big as the number of distinct values for the column referencing the primary key. 50 | * Requesting Primary/Foreign key meta information is a good practive to validate data type consistentcy, but it's not a requirement for the tool. 51 | * Creating large CHAR/VARCHAR columns (1k+) can consume a considerable amount of time and space. Review these instances with the customer and consider to limit the size. To be clear, Snowflake can handle bigger sizes but it can be very costly to generate the data. 52 | * Creating a big dataset, e.g. 100 billion rows, can take a considerable amount of time and should be executed on a sufficiently big cluster. However, smaller dataset, e.g. 1 million rows do not benefit from using a large cluster. Rule of Thumb: Start with an XL or smaller for 1 billion rows and below. If you need to create bigger sets, snowflake scales almost linearly from this point forward, i.e. scaling up one level cuts the time to generate the data by half. 53 | * When creating a large dataset which consumes a considerable amount of credits, start with a scaled down version of the spec, e.g. 1 million rows (or whatever makes sense) and review the output with the customer in terms of dats distribution and joins. Also record the time it took to create the dataset as well as the size to extrapolate total time and size. 54 | * Uniqueness for a column can be achieved by setting the column cardinality for that column to match the table cardinality. Only BIGINT/CHAR/VARCHAR are supported for unique columns. Uniqueness is garuanteed via function seq8(). 55 | 56 | Using the input schema, the framework generates an SQL script to generate data in Snowflake. The SQL script can be generated using the Excel workwork or the python Script. 57 | 58 | ## Excel Code Generation 59 | 60 | 1. Request the schema information from the customer 61 | 1. Cleanse the schema information, in particular check the data types 62 | 1. Copy the customer data onto the formula worksheet 63 | 1. Save the worksheet as txt format 64 | 1. Run the following command to generate the SQL Script (the awk script assemble.awk is part of this repo) 65 | 66 | cat name.txt | sed "s/\\"//g" | sed "s/|/\\"/g" | awk -F"\t" -f assemble.awk 67 | 68 | ## Python Code Generation 69 | 70 | 1. Request the schema information from the customer 71 | 1. Cleanse the schema information, in particular check the data types 72 | 1. Save the worksheet as csv format 73 | 1. Run the following command to generate the SQL Script 74 | 75 | python snowflake_python_generator.py name.csv 76 | 77 | ## Advanced Use-Cases 78 | 79 | The are 2 additional datapoints that can be requested from the customer to cover special cases in term of data distribution. 80 | 1. Number of null values per column 81 | 2. Data distribution (uniform vs. normal) 82 | 83 | * Uniform data distribution can be selected ( 1 in column `Normal Dist` ) for data types BIGINT, VARCHAR, CHAR. Please note that the value for column cardinality will be slightly higher, i.e. is not the exactly the value requested. 84 | 85 | Please note: 86 | * In case of large dataset and sparse distribution of large string columns, it's very benefical to request the number of null values per column. When reviewing the spec, the number of null values divided by table cardinality should be a reasonable percentage valye, i.e. number between 1 and 100. The framework will generate code that will randomly choose to generate a value or null based on the specified distribution. 87 | * NOT YET IMPLEMENTED IN PYTHON: In some cases the default data distribution, i.e. uniform, does not meet requirements and customers are looking for a more normal (i.e. bell curve) distribution of FACTS for a specific DIMENSION. Normal distribution only makes sense for fact tables though the framework doesn't limit the usage to fact tables. If a normal distribution is being specified (Distribution=1), then the framework generates code to use a different distribution function. 88 | 89 | 90 | -------------------------------------------------------------------------------- /snowflake_python_generator.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # Based on Robert Fehrmann's Data Generation process. 4 | # Program loads the file providing the input and generates the required execution scripts. 5 | # 2019-01-08 jfrink Initial Creation 6 | # 7 | # Version 1.0.0 8 | # 9 | # -> ./snowflake_datagen.py demoxx load_wh peter -db SALES 10 | 11 | import sys 12 | import os, glob, errno 13 | import getpass 14 | import argparse 15 | #import snowflake.connector 16 | import re 17 | import csv 18 | 19 | #Verbose processing 20 | def show_verbose(verbose, text_to_print): 21 | if verbose: 22 | print(text_to_print) 23 | return 24 | 25 | 26 | #run_sql to pass result set back 27 | def run_sql(conn, sql, fetchall): 28 | cur = conn.cursor() 29 | try: 30 | cur.execute(sql) 31 | if fetchall: 32 | res=cur.fetchall() 33 | else: 34 | res = cur.fetchone() 35 | 36 | except (snowflake.connector.errors.ProgrammingError) as e: 37 | print("Statement error: {0}".format(e.msg)) 38 | except: 39 | print("Unexpected error: {0}".format(e.msg)) 40 | 41 | finally: 42 | cur.close() 43 | return res 44 | 45 | def load_db_file(filename,quoted_names): 46 | 47 | database_objects = {} 48 | seed_1 = 10000 #default seed value, increaments for each row added 49 | line_number=0 50 | if (quoted_names): 51 | quotes='"' 52 | else: 53 | quotes='' 54 | 55 | """try: 56 | fh = open(filename, 'rU') 57 | except IOError: 58 | print("Could not open file! " + filename)""" 59 | 60 | #Used to skip the header record in the file. 61 | with open(filename, 'rU') as fh: 62 | next(fh) 63 | line_number+=1 64 | for line_list in csv.reader(fh, delimiter=',', quotechar='"'): 65 | db="TEST" 66 | schema=line_list[0] 67 | tbl=quotes + line_list[1] + quotes 68 | col=quotes + line_list[2] + quotes 69 | cardinality=line_list[3].strip() 70 | tbl_cardinality=line_list[4].strip() 71 | datatype=line_list[5] 72 | datatype_length=line_list[6].strip() 73 | datatype_precision=line_list[7].strip() 74 | if(len(line_list)>8): 75 | null_values=line_list[8].strip() 76 | if (null_values==''): 77 | null_ratio=0 78 | else: 79 | null_ratio=(float(null_values)/float(tbl_cardinality))*1000 80 | else: 81 | null_ratio=0 82 | 83 | seed_1=seed_1+1 84 | 85 | #Reset the formula to make sure we catch all occurances 86 | formula = '' 87 | 88 | if datatype.upper() == 'DATE': 89 | fb = 'dateadd(day, uniform(1, ' + str(cardinality) + ', random('+ str(seed_1) +')), date_trunc(day, current_date))' 90 | fe = '::date as ' + col 91 | elif datatype.upper() == 'TIMESTAMP': 92 | fb = '(date_part(epoch_second, current_date) + (uniform(1, ' + str(cardinality) + ', random('+ str(seed_1) +'))))' 93 | fe = '::timestamp as ' + col 94 | elif datatype.upper() == 'CHAR': 95 | if cardinality == tbl_cardinality: 96 | fb = 'rpad((seq8()+1)::varchar,'+ str(datatype_length) + ", 'abcdefghifklmnopqrstuvwxyz')" 97 | else: 98 | fb = 'rpad(uniform(1, ' + str(cardinality) + ', random(' + str(seed_1) + '))::varchar,'+ str(datatype_length) + ", 'abcdefghifklmnopqrstuvwxyz')" 99 | fe = '::' + datatype.lower() + '(' + str(datatype_length) + ') as ' + col 100 | elif datatype.upper() == 'VARCHAR': 101 | if (datatype_precision==''): 102 | datatype_precision=1 103 | fb = 'randstr(uniform(' + str(datatype_precision) + ',' + str(datatype_length) + ', random(' + str(seed_1) + ')),uniform(1,'+ str(cardinality) + ',random(' + str(seed_1) + ')))' 104 | fe = '::' + datatype.lower() + '(' + str(datatype_length) + ') as ' + col 105 | elif datatype.upper() == 'BIGINT': 106 | if cardinality == tbl_cardinality: 107 | fb = '(seq8()+1)' 108 | else: 109 | fb = 'uniform(1,' + str(cardinality) + ' , random('+ str(seed_1) +'))' 110 | fe = '::' + datatype.lower() + ' as ' + col 111 | elif datatype.upper() == 'BOOLEAN': 112 | if (int(cardinality)>2): 113 | cardinality=2 114 | fb = '(uniform(1,' + str(cardinality) + ' , random('+ str(seed_1) +'))-1)' 115 | fe = '::' + datatype.lower() + ' as ' + col 116 | elif datatype.upper() == 'INTEGER' or datatype.upper() == 'DOUBLE' or datatype.upper() == 'FLOAT': 117 | fb = 'uniform(1,' + str(cardinality) + ' , random('+ str(seed_1) +'))' 118 | fe = '::' + datatype.lower() + ' as ' + col 119 | elif datatype.upper() == 'NUMBER': 120 | fb = 'uniform(1,' + str(cardinality) + ' , random('+ str(seed_1) +'))' 121 | fe = '::number(' + str(datatype_length) + ',' + str(datatype_precision) + ') as ' + col 122 | else: 123 | fb='' 124 | fe='' 125 | 126 | if (fb==''): 127 | if (schema!='' or datatype!=''): 128 | print( "WARNING: Line: {0} Unknown Datatype: {1}".format(line_number,datatype ) ) 129 | else: 130 | if (int(cardinality)<=0): 131 | formula = 'null'+fe 132 | elif (null_ratio<=0): 133 | formula = fb + fe 134 | else: 135 | formula = '(case when uniform(1,1000,random('+str(seed_1+10000)+'))<='+str(int(null_ratio))+' then null else '+fb+' end)'+fe 136 | 137 | column_info={'NAME': col, 'DATA_TYPE': datatype, 'LENGTH': datatype_length, 'CARDINALITY': cardinality, 'TBL_CARDINALITY': tbl_cardinality, 'FORMULA': formula } 138 | 139 | if db not in database_objects: 140 | database_objects[db] = {} 141 | 142 | schema_objects=database_objects[db] 143 | 144 | if schema not in schema_objects: 145 | schema_objects[schema] = {} 146 | 147 | tbl_objects=schema_objects[schema] 148 | 149 | if tbl not in tbl_objects: 150 | tbl_objects[tbl] = [] 151 | 152 | tbl_objects[tbl].append(column_info) 153 | 154 | fh.close() 155 | 156 | return database_objects 157 | 158 | 159 | 160 | def print_ddl(database_objects, outputfile): 161 | 162 | if outputfile: 163 | try: 164 | of = open(outputfile, 'w') 165 | except IOError: 166 | print("Could not open file! " + outputfile) 167 | else: 168 | of = '' 169 | 170 | for db in sorted(database_objects.keys()): #Database Name 171 | #Now loop through the new obj_hash to print out the ddl 172 | printer('-------------------------------------', of) 173 | printer('-- Generating Database: ' + str(db), of) 174 | printer('-------------------------------------', of) 175 | schema_obj=database_objects[db] 176 | for schema in schema_obj: 177 | printer('CREATE TRANSIENT SCHEMA IF NOT EXISTS ' + str(schema) + ' DATA_RETENTION_TIME_IN_DAYS=0;', of) 178 | printer('USE SCHEMA '+ str(schema) +';', of) 179 | tbl_obj=schema_obj[schema] 180 | for tbl in tbl_obj: 181 | printer('CREATE or REPLACE TABLE '+ tbl, of) 182 | printer('AS', of) 183 | printer('SELECT', of) 184 | col_obj=tbl_obj[tbl] 185 | num_of_columns=len(col_obj) 186 | counter=1 187 | for col in col_obj: 188 | if counter < num_of_columns: 189 | printer(' ' + col['FORMULA'] + ',', of) 190 | counter=counter+1 191 | else: 192 | printer(' ' + col['FORMULA'], of) 193 | printer('from table(generator(rowcount => ' + str(col['TBL_CARDINALITY']) + '));' , of) 194 | printer(' ', of) 195 | 196 | if of != '': 197 | if not of.closed: 198 | of.close() 199 | 200 | return 201 | 202 | def printer(text, fh): 203 | 204 | 205 | if fh =='': 206 | print(text) 207 | else: 208 | if not fh.closed: 209 | fh.write(text+"\n") 210 | return 211 | 212 | 213 | def main(): 214 | ##### MAIN ##### 215 | parser = argparse.ArgumentParser(description='Snowflake Data Generation Utility.', 216 | epilog='Example: snowflake_datagen.py source_file.csv') 217 | parser.add_argument('filename', action='store', 218 | help='Customer Sample file template') 219 | parser.add_argument('-sqlfile', '--sqlfile', action='store', 220 | help='Create an output file script containing the create role ddl and grants, to be used with -ddl switch') 221 | parser.add_argument('-q', '--quoted_names', action='store_true', 222 | help='Enclose name with double quotes') 223 | parser.add_argument('-v', '--verbose', action='store_true', help='verbose') 224 | 225 | args=parser.parse_args(); 226 | if args.verbose: 227 | print("filename=" + str(args.filename)) # File to be loaded 228 | print("sqlfile=" + str(args.sqlfile)) # show debugging output 229 | print("verbose=" + str(args.verbose)) # show debugging output 230 | 231 | db_objects=load_db_file(args.filename,args.quoted_names) 232 | 233 | print_ddl(db_objects, args.sqlfile) 234 | 235 | return 236 | 237 | if __name__ == "__main__": 238 | main() 239 | --------------------------------------------------------------------------------