├── python-udfs
    ├── f_parse_xml(varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   └── function.sql
    ├── f_ua_parser_family(varchar)
    │   ├── output.csv
    │   ├── requirements.txt
    │   ├── input.csv
    │   └── function.sql
    ├── f_cosine_similarity(varchar,varchar)
    │   ├── output.csv
    │   └── function.sql
    ├── f_fuzzy_string_match(varchar,varchar)
    │   ├── output.csv
    │   ├── requirements.txt
    │   ├── input.csv
    │   └── function.sql
    ├── f_sentiment(varchar)
    │   ├── requirements.txt
    │   ├── input.csv
    │   ├── output.csv
    │   └── function.sql
    ├── f_null_syns(varchar)
    │   ├── input.csv
    │   ├── output.csv
    │   └── function.sql
    ├── fn_levenshtein_distance(varchar,varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   └── function.sql
    ├── f_next_business_day(date)
    │   ├── output.csv
    │   ├── input.csv
    │   └── function.sql
    ├── f_parse_url(varchar,varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   └── function.sql
    ├── f_bitwise_to_string(bigint,int)
    │   ├── output.csv
    │   ├── input.csv
    │   └── function.sql
    ├── f_unixts_to_timestamp(bigint,varchar)
    │   ├── input.csv
    │   ├── output.csv
    │   └── function.sql
    ├── f_format_number(float,varchar,varchar,int,int,bool)
    │   ├── output.csv
    │   ├── input.csv
    │   └── function.sql
    ├── f_bitwise_to_delimited(bigint,int,char)
    │   ├── output.csv
    │   ├── input.csv
    │   └── function.sql
    └── f_parse_url_query_string(varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   └── function.sql
├── lambda-udfs
    ├── f_lower_python(varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   ├── function.sql
    │   └── lambda.yaml
    ├── f_upper_java(varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   ├── function.sql
    │   ├── src
    │   │   └── main
    │   │   │   └── java
    │   │   │       └── f_upper_java_varchar
    │   │   │           └── Handler.java
    │   ├── lambda.yaml
    │   └── pom.xml
    ├── f_upper_nodejs(varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   ├── function.sql
    │   └── lambda.yaml
    ├── f_upper_python(varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   ├── function.sql
    │   └── lambda.yaml
    ├── fn_lambda_levenshtein_distance(varchar,varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   ├── function.sql
    │   └── lambda.yaml
    ├── f_glue_schema_registry_avro_to_json(varchar,varchar,varchar)
    │   ├── requirements.txt
    │   ├── example.png
    │   ├── output.csv
    │   ├── input.csv
    │   ├── resources.yaml
    │   ├── function.sql
    │   ├── README.md
    │   └── lambda.yaml
    ├── f_kms_decrypt(varchar)
    │   ├── output.csv.sample
    │   ├── function.sql
    │   ├── input.csv
    │   └── lambda.yaml
    ├── f_mysql_lookup_nodejs(varchar,varchar,varchar,varchar)
    │   ├── package.json
    │   ├── output.csv
    │   ├── input.csv
    │   ├── function.sql
    │   ├── index.js
    │   ├── lambda.yaml
    │   └── resources.yaml
    ├── f_unmarshall_dynamodb_stream_data(varchar)
    │   ├── example.png
    │   ├── function.sql
    │   ├── output.csv
    │   ├── input.csv
    │   ├── README.md
    │   └── lambda.yaml
    ├── f_kms_encrypt(varchar,varchar)
    │   ├── input.csv
    │   ├── resources.yaml
    │   ├── function.sql
    │   ├── output.csv.sample
    │   └── lambda.yaml
    ├── f_dynamodb_lookup_python(varchar,varchar,varchar)
    │   ├── input.csv
    │   ├── output.csv
    │   ├── function.sql
    │   ├── lambda.yaml
    │   └── resources.yaml
    └── f_titan_embedding(varchar)
    │   ├── function.sql
    │   ├── input.csv
    │   └── lambda.yaml
├── sql-udfs
    ├── f_mask_bigint(bigint,varchar,varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   └── function.sql
    ├── f_mask_timestamp(timestamp,varchar,varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   └── function.sql
    └── f_mask_varchar(varchar,varchar,varchar)
    │   ├── output.csv
    │   ├── input.csv
    │   └── function.sql
├── stored-procedures
    ├── sp_update_permissions
    │   ├── test_files
    │   │   ├── user_access_details6.csv
    │   │   ├── user_access_details1.csv
    │   │   ├── user_access_details4.csv
    │   │   ├── user_access_details3.csv
    │   │   ├── user_access_details2.csv
    │   │   └── user_access_details5.csv
    │   ├── README.md
    │   ├── test_scenarios.sql
    │   └── sp_update_permissions.sql
    ├── README.md
    ├── sp_check_foreign_key
    │   ├── README.md
    │   └── sp_check_foreign_key.sql
    ├── sp_vector_search_all
    │   └── sp_vector_search_all.sql
    ├── sp_correlation
    │   └── sp_correlation.sql
    ├── sp_vector_search
    │   └── sp_vector_search.sql
    ├── sp_controlled_access
    │   └── sp_controlled_access.sql
    ├── sp_kmeans
    │   └── sp_kmeans.sql
    ├── sp_analyze_minimal
    │   └── sp_analyze_minimal.sql
    ├── sp_pivot_for
    │   └── sp_pivot_for.sql
    ├── sp_connect_by_prior
    │   └── sp_connect_by_prior.sql
    └── sp_split_table_by_range
    │   └── sp_split_table_by_range.sql
├── NOTICE.txt
├── .github
    └── workflows
    │   └── github-actions.yml
├── bin
    ├── layerInstaller.sh
    ├── libraryInstaller.sh
    ├── testFunction.sh
    └── deployFunction.sh
├── views
    └── v_generate_udf_ddl.sql
├── CONTRIBUTING.md
└── README.md


/python-udfs/f_parse_xml(varchar)/output.csv:
--------------------------------------------------------------------------------
1 | Liechtenstein:1
2 | 


--------------------------------------------------------------------------------
/python-udfs/f_ua_parser_family(varchar)/output.csv:
--------------------------------------------------------------------------------
1 | Chrome
2 | 


--------------------------------------------------------------------------------
/python-udfs/f_cosine_similarity(varchar,varchar)/output.csv:
--------------------------------------------------------------------------------
1 | 1
2 | 


--------------------------------------------------------------------------------
/python-udfs/f_fuzzy_string_match(varchar,varchar)/output.csv:
--------------------------------------------------------------------------------
1 | 97
2 | 


--------------------------------------------------------------------------------
/python-udfs/f_sentiment(varchar)/requirements.txt:
--------------------------------------------------------------------------------
1 | nltk==3.2.1
2 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_lower_python(varchar)/output.csv:
--------------------------------------------------------------------------------
1 | abc
2 | test
3 | 123
4 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_java(varchar)/output.csv:
--------------------------------------------------------------------------------
1 | ABC
2 | TEST
3 | 123
4 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_nodejs(varchar)/output.csv:
--------------------------------------------------------------------------------
1 | ABC
2 | TEST
3 | 123
4 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_python(varchar)/output.csv:
--------------------------------------------------------------------------------
1 | ABC
2 | TEST
3 | 123A
4 | 


--------------------------------------------------------------------------------
/python-udfs/f_ua_parser_family(varchar)/requirements.txt:
--------------------------------------------------------------------------------
1 | ua_parser
2 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_lower_python(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'aBc'
2 | 'TesT'
3 | '123'
4 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_java(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'abc'
2 | 'test'
3 | '123'
4 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_nodejs(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'abc'
2 | 'test'
3 | '123'
4 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_python(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'abc'
2 | 'test'
3 | '123a'
4 | 


--------------------------------------------------------------------------------
/python-udfs/f_fuzzy_string_match(varchar,varchar)/requirements.txt:
--------------------------------------------------------------------------------
1 | thefuzz
2 | 


--------------------------------------------------------------------------------
/python-udfs/f_sentiment(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'Wow, NLTK is really powerful!'
2 | 


--------------------------------------------------------------------------------
/python-udfs/f_null_syns(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | NULL
2 | 'VOID'
3 | '<NULL>'
4 | ''
5 | 


--------------------------------------------------------------------------------
/python-udfs/f_null_syns(varchar)/output.csv:
--------------------------------------------------------------------------------
1 | false
2 | true
3 | false
4 | true
5 | 


--------------------------------------------------------------------------------
/python-udfs/fn_levenshtein_distance(varchar,varchar)/output.csv:
--------------------------------------------------------------------------------
1 | 0
2 | 1
3 | 3
4 | 


--------------------------------------------------------------------------------
/lambda-udfs/fn_lambda_levenshtein_distance(varchar,varchar)/output.csv:
--------------------------------------------------------------------------------
1 | 0
2 | 1
3 | 3
4 | 


--------------------------------------------------------------------------------
/python-udfs/f_next_business_day(date)/output.csv:
--------------------------------------------------------------------------------
1 | 2015-09-08
2 | 2015-09-08
3 | 2015-09-09
4 | 


--------------------------------------------------------------------------------
/python-udfs/f_fuzzy_string_match(varchar,varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'this is a test','this is a test!'
2 | 


--------------------------------------------------------------------------------
/python-udfs/f_next_business_day(date)/input.csv:
--------------------------------------------------------------------------------
1 | '2015-09-04'
2 | '2015-09-05'
3 | '2015-09-08'
4 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_glue_schema_registry_avro_to_json(varchar,varchar,varchar)/requirements.txt:
--------------------------------------------------------------------------------
1 | avro==1.11.2
2 | 


--------------------------------------------------------------------------------
/python-udfs/f_parse_url(varchar,varchar)/output.csv:
--------------------------------------------------------------------------------
1 | https
2 | example.com
3 | /issues/23
4 | hello=world
5 | 


--------------------------------------------------------------------------------
/python-udfs/f_sentiment(varchar)/output.csv:
--------------------------------------------------------------------------------
1 | {'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}
2 | 


--------------------------------------------------------------------------------
/python-udfs/f_bitwise_to_string(bigint,int)/output.csv:
--------------------------------------------------------------------------------
1 | 100011001
2 | 000011010
3 | 100011101
4 | 000110001
5 | 


--------------------------------------------------------------------------------
/python-udfs/fn_levenshtein_distance(varchar,varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'cat','cat'
2 | 'cat','cats'
3 | 'cat','dog'
4 | 


--------------------------------------------------------------------------------
/lambda-udfs/fn_lambda_levenshtein_distance(varchar,varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'cat','cat'
2 | 'cat','cats'
3 | 'cat','dog'
4 | 


--------------------------------------------------------------------------------
/sql-udfs/f_mask_bigint(bigint,varchar,varchar)/output.csv:
--------------------------------------------------------------------------------
1 | 123
2 | 1044974888152125746
3 | 1234567890
4 | null
5 | 890
6 | 


--------------------------------------------------------------------------------
/python-udfs/f_unixts_to_timestamp(bigint,varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 1349720105,'s'
2 | 1349720105123,'ms'
3 | 1349720105123123,'us'
4 | 


--------------------------------------------------------------------------------
/python-udfs/f_format_number(float,varchar,varchar,int,int,bool)/output.csv:
--------------------------------------------------------------------------------
1 | -123,456,789.00
2 | 123 456 789
3 | +123,456,789.00
4 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_kms_decrypt(varchar)/output.csv.sample:
--------------------------------------------------------------------------------
1 | jdoe@org.com
2 | John Doe
3 | ejones@org.com
4 | Mary Contrary
5 | jhndoe@org.com
6 | 


--------------------------------------------------------------------------------
/python-udfs/f_unixts_to_timestamp(bigint,varchar)/output.csv:
--------------------------------------------------------------------------------
1 | 2012-10-08 18:15:05
2 | 2012-10-08 18:15:05.123
3 | 2012-10-08 18:15:05.123123
4 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_update_permissions/test_files/user_access_details6.csv:
--------------------------------------------------------------------------------
1 | 1|attach|user1|user|table|test_user_management.sales|only_self_sales


--------------------------------------------------------------------------------
/lambda-udfs/f_mysql_lookup_nodejs(varchar,varchar,varchar,varchar)/package.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": {
3 |     "mysql2": "^2.3.0"
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/python-udfs/f_bitwise_to_delimited(bigint,int,char)/output.csv:
--------------------------------------------------------------------------------
1 | 1,0,0,0,1,1,0,0,1
2 | 0,0,0,0,1,1,0,1,0
3 | 1,0,0,0,1,1,1,0,1
4 | 0,0,0,1,1,0,0,0,1
5 | 


--------------------------------------------------------------------------------
/python-udfs/f_bitwise_to_string(bigint,int)/input.csv:
--------------------------------------------------------------------------------
1 | B'100011001'::integer,9
2 | B'000011010'::integer,9
3 | B'100011101'::integer,9
4 | B'000110001'::integer,9
5 | 


--------------------------------------------------------------------------------
/sql-udfs/f_mask_timestamp(timestamp,varchar,varchar)/output.csv:
--------------------------------------------------------------------------------
1 | 1900-09-01 00:00:00
2 | 2020-11-23 00:00:00
3 | 2021-09-01 00:00:00
4 | null
5 | 2021-01-01 00:00:00
6 | 


--------------------------------------------------------------------------------
/python-udfs/f_format_number(float,varchar,varchar,int,int,bool)/input.csv:
--------------------------------------------------------------------------------
1 | -123456789, ',', '.', 3, 2, FALSE
2 | 123456789, ' ', '.', 3, 0, 0
3 | 123456789, ',', '.', 3, 2, 1
4 | 


--------------------------------------------------------------------------------
/python-udfs/f_parse_url_query_string(varchar)/output.csv:
--------------------------------------------------------------------------------
1 | {"name": "ferret"}
2 | {"Status": "Elite"}
3 | {"status": "Currently"}
4 | {"utf8": "\u2713", "query": "redshift"}
5 | 


--------------------------------------------------------------------------------
/python-udfs/f_ua_parser_family(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.104 Safari/537.36'
2 | 


--------------------------------------------------------------------------------
/python-udfs/f_bitwise_to_delimited(bigint,int,char)/input.csv:
--------------------------------------------------------------------------------
1 | B'100011001'::integer,9,','
2 | B'000011010'::integer,9,','
3 | B'100011101'::integer,9,','
4 | B'000110001'::integer,9,','
5 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_update_permissions/test_files/user_access_details1.csv:
--------------------------------------------------------------------------------
1 | 1|grant|user1|user|schema|test_user_management|usage
2 | 2|grant|user1|user|table|test_user_management.sales|select


--------------------------------------------------------------------------------
/sql-udfs/f_mask_bigint(bigint,varchar,varchar)/input.csv:
--------------------------------------------------------------------------------
1 | '1234567890', NULL, 'P'
2 | '1234567890', NULL, 'F'
3 | '1234567890', NULL, 'N'
4 | '1234567890', NULL, NULL
5 | '1234567890', 'cc', 'P'
6 | 


--------------------------------------------------------------------------------
/sql-udfs/f_mask_timestamp(timestamp,varchar,varchar)/input.csv:
--------------------------------------------------------------------------------
1 | '9/1/2021', NULL, 'P'
2 | '9/1/2021', NULL, 'F'
3 | '9/1/2021', NULL, 'N'
4 | '9/1/2021', NULL, NULL
5 | '9/1/2021', 'dob', 'P'
6 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_unmarshall_dynamodb_stream_data(varchar)/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-redshift-udfs/HEAD/lambda-udfs/f_unmarshall_dynamodb_stream_data(varchar)/example.png


--------------------------------------------------------------------------------
/stored-procedures/sp_update_permissions/test_files/user_access_details4.csv:
--------------------------------------------------------------------------------
1 | 1|revoke|user1|user|function|f_sql_greater (float, float)|execute
2 | 2|grant|group1|group|function|f_sql_greater (float, float)|execute


--------------------------------------------------------------------------------
/stored-procedures/sp_update_permissions/test_files/user_access_details3.csv:
--------------------------------------------------------------------------------
1 | 1|grant|user1|user|table|test_user_management.sales|SELECT(buyer_id, price_paid) 
2 | 2|grant|user1|user|function|f_sql_greater (float, float)|execute


--------------------------------------------------------------------------------
/lambda-udfs/f_glue_schema_registry_avro_to_json(varchar,varchar,varchar)/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/amazon-redshift-udfs/HEAD/lambda-udfs/f_glue_schema_registry_avro_to_json(varchar,varchar,varchar)/example.png


--------------------------------------------------------------------------------
/python-udfs/f_parse_xml(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | '<data><country name="Liechtenstein"><rank>1</rank><year>2008</year><gdppc>141100</gdppc><neighbor name="Austria" direction="E"/><neighbor name="Switzerland" direction="W"/></country></data>'
2 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_kms_encrypt(varchar,varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'alias/f-kms-encrypt', 'jdoe@org.com'
2 | 'alias/f-kms-encrypt', 'John Doe'
3 | 'alias/f-kms-encrypt', 'ejones@org.com'
4 | 'alias/f-kms-encrypt', 'Mary Contrary'
5 | 'alias/f-kms-encrypt', 'jhndoe@org.com'
6 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_dynamodb_lookup_python(varchar,varchar,varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'CustomerLambdaUDF','id','0'
2 | 'CustomerLambdaUDF','id','1'
3 | 'CustomerLambdaUDF','id','2'
4 | 'CustomerLambdaUDF','id','3'
5 | 'CustomerLambdaUDF','id','4'
6 | 'CustomerLambdaUDF','id','12'
7 | 


--------------------------------------------------------------------------------
/lambda-udfs/fn_lambda_levenshtein_distance(varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
1 | create or replace external function fn_lambda_levenshtein_distance(a varchar, b varchar) returns int 
2 | lambda 'fn_lambda_levenshtein_distance' iam_role ':RedshiftRole' immutable retry_timeout 0;
3 | 


--------------------------------------------------------------------------------
/python-udfs/f_parse_url(varchar,varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'https://example.com/issues/23?hello=world', 'scheme'
2 | 'https://example.com/issues/23?hello=world', 'hostname'
3 | 'https://example.com/issues/23?hello=world', 'path'
4 | 'https://example.com/issues/23?hello=world', 'query'
5 | 


--------------------------------------------------------------------------------
/python-udfs/f_parse_url_query_string(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'http://example.com/over/there?name=ferret'
2 | 'http://example.com/Sales/DeptData/Elites.aspx?Status=Elite'
3 | 'http://example.com/home?status=Currently'
4 | 'https://example.com/ops/search?utf8=%E2%9C%93&query=redshift'
5 | 


--------------------------------------------------------------------------------
/sql-udfs/f_mask_varchar(varchar,varchar,varchar)/output.csv:
--------------------------------------------------------------------------------
 1 | Janxxxxx
 2 | 1c272047233576d77a9b9a1acfdf741c
 3 | Jane Doe
 4 | null
 5 | jdoxxxx@org.com
 6 | c51f82d521e9a9a847cc035e6e92d8b4
 7 | jdoe@org.com
 8 | 123-45-xxxx
 9 | 1e87489a7ea3c3492ddaeee9c13f386d
10 | 123-45-6789
11 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_mysql_lookup_nodejs(varchar,varchar,varchar,varchar)/output.csv:
--------------------------------------------------------------------------------
1 | {"id":null,"fname":null,"lname":null}
2 | {"id":"2","fname":"Jane","lname":"Doe"}
3 | {"id":"3","fname":"Carlos","lname":"Salazar"}
4 | {"id":"4","fname":"Diego","lname":"Ramirez"}
5 | {"id":null,"fname":null,"lname":null}
6 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_update_permissions/test_files/user_access_details2.csv:
--------------------------------------------------------------------------------
1 | 1|grant|user1|user|schema|test_user_management|usage
2 | 2|revoke|user1|user|table|test_user_management.sales|select
3 | 3|grant|group1|group|schema|test_user_management|usage
4 | 4|grant|group1|group|table|test_user_management.sales|select


--------------------------------------------------------------------------------
/lambda-udfs/f_mysql_lookup_nodejs(varchar,varchar,varchar,varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'public.customers', 'id', 'lookupnodejs','0'
2 | 'public.customers', 'id', 'lookupnodejs','2'
3 | 'public.customers', 'id', 'lookupnodejs','3'
4 | 'public.customers', 'id', 'lookupnodejs','4'
5 | 'public.customers', 'id', 'lookupnodejs','12'
6 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_java(varchar)/function.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Purpose:
3 |     This sample function demonstrates how to create/use lambda UDFs in java 
4 | 
5 | 2021-08-01: written by rjvgupta
6 | */
7 | CREATE OR REPLACE EXTERNAL FUNCTION f_upper_java(varchar) RETURNS varchar IMMUTABLE
8 | LAMBDA 'f-upper-java-varchar' IAM_ROLE ':RedshiftRole';
9 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_lower_python(varchar)/function.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Purpose:
3 |     This sample function demonstrates how to create/use lambda UDFs in python 
4 | 2021-08-01: written by rjvgupta
5 | */
6 | CREATE OR REPLACE EXTERNAL FUNCTION f_lower_python(varchar) RETURNS varchar IMMUTABLE
7 | LAMBDA 'f-lower-python-varchar' IAM_ROLE ':RedshiftRole';
8 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_dynamodb_lookup_python(varchar,varchar,varchar)/output.csv:
--------------------------------------------------------------------------------
1 | null
2 | {"size": "2", "lname": "Doe", "id": "1", "fname": "John"}
3 | {"size": "3", "lname": "Doe", "id": "2", "fname": "Jane"}
4 | {"size": "5", "lname": "Salazar", "id": "3", "fname": "Carlos"}
5 | {"size": "7", "lname": "Ramirez", "id": "4", "fname": "Diego"}
6 | null
7 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_nodejs(varchar)/function.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Purpose:
3 |     This sample function demonstrates how to create/use lambda UDFs in python
4 | 
5 | 2021-08-01: written by rjvgupta
6 | */
7 | CREATE OR REPLACE EXTERNAL FUNCTION f_upper_nodejs(varchar) RETURNS varchar IMMUTABLE
8 | LAMBDA 'f-upper-nodejs-varchar' IAM_ROLE ':RedshiftRole';
9 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_python(varchar)/function.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Purpose:
3 |     This sample function demonstrates how to create/use lambda UDFs in python
4 | 
5 | 2021-08-01: written by rjvgupta
6 | */
7 | CREATE OR REPLACE EXTERNAL FUNCTION f_upper_python(varchar) RETURNS varchar IMMUTABLE
8 | LAMBDA 'f-upper-python-varchar' IAM_ROLE ':RedshiftRole';
9 | 


--------------------------------------------------------------------------------
/sql-udfs/f_mask_varchar(varchar,varchar,varchar)/input.csv:
--------------------------------------------------------------------------------
 1 | 'Jane Doe', NULL, 'P'
 2 | 'Jane Doe', NULL, 'F'
 3 | 'Jane Doe', NULL, 'N'
 4 | 'Jane Doe', NULL, NULL
 5 | 'jdoe@org.com', 'email', 'P'
 6 | 'jdoe@org.com', 'email', 'F'
 7 | 'jdoe@org.com', 'email', 'N'
 8 | '123-45-6789', 'ssn', 'P'
 9 | '123-45-6789', 'ssn', 'F'
10 | '123-45-6789', 'ssn', 'N'
11 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_update_permissions/test_files/user_access_details5.csv:
--------------------------------------------------------------------------------
1 | 1|revoke|user1|user|table|test_user_management.sales|SELECT(buyer_id, price_paid) 
2 | 2|revoke|group1|group|table|test_user_management.sales|SELECT
3 | 1|grant|group1|group|table|ALL TABLES IN SCHEMA test_user_management|SELECT
4 | 2|grant|role1|role|table|test_user_management.sales|SELECT


--------------------------------------------------------------------------------
/lambda-udfs/f_glue_schema_registry_avro_to_json(varchar,varchar,varchar)/output.csv:
--------------------------------------------------------------------------------
1 | {"name": "Moiraine", "favorite_number": 4, "favorite_color": "Blue"}
2 | {"name": "Elaida", "favorite_number": 1, "favorite_color": "Red"}
3 | {"name": "Cadsuane", "favorite_number": 2, "favorite_color": "Green"}
4 | {"name": "Alviarin", "favorite_number": 0, "favorite_color": "Black"}
5 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_glue_schema_registry_avro_to_json(varchar,varchar,varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'AvroLambdaUDF','lambda_udf_stream','104d6f697261696e6500080008426c7565'
2 | 'AvroLambdaUDF','lambda_udf_stream','0c456c6169646100020006526564'
3 | 'AvroLambdaUDF','lambda_udf_stream','104361647375616e650004000a477265656e'
4 | 'AvroLambdaUDF','lambda_udf_stream','10416c76696172696e0000000a426c61636b'
5 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_titan_embedding(varchar)/function.sql:
--------------------------------------------------------------------------------
1 | /*
2 | Purpose:
3 |     This sample function demonstrates how use lambda to call the titan embedding model to convert your text to an embedding.
4 | 
5 | 2024-07-11: written by rjvgupta  
6 | */ 
7 | CREATE OR REPLACE EXTERNAL FUNCTION f_titan_embedding (varchar) RETURNS varchar(max) STABLE
8 | LAMBDA 'f-titan-embedding-varchar' IAM_ROLE ':RedshiftRole';
9 | 


--------------------------------------------------------------------------------
/python-udfs/f_unixts_to_timestamp(bigint,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /* Purpose: Converts a UNIX timestamp to a UTC datetime with up to microseconds granularity.
 2 | 
 3 | 2015-09-10: written by chriz@
 4 | */
 5 | 
 6 | CREATE OR REPLACE FUNCTION f_unixts_to_timestamp(ts BIGINT, units varchar)
 7 | RETURNS timestamp
 8 | STABLE
 9 | AS $$
10 |     import pandas
11 |     return pandas.to_datetime(ts, unit=units.rstrip())
12 | $$ LANGUAGE plpythonu;
13 | 


--------------------------------------------------------------------------------
/python-udfs/f_fuzzy_string_match(varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose: This function will uses Levenshtein Distance to calculate the differences between sequences.
 3 | 
 4 | 2022-08-14: written by saeedsb
 5 | */
 6 | --
 7 | CREATE OR REPLACE FUNCTION f_fuzzy_string_match (string_a VARCHAR,string_b VARCHAR) 
 8 | RETURNS FLOAT IMMUTABLE AS $$
 9 |   from thefuzz import fuzz 
10 |   
11 |   return fuzz.ratio (string_a,string_b) 
12 | $$ LANGUAGE plpythonu;


--------------------------------------------------------------------------------
/python-udfs/f_sentiment(varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose: This function will return the sentiment of a text field using the nltk library.
 3 | 
 4 | 2023-09-29: written by rjvgupta
 5 | */
 6 | --
 7 | CREATE OR REPLACE FUNCTION f_sentiment (text VARCHAR)
 8 | RETURNS VARCHAR IMMUTABLE AS $$
 9 |   from nltk.sentiment import SentimentIntensityAnalyzer
10 |   sia = SentimentIntensityAnalyzer()
11 |   return sia.polarity_scores(text)
12 | $$ LANGUAGE plpythonu;
13 | 


--------------------------------------------------------------------------------
/python-udfs/f_ua_parser_family(varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose: This function will extract from the user_agent string. This function demos packaging of an external library. 
 3 | 
 4 | 2021-08-08: written by rjvgupta
 5 | */
 6 | --
 7 | CREATE OR REPLACE FUNCTION f_ua_parser_family (ua VARCHAR)
 8 | RETURNS VARCHAR IMMUTABLE AS $$
 9 |   from ua_parser import user_agent_parser
10 |   return user_agent_parser.ParseUserAgent(ua)['family']
11 | $$ LANGUAGE plpythonu;
12 | 


--------------------------------------------------------------------------------
/python-udfs/f_cosine_similarity(varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose: This function use numpy to determine the similary between two vectors. 
 3 | 
 4 | 2021-08-08: written by rjvgupta
 5 | */
 6 | --
 7 | CREATE OR REPLACE FUNCTION f_cosine_similarity (v1 VARCHAR(MAX), v2 VARCHAR(MAX))
 8 | RETURNS FLOAT8 IMMUTABLE AS $$
 9 |   import numpy,json
10 |   from numpy.linalg import norm
11 |   A = numpy.array(json.loads(v1))
12 |   B = numpy.array(json.loads(v2))
13 |   return numpy.dot(A,B)/(norm(A)*norm(B))
14 | $$ LANGUAGE plpythonu;
15 | 


--------------------------------------------------------------------------------
/python-udfs/f_parse_url_query_string(varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /* Purpose: This UDF takes a URL as an argument, and parses out the field-value pairs.
 2 | Returns pairs in JSON for further parsing if needed.
 3 | 
 4 | 2015-09-10: written by chriz@
 5 | */
 6 | 
 7 | CREATE OR REPLACE FUNCTION f_parse_url_query_string(url VARCHAR(MAX))
 8 | RETURNS varchar(max)
 9 | STABLE
10 | AS $$
11 |     from urlparse import urlparse, parse_qsl
12 |     import json
13 |     return json.dumps(dict(parse_qsl(urlparse(url)[4])))
14 | $$ LANGUAGE plpythonu;
15 | 


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | Copyright 2015-2015 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 | 
3 | Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the License. A copy of the License is located at
4 | 
5 |     http://aws.amazon.com/apache2.0/
6 | 
7 | or in the "license" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.
8 | 


--------------------------------------------------------------------------------
/python-udfs/f_parse_xml(varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /* UDF:
 2 | 
 3 | Purpose: This function showcases how parsing XML is possible with UDFs.
 4 | 
 5 | 2015-09-10: written by chriz@
 6 | */
 7 | 
 8 | CREATE OR REPLACE FUNCTION f_parse_xml(xml VARCHAR(MAX))
 9 | RETURNS varchar(max)
10 | STABLE
11 | AS $$
12 |     import xml.etree.ElementTree as ET
13 |     root = ET.fromstring(xml)
14 |     for country in root.findall('country'):
15 |         rank = country.find('rank').text
16 |         name = country.get('name')
17 |         return name + ':' + rank
18 | $$ LANGUAGE plpythonu;
19 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_dynamodb_lookup_python(varchar,varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose:
 3 |     This sample function demonstrates how to create/use lambda UDFs in python to use external services like dynamoDB.
 4 |     https://aws.amazon.com/blogs/big-data/accessing-external-components-using-amazon-redshift-lambda-udfs/
 5 | 
 6 | 2021-08-01: written by rjvgupta
 7 | */ 
 8 | CREATE OR REPLACE EXTERNAL FUNCTION f_dynamodb_lookup_python (varchar, varchar, varchar) RETURNS varchar STABLE
 9 | LAMBDA 'f-dynamodb-lookup-python-varchar-varchar-varchar' IAM_ROLE ':RedshiftRole';
10 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_kms_decrypt(varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose:
 3 |     This sample function demonstrates how to decrypt data which was encrypted using a KMS key. This function can be used in conjunction with f_kms_encrypt.
 4 | 
 5 |     Note: the test input/output is for illustration and would need to be modified to use data that was encrypted with YOUR kms key.
 6 | 
 7 | 2021-09-08: written by rjvgupta
 8 | */
 9 | 
10 | CREATE OR REPLACE EXTERNAL FUNCTION f_kms_decrypt (value varchar(max))
11 | RETURNS varchar STABLE
12 | LAMBDA 'f-kms-decrypt-varchar'
13 | IAM_ROLE ':RedshiftRole';
14 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_unmarshall_dynamodb_stream_data(varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose:
 3 |     This sample function demonstrates how to convert DynamoDB marshalled data in a Kinesis stream
 4 |     from DynamoDB streams into unmarshalled (normal JSON) format for usage in Redshift materialized views.
 5 |     TODO: Blog link when published
 6 | 
 7 | 2022-08-30: written by mmehrten
 8 | */ 
 9 | CREATE OR REPLACE EXTERNAL FUNCTION f_unmarshall_dynamodb_stream_data(VARCHAR(MAX)) RETURNS VARCHAR(MAX) IMMUTABLE
10 | LAMBDA 'f-unmarshall-dynamodb-stream-data-varchar' IAM_ROLE ':RedshiftRole';
11 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_kms_encrypt(varchar,varchar)/resources.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Resources:
 3 |   KMSKey:
 4 |     Type: "AWS::KMS::Key"
 5 |     Properties:
 6 |       KeyPolicy:
 7 |         Version: '2012-10-17'
 8 |         Statement:
 9 |         - Effect: Allow
10 |           Principal:
11 |             AWS: !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:root'
12 |           Action: 'kms:*'
13 |           Resource: '*'
14 |   KMSKeyAlias:
15 |     Type: AWS::KMS::Alias
16 |     Properties:
17 |       AliasName: alias/f-kms-encrypt
18 |       TargetKeyId: !Ref KMSKey
19 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_kms_encrypt(varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose:
 3 |     This sample function demonstrates how to encrypt data which was encrypted using a KMS key.
 4 |     This function can be used in conjunction with f_kms_decrypt.
 5 | 
 6 |     Note: the test input/output is for illustration and would need to be modified to use data that was encrypted with YOUR kms key
 7 | 
 8 | 2021-09-08: written by rjvgupta
 9 | */
10 | CREATE OR REPLACE EXTERNAL FUNCTION f_kms_encrypt (key varchar, value varchar)
11 | RETURNS varchar(max) STABLE
12 | LAMBDA 'f-kms-encrypt-varchar-varchar'
13 | IAM_ROLE ':RedshiftRole';
14 | 


--------------------------------------------------------------------------------
/python-udfs/f_next_business_day(date)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose: Returns the next business day with respect to US Federal Holidays and a M-F work week.
 3 | Arguments:
 4 |     • `dt` - date to be shifted
 5 | 
 6 | 2015-09-10: written by chriz@
 7 | */
 8 | 
 9 | CREATE OR REPLACE FUNCTION f_next_business_day(dt DATE)
10 | RETURNS date
11 | STABLE
12 | AS $$
13 |     import pandas
14 |     from pandas.tseries.offsets import CustomBusinessDay
15 |     from pandas.tseries.holiday import USFederalHolidayCalendar
16 | 
17 |     bday_us = CustomBusinessDay(calendar=USFederalHolidayCalendar())
18 |     return dt + bday_us
19 | $$ LANGUAGE plpythonu;
20 | 


--------------------------------------------------------------------------------
/python-udfs/f_null_syns(varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /* UDF:
 2 | 
 3 | Purpose: This function showcases python SET and BOOLEAN support as well as how an argument can be matched against synonyms,
 4 | similar to a SQL IN condition.  You might use it as follows:
 5 |  
 6 | UPDATE null_tbl SET a = NULL WHERE f_null_syns(a) = TRUE;
 7 | 
 8 | 2015-09-10: written by chriz@
 9 | */
10 | 
11 | CREATE OR REPLACE FUNCTION f_null_syns(a VARCHAR)
12 | RETURNS boolean
13 | STABLE
14 | AS $$
15 |     s = {"null", "invalid", "unknown", "n/a", "not applicable", "void", "nothing", "nonexistent", "null and void", ""}
16 |     b = str(a).lower()
17 |     return b in s
18 | $$ LANGUAGE plpythonu;
19 | 


--------------------------------------------------------------------------------
/python-udfs/f_parse_url(varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose:
 3 |     This UDF takes a URL and attribute name as arguments,
 4 |     and return the specified attribute of the given URL.
 5 |     Attributes definition: https://docs.python.org/2.7/library/urlparse.html#module-urlparse
 6 | 
 7 | 2016-12-29: written by inohiro
 8 | */
 9 | 
10 | CREATE OR REPLACE FUNCTION f_parse_url(url VARCHAR(MAX), part VARCHAR(20))
11 | RETURNS VARCHAR(MAX)
12 | IMMUTABLE
13 | AS $$
14 |     if url is None:
15 |         return None
16 |     else:
17 |         import urlparse
18 |         parsed = urlparse.urlparse(url)
19 |         return getattr(parsed, part)
20 | $$ LANGUAGE plpythonu
21 | ;
22 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_titan_embedding(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'With ComfortCut blades, you get a clean shave that''s comfortable on your skin. Rounded caps shield 27 self-sharpening blades to gently cut hair just above skin level and help the shaver glide smoothly over your skin.\n4D Flex Heads move independently in 4 directions to automatically adjust to the curves of your face, neck and jaw line.\nPop-up trimmer for mustache and sideburns Finish your look with the built-in trimmer. It’s ideal for maintaining your mustache and trimming your sideburns.\nAt the touch of a button, you can pop the heads open for an easy and thorough clean under the tap.\nUp to 40 minutes of cordless runtime - that''s about 13 shaves. Or plug it in for instant, continuous power.'
2 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_mysql_lookup_nodejs(varchar,varchar,varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose:
 3 |     This sample function demonstrates how to create/use lambda UDFs in nodejs to use external services like mySQL.
 4 |     https://aws.amazon.com/blogs/big-data/accessing-external-components-using-amazon-redshift-lambda-udfs/
 5 | 
 6 |     Usage: p1 - schema.table
 7 |            p2 - join column
 8 |            p3 - secretArn containing host, user, password
 9 |            p4 - join value
10 | 
11 | 2021-08-01: written by rjvgupta
12 | */ 
13 | CREATE OR REPLACE EXTERNAL FUNCTION f_mysql_lookup_nodejs (varchar, varchar, varchar, varchar) RETURNS varchar STABLE
14 | LAMBDA 'f-mysql-lookup-nodejs-varchar-varchar-varchar-varchar' IAM_ROLE ':RedshiftRole';
15 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_glue_schema_registry_avro_to_json(varchar,varchar,varchar)/resources.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Resources:
 3 |   GlueSchemaRegistry:
 4 |     Type: AWS::Glue::Registry
 5 |     Properties:
 6 |       Name: AvroLambdaUDF
 7 |   AvroSchema:
 8 |     Type: AWS::Glue::Schema
 9 |     Properties: 
10 |       Compatibility: BACKWARD
11 |       DataFormat: AVRO
12 |       Name: lambda_udf_stream
13 |       Registry: 
14 |         Arn:
15 |           !Ref GlueSchemaRegistry
16 |       SchemaDefinition: |
17 |         {
18 |             "type": "record",
19 |             "name": "User",
20 |             "namespace": "example.avro",
21 |             "fields": [
22 |                 {"type": "string", "name": "name"},
23 |                 {"type": ["int", "null"], "name": "favorite_number"},
24 |                 {"type": ["string", "null"], "name": "favorite_color"}
25 |             ]
26 |         }
27 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_glue_schema_registry_avro_to_json(varchar,varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose:
 3 |     This sample function demonstrates how to create/use lambda UDFs in python to convert Avro encoded data into JSON for use in
 4 |     Redshift SUPER data type using the Glue Schema Registry.
 5 |     TODO: Link to blog
 6 | 
 7 |     Arguments: 
 8 |         :param registry_name: The Glue Schema Registry name for the schema
 9 |         :param schema_name: The schema name for the data to retrieve from the registry
10 |         :param data: The Hex-encoded Avro binary data
11 |     
12 |     Returns:
13 |         The data encoded as JSON.
14 | 
15 | 2023-08-25: written by mmehrten
16 | */ 
17 | CREATE OR REPLACE EXTERNAL FUNCTION f_glue_schema_registry_avro_to_json (varchar, varchar, varchar) 
18 | RETURNS varchar 
19 | IMMUTABLE
20 | LAMBDA 'f-glue-schema-registry-avro-to-json-varchar-varchar-varchar' 
21 | IAM_ROLE ':RedshiftRole';
22 | 


--------------------------------------------------------------------------------
/python-udfs/f_bitwise_to_string(bigint,int)/function.sql:
--------------------------------------------------------------------------------
 1 | /* f_bitwise_to_string.sql
 2 | 
 3 | Purpose: Bitwise operations are very fast in Redshift and are invaluable when dealing
 4 |          with many thousands of BOOLEAN columns. This function, most useful for reporting,
 5 |          creates a VARCHAR representation of an INT column containing bit-wise encoded
 6 |          BOOLEAN values, e.g. 281 => '100011001'
 7 | 
 8 | Arguments:
 9 |     • `bitwise_column` - column containing bit-wise encoded BOOLEAN values
10 |     • `bits_in_column` - number of bits encoded in the column
11 | 
12 | 2015-10-15: created by Joe Harris (https://github.com/joeharris76)
13 | */
14 | CREATE OR REPLACE FUNCTION f_bitwise_to_string(bitwise_column BIGINT, bits_in_column INT)
15 |     RETURNS VARCHAR(255)
16 | STABLE
17 | AS $$
18 |   # Convert column to binary, strip "0b" prefix, pad out with zeroes
19 |   b = bin(bitwise_column)[2:].zfill(bits_in_column)
20 |   return b
21 | $$ LANGUAGE plpythonu;
22 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_java(varchar)/src/main/java/f_upper_java_varchar/Handler.java:
--------------------------------------------------------------------------------
 1 | package f_upper_java_varchar;
 2 | 
 3 | import com.amazonaws.services.lambda.runtime.Context;
 4 | import com.amazonaws.services.lambda.runtime.RequestHandler;
 5 | 
 6 | import java.util.HashMap;
 7 | import org.json.*;
 8 | 
 9 | public class Handler implements RequestHandler<HashMap<String,Object>, String> {
10 | 
11 | 	@Override
12 | 	public String handleRequest(HashMap<String,Object> input, Context context) {
13 | 		// TODO Auto-generated method stub
14 | 		context.getLogger().log("Input: " + input);
15 | 
16 | 		JSONObject inputJSON = new JSONObject(input);
17 | 		JSONArray rows = inputJSON.getJSONArray("arguments");
18 | 
19 | 		JSONArray res = new JSONArray();
20 | 		for (int i = 0; i<rows.length(); i++) {
21 | 			JSONArray row = rows.getJSONArray(i);
22 | 			res.put(row.getString(0).toUpperCase());
23 | 		}
24 | 		JSONObject results = new JSONObject();
25 | 		results.put("results", res);
26 | 		return results.toString();
27 | 	}
28 | 
29 | }
30 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_kms_decrypt(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | 'AQICAHiQbIJ478Gbu8DZyl0frUxOrbgDlP+CyfuWCuF0kHJyWgGUd8tNQazGBi+IYsOtXg1BAAAAajBoBgkqhkiG9w0BBwagWzBZAgEAMFQGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMkHetaCvYsd+bnS12AgEQgCf+H2/nnRsPE/6FB5z551dFDhsMOhJmHZ/DzT1yuEspw3Tmc1bWRt0='
2 | 'AQICAHiQbIJ478Gbu8DZyl0frUxOrbgDlP+CyfuWCuF0kHJyWgGN/ko4MVPmu6f6cR0mjxhXAAAAZjBkBgkqhkiG9w0BBwagVzBVAgEAMFAGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMPQ5O+M9AhwaqZhiqAgEQgCPFkXzm81pfPPikahIIoYu9ebI7Wq7e42WfSc/iyZ8bcLn5Vg=='
3 | 'AQICAHiQbIJ478Gbu8DZyl0frUxOrbgDlP+CyfuWCuF0kHJyWgENsHKtt0VMT58bHA9SBjOkAAAAbDBqBgkqhkiG9w0BBwagXTBbAgEAMFYGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMqL9rHMVGMIr5yV6JAgEQgCm0rJjAdbFu0KVmAIz7bGZ8PX1UoJVf/KERNqqQlNYvHMBmg6EDKVnyTQ=='
4 | 'AQICAHiQbIJ478Gbu8DZyl0frUxOrbgDlP+CyfuWCuF0kHJyWgHFcEi3xwexFNGjI+sCJ3AgAAAAazBpBgkqhkiG9w0BBwagXDBaAgEAMFUGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMV1jqib9UiJGMXBeKAgEQgCiiM2X0hcOiqyqTnnagojGom3UcK0z5wWKvI5RkSQu4BD9OSMwDddjc'
5 | 'AQICAHiQbIJ478Gbu8DZyl0frUxOrbgDlP+CyfuWCuF0kHJyWgFZNjlw6W80pKiDnA59yM5rAAAAbDBqBgkqhkiG9w0BBwagXTBbAgEAMFYGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM9BxZQiS48puDXQQVAgEQgClf4qpjnyg3jiEgm7JaKVbHd9En1cBuX9VdnMYs4hQiZEN/4RdKSMdbgg=='
6 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_kms_encrypt(varchar,varchar)/output.csv.sample:
--------------------------------------------------------------------------------
1 | AQICAHiQbIJ478Gbu8DZyl0frUxOrbgDlP+CyfuWCuF0kHJyWgGUd8tNQazGBi+IYsOtXg1BAAAAajBoBgkqhkiG9w0BBwagWzBZAgEAMFQGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMkHetaCvYsd+bnS12AgEQgCf+H2/nnRsPE/6FB5z551dFDhsMOhJmHZ/DzT1yuEspw3Tmc1bWRt0=
2 | AQICAHiQbIJ478Gbu8DZyl0frUxOrbgDlP+CyfuWCuF0kHJyWgGN/ko4MVPmu6f6cR0mjxhXAAAAZjBkBgkqhkiG9w0BBwagVzBVAgEAMFAGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMPQ5O+M9AhwaqZhiqAgEQgCPFkXzm81pfPPikahIIoYu9ebI7Wq7e42WfSc/iyZ8bcLn5Vg==
3 | AQICAHiQbIJ478Gbu8DZyl0frUxOrbgDlP+CyfuWCuF0kHJyWgENsHKtt0VMT58bHA9SBjOkAAAAbDBqBgkqhkiG9w0BBwagXTBbAgEAMFYGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMqL9rHMVGMIr5yV6JAgEQgCm0rJjAdbFu0KVmAIz7bGZ8PX1UoJVf/KERNqqQlNYvHMBmg6EDKVnyTQ==
4 | AQICAHiQbIJ478Gbu8DZyl0frUxOrbgDlP+CyfuWCuF0kHJyWgHFcEi3xwexFNGjI+sCJ3AgAAAAazBpBgkqhkiG9w0BBwagXDBaAgEAMFUGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQMV1jqib9UiJGMXBeKAgEQgCiiM2X0hcOiqyqTnnagojGom3UcK0z5wWKvI5RkSQu4BD9OSMwDddjc
5 | AQICAHiQbIJ478Gbu8DZyl0frUxOrbgDlP+CyfuWCuF0kHJyWgFZNjlw6W80pKiDnA59yM5rAAAAbDBqBgkqhkiG9w0BBwagXTBbAgEAMFYGCSqGSIb3DQEHATAeBglghkgBZQMEAS4wEQQM9BxZQiS48puDXQQVAgEQgClf4qpjnyg3jiEgm7JaKVbHd9En1cBuX9VdnMYs4hQiZEN/4RdKSMdbgg==
6 | 


--------------------------------------------------------------------------------
/python-udfs/f_bitwise_to_delimited(bigint,int,char)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose: Bitwise operations are very fast in Redshift and are invaluable when dealing
 3 |          with many thousands of BOOLEAN columns. This function, most useful for exports,
 4 |          creates a VARCHAR, delimited by a specified character, from an INT column
 5 |          containing bit-wise encoded BOOLEAN values, e.g. 281 => '1,0,0,0,1,1,0,0,1'
 6 |  
 7 | Arguments:
 8 |     • `bitwise_column` - column containing bit-wise encoded BOOLEAN values
 9 |     • `bits_in_column` - number of bits encoded in the column
10 |     • `delimiter`      - character that will delimit the output
11 | test
12 | 2015-10-15: created by Joe Harris (https://github.com/joeharris76)
13 | */
14 | CREATE OR REPLACE FUNCTION f_bitwise_to_delimited(bitwise_column BIGINT, bits_in_column INT, delimter CHAR(1))
15 |     RETURNS VARCHAR(512)
16 | STABLE
17 | AS $$
18 |   # Convert column to binary, strip "0b" prefix, pad out with zeroes
19 |   b = bin(bitwise_column)[2:].zfill(bits_in_column)
20 |   # Convert each character to a member of an array, join array into string using delimiter
21 |   o = delimter.join([b[i:i+1] for i in range(0, len(b), 1)])
22 |   return o
23 | $$ LANGUAGE plpythonu;
24 | 


--------------------------------------------------------------------------------
/python-udfs/fn_levenshtein_distance(varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | create or replace function fn_levenshtein_distance(a varchar, b varchar) returns integer as
 2 | $$
 3 | 
 4 | def levenshtein_distance(a, len_a, b, len_b):
 5 |     d = [[0] * (len_b + 1) for i in range(len_a + 1)]  
 6 | 
 7 |     for i in range(1, len_a + 1):
 8 |         d[i][0] = i
 9 | 
10 |     for j in range(1, len_b + 1):
11 |         d[0][j] = j
12 |     
13 |     for j in range(1, len_b + 1):
14 |         for i in range(1, len_a + 1):
15 |             if a[i - 1] == b[j - 1]:
16 |                 cost = 0
17 |             else:
18 |                 cost = 1
19 |             d[i][j] = min(d[i - 1][j] + 1,      # deletion
20 |                           d[i][j - 1] + 1,      # insertion
21 |                           d[i - 1][j - 1] + cost) # substitution   
22 | 
23 |     return d[len_a][len_b]
24 | 
25 | def distance(a, b):
26 | 	if a is None:
27 | 		len_a = 0
28 | 	else:
29 | 		len_a = len(a)
30 | 	if b is None:
31 | 		len_b = 0
32 | 	else:
33 | 		len_b = len(b)
34 | 	if len_a == 0:
35 | 		return len_b
36 | 	elif len_b == 0:
37 | 		return len_a
38 | 	else:
39 | 		return levenshtein_distance(a, len_a, b, len_b)
40 | 
41 | return distance(a, b)
42 | $$
43 | language plpythonu immutable;
44 | 


--------------------------------------------------------------------------------
/sql-udfs/f_mask_bigint(bigint,varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose:
 3 |     This sample function demonstrates how to use a UDF for dynamic masking.  This function can be particular
 4 |     useful when used in a security view and the parameter for the "priv" are dynamically populate via a control table.
 5 |     See the following for more details:
 6 |     https://github.com/aws-samples/amazon-redshift-dynamic-data-masking
 7 | 
 8 |     inputs:
 9 |       src - the table column which needs to be masked/unmasked
10 |       class - the classification of data, i.e. different class values may have different masking partial or full masking rules.
11 |       priv - the level of privilage allowed for this user.  e.g. if
12 |         not supplied/null, function should return null
13 |         if 'N' - no masking, will return source value
14 |         if 'F' - the data should be fully masked
15 |         if 'P' - the data should be partially masked
16 | 
17 | 2021-09-03: written by rjvgupta
18 | */
19 | create or replace function f_mask_bigint (src bigint, class varchar, priv varchar)
20 |   returns bigint
21 | immutable
22 | as $$
23 |   select case
24 |     when $3 = 'N' then $1
25 |     when $3 = 'F' then strtol(substring(md5($1), 1,15), 16)
26 |     when $3 = 'P' then case $2
27 |       when 'cc' then mod($1, 1000)
28 |       else substring($1::varchar, 1, 3)::int end
29 |     else null
30 |     end
31 | $$ language sql;
32 | 


--------------------------------------------------------------------------------
/sql-udfs/f_mask_varchar(varchar,varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose:
 3 |     This sample function demonstrates how to use a UDF for dynamic masking.  This function can be particular
 4 |     useful when used in a security view and the parameter for the "priv" are dynamically populate via a control table.
 5 |     See the following for more details:
 6 |     https://github.com/aws-samples/amazon-redshift-dynamic-data-masking
 7 | 
 8 |     inputs:
 9 |       src - the table column which needs to be masked/unmasked
10 |       class - the classification of data, i.e. different class values may have different masking partial or full masking rules.
11 |       priv - the level of privilage allowed for this user.  e.g. if
12 |         not supplied/null, function should return null
13 |         if 'N' - no masking, will return source value
14 |         if 'F' - the data should be fully masked
15 |         if 'P' - the data should be partially masked
16 | 
17 | 2021-09-03: written by rjvgupta
18 | */
19 | create or replace function f_mask_varchar (varchar, varchar, varchar)
20 |   returns varchar
21 | immutable
22 | as $$
23 |   select case
24 |     when $3 = 'N' then $1
25 |     when $3 = 'F' then md5($1)
26 |     when $3 = 'P' then case $2
27 |       when 'ssn' then substring($1, 1, 7)||'xxxx'
28 |       when 'email' then substring(SPLIT_PART($1, '@', 1), 1, 3) + 'xxxx@' + SPLIT_PART($1, '@', 2)
29 |       else substring($1, 1, 3)||'xxxxx' end
30 |     else null
31 |     end
32 | $$ language sql;
33 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_unmarshall_dynamodb_stream_data(varchar)/output.csv:
--------------------------------------------------------------------------------
1 | {"awsRegion": "us-gov-west-1", "eventID": "fed80872-9f53-416d-b998-20f8d1e33eec", "eventName": "INSERT", "userIdentity": null, "recordFormat": "application/json", "tableName": "streamer", "dynamodb": {"ApproximateCreationDateTime": 1684189640771, "Keys": {"column1": "3120"}, "NewImage": {"column1": "3120", "column2": [{"column5": {"column6": "4395"}, "column4": "4395", "column3": "4395"}, {"column5": {"column6": "4395"}, "column4": "670D0F1A-0BCF-4E53-BA9F-1CFD674EA21B", "column3": "4395"}]}, "SizeBytes": 163, "OldImage": {}}, "eventSource": "aws:dynamodb"}
2 | {"awsRegion": "us-gov-west-1", "eventID": "671fe3ec-f180-49fb-99c6-7c5ed0718e24", "eventName": "MODIFY", "userIdentity": null, "recordFormat": "application/json", "tableName": "streamer", "dynamodb": {"ApproximateCreationDateTime": 1684189987356, "Keys": {"column1": "3120"}, "NewImage": {"column1": "3120", "column2": "Updated!"}, "OldImage": {"column1": "3120", "column2": [{"column5": {"column6": "4395"}, "column4": "4395", "column3": "4395"}, {"column5": {"column6": "4395"}, "column4": "670D0F1A-0BCF-4E53-BA9F-1CFD674EA21B", "column3": "4395"}]}, "SizeBytes": 189}, "eventSource": "aws:dynamodb"}
3 | {"awsRegion": "us-gov-west-1", "eventID": "492da627-7d69-4168-a6f0-534674ea5eba", "eventName": "REMOVE", "userIdentity": null, "recordFormat": "application/json", "tableName": "streamer", "dynamodb": {"ApproximateCreationDateTime": 1684190049329, "Keys": {"column1": "3120"}, "OldImage": {"column1": "3120", "column2": "Updated!"}, "SizeBytes": 37, "NewImage": {}}, "eventSource": "aws:dynamodb"}
4 | 


--------------------------------------------------------------------------------
/sql-udfs/f_mask_timestamp(timestamp,varchar,varchar)/function.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Purpose:
 3 |     This sample function demonstrates how to use a UDF for dynamic masking.  This function can be particular
 4 |     useful when used in a security view and the parameter for the "priv" are dynamically populate via a control table.
 5 |     See the following for more details:
 6 |     https://github.com/aws-samples/amazon-redshift-dynamic-data-masking
 7 | 
 8 |     inputs:
 9 |       src - the table column which needs to be masked/unmasked
10 |       class - the classification of data, i.e. different class values may have different masking partial or full masking rules.
11 |       priv - the level of privilage allowed for this user.  e.g. if
12 |         not supplied/null, function should return null
13 |         if 'N' - no masking, will return source value
14 |         if 'F' - the data should be fully masked
15 |         if 'P' - the data should be partially masked
16 | 
17 |     note:
18 |       this function is volitile and will fail on the test for full masking because it uses a RANDOM function, this is by design.
19 | 
20 | 2021-09-03: written by rjvgupta
21 | */
22 | create or replace function f_mask_timestamp (timestamp, varchar, varchar)
23 |   returns timestamp
24 | volatile
25 | as $$
26 |   select case
27 |     when $3 = 'N' then $1
28 |     when $3 = 'F' then dateadd(day, (random() * 100)::int-50, '1/1/2021'::date)
29 |     when $3 = 'P' then case $2
30 |       when 'dob' then date_trunc('year',$1)
31 |       else dateadd(year, -1*date_part('year', $1)::int+1900,$1) end
32 |     else null
33 |     end
34 | $$ language sql;
35 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_unmarshall_dynamodb_stream_data(varchar)/input.csv:
--------------------------------------------------------------------------------
1 | '{"awsRegion": "us-gov-west-1", "eventID": "fed80872-9f53-416d-b998-20f8d1e33eec", "eventName": "INSERT", "userIdentity": null, "recordFormat": "application/json", "tableName": "streamer", "dynamodb": {"ApproximateCreationDateTime": 1684189640771, "Keys": {"column1": {"S": "3120"}}, "NewImage": {"column1": {"S": "3120"}, "column2": {"L": [{"M": {"column5": {"M": {"column6": {"N": "4395"}}}, "column4": {"N": "4395"}, "column3": {"S": "4395"}}}, {"M": {"column5": {"M": {"column6": {"N": "4395"}}}, "column4": {"S": "670D0F1A-0BCF-4E53-BA9F-1CFD674EA21B"}, "column3": {"S": "4395"}}}]}}, "SizeBytes": 163}, "eventSource": "aws:dynamodb"}'
2 | '{"awsRegion": "us-gov-west-1", "eventID": "671fe3ec-f180-49fb-99c6-7c5ed0718e24", "eventName": "MODIFY", "userIdentity": null, "recordFormat": "application/json", "tableName": "streamer", "dynamodb": {"ApproximateCreationDateTime": 1684189987356, "Keys": {"column1": {"S": "3120"}}, "NewImage": {"column1": {"S": "3120"}, "column2": {"S": "Updated!"}}, "OldImage": {"column1": {"S": "3120"}, "column2": {"L": [{"M": {"column5": {"M": {"column6": {"N": "4395"}}}, "column4": {"N": "4395"}, "column3": {"S": "4395"}}}, {"M": {"column5": {"M": {"column6": {"N": "4395"}}}, "column4": {"S": "670D0F1A-0BCF-4E53-BA9F-1CFD674EA21B"}, "column3": {"S": "4395"}}}]}}, "SizeBytes": 189}, "eventSource": "aws:dynamodb"}'
3 | '{"awsRegion": "us-gov-west-1", "eventID": "492da627-7d69-4168-a6f0-534674ea5eba", "eventName": "REMOVE", "userIdentity": null, "recordFormat": "application/json", "tableName": "streamer", "dynamodb": {"ApproximateCreationDateTime": 1684190049329, "Keys": {"column1": {"S": "3120"}}, "OldImage": {"column1": {"S": "3120"}, "column2": {"S": "Updated!"}}, "SizeBytes": 37}, "eventSource": "aws:dynamodb"}'
4 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_mysql_lookup_nodejs(varchar,varchar,varchar,varchar)/index.js:
--------------------------------------------------------------------------------
 1 | var AWS = require('aws-sdk');
 2 | const mysql = require('mysql2');
 3 | 
 4 | exports.handler = async (event, context) => {
 5 |   console.log(event);
 6 |   var secretsManager = new AWS.SecretsManager();
 7 |   var secretId = event.arguments[0][2];
 8 |   const secret = await secretsManager.getSecretValue({
 9 |       SecretId: secretId
10 |       }).promise();
11 | 
12 |   var secretJson = JSON.parse(secret.SecretString);
13 | 
14 |   var host = secretJson.host;
15 |   var user = secretJson.username;
16 |   var password = secretJson.password;
17 | 
18 |   let connectionConfig = {
19 |     host: host,
20 |     user: user,
21 |     password: password,
22 |     connectTimeout: 60000
23 |   };
24 | 
25 |   var pool = await mysql.createPool(connectionConfig);
26 |   var conn = pool.promise();
27 | 
28 |   var table = event.arguments[0][0];
29 |   var columnName = event.arguments[0][1];
30 | 
31 |   var createStmt = 'create temporary table ' + table + '_jointemp (temp_seq int, '+ columnName + ' varchar(100)); ';
32 |   await conn.query(createStmt);
33 | 
34 | 
35 |   var values = event.arguments.map((x, i) => "("+i+",'"+x[3]+"')");
36 |   var insertStmt = 'insert into ' + table + '_jointemp(temp_seq, '+ columnName +') values ' + values.join(',') + ';';
37 |   await conn.query(insertStmt);
38 | 
39 |   var selectStmt = 'select t2.* FROM ' + table + '_jointemp t1 LEFT OUTER JOIN ' + table + ' t2 using ('+ columnName +') order by temp_seq;'
40 |   const [results, fields] = await conn.execute(selectStmt);
41 | 
42 |   var res = {};
43 |   if(results.length > 0){
44 |     res = results.map((row) => JSON.stringify(row));
45 |     }
46 |     var response = JSON.stringify({"results": res});
47 |     conn.end();
48 |   return response;
49 | };
50 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_java(varchar)/lambda.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Parameters:
 3 |   S3Bucket:
 4 |     Description: Bucket containing the java packages
 5 |     Type: String
 6 |   S3Key:
 7 |     Description: Key containing the java packages
 8 |     Type: String
 9 | Resources:
10 |   LambdaRole:
11 |     Type: AWS::IAM::Role
12 |     Properties:
13 |       AssumeRolePolicyDocument:
14 |           Version: 2012-10-17
15 |           Statement:
16 |             -
17 |               Effect: Allow
18 |               Principal:
19 |                 Service:
20 |                   - lambda.amazonaws.com
21 |               Action:
22 |                 - sts:AssumeRole
23 |       Path: /
24 |       Policies:
25 |           -
26 |             PolicyName: CloudwatchLogs
27 |             PolicyDocument:
28 |               Version: 2012-10-17
29 |               Statement:
30 |                 -
31 |                   Effect: Allow
32 |                   Action:
33 |                     - logs:CreateLogGroup
34 |                   Resource:
35 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
36 |                 -
37 |                   Effect: Allow
38 |                   Action:
39 |                     - logs:CreateLogStream
40 |                     - logs:PutLogEvents
41 |                   Resource:
42 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
43 |   LambdaUDFFunction:
44 |     Type: "AWS::Lambda::Function"
45 |     Properties:
46 |       FunctionName: f-upper-java-varchar
47 |       Role: !GetAtt 'LambdaRole.Arn'
48 |       Timeout: 100
49 |       Code:
50 |         S3Bucket: !Ref S3Bucket
51 |         S3Key: !Ref S3Key
52 |       Handler: f_upper_java_varchar.Handler::handleRequest
53 |       Runtime: java11
54 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_unmarshall_dynamodb_stream_data(varchar)/README.md:
--------------------------------------------------------------------------------
 1 | # f_unmarshall_dynamodb_stream_data(varchar)
 2 | 
 3 | This sample function demonstrates how to convert DynamoDB marshalled data in a Kinesis stream
 4 | from DynamoDB streams into unmarshalled (normal JSON) format for usage in Redshift materialized views.
 5 | Blog link TODO.
 6 | 
 7 | ![Example Architecture](https://github.com/aws-samples/amazon-redshift-udfs/blob/master/lambda-udfs/f_unmarshall_dynamodb_stream_data(varchar)/example.png)
 8 | 
 9 | ## Arguments: 
10 | 1.  `payload`: The data from DynamoDB streams
11 | 
12 | ## Returns:
13 | The same payload, with DynamoDB data encoded as normal unmarshalled JSON
14 | 
15 | ## Example usage:
16 | This example demonstrates creating a materialized view with SUPER data where data is in normal JSON
17 | unmarshalled format from DynamoDB streams. [blog post](link_todo):
18 | 
19 | ```
20 | -- Step 1 
21 | CREATE EXTERNAL SCHEMA kds FROM KINESIS 
22 | 
23 | -- Step 2 
24 | CREATE MATERIALIZED VIEW {name} AUTO REFRESH YES AS 
25 | SELECT 
26 |     t.kinesis_data AS binary_avro, 
27 |     t.sequence_number,
28 |     t.refresh_time, 
29 |     t.approximate_arrival_timestamp, 
30 |     t.shard_id,
31 |     f_unmarshall_dynamodb_stream_data(payload) AS json_string, 
32 |     JSON_PARSE(json_string) AS super_data,
33 |     super_data."awsRegion" AS region,
34 |     super_data."eventID" AS event_id,
35 |     super_data."eventName" AS event_name,
36 |     super_data."tableName" AS table_name,
37 |     super_data."dynamodb"."ApproximateCreationDateTime" AS approximate_creation_date_time,
38 |     super_data."dynamodb"."Keys" AS keys,
39 |     super_data."dynamodb"."NewImage" AS new_image,
40 |     super_data."dynamodb"."OldImage" AS old_image,
41 |     super_data."dynamodb"."SizeBytes" AS size_bytes,
42 |     super_data."eventSource" AS event_source
43 | FROM kds.{stream_name} AS t
44 | ```


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_nodejs(varchar)/lambda.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Resources:
 3 |   LambdaRole:
 4 |     Type: AWS::IAM::Role
 5 |     Properties:
 6 |       AssumeRolePolicyDocument:
 7 |           Version: 2012-10-17
 8 |           Statement:
 9 |             -
10 |               Effect: Allow
11 |               Principal:
12 |                 Service:
13 |                   - lambda.amazonaws.com
14 |               Action:
15 |                 - sts:AssumeRole
16 |       Path: /
17 |       Policies:
18 |           -
19 |             PolicyName: CloudwatchLogs
20 |             PolicyDocument:
21 |               Version: 2012-10-17
22 |               Statement:
23 |                 -
24 |                   Effect: Allow
25 |                   Action:
26 |                     - logs:CreateLogGroup
27 |                   Resource:
28 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
29 |                 -
30 |                   Effect: Allow
31 |                   Action:
32 |                     - logs:CreateLogStream
33 |                     - logs:PutLogEvents
34 |                   Resource:
35 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
36 |   LambdaUDFFunction:
37 |      Type: "AWS::Lambda::Function"
38 |      Properties:
39 |        FunctionName: f-upper-javascript-varchar
40 |        Role: !GetAtt 'LambdaRole.Arn'
41 |        Timeout: 300
42 |        Code:
43 |          ZipFile: |
44 |            exports.handler = async (event) => {
45 |                console.log('Received event:', JSON.stringify(event, null, 2));
46 |                var res = event.arguments.map((x) => x[0].toUpperCase());
47 |                var ret_json = JSON.stringify({"results": res});
48 |                return ret_json
49 |            };
50 |        Handler: index.handler
51 |        Runtime: nodejs14.x
52 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_lower_python(varchar)/lambda.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Resources:
 3 |   LambdaRole:
 4 |     Type: AWS::IAM::Role
 5 |     Properties:
 6 |       AssumeRolePolicyDocument:
 7 |           Version: 2012-10-17
 8 |           Statement:
 9 |             -
10 |               Effect: Allow
11 |               Principal:
12 |                 Service:
13 |                   - lambda.amazonaws.com
14 |               Action:
15 |                 - sts:AssumeRole
16 |       Path: /
17 |       Policies:
18 |           -
19 |             PolicyName: CloudwatchLogs
20 |             PolicyDocument:
21 |               Version: 2012-10-17
22 |               Statement:
23 |                 -
24 |                   Effect: Allow
25 |                   Action:
26 |                     - logs:CreateLogGroup
27 |                   Resource:
28 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
29 |                 -
30 |                   Effect: Allow
31 |                   Action:
32 |                     - logs:CreateLogStream
33 |                     - logs:PutLogEvents
34 |                   Resource:
35 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
36 |   LambdaUDFFunction:
37 |      Type: "AWS::Lambda::Function"
38 |      Properties:
39 |        FunctionName: f-lower-python-varchar
40 |        Role: !GetAtt 'LambdaRole.Arn'
41 |        Timeout: 300
42 |        Code:
43 |          ZipFile: |
44 |            import json
45 |            def handler(event, context):
46 |                print("Received event: " + json.dumps(event, indent=2))
47 |                print(json.dumps(event))
48 |                res = [x[0].lower() for x in event['arguments']]
49 |                ret_json = json.dumps({"results": res})
50 |                return ret_json
51 |        Handler: index.handler
52 |        Runtime: python3.7
53 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_python(varchar)/lambda.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Resources:
 3 |   LambdaRole:
 4 |     Type: AWS::IAM::Role
 5 |     Properties:
 6 |       AssumeRolePolicyDocument:
 7 |           Version: 2012-10-17
 8 |           Statement:
 9 |             -
10 |               Effect: Allow
11 |               Principal:
12 |                 Service:
13 |                   - lambda.amazonaws.com
14 |               Action:
15 |                 - sts:AssumeRole
16 |       Path: /
17 |       Policies:
18 |           -
19 |             PolicyName: CloudwatchLogs
20 |             PolicyDocument:
21 |               Version: 2012-10-17
22 |               Statement:
23 |                 -
24 |                   Effect: Allow
25 |                   Action:
26 |                     - logs:CreateLogGroup
27 |                   Resource:
28 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
29 |                 -
30 |                   Effect: Allow
31 |                   Action:
32 |                     - logs:CreateLogStream
33 |                     - logs:PutLogEvents
34 |                   Resource:
35 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
36 |   LambdaUDFFunction:
37 |      Type: "AWS::Lambda::Function"
38 |      Properties:
39 |        FunctionName: f-upper-python-varchar
40 |        Role: !GetAtt 'LambdaRole.Arn'
41 |        Timeout: 300
42 |        Code:
43 |          ZipFile: |
44 |            import json
45 | 
46 |            def handler(event, context):
47 |                print("Received event: " + json.dumps(event, indent=2))
48 |                print(json.dumps(event))
49 |                res = [x[0].upper() for x in event['arguments']]
50 |                ret_json = json.dumps({"results": res})
51 |                return ret_json
52 |        Handler: index.handler
53 |        Runtime: python3.7
54 | 


--------------------------------------------------------------------------------
/stored-procedures/README.md:
--------------------------------------------------------------------------------
 1 | # Redshift Stored Procedures
 2 | Sample and useful Stored Procedures
 3 | 
 4 | | Procedure                                                | Description                                                                           |
 5 | | ---------------------------------------------------------| --------------------------------------------------------------------------------------|
 6 | | [`sp_analyze_minimal.sql`](./sp_analyze_minimal)           | Analyze **one** column of a table. To be used on a staging table right after loading  |
 7 | | [`sp_check_primary_key.sql`](./sp_check_primary_key)       | Check the integrity of the PRIMARY KEY declared on a table                            |
 8 | | [`sp_check_foreign_key.sql`](./sp_check_foreign_key)       | Check the integrity of the FOREIGN KEY declared on a table and column                 |
 9 | | [`sp_connect_by_prior.sql`](./sp_connect_by_prior)         | Calculate levels in a nested hierarchy tree                                           |
10 | | [`sp_controlled_access.sql`](./sp_controlled_access)       | Provide controlled access to data without granting permission on the table/view       |
11 | | [`sp_pivot_for.sql`](./sp_pivot_for)                       | Transpose row values into columns                                                     |
12 | | [`sp_split_table_by_range.sql`](./sp_split_table_by_range) | Split a large table into parts using a numeric column                                 |
13 | | [`sp_sync_get_new_rows.sql`](./sp_sync_get_new_rows)       | Sync new rows from a source table and insert them into a target table                 |
14 | | [`sp_sync_merge_changes.sql`](./sp_sync_merge_changes)     | Sync new and changed rows from a source table and merge them into a target table      |
15 | | [`sp_update_permissions.sql`](./sp_update_permissions)     | Reads user, group and role permission matrix from S3 and updates authorization in Redshift|
16 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_upper_java(varchar)/pom.xml:
--------------------------------------------------------------------------------
 1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 2 |   xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 3 |   <modelVersion>4.0.0</modelVersion>
 4 | 
 5 |   <groupId>lambda</groupId>
 6 |   <artifactId>f_upper_java_varchar</artifactId>
 7 |   <version>1.0.0</version>
 8 |   <packaging>jar</packaging>
 9 | 
10 |   <build>
11 |     <plugins>
12 |       <plugin>
13 |         <groupId>org.apache.maven.plugins</groupId>
14 |         <artifactId>maven-compiler-plugin</artifactId>
15 |         <version>3.6.0</version>
16 |         <configuration>
17 |           <source>1.8</source>
18 |           <target>1.8</target>
19 |           <encoding>UTF-8</encoding>
20 |           <forceJavacCompilerUse>true</forceJavacCompilerUse>
21 |         </configuration>
22 |       </plugin>
23 |       <plugin>
24 |         <groupId>org.apache.maven.plugins</groupId>
25 |         <artifactId>maven-shade-plugin</artifactId>
26 |         <version>3.0.0</version>
27 |         <executions>
28 |           <execution>
29 |             <phase>package</phase>
30 |             <goals>
31 |               <goal>shade</goal>
32 |             </goals>
33 |           </execution>
34 |         </executions>
35 |       </plugin>
36 |     </plugins>
37 |   </build>
38 | 
39 | 
40 | 
41 |   <dependencies>
42 |     <dependency>
43 |       <groupId>com.amazonaws</groupId>
44 |       <artifactId>aws-lambda-java-core</artifactId>
45 |       <version>1.1.0</version>
46 |     </dependency>
47 |     <dependency>
48 |     	<groupId>org.apache.maven.plugins</groupId>
49 |     	<artifactId>maven-shade-plugin</artifactId>
50 |     	<version>3.2.2</version>
51 |     </dependency>
52 |     <dependency>
53 |     	<groupId>org.json</groupId>
54 |     	<artifactId>json</artifactId>
55 |     	<version>20230227</version>
56 |     </dependency>
57 |   </dependencies>
58 | </project>
59 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_glue_schema_registry_avro_to_json(varchar,varchar,varchar)/README.md:
--------------------------------------------------------------------------------
 1 | # f_glue_schema_registry_avro_to_json(varchar,varchar,varchar)
 2 | 
 3 | This sample function demonstrates how to create/use lambda UDFs in Python to convert Avro encoded data into JSON for use in Redshift SUPER data type using the Glue Schema Registry (refer to [Non-JSON ingestion using Amazon Kinesis Data Streams, Amazon MSK, and Amazon Redshift Streaming Ingestion](https://aws.amazon.com/blogs/big-data/non-json-ingestion-using-amazon-kinesis-data-streams-amazon-msk-and-amazon-redshift-streaming-ingestion/).
 4 | 
 5 | ![Example Architecture](https://github.com/aws-samples/amazon-redshift-udfs/blob/master/lambda-udfs/f_glue_schema_registry_avro_to_json(varchar%2Cvarchar%2Cvarchar)/example.png)
 6 | 
 7 | ## Arguments: 
 8 | 1.  `registry_name`: The Glue Schema Registry name for the schema
 9 | 2.  `schema_name`: The schema name for the data to retrieve from the registry
10 | 3.  `data`: The Hex-encoded Avro binary data
11 | 
12 | ## Returns:
13 | The data encoded as JSON
14 | 
15 | ## Example usage:
16 | This example demonstrates creating a materialized view with SUPER data converted from Avro that's 
17 | published to a Kinesis stream. This uses the "Assumed schema" approach, as described in the
18 | [blog post](https://aws.amazon.com/blogs/big-data/non-json-ingestion-using-amazon-kinesis-data-streams-amazon-msk-and-amazon-redshift-streaming-ingestion/):
19 | 
20 | ```
21 | -- Step 1 
22 | CREATE EXTERNAL SCHEMA kds FROM KINESIS 
23 | 
24 | -- Step 2 
25 | CREATE MATERIALIZED VIEW {name} AUTO REFRESH YES AS 
26 |     SELECT 
27 |     -- Step 3 
28 |     t.kinesis_data AS binary_avro, 
29 |     to_hex(binary_avro) AS hex_avro,
30 |     -- Step 5 
31 |     f_glue_schema_registry_avro_to_json('{registry-name}', '{stream-name}', hex_avro) AS json_string, 
32 |     -- Step 6 JSON_PARSE(json_string) AS super_data, 
33 |     t.sequence_number,
34 |     t.refresh_time, 
35 |     t.approximate_arrival_timestamp, 
36 |     t.shard_id 
37 | FROM kds.{stream_name} AS t
38 | ```
39 | 


--------------------------------------------------------------------------------
/.github/workflows/github-actions.yml:
--------------------------------------------------------------------------------
 1 | name: GitHub Actions Demo
 2 | on: [push]
 3 | jobs:
 4 |   DeployCode:
 5 |     runs-on: ubuntu-latest
 6 |     steps:
 7 |       - name: Check out repository code
 8 |         uses: actions/checkout@v2
 9 |       - name: Set up JDK 11
10 |         uses: actions/setup-java@v2
11 |         with:
12 |           java-version: '11'
13 |           distribution: 'adopt'
14 |       - name: Configure AWS credentials
15 |         uses: aws-actions/configure-aws-credentials@v1
16 |         with:
17 |           aws-access-key-id: ${{ secrets.AWS_ID }}
18 |           aws-secret-access-key: ${{ secrets.AWS_KEY }}
19 |           aws-region: ${{ vars.REGION }}
20 |       - id: files
21 |         uses: jitterbit/get-changed-files@v1
22 |         with:
23 |           format: 'json'
24 |       - run: |
25 |           set -e
26 |           folders=""
27 |           files=`jq -r '.[]' <<<'${{ steps.files.outputs.added_modified }}'`
28 | 
29 |           for file in ${files[@]}; do
30 |             folders="$folders $(dirname $file)"
31 |           done
32 |           folders="$folders" | tr " " "\n" | sort | uniq | tr "\n" " "
33 |           cd bin
34 |           type=""
35 |           function=""
36 |           for folder in $folders; do
37 |               IFS="/" read -ra PARTS <<< "$folder"
38 |               if [[ $folder == *-udfs/* ]] && [[ $function != ${PARTS[1]} ]]; then
39 |                 type=${PARTS[0]}
40 |                 function=${PARTS[1]}
41 |                 key=${{ vars.S3_KEY }}
42 |                 if ! [[ -z "${key// }" ]]; then
43 |                   k="-k ${{ vars.S3_KEY }}"
44 |                 fi
45 |                 ./deployFunction.sh -t $type -f "$function" -s ${{ vars.S3_BUCKET }} $k -r ${{ vars.IAM_ROLE }} -c ${{ vars.CLUSTER }} -d ${{ vars.DB }} -u ${{ vars.USER }} -n ${{ vars.SCHEMA }} -g ${{ vars.SECURITY_GROUP }} -x ${{ vars.SUBNET }}
46 |                 ./testFunction.sh -t $type -f "$function" -c ${{ vars.CLUSTER }} -d ${{ vars.DB }} -u ${{ vars.USER }} -n ${{ vars.SCHEMA }}
47 |               else
48 |                 echo Ignoring: $folder
49 |               fi
50 |           done
51 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_check_foreign_key/README.md:
--------------------------------------------------------------------------------
 1 | This stored procedure is designed to check the integrity of the FOREIGN KEY declared on a table and a column. If the foreign key doesn't have a primary key, it will raise an info.
 2 | 
 3 | ## Prerequisites
 4 | 
 5 | The log table passed to this procedure must follow the below structure:
 6 | 
 7 | 
 8 | ```
 9 | CREATE TABLE $(log_table) (
10 |     batch_time   TIMESTAMP,
11 |     check_table  VARCHAR,
12 |     check_column VARCHAR,
13 |     check_time   TIMESTAMP,
14 |     check_status VARCHAR,
15 |     error_count  INT);
16 | ```
17 | 
18 | ## Parameters
19 | 
20 | The stored procedure takes the following parameters:
21 | 
22 | - `batch_time`: Timestamp for this batch. Can be used to group multiple fixes.
23 | - `check_table`: Schema qualified name of table to be queried.
24 | - `check_column`: Name of the column we want to check the integrity, must be a column in check_table.
25 | - `log_table`: Schema qualified table where actions are to be logged.
26 | 
27 | ## Example usage
28 | 
29 | First, ensure your log table is set up with the correct structure:
30 | 
31 | ```
32 | DROP TABLE IF EXISTS tmp_fk_log;
33 | CREATE TABLE tmp_fk_log(
34 |       batch_time   TIMESTAMP
35 |     , check_table  VARCHAR
36 |     , check_column VARCHAR
37 |     , check_time   TIMESTAMP
38 |     , check_status VARCHAR
39 |     , error_count  INT);
40 | ```
41 | 
42 | Create the data tables. In this example, we create a customers table and an orders table:
43 | 
44 | ```
45 | DROP TABLE IF EXISTS customers CASCADE;
46 | CREATE TABLE customers (
47 | customer_id INTEGER IDENTITY(1,1) PRIMARY KEY,
48 | customer_name VARCHAR(50) NOT NULL
49 | );
50 | DROP TABLE IF EXISTS orders;
51 | CREATE TABLE orders (
52 |     order_id INTEGER IDENTITY(1,1) PRIMARY KEY,
53 |     customer_id INTEGER NOT NULL REFERENCES customers(customer_id),
54 |     order_date DATE NOT NULL
55 | );
56 | ```
57 | 
58 | Populate the tables:
59 | 
60 | ```
61 | INSERT INTO customers (customer_name)
62 | VALUES ('Alice'), ('Bob'), ('Charlie');
63 | INSERT INTO orders (customer_id, order_date)
64 | VALUES (1, '2023-01-01'),
65 |        (2, '2023-01-02'),
66 |        (3, '2023-01-03'),
67 |        (4, '2023-01-04'); -- Inconsistency: customer_id 4 does not exist
68 | ```
69 | 
70 | Call the procedure:
71 | 
72 | ```
73 | CALL sp_check_foreign_key(SYSDATE,'orders', 'customer_id', 'tmp_fk_log');
74 | ```
75 | 
76 | In the above example, an inconsistency is deliberately introduced (customer_id 4 does not exist in customers table), so the procedure will log the error count and status in the tmp_fk_log table.
77 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_vector_search_all/sp_vector_search_all.sql:
--------------------------------------------------------------------------------
 1 | /**********************************************************************************************
 2 | Purpose: Execute vector search leveraging vector indexes 
 3 | Notes:  
 4 |         This procedure is used to execute a vectorized searh of your query string.  The procedure will create
 5 |         and on-the-fly embedding using the LambdaUDF f_titan_embeding(varchar) and compare the result to all
 6 |         data in your $(tablename)_embeddings table.  See the following article for more info:
 7 |         https://repost.aws/articles/ARPoweQIN2ROOXZiJAtSQvkQ/vector-search-with-amazon-redshift
 8 | 
 9 | Parameters:
10 |         tablename : The table which was the source of the data which contains the batch embeddings and K-Means clusters. 
11 |         search    : The texst you want to search
12 |         cnt       : The number of results you want to return
13 |         tmp_name  : The name of the temp table that will be created to return your search results.
14 | 
15 | Requirements:
16 |         expects a table with the following tables to exist and be populated:
17 |             CREATE TABLE $(tablename)_embeddings
18 |                 ( "recordId" VARCHAR(15),
19 |                   "modelOutput" SUPER ) DISTKEY (recordid);
20 | History:
21 | 2024-10-25 - rjvgupta - Created
22 | **********************************************************************************************/
23 | SET enable_case_sensitive_identifier TO true;
24 | 
25 | CREATE OR REPLACE PROCEDURE sp_vector_search_all (tablename IN varchar, search IN varchar, cnt IN int, tmp_name IN varchar) AS $$
26 | BEGIN
27 |     EXECUTE 'drop table if exists #'||tmp_name;
28 |     EXECUTE 'create table #'||tmp_name ||' ("recordId" varchar(100), similarity float)';
29 |     EXECUTE 'insert into #'||tmp_name ||'
30 |         select re."recordId", sum(rv::float*qv::float)/SQRT(sum(rv::float*rv::float)*sum(qv::float*qv::float)) esimilarity
31 |         from (select JSON_PARSE(f_titan_embedding('''+search+''')) as q) q, q.q qv at qvi, 
32 |            '||tablename||'_embeddings re, re."modelOutput".embedding rv at rvi 
33 |         where rvi = qvi
34 |         group by 1
35 |         qualify rank() over (order by esimilarity desc) <= '||cnt;
36 | END $$ LANGUAGE plpgsql;
37 | 
38 | /* Usage Example:
39 | SET enable_case_sensitive_identifier TO true;
40 | call sp_vector_search_all('reviews', 'bad broken unreliable slow', 100, 'searchresults');
41 | 
42 | select review_id, product_title, review_title, review_desc, similarity 
43 | from  #searchresults 
44 | join reviews on review_id = "recordId"
45 | order by similarity desc;
46 | */
47 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_mysql_lookup_nodejs(varchar,varchar,varchar,varchar)/lambda.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Parameters:
 3 |   S3Bucket:
 4 |     Description: Bucket containing the node.js packages
 5 |     Type: String
 6 |   S3Key:
 7 |     Description: Key containing the node.js packages
 8 |     Type: String
 9 |   SecurityGroupId:
10 |     Description: the SG this function should run in to have connectivity to MySQL
11 |     Type: String
12 |   SubnetId:
13 |     Description: the Subnet this function should run in to have connectivity to MySQL
14 |     Type: String
15 | Resources:
16 |   LambdaRole:
17 |     Type: AWS::IAM::Role
18 |     Properties:
19 |       AssumeRolePolicyDocument:
20 |           Version: 2012-10-17
21 |           Statement:
22 |             -
23 |               Effect: Allow
24 |               Principal:
25 |                 Service:
26 |                   - lambda.amazonaws.com
27 |               Action:
28 |                 - sts:AssumeRole
29 |       Path: /
30 |       Policies:
31 |           -
32 |             PolicyName: CloudwatchLogs
33 |             PolicyDocument:
34 |               Version: 2012-10-17
35 |               Statement:
36 |                 -
37 |                   Effect: Allow
38 |                   Action:
39 |                     - logs:CreateLogGroup
40 |                   Resource:
41 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
42 |                 -
43 |                   Effect: Allow
44 |                   Action:
45 |                     - logs:CreateLogStream
46 |                     - logs:PutLogEvents
47 |                   Resource:
48 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
49 |                 -
50 |                   Effect: Allow
51 |                   Action:
52 |                     - secretsmanager:GetSecretValue
53 |                   Resource:
54 |                     - !Sub "arn:${AWS::Partition}:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:*"
55 |       ManagedPolicyArns:
56 |         - !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
57 |   LambdaUDFFunction:
58 |      Type: "AWS::Lambda::Function"
59 |      Properties:
60 |        FunctionName: f-mysql-lookup-nodejs-varchar-varchar-varchar-varchar
61 |        Role: !GetAtt 'LambdaRole.Arn'
62 |        Handler: index.handler
63 |        Runtime: nodejs14.x
64 |        VpcConfig:
65 |         SecurityGroupIds:
66 |           - !Ref SecurityGroupId
67 |         SubnetIds:
68 |           - !Ref SubnetId
69 |        Timeout: 300
70 |        Code:
71 |          S3Bucket: !Ref S3Bucket
72 |          S3Key: !Ref S3Key
73 | 


--------------------------------------------------------------------------------
/python-udfs/f_format_number(float,varchar,varchar,int,int,bool)/function.sql:
--------------------------------------------------------------------------------
 1 | /* UDF: f_format_number.sql
 2 | 
 3 | Purpose: Provides a simple, non-locale aware way to format a number with user defined  thousands and decimal separator.
 4 | 
 5 | 2015-11-9: written by sdia
 6 | 
 7 | */
 8 | 
 9 | 
10 | CREATE OR REPLACE FUNCTION f_format_number(value FLOAT, group_sep VARCHAR, decimal_sep VARCHAR,
11 |        group_length INT, prec INT, sign BOOL)
12 | 
13 | RETURNS VARCHAR IMMUTABLE
14 | AS $$
15 |     from collections import deque
16 |     from math import modf
17 | 
18 |     def int_grouper_as_string(int_value, group_sep, group_length):
19 |         d = deque(str(int(int_value)))
20 |         grouped = []
21 |         c = 0
22 |         while True:
23 |             try:
24 |                 x = d.pop()
25 |             except IndexError:
26 |                 break
27 |             if c and not c % group_length:
28 |                 grouped.append(group_sep)
29 |             c += 1
30 |             grouped.append(x)
31 |         grouped.reverse()
32 |         return ''.join(grouped)
33 | 
34 | 
35 |     def get_decimal_as_string(fract_value, precision):
36 |         if precision > 0:
37 |             prec_string = '{{0:.{precision}f}}'.format(precision=precision)
38 |             fract_string = prec_string.format(fract_value)
39 |             return fract_string.split('.')[-1]
40 |         else:
41 |             return ''
42 | 
43 | 
44 |     def f_format_number(value, group_sep=',', decimal_sep='.', group_length=3,
45 |                         precision=2, sign=False):
46 |         if value is None:
47 |             return None
48 |         try:
49 |             value_float = float(value)
50 |         except ValueError, e:
51 |             print('A problem occured with formatting, numeric value was expected.')
52 |             raise(e)
53 |         try:
54 |             assert decimal_sep != group_sep
55 |         except AssertionError, e:
56 |             print('A problem occured with formatting, group and decimal separators should not be equal!')
57 |             raise(e)
58 | 
59 |         if value < 0:
60 |             sign_symbol = '-'
61 |         elif sign:
62 |             sign_symbol = '+'
63 |         else:
64 |             sign_symbol = ''
65 | 
66 |         fract_part, int_part = modf(abs(value_float))
67 |         int_group = int_grouper_as_string(int_part, group_sep, group_length)
68 |         dec = get_decimal_as_string(fract_part, prec)
69 | 
70 |         res = sign_symbol+int_group
71 | 
72 |         if dec != '':
73 |             res += decimal_sep+dec
74 | 
75 |         return res
76 | 
77 | 
78 |     return f_format_number(value, group_sep, decimal_sep, group_length,
79 |     	   prec, sign)
80 | 
81 | $$ LANGUAGE plpythonu;
82 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_correlation/sp_correlation.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | CREATE or replace PROCEDURE sp_correlation(source_schema_name in varchar(255), source_table_name in varchar(255), target_column_name in varchar(255), output_temp_table_name inout varchar(255)) AS $$
 3 | DECLARE
 4 |   v_sql varchar(max);
 5 |   v_generated_sql varchar(max);
 6 |   v_source_schema_name varchar(255)=lower(source_schema_name);
 7 |   v_source_table_name varchar(255)=lower(source_table_name);
 8 |   v_target_column_name varchar(255)=lower(target_column_name);
 9 | BEGIN
10 |   EXECUTE 'drop table if exists ' || output_temp_table_name;
11 |   v_sql = '
12 | select
13 |   ''create temp table '|| output_temp_table_name||' as select ''|| outer_calculation||
14 |   '' from (select count(1) number_of_items, sum('||v_target_column_name||') sum_target, sum(pow('||v_target_column_name||',2)) sum_square_target, pow(sum('||v_target_column_name||'),2) square_sum_target,''||
15 |   inner_calculation||
16 |   '' from (select ''||
17 |   column_name||
18 |   '' from '||v_source_table_name||'))''
19 | from
20 |   (
21 |   select
22 |     distinct
23 |     listagg(outer_calculation,'','') over () outer_calculation
24 |     ,listagg(inner_calculation,'','') over () inner_calculation
25 |     ,listagg(column_name,'','') over () column_name
26 |   from
27 |     (
28 |     select
29 |       case when atttypid=16 then ''decode(''||column_name||'',true,1,0)'' else column_name end column_name
30 |       ,atttypid
31 |       ,''cast(decode(number_of_items * sum_square_''||rn||'' - square_sum_''||rn||'',0,null,(number_of_items*sum_target_''||rn||'' - sum_target * sum_''||rn||
32 |         '')/sqrt((number_of_items * sum_square_target - square_sum_target) * (number_of_items * sum_square_''||rn||
33 |         '' - square_sum_''||rn||''))) as numeric(5,2)) ''||column_name outer_calculation
34 |       ,''sum(''||column_name||'') sum_''||rn||'',''||
35 |             ''sum(trip_count*''||column_name||'') sum_target_''||rn||'',''||
36 |             ''sum(pow(''||column_name||'',2)) sum_square_''||rn||'',''||
37 |             ''pow(sum(''||column_name||''),2) square_sum_''||rn inner_calculation
38 |     from
39 |       (
40 |       select
41 |         row_number() over (order by a.attnum) rn
42 |         ,a.attname::VARCHAR column_name
43 |         ,a.atttypid
44 |       FROM pg_namespace AS n
45 |         INNER JOIN pg_class AS c ON n.oid = c.relnamespace
46 |         INNER JOIN pg_attribute AS a ON c.oid = a.attrelid
47 |       where a.attnum > 0
48 |         and n.nspname = '''||v_source_schema_name||'''
49 |         and c.relname = '''||v_source_table_name||'''
50 |         and a.atttypid in (16,20,21,23,700,701,1700)
51 |       )
52 |     )
53 | )';
54 |   execute v_sql into v_generated_sql;
55 |   execute  v_generated_sql;
56 | END;
57 | $$ LANGUAGE plpgsql;
58 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_kms_decrypt(varchar)/lambda.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Resources:
 3 |   LambdaRole:
 4 |     Type: AWS::IAM::Role
 5 |     Properties:
 6 |       AssumeRolePolicyDocument:
 7 |           Version: 2012-10-17
 8 |           Statement:
 9 |             -
10 |               Effect: Allow
11 |               Principal:
12 |                 Service:
13 |                   - lambda.amazonaws.com
14 |               Action:
15 |                 - sts:AssumeRole
16 |       Path: /
17 |       Policies:
18 |           -
19 |             PolicyName: CloudwatchLogs
20 |             PolicyDocument:
21 |               Version: 2012-10-17
22 |               Statement:
23 |                 -
24 |                   Effect: Allow
25 |                   Action:
26 |                     - logs:CreateLogGroup
27 |                   Resource:
28 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
29 |                 -
30 |                   Effect: Allow
31 |                   Action:
32 |                     - logs:CreateLogStream
33 |                     - logs:PutLogEvents
34 |                   Resource:
35 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
36 |                 -
37 |                   Effect: Allow
38 |                   Action:
39 |                     - kms:Decrypt
40 |                   Resource:
41 |                     - !Sub "arn:${AWS::Partition}:kms:${AWS::Region}:${AWS::AccountId}:key/*"
42 |       ManagedPolicyArns:
43 |         - !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
44 |   LambdaUDFFunction:
45 |      Type: "AWS::Lambda::Function"
46 |      Properties:
47 |        FunctionName: f-kms-decrypt-varchar
48 |        Role: !GetAtt 'LambdaRole.Arn'
49 |        Timeout: 300
50 |        Code:
51 |          ZipFile: |
52 |           import json, boto3, os, base64
53 |           kms = boto3.client('kms')
54 | 
55 |           def handler(event, context):
56 |               ret = dict()
57 |               res = []
58 |               for argument in event['arguments']:
59 |                   try:
60 |                       columnValue = argument[0]
61 |                       if (columnValue == None):
62 |                           response = None
63 |                       else:
64 |                           plaintext = kms.decrypt(CiphertextBlob=base64.b64decode(columnValue))
65 |                           response = plaintext["Plaintext"].decode('utf-8')
66 |                       res.append(response)
67 |                   except Exception as e:
68 |                       print (str(e))
69 |                       res.append(None)
70 |               ret['success'] = True
71 |               ret['results'] = res
72 |               return json.dumps(ret)
73 | 
74 |        Handler: index.handler
75 |        Runtime: python3.7
76 |      DependsOn:
77 |        - LambdaRole
78 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_kms_encrypt(varchar,varchar)/lambda.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Resources:
 3 |   LambdaRole:
 4 |     Type: AWS::IAM::Role
 5 |     Properties:
 6 |       AssumeRolePolicyDocument:
 7 |           Version: 2012-10-17
 8 |           Statement:
 9 |             -
10 |               Effect: Allow
11 |               Principal:
12 |                 Service:
13 |                   - lambda.amazonaws.com
14 |               Action:
15 |                 - sts:AssumeRole
16 |       Path: /
17 |       Policies:
18 |           -
19 |             PolicyName: CloudwatchLogs
20 |             PolicyDocument:
21 |               Version: 2012-10-17
22 |               Statement:
23 |                 -
24 |                   Effect: Allow
25 |                   Action:
26 |                     - logs:CreateLogGroup
27 |                   Resource:
28 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
29 |                 -
30 |                   Effect: Allow
31 |                   Action:
32 |                     - logs:CreateLogStream
33 |                     - logs:PutLogEvents
34 |                   Resource:
35 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
36 |                 -
37 |                   Effect: Allow
38 |                   Action:
39 |                     - kms:Encrypt
40 |                   Resource:
41 |                     - !Sub "arn:${AWS::Partition}:kms:${AWS::Region}:${AWS::AccountId}:key/*"
42 |       ManagedPolicyArns:
43 |         - !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
44 |   LambdaUDFFunction:
45 |      Type: "AWS::Lambda::Function"
46 |      Properties:
47 |        FunctionName: f-kms-encrypt-varchar-varchar
48 |        Role: !GetAtt 'LambdaRole.Arn'
49 |        Timeout: 300
50 |        Code:
51 |          ZipFile: |
52 |           import json, boto3, os, base64
53 |           kms = boto3.client('kms')
54 | 
55 |           def handler(event, context):
56 |               ret = dict()
57 |               res = []
58 |               for argument in event['arguments']:
59 |                   try:
60 |                       kmskeyid = argument[0]
61 |                       columnValue = argument[1]
62 |                       if (columnValue == None):
63 |                           response = None
64 |                       else:
65 |                           ciphertext = kms.encrypt(KeyId=kmskeyid, Plaintext=columnValue)
66 |                           response = base64.b64encode(ciphertext["CiphertextBlob"]).decode('utf-8')
67 |                       res.append(response)
68 |                   except Exception as e:
69 |                       print (str(e))
70 |                       res.append(None)
71 |               ret['success'] = True
72 |               ret['results'] = res
73 |               return json.dumps(ret)
74 |        Handler: index.handler
75 |        Runtime: python3.9
76 |      DependsOn:
77 |        - LambdaRole
78 | 


--------------------------------------------------------------------------------
/bin/layerInstaller.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -e
 3 | # Install Pip Module as Redshift Library
 4 | 
 5 | function usage {
 6 | 	echo "./layerInstaller.sh -s <s3 location> -r <requirement> -f <UDF name> -p <python version> "
 7 | 	echo
 8 | 	echo "where <s3 location> is the prefix of the location on S3 to upload the artifact to. Must be in format s3://bucket/prefix"
 9 | 	echo "      <requirement> is the dependency requirement (e.g. boto3>=1.29.0)"
10 | 	echo "      <UDF name> is the name of the UDF"
11 | 	echo "      <python version> is Python version to build for (defaults to 3.9)"
12 | 	exit 0;
13 | }
14 | 
15 | 
16 | function notNull {
17 | 	if [ "$1x" == "x" ]; then
18 | 		echo $2
19 | 		exit -1
20 | 	fi
21 | }
22 | 
23 | # make sure we have Docker installed
24 | docker images &> /dev/null
25 | if [ $? != 0 ]; then
26 |   echo "docker not found or not running. Please install docker to continue"
27 |   exit -1
28 | fi
29 | 
30 | # look up runtime arguments of the module name and the destination S3 Prefix
31 | while getopts "r:s:f:h:p" opt; do
32 | 	case $opt in
33 | 		r) requirement="$OPTARG";;
34 | 		s) s3Path="$OPTARG";;
35 | 		f) function="$OPTARG";;
36 | 		p) python_version="$OPTARG";;
37 | 		h) usage;;
38 | 		\?) echo "Invalid option: -"$OPTARG"" >&2
39 | 			exit 1;;
40 | 		:) usage;;
41 | 	esac
42 | done
43 | 
44 | # validate arguments
45 | notNull "$requirement" "Please provide the dependency requirement (e.g. boto3>=1.29.0) with -r"
46 | notNull "$s3Path" "Please provide an S3 key to store the library in using -s"
47 | notNull "$function" "Please provide the function name using -f"
48 | 
49 | if [ -z "${python_version}" ]; then
50 | 	python_version="3.9"
51 | fi
52 | dependencyName=$(echo "${requirement}" | sed 's/[<=>]/ /g' | awk '{print $1}')
53 | dependencyVersion=$(echo "${requirement}" | sed 's/[<=>]/ /g' | awk '{print $2}')
54 | notNull "${dependencyName}" "Invalid requirement: ${requirement}. Expected format: 'NAME[>=<]*[0-9\.]*'"
55 | archiveName="${s3Path}/${dependencyName}.zip"
56 | if [ ! -z "${dependencyVersion}" ]; then 
57 | 	archiveName="${s3Path}/${dependencyName}_${dependencyVersion}.zip"
58 | fi
59 | 
60 | TMPDIR=.tmp
61 | if [ ! -d "$TMPDIR" ]; then
62 |   mkdir $TMPDIR
63 |   chmod 2755 $TMPDIR
64 | fi
65 | cd "${TMPDIR}"
66 | 
67 | echo "Building Lambda layer inside Docker container..."
68 | mkdir -p "python/lib/python${python_version}/site-packages"
69 | echo "${requirement}" > requirements.txt
70 | export UID
71 | docker run \
72 |     -v "$PWD":/var/task \
73 | 	--user $UID:$UID \
74 |     "public.ecr.aws/sam/build-python${python_version}" \
75 |     /bin/sh \
76 | 	-c "pip install --upgrade pip" \
77 |     -c "pip install -r requirements.txt -t python/lib/python${python_version}/site-packages/; exit"
78 | 
79 | echo "Built, zipping layer contents..."
80 | zip -r "${function}.zip" python > /dev/null
81 | cd -
82 | 
83 | echo "Zipped, publishing to ${archiveName}..."
84 | aws s3 cp "${TMPDIR}/${function}.zip" "${archiveName}"
85 | 
86 | echo "Published, cleaning up..."
87 | rm -rf "${TMPDIR}"
88 | 
89 | echo "Complete."


--------------------------------------------------------------------------------
/views/v_generate_udf_ddl.sql:
--------------------------------------------------------------------------------
 1 | --DROP VIEW admin.v_generate_udf_ddl;
 2 | /**********************************************************************************************
 3 | Purpose: View to get the DDL for a UDF. 
 4 | History:
 5 | 2016-04-20 chriz-bigdata Created
 6 | **********************************************************************************************/
 7 | CREATE OR REPLACE VIEW admin.v_generate_udf_ddl
 8 | AS
 9 | WITH arguments AS (SELECT oid, i, arg_name[i] as argument_name, arg_types[i-1] argument_type
10 | FROM (
11 |   SELECT generate_series(1, array_upper(arg_name, 1)) AS i, arg_name, arg_types,oid
12 |   FROM (SELECT oid, proargnames arg_name, proargtypes arg_types from pg_proc where proowner != 1) t
13 | ) t)
14 | SELECT 
15 | 	schemaname,
16 | 	udfname,
17 | 	seq,
18 | 	trim(ddl) ddl FROM (
19 | SELECT 
20 |    n.nspname AS schemaname,
21 |    p.proname AS udfname,
22 |    p.oid AS udfoid,
23 | 1000 as seq, ('CREATE FUNCTION ' || p.proname || ' \(')::varchar(max) as ddl
24 | FROM pg_proc p
25 | LEFT JOIN pg_namespace n on n.oid = p.pronamespace
26 | WHERE p.proowner != 1
27 | UNION ALL
28 | SELECT 
29 |    n.nspname AS schemaname,
30 |    p.proname AS udfname,
31 |    p.oid AS udfoid,
32 | 2000+nvl(i,0) as seq, case when i = 1 then argument_name || ' ' || format_type(argument_type,null) else ',' || argument_name || ' ' || format_type(argument_type,null) end as ddl
33 | FROM pg_proc p
34 | LEFT JOIN pg_namespace n on n.oid = p.pronamespace
35 | LEFT JOIN arguments a on a.oid = p.oid
36 | WHERE p.proowner != 1
37 | UNION ALL
38 | SELECT 
39 |    n.nspname AS schemaname,
40 |    p.proname AS udfname,
41 |    p.oid AS udfoid,
42 | 3000 as seq, '\)' as ddl
43 | FROM pg_proc p
44 | LEFT JOIN pg_namespace n on n.oid = p.pronamespace
45 | WHERE p.proowner != 1
46 | UNION ALL
47 | SELECT 
48 |    n.nspname AS schemaname,
49 |    p.proname AS udfname,
50 |    p.oid AS udfoid,
51 |  4000 as seq, '  RETURNS ' || pg_catalog.format_type(p.prorettype, NULL) as ddl
52 | FROM pg_proc p
53 | LEFT JOIN pg_namespace n on n.oid = p.pronamespace
54 | WHERE p.proowner != 1
55 | UNION ALL
56 | SELECT 
57 |    n.nspname AS schemaname,
58 |    p.proname AS udfname,
59 |       p.oid AS udfoid,
60 | 5000 AS seq, CASE WHEN p.provolatile = 'v' THEN 'VOLATILE' WHEN p.provolatile = 's' THEN 'STABLE' WHEN p.provolatile = 'i' THEN 'IMMUTABLE' ELSE '' END as ddl
61 | FROM pg_proc p
62 | LEFT JOIN pg_namespace n on n.oid = p.pronamespace
63 | WHERE p.proowner != 1
64 | UNION ALL
65 | SELECT 
66 |    n.nspname AS schemaname,
67 |    p.proname AS udfname,
68 |       p.oid AS udfoid,
69 | 6000 AS seq, 'AS $$' as ddl
70 | FROM pg_proc p
71 | LEFT JOIN pg_namespace n on n.oid = p.pronamespace
72 | WHERE p.proowner != 1
73 | UNION ALL
74 | SELECT 
75 |    n.nspname AS schemaname,
76 |    p.proname AS udfname,
77 |       p.oid AS udfoid,
78 | 7000 AS seq, p.prosrc as DDL
79 | FROM pg_proc p
80 | LEFT JOIN pg_namespace n on n.oid = p.pronamespace
81 | WHERE p.proowner != 1
82 | UNION ALL
83 | SELECT 
84 |    n.nspname AS schemaname,
85 |    p.proname AS udfname,
86 |       p.oid AS udfoid,
87 | 8000 as seq, '$$ LANGUAGE plpythonu;' as ddl
88 | FROM pg_proc p
89 | LEFT JOIN pg_namespace n on n.oid = p.pronamespace
90 | WHERE p.proowner != 1
91 | )
92 | ORDER BY udfoid,seq;
93 | 


--------------------------------------------------------------------------------
/lambda-udfs/fn_lambda_levenshtein_distance(varchar,varchar)/lambda.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Resources:
 3 |   LambdaRole:
 4 |     Type: AWS::IAM::Role
 5 |     Properties:
 6 |       AssumeRolePolicyDocument:
 7 |           Version: 2012-10-17
 8 |           Statement:
 9 |             -
10 |               Effect: Allow
11 |               Principal:
12 |                 Service:
13 |                   - lambda.amazonaws.com
14 |               Action:
15 |                 - sts:AssumeRole
16 |       Path: /
17 |       Policies:
18 |           -
19 |             PolicyName: CloudwatchLogs
20 |             PolicyDocument:
21 |               Version: 2012-10-17
22 |               Statement:
23 |                 -
24 |                   Effect: Allow
25 |                   Action:
26 |                     - logs:CreateLogGroup
27 |                   Resource:
28 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
29 |                 -
30 |                   Effect: Allow
31 |                   Action:
32 |                     - logs:CreateLogStream
33 |                     - logs:PutLogEvents
34 |                   Resource:
35 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
36 |   LambdaLevenshteinDistance:
37 |     Type: "AWS::Lambda::Function"
38 |     Properties:
39 |       FunctionName: fn_lambda_levenshtein_distance
40 |       Role: !GetAtt 'LambdaRole.Arn'
41 |       Timeout: 600
42 |       Code:
43 |         ZipFile: |
44 |           import json
45 |           def levenshtein_distance(a, len_a, b, len_b):
46 |             d = [[0] * (len_b + 1) for i in range(len_a + 1)]  
47 |             for i in range(1, len_a + 1):
48 |               d[i][0] = i
49 |             for j in range(1, len_b + 1):
50 |               d[0][j] = j
51 |             for j in range(1, len_b + 1):
52 |               for i in range(1, len_a + 1):
53 |                 if a[i - 1] == b[j - 1]:
54 |                   cost = 0
55 |                 else:
56 |                   cost = 1
57 |                 d[i][j] = min(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost)
58 |             return d[len_a][len_b]
59 |           def distance(a, b):
60 |             if a is None:
61 |               len_a = 0
62 |             else:
63 |               len_a = len(a)
64 |             if b is None:
65 |               len_b = 0
66 |             else:
67 |               len_b = len(b)
68 |             if len_a == 0:
69 |               return len_b
70 |             elif len_b == 0:
71 |               return len_a
72 |             else:
73 |               return levenshtein_distance(a, len_a, b, len_b)
74 |           def lambda_handler(event, context):
75 |             ret = dict()
76 |             res = []
77 |             for argument in event['arguments']:
78 |               try: 
79 |                 s1=str(argument[0])
80 |                 s2=str(argument[1])
81 |                 for argument in event['arguments']:
82 |                   s1=str(argument[0])
83 |                   s2=str(argument[1])
84 |                   res.append(distance(s1, s2))
85 |               except Exception as e:
86 |                 print (str(e))
87 |                 ret['error_msg'] = str(e)
88 |                 ret['sucess'] = False
89 |               ret['sucess'] = True
90 |               ret['results'] = res
91 |               return json.dumps(ret)
92 |       Handler: index.lambda_handler
93 |       Runtime: python3.7
94 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_dynamodb_lookup_python(varchar,varchar,varchar)/lambda.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Resources:
 3 |   LambdaRole:
 4 |     Type: AWS::IAM::Role
 5 |     Properties:
 6 |       AssumeRolePolicyDocument:
 7 |           Version: 2012-10-17
 8 |           Statement:
 9 |             -
10 |               Effect: Allow
11 |               Principal:
12 |                 Service:
13 |                   - lambda.amazonaws.com
14 |               Action:
15 |                 - sts:AssumeRole
16 |       Path: /
17 |       Policies:
18 |           -
19 |             PolicyName: CloudwatchLogs
20 |             PolicyDocument:
21 |               Version: 2012-10-17
22 |               Statement:
23 |                 -
24 |                   Effect: Allow
25 |                   Action:
26 |                     - logs:CreateLogGroup
27 |                   Resource:
28 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
29 |                 -
30 |                   Effect: Allow
31 |                   Action:
32 |                     - logs:CreateLogStream
33 |                     - logs:PutLogEvents
34 |                   Resource:
35 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
36 |                 -
37 |                   Effect: Allow
38 |                   Action:
39 |                     - dynamodb:DescribeTable
40 |                     - dynamodb:GetItem
41 |                   Resource:
42 |                     - !Sub "arn:${AWS::Partition}:dynamodb:${AWS::Region}:${AWS::AccountId}:table/*"
43 |   LambdaUDFFunction:
44 |      Type: "AWS::Lambda::Function"
45 |      Properties:
46 |        FunctionName: f-dynamodb-lookup-python-varchar-varchar-varchar
47 |        Role: !GetAtt 'LambdaRole.Arn'
48 |        Handler: index.handler
49 |        Runtime: python3.9
50 |        Timeout: 300
51 |        Code:
52 |          ZipFile: |
53 |             import decimal
54 |             import json
55 | 
56 |             import boto3
57 | 
58 |             dynamodb = boto3.resource("dynamodb")
59 | 
60 | 
61 |             class DecimalEncoder(json.JSONEncoder):
62 |                 """Handle JSON encoding of Decimal data (necessary because TypeDeserializer defaults to Decimal for floating point values)."""
63 | 
64 |                 def default(self, o):
65 |                     if isinstance(o, decimal.Decimal):
66 |                         return str(o)
67 |                     return super(DecimalEncoder, self).default(o)
68 | 
69 | 
70 |             def handler(event, context):
71 |                 redshift_response = {"success": False, "num_records": event["num_records"]}
72 |                 try:
73 |                     result = []
74 |                     for table_name, column_name, column_value in event["arguments"]:
75 |                         table = dynamodb.Table(table_name)
76 |                         try:
77 |                             ddb_response = table.get_item(Key={column_name: column_value})
78 |                             result.append(json.dumps(ddb_response["Item"], cls=DecimalEncoder))
79 |                         except Exception as e:
80 |                             print(f"Error retrieving from DDB: {e}")
81 |                             result.append(None)
82 |                     redshift_response["success"] = True
83 |                     redshift_response["results"] = result
84 |                 except Exception as e:
85 |                     redshift_response["error_msg"] = str(e)
86 |                 return json.dumps(redshift_response)


--------------------------------------------------------------------------------
/stored-procedures/sp_vector_search/sp_vector_search.sql:
--------------------------------------------------------------------------------
 1 | /**********************************************************************************************
 2 | Purpose: Execute vector search leveraging vector indexes 
 3 | Notes:  
 4 |         This procedure is used to execute a vectorized searh of your query string.  The procedure will create
 5 |         and on-the-fly embedding using the LambdaUDF f_titan_embeding(varchar) and compare the result to your 
 6 |         K-Means clusters create using the stored procedure sp_vector_search.  See the following article for more info:
 7 |         https://repost.aws/articles/ARPoweQIN2ROOXZiJAtSQvkQ/vector-search-with-amazon-redshift
 8 | 
 9 | Parameters:
10 |         tablename : The table which was the source of the data which contains the batch embeddings and K-Means clusters. 
11 |         search    : The texst you want to search
12 |         cnt       : The number of results you want to return
13 |         tmp_name  : The name of the temp table that will be created to return your search results.
14 | 
15 | Requirements:
16 |         expects a table with the following tables to exist and be populated:
17 |             CREATE TABLE $(tablename)_embeddings
18 |                 ( "recordId" VARCHAR(15),
19 |                   "modelOutput" SUPER ) DISTKEY (recordid);
20 |             CREATE TABLE $(tablename)_kmeans 
21 |                 ( cluster int, 
22 |                   centroid SUPER, 
23 |                   startts timestamp, 
24 |                   endts timestamp, 
25 |                   interations int) DISTSTYLE ALL;
26 |             CREATE TABLE $(tablename)_kmeans_clusters 
27 |                 ( cluster int, 
28 |                   "recordId" VARCHAR(15), 
29 |                   similarity float, 
30 |                   rnk int) DISTKEY (recordid);
31 | 
32 | History:
33 | 2024-07-19 - rjvgupta - Created
34 | 2024-10-25 - rjvgupta - Updated for case-sensitivity.
35 | **********************************************************************************************/
36 | 
37 | SET enable_case_sensitive_identifier TO true;
38 | 
39 | CREATE OR REPLACE PROCEDURE sp_vector_search (tablename IN varchar, search IN varchar, cnt IN int, tmp_name IN varchar) AS $$
40 | BEGIN
41 |     EXECUTE 'drop table if exists #'||tmp_name;
42 |     EXECUTE 'create table #'||tmp_name ||' ("recordId" varchar(100), similarity float)';
43 |     EXECUTE 'insert into #'||tmp_name ||'
44 |         select re."recordId", sum(rv::float*qv::float)/SQRT(sum(rv::float*rv::float)*sum(qv::float*qv::float)) esimilarity
45 |         from (
46 |             select k.cluster, q.q, sum(kv::float*qv::float)/SQRT(sum(kv::float*kv::float)*sum(qv::float*qv::float)) csimilarity
47 |             from '||tablename||'_kmeans k, k.centroid kv at kvi, 
48 |             (select JSON_PARSE(f_titan_embedding('''+search+''')) as q) q, q.q qv at qvi
49 |             where kvi = qvi 
50 |             group by 1,2
51 |             qualify rank() over (order by csimilarity desc) = 1
52 |         ) q, '||tablename||'_kmeans_clusters c, '||tablename||'_embeddings re, q.q qv at qvi, re."modelOutput".embedding rv at rvi 
53 |         where rvi = qvi and c.cluster = q.cluster and c."recordId" = re."recordId"
54 |         group by 1
55 |         qualify rank() over (order by esimilarity desc) <= '||cnt;
56 | END $$ LANGUAGE plpgsql;
57 | 
58 | /* Usage Example:
59 | SET enable_case_sensitive_identifier TO true;
60 | call sp_vector_search('reviews', 'bad broken unreliable slow', 100, 'searchresults');
61 | 
62 | select review_id, product_title, review_title, review_desc, similarity 
63 | from  #searchresults 
64 | join reviews on review_id = "recordId"
65 | order by similarity desc;
66 | */
67 | 


--------------------------------------------------------------------------------
/bin/libraryInstaller.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | # Install Pip Module as Redshift Library
  4 | 
  5 | function usage {
  6 | 	echo "./libraryInstaller.sh -m <module> -s <s3 location> -r <iam role> -c <cluster> -d <database> -u <db user> "
  7 | 	echo
  8 | 	echo "where <module> is the name of the Pip module to be installed"
  9 | 	echo "      <s3 location> is the location on S3 to upload the artifact to. Must be in format s3://bucket/prefix"
 10 | 	echo "      <s3 role> is the role which is attached to the Redshift cluster and has access to read from the s3 upload location"
 11 | 	echo "      <cluster> is the Redshift cluster you will deploy the function to"
 12 | 	echo "      <database> is the database you will deploy the function to"
 13 | 	echo "      <db user> is the db user who will create the function"
 14 | 
 15 | 	exit 0;
 16 | }
 17 | 
 18 | 
 19 | function notNull {
 20 | 	if [ "$1x" == "x" ]; then
 21 | 		echo $2
 22 | 		exit -1
 23 | 	fi
 24 | }
 25 | 
 26 | # make sure we have wheel installed into pip
 27 | pip3 show wheel &> /dev/null
 28 | if [ $? != 0 ]; then
 29 |   echo "pip3 wheel not found. Please install with 'sudo pip install wheel'"
 30 |   exit -1
 31 | fi
 32 | 
 33 | # look up runtime arguments of the module name and the destination S3 Prefix
 34 | while getopts "m:s:r:c:d:u:h" opt; do
 35 | 	case $opt in
 36 | 		m) module="$OPTARG";;
 37 | 		s) s3Prefix="$OPTARG";;
 38 | 		r) s3Role="$OPTARG";;
 39 | 		c) cluster="$OPTARG";;
 40 | 		d) db="$OPTARG";;
 41 | 		u) user="$OPTARG";;
 42 | 		h) usage;;
 43 | 		\?) echo "Invalid option: -"$OPTARG"" >&2
 44 | 			exit 1;;
 45 | 		:) usage;;
 46 | 	esac
 47 | done
 48 | 
 49 | # validate arguments
 50 | notNull "$module" "Please provide the pip module name using -m"
 51 | notNull "$s3Prefix" "Please provide an S3 Prefix to store the library in using -s"
 52 | 
 53 | # check that the s3 prefix is in the right format
 54 | # starts with 's3://'
 55 | 
 56 | if ! [[ $s3Prefix == s3:\/\/* ]]; then
 57 | 	echo "S3 Prefix must start with 's3://'"
 58 | 	echo
 59 | 	usage
 60 | fi
 61 | 
 62 | # found the module - install to a local hidden directory
 63 | echo "Installing $module with pip and uploading to $s3Prefix"
 64 | 
 65 | TMPDIR=.tmp
 66 | if [ ! -d "$TMPDIR" ]; then
 67 |   mkdir $TMPDIR
 68 | fi
 69 | 
 70 | rm -Rf "$TMPDIR/.$module" &> /dev/null
 71 | 
 72 | mkdir "$TMPDIR/.$module"
 73 | 
 74 | pip3 wheel $module --wheel-dir "$TMPDIR/.$module"
 75 | if [ $? != 0 ]; then
 76 | 	echo "Unable to find module $module in pip."
 77 | 	rm -Rf "$TMPDIR/.$module"
 78 | 	exit $?
 79 | fi
 80 | 
 81 | 
 82 | execQuery()
 83 | {
 84 |   echo $4
 85 |   output=`aws redshift-data execute-statement --cluster-identifier $1 --database $2 --db-user $3 --sql "$4"`
 86 |   id=`echo $output | jq -r .Id`
 87 |   notNull "$id" "Error running execute-statement"
 88 | 
 89 |   status="SUBMITTED"
 90 |   while [ "$status" != "FINISHED" ] && [ "$status" != "FAILED" ]
 91 |   do
 92 |     sleep 1
 93 |     status=`aws redshift-data describe-statement --id $id | jq -r .Status`
 94 | 	notNull "$status" "Error running describe-statement"
 95 |   done
 96 | 	if [ "$status" == "FAILED" ]; then
 97 |     aws redshift-data describe-statement --id $id
 98 |     return 1
 99 |   else
100 |     echo $id:$status
101 |   fi
102 | }
103 | 
104 | files=`ls ${TMPDIR}/.${module}/*.whl`
105 | for depname in `basename -s .whl $files`
106 | do
107 | 	#depname=${file%.*}
108 |  	#depname=`basename -s .whl $file`
109 | 	echo $depname
110 | 	aws s3 cp "$TMPDIR/.$module/$depname.whl" "$s3Prefix/$depname.zip"
111 | 	sql="CREATE OR REPLACE LIBRARY ${depname%%-*} LANGUAGE plpythonu FROM '$s3Prefix/$depname.zip' WITH CREDENTIALS AS 'aws_iam_role=$s3Role'; "
112 | 	execQuery $cluster $db $user "$sql"
113 | 	if [ $? != 0 ]; then
114 | 		rm -Rf "$TMPDIR/.$module"
115 | 		exit $?
116 | 	fi
117 | done
118 | 
119 | rm -Rf "$TMPDIR/.$module"
120 | cd - &> /dev/null
121 | 
122 | exit 0
123 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_titan_embedding(varchar)/lambda.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Resources:
 3 |   LambdaRole:
 4 |     Type: AWS::IAM::Role
 5 |     Properties:
 6 |       AssumeRolePolicyDocument:
 7 |           Version: 2012-10-17
 8 |           Statement:
 9 |             -
10 |               Effect: Allow
11 |               Principal:
12 |                 Service:
13 |                   - lambda.amazonaws.com
14 |               Action:
15 |                 - sts:AssumeRole
16 |       Path: /
17 |       Policies:
18 |           -
19 |             PolicyName: CloudwatchLogs
20 |             PolicyDocument:
21 |               Version: 2012-10-17
22 |               Statement:
23 |                 -
24 |                   Effect: Allow
25 |                   Action:
26 |                     - logs:CreateLogGroup
27 |                   Resource:
28 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
29 |                 -
30 |                   Effect: Allow
31 |                   Action:
32 |                     - logs:CreateLogStream
33 |                     - logs:PutLogEvents
34 |                   Resource:
35 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
36 |                 -
37 |                   Effect: Allow
38 |                   Action:
39 |                     - bedrock:InvokeModel
40 |                   Resource:
41 |                     - !Sub "arn:${AWS::Partition}:bedrock:${AWS::Region}::foundation-model/amazon.titan-embed-text-v1"
42 |   LambdaUDFFunction:
43 |      Type: "AWS::Lambda::Function"
44 |      Properties:
45 |        FunctionName: f-titan-embedding-varchar
46 |        Role: !GetAtt 'LambdaRole.Arn'
47 |        Handler: index.handler
48 |        Runtime: python3.9
49 |        Timeout: 300
50 |        Code:
51 |          ZipFile: |
52 |           import boto3, json, sys
53 |           bedrock_runtime = boto3.client(service_name="bedrock-runtime")
54 | 
55 |           def generate_embeddings(text=None):
56 |               try:
57 |                   input_data = {"inputText": text, "dimensions": 256}
58 |                   response = bedrock_runtime.invoke_model(
59 |                       body=json.dumps(input_data),
60 |                       modelId="amazon.titan-embed-text-v1",
61 |                       accept="application/json",
62 |                       contentType="application/json"
63 |                   )
64 |                   response = response.get("body").read().decode('utf-8')
65 |                   response_json = json.loads(response)
66 |                   return response_json.get("embedding")
67 |                   
68 |               except Exception as e:
69 |                   exc_type, exc_obj, exc_tb = sys.exc_info()
70 |                   print(exc_type, exc_tb.tb_lineno)
71 |                   print('Error: '+ str(e))
72 |                   return ''
73 |                   
74 |           def handler(event, context):
75 |               print(event)
76 |               redshift_response = {"success": False, "num_records": event["num_records"]}
77 |               try:
78 |                   result = []
79 |                   for row in event["arguments"]:
80 |                       try:
81 |                           embedding = generate_embeddings(row[0])
82 |                           result.append(json.dumps(embedding))
83 |                       except Exception as e:
84 |                           print(f"Error: {e}")
85 |                           result.append(None)
86 |                   redshift_response["success"] = True
87 |                   redshift_response["results"] = result
88 |               except Exception as e:
89 |                   redshift_response["error_msg"] = str(e)
90 |                   print('Error: '+ str(e))
91 |                   exc_type, exc_obj, exc_tb = sys.exc_info()
92 |                   print(exc_type, exc_tb.tb_lineno)
93 |               return json.dumps(redshift_response)
94 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_unmarshall_dynamodb_stream_data(varchar)/lambda.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: '2010-09-09'
 2 | Resources:
 3 |   LambdaRole:
 4 |     Type: AWS::IAM::Role
 5 |     Properties:
 6 |       AssumeRolePolicyDocument:
 7 |           Version: 2012-10-17
 8 |           Statement:
 9 |             -
10 |               Effect: Allow
11 |               Principal:
12 |                 Service:
13 |                   - lambda.amazonaws.com
14 |               Action:
15 |                 - sts:AssumeRole
16 |       Path: /
17 |       Policies:
18 |           -
19 |             PolicyName: CloudwatchLogs
20 |             PolicyDocument:
21 |               Version: 2012-10-17
22 |               Statement:
23 |                 -
24 |                   Effect: Allow
25 |                   Action:
26 |                     - logs:CreateLogGroup
27 |                   Resource:
28 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
29 |                 -
30 |                   Effect: Allow
31 |                   Action:
32 |                     - logs:CreateLogStream
33 |                     - logs:PutLogEvents
34 |                   Resource:
35 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
36 |   LambdaUDFFunction:
37 |      Type: "AWS::Lambda::Function"
38 |      Properties:
39 |        FunctionName: f-unmarshall-dynamodb-stream-data-varchar
40 |        Role: !GetAtt 'LambdaRole.Arn'
41 |        Handler: index.handler
42 |        Runtime: python3.9
43 |        Timeout: 300
44 |        Code:
45 |          ZipFile: |
46 |           import decimal
47 |           import json
48 |           from typing import Dict
49 | 
50 |           from boto3.dynamodb.types import TypeDeserializer
51 | 
52 |           deserializer = TypeDeserializer()
53 | 
54 | 
55 |           class DecimalEncoder(json.JSONEncoder):
56 |               """Handle JSON encoding of Decimal data (necessary because TypeDeserializer defaults to Decimal for floating point values)."""
57 | 
58 |               def default(self, o):
59 |                   if isinstance(o, decimal.Decimal):
60 |                       return str(o)
61 |                   return super(DecimalEncoder, self).default(o)
62 | 
63 | 
64 |           def _ddb_to_json(data: Dict, prop: str) -> Dict:
65 |               """Convert DynamoDB encoded data into normal JSON.
66 | 
67 |               :param data: A mapping of {"key": {dynamo db encoded data}}
68 |               :param prop: The key to convert from the input data (e.g. Keys or NewImage from DynamoDB Streams)
69 |               """
70 |               if prop not in data:
71 |                   return {}
72 |               return deserializer.deserialize({"M": data[prop]})
73 | 
74 | 
75 |           def parse_dynamodb(dynamodb_json_string: str) -> str:
76 |               """Parse the "dynamodb" key from a DynamoDB Streams message into a Spark struct with JSON encoded keys / image.
77 | 
78 |               Converts from DynamoDB record encoding to normal JSON encoding.
79 |               """
80 |               data = json.loads(dynamodb_json_string)
81 |               data["dynamodb"]["Keys"] = _ddb_to_json(data["dynamodb"], "Keys")
82 |               data["dynamodb"]["NewImage"] = _ddb_to_json(data["dynamodb"], "NewImage")
83 |               data["dynamodb"]["OldImage"] = _ddb_to_json(data["dynamodb"], "OldImage")
84 |               return json.dumps(data, cls=DecimalEncoder)
85 | 
86 | 
87 |           def handler(event, context):
88 |               try:
89 |                   return json.dumps({
90 |                       "success": True,
91 |                       "num_records": event["num_records"],
92 |                       "results": [parse_dynamodb(record[0]) for record in event["arguments"]],
93 |                   })
94 |               except Exception as e:
95 |                   return json.dumps({
96 |                       "success": False,
97 |                       "error_msg": f"Error processing Lambda event. Error: {e}",
98 |                   })
99 | 


--------------------------------------------------------------------------------
/bin/testFunction.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -eu
  3 | 
  4 | category=$1
  5 | function=$2
  6 | cluster=$3
  7 | db=$4
  8 | user=$5
  9 | schema=$6
 10 | 
 11 | function usage {
 12 | 	echo "./deployFunction.sh -t <type> -f <function> -c <cluster> -d <database> -u <db user> "
 13 | 	echo
 14 | 	echo "where <type> is the type of function to be installed. e.g. python-udfs, lambda-udfs, sql-udfs"
 15 | 	echo "      <function> is the name of the function, including the parameters and enclosed in quotes e.g. \"f_bitwise_to_string(bigint,int)\""
 16 | 	echo "      <cluster> is the Redshift cluster you will deploy the function to"
 17 | 	echo "      <database> is the database you will deploy the function to"
 18 | 	echo "      <db user> is the db user who will create the function"
 19 | 	echo "      <schema> is the db schema where the function will be created"
 20 | 
 21 | 	exit 0;
 22 | }
 23 | 
 24 | execQuery() {
 25 |   output=`aws redshift-data batch-execute-statement --cluster-identifier $1 --database $2 --db-user $3 --sql "set search_path to $4; $5" "$6"`
 26 |   id=`echo $output | jq -r .Id`
 27 |   notNull "$id" "Error running batch-execute-statement"
 28 |   status="SUBMITTED"
 29 |   while [ "$status" != "FINISHED" ] && [ "$status" != "FAILED" ]
 30 |   do
 31 |     sleep 1
 32 |     status=`aws redshift-data describe-statement --id $id | jq -r .Status`
 33 |   done
 34 |   if [ "$status" == "FAILED" ]; then
 35 |     aws redshift-data describe-statement --id $id
 36 |     exit 1
 37 |   else
 38 |     aws redshift-data get-statement-result --id $id:2
 39 |   fi
 40 | }
 41 | 
 42 | function checkDep {
 43 | 	which $1 >> /dev/null
 44 | 	if [ $? -ne 0 ]; then
 45 | 		echo "Unable to find required dependency $1"
 46 | 		exit -1
 47 | 	fi
 48 | }
 49 | 
 50 | function notNull {
 51 | 	if [ "$1x" == "x" ]; then
 52 | 		echo $2
 53 | 		exit -1
 54 | 	fi
 55 | }
 56 | 
 57 | # make sure we have pip and the aws cli installed
 58 | checkDep "aws"
 59 | 
 60 | # look up runtime arguments of the module name and the destination S3 Prefix
 61 | while getopts "t:f:s:l:r:c:d:u:n:h" opt; do
 62 | 	case $opt in
 63 | 		t) type="$OPTARG";;
 64 | 		f) function="$OPTARG";;
 65 | 		c) cluster="$OPTARG";;
 66 | 		d) db="$OPTARG";;
 67 | 		u) user="$OPTARG";;
 68 | 		n) schema="$OPTARG";;
 69 | 		h) usage;;
 70 | 		\?) echo "Invalid option: -"$OPTARG"" >&2
 71 | 			exit 1;;
 72 | 		:) usage;;
 73 | 	esac
 74 | done
 75 | 
 76 | # validate required arguments
 77 | notNull "$type" "Please provide the function type -t"
 78 | notNull "$function" "Please provide the function name -f"
 79 | notNull "$cluster" "Please provide the Redshift cluster name -c"
 80 | notNull "$db" "Please provide the Redshift cluster db name -d"
 81 | notNull "$user" "Please provide the Redshift cluster user name -u"
 82 | notNull "$schema" "Please provide the Redshift cluster namespace (schema) -n"
 83 | 
 84 | 
 85 | if test -f "../$type/$function/resources.yaml"; then
 86 |   template=$(<"../$type/$function/resources.yaml")
 87 |   stackname=${function//(/-}
 88 |   stackname=${stackname//)/}
 89 |   stackname=${stackname//_/-}
 90 |   stackname=${stackname//,/-}
 91 |   if ! aws cloudformation deploy --template-file ../${type}/${function}/resources.yaml --stack-name ${stackname}-resources --no-fail-on-empty-changeset --capabilities CAPABILITY_IAM; then
 92 | 		aws cloudformation delete-stack --stack-name ${stackname}-resources
 93 | 		exit 1
 94 | 	fi
 95 | fi
 96 | 
 97 | arrIN=(${function//(/ })
 98 | name=${arrIN[0]}
 99 | args=${arrIN[1]}
100 | args=${args%?}
101 | i=0
102 | fields=
103 | params=
104 | OIFS=$IFS
105 | IFS=','
106 | for arg in $args
107 | do
108 |   if [ "${arg}" == "varchar" ]; then arg="varchar(max)"; fi
109 |   fields="$fields,p$i $arg"
110 |   params="$params,p$i"
111 |   ((i=i+1))
112 | done
113 | IFS=$OIFS
114 | 
115 | sql="drop table if exists #$name"
116 | sql="$sql;create table #$name (seq int$fields)";
117 | 
118 | rows=""
119 | sql="$sql;insert into #$name values "
120 | 
121 | i=0
122 | while read row; do
123 |   rows="$rows ($i,$row),"
124 |   ((i=i+1))
125 | done <"../$type/$function/input.csv"
126 | 
127 | sql="$sql${rows%?}"
128 | sql1="select $name(${params:1})::varchar(max) from #$name order by seq;"
129 | echo "$sql;$sql1"
130 | output=`execQuery $cluster $db $user $schema "$sql" "$sql1"`
131 | echo $output | jq -r '.Records | .[] | [.[0].stringValue] | .[]' > output.csv
132 | if test -f "../$type/$function/output.csv"; then
133 |   diff output.csv "../$type/$function/output.csv"
134 |   echo "Test passed. Result from Redshift: "
135 |   cat output.csv
136 |   rm output.csv
137 | else 
138 |   echo "Test skipped because output may vary based on your deployment."  
139 | fi
140 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_update_permissions/README.md:
--------------------------------------------------------------------------------
 1 | # sp_update_permissions
 2 | 
 3 | Stored procedure that reads user, group and role permission matrix from Amazon S3 and updates authorisation in Redshift accordingly. It helps startups and small to medium organisations that haven't integrated Amazon Redshift with an identity provider to streamline security measures and acceess control for their data warehouse built with Amazon Redshift. This SP can be used for bulk update of permissions for principals mentioned above, at schema, table, column and row level. 
 4 | 
 5 | It expects the file in delimited text format with following schema and "|" as delimiter:          
 6 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;id|operation|principal|principal_type|object_type|object_name|access_option
 7 | 
 8 | For example:
 9 | 
10 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;1|grant|group_1|group|schema|schema_1|usage
11 | 
12 | &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;2|revoke|user_1|user|table|table_1|select
13 | 
14 | ## Pre-requisites
15 | 
16 | 1- Create an S3 bucket: The first step before starting to use the stored procedure is to load the first version of the file into an Amazon S3 bucket. Take a note of the S3 bucket name.
17 | 
18 | 2- Create IAM role for Redshift to access external data: Since the stored procedure reads data stored in S3 using external schema, you need a Redshift role with relevant permissions. Please make sure you have created the role by following the steps [here](https://docs.aws.amazon.com/redshift/latest/dg/c-getting-started-using-spectrum-create-role.html). After the role is created, attach it to your Redshift cluster in the Properties tab of Amazon Redshift console page.
19 | 
20 | 3- AWS Lake Formation access for Redshift external role: If you have Lake Formation enabled for your account and region, you need to add the IAM role created to access external data as admin in AWS Lake Formation. Follow the steps below:
21 | 
22 | - Navigate to AWS Lake Formation console
23 | - From the left pane, choose "Administrative roles and tasks" under Permissions
24 | - In the Data lake administrators section click on Choose administrators
25 | - Search for and select your role from drop-down list
26 | - Click on Save
27 | 
28 | Note that this step is needed only if you have enabled AWS Lake Formation for the account and region where your Amazon Redshift cluster resides.
29 | 
30 | The rest of the steps from here should be executed from Amazon Redshift Query Editor Version 2 (QEv2) as superuser.
31 | 
32 | 
33 | ## Install
34 | 1- Create external schema: Run the following command to create external schema. You would need to replace the IAM Role ARN for your Spectrum role:
35 | 
36 | ```sql
37 | create external schema access_management 
38 | from data catalog database redshift 
39 | iam_role '[Spectrum_Role_ARN]' 
40 | create external database if not exists;
41 | 
42 | ```
43 | Feel free to change the name of the schema. Take a note of it if you prefer a different name.
44 | 
45 | 2- Create external table: Run the following command to create external table: 
46 | ```sql
47 | create external table access_management.redshift_access_details(
48 |     id integer,
49 |     operation varchar(50),
50 |     principal varchar(50),
51 |     principal_type varchar(50),
52 |     object_type varchar(50),
53 |     object_name varchar(50),
54 |     access_option varchar(50))
55 |     row format delimited
56 |     fields terminated by '|'
57 |     stored as textfile
58 |     location 's3://[path_to_location_your_file_is_stored_in_s3]'
59 | ;
60 | 
61 | ```
62 | Same as previous step, feel free to change the name of the table and take a note of it.
63 | 
64 | 3- Test external table: Copy the first user access details file, "user_access_details_1.csv" into your S3 bucket and run the following command. Make sure you can query the file you uploaded to S3:
65 | 
66 | ```sql
67 | SELECT * FROM access_management.redshift_access_details;
68 | 
69 | ```
70 | You are ready to move to the next steps if your query returns the rows from the file.
71 | 
72 | 
73 | 4- Login to Redshift Query Editor Version 2 (QEv2) as superuser, copy and paste the code in "create_sp.sql" in QEv2 and execute it. 
74 | 
75 | 
76 | ## Usage
77 | To execute the stored procedure, replace input parameters in the command below with the names you used in pre-requisites steps:
78 | 
79 | ```sql
80 | CALL sp_update_permissions('[external_schema_name]', '[external_table_name]');
81 | 
82 | ```
83 | If you didn't change schema and table names, the command should be:
84 | ```sql
85 | CALL sp_update_permissions('access_management', 'redshift_access_details','[iam_role_arn]');
86 | ```
87 | 
88 | ## Test
89 | 
90 | A set of test scenarios are covered in "test_scenarios.sql". 
91 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_dynamodb_lookup_python(varchar,varchar,varchar)/resources.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: '2010-09-09'
  2 | Resources:
  3 |   DynamoDBTable:
  4 |     Type: AWS::DynamoDB::Table
  5 |     Properties:
  6 |       TableName: CustomerLambdaUDF
  7 |       AttributeDefinitions:
  8 |         -
  9 |           AttributeName: "id"
 10 |           AttributeType: "S"
 11 |       KeySchema:
 12 |         -
 13 |           AttributeName: "id"
 14 |           KeyType: "HASH"
 15 |       BillingMode: PAY_PER_REQUEST
 16 |   InitRole:
 17 |     Type: AWS::IAM::Role
 18 |     Properties:
 19 |       AssumeRolePolicyDocument:
 20 |           Version: 2012-10-17
 21 |           Statement:
 22 |             -
 23 |               Effect: Allow
 24 |               Principal:
 25 |                 Service:
 26 |                   - lambda.amazonaws.com
 27 |               Action:
 28 |                 - sts:AssumeRole
 29 |       Path: /
 30 |       Policies:
 31 |           -
 32 |             PolicyName: CloudwatchLogs
 33 |             PolicyDocument:
 34 |               Version: 2012-10-17
 35 |               Statement:
 36 |                 -
 37 |                   Effect: Allow
 38 |                   Action:
 39 |                     - logs:CreateLogGroup
 40 |                   Resource:
 41 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
 42 |                 -
 43 |                   Effect: Allow
 44 |                   Action:
 45 |                     - logs:CreateLogStream
 46 |                     - logs:PutLogEvents
 47 |                   Resource:
 48 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
 49 |                 -
 50 |                   Effect: Allow
 51 |                   Action:
 52 |                     - dynamodb:PutItem
 53 |                   Resource:
 54 |                     - !Sub "arn:${AWS::Partition}:dynamodb:${AWS::Region}:${AWS::AccountId}:table/*"
 55 |   InitFunction:
 56 |     Type: AWS::Lambda::Function
 57 |     Properties:
 58 |       Role: !GetAtt 'InitRole.Arn'
 59 |       Handler: index.handler
 60 |       Runtime: python3.9
 61 |       Timeout: 30
 62 |       Code:
 63 |         ZipFile: |
 64 |           import boto3, json, urllib3
 65 | 
 66 |           http = urllib3.PoolManager()
 67 |           dynamodb = boto3.resource('dynamodb')
 68 | 
 69 |           def send(event, context, responseStatus, responseData):
 70 |             responseUrl = event['ResponseURL']
 71 |             responseBody = {
 72 |                 'Status' : responseStatus,
 73 |                 'Reason' : "See the details in CloudWatch Log Stream: {}".format(context.log_stream_name),
 74 |                 'PhysicalResourceId' : context.log_stream_name,
 75 |                 'StackId' : event['StackId'],
 76 |                 'RequestId' : event['RequestId'],
 77 |                 'LogicalResourceId' : event['LogicalResourceId'],
 78 |                 'NoEcho' : False,
 79 |                 'Data' : responseData
 80 |             }
 81 |             json_responseBody = json.dumps(responseBody)
 82 |             headers = {
 83 |                 'content-type' : '',
 84 |                 'content-length' : str(len(json_responseBody))
 85 |             }
 86 |             try:
 87 |                 response = http.request('PUT', responseUrl, headers=headers, body=json_responseBody)
 88 |             except Exception as e:
 89 |                 print("send(..) failed executing http.request(..):", e)
 90 | 
 91 | 
 92 |           def handler(event, context):
 93 |             print("Received event: " + json.dumps(event, indent=2))
 94 | 
 95 |             if event['RequestType'] == 'Delete':
 96 |               send(event, context, 'SUCCESS', {'Data': 'Delete complete'})
 97 |             else:
 98 |               try:
 99 |                 table = dynamodb.Table(event['ResourceProperties']['DynamoTableName'])
100 |                 table.put_item(Item={'id':'1','fname':'John','lname':'Doe','size':2})
101 |                 table.put_item(Item={'id':'2','fname':'Jane','lname':'Doe','size':3})
102 |                 table.put_item(Item={'id':'3','fname':'Carlos','lname':'Salazar','size':5})
103 |                 table.put_item(Item={'id':'4','fname':'Diego','lname':'Ramirez','size':7})
104 |                 table.put_item(Item={'id':'5','fname':'Mary','lname':'Major','size':11})
105 |                 table.put_item(Item={'id':'6','fname':'Richard','lname':'Roe','size':13})
106 |                 send(event, context, 'SUCCESS', {'Data': 'Insert complete'})
107 |               except Exception as e:
108 |                 print(e)
109 |                 send(event, context, 'FAILED', {'Data': 'Insert failed'})
110 |   InitializeDynamoDB:
111 |       Type: Custom::InitFunction
112 |       DeletionPolicy: Retain
113 |       DependsOn: DynamoDBTable
114 |       Properties:
115 |         ServiceToken:
116 |            Fn::GetAtt: [ InitFunction , "Arn" ]
117 |         DynamoTableName: CustomerLambdaUDF
118 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_update_permissions/test_scenarios.sql:
--------------------------------------------------------------------------------
  1 | -- Create schema, table, and insert few records
  2 | CREATE SCHEMA test_user_management;
  3 | create table test_user_management.sales(
  4 | sales_id integer,
  5 | seller_id integer,
  6 | buyer_id integer ,
  7 | qty_sold smallint ,
  8 | price_paid decimal(8,2));
  9 | 
 10 | INSERT INTO test_user_management.sales(sales_id ,seller_id ,buyer_id  ,qty_sold  ,price_paid)
 11 |     VALUES (11,11,111,30,13.31),
 12 |             (12,11,111,33,13.31),
 13 |             (13,10,100,1,1.01),
 14 |             (14,10,100,2,11.11)
 15 | 
 16 | SELECT *
 17 | FROM test_user_management.sales;
 18 | 
 19 | -- Create users and groups
 20 | create user user1 password 'testAccess8';
 21 | create user user2 password 'testAccess8';
 22 | create group group1 with user user2;
 23 | 
 24 | REVOKE USAGE ON SCHEMA test_user_management FROM public;
 25 | REVOKE SELECT ON dms_sample_dbo.mlb_data from PUBLIC;
 26 | 
 27 | /*--- Test 1: GRANT access to user ---*/
 28 | -- Load user_access_details1.csv into S3 bucket and execute SP (as superuser)
 29 | -- Expected result: user1 should be able to query the table
 30 | SET SESSION AUTHORIZATION 'user1';
 31 | SELECT *
 32 | FROM test_user_management.sales;
 33 | 
 34 | -- Query should fail for user2 
 35 | SET SESSION AUTHORIZATION 'user2';
 36 | SELECT *
 37 | FROM test_user_management.sales;
 38 | 
 39 | 
 40 | /*--- Test 2: REVOKE access from user1 & GRANT access to group1 ---*/
 41 | -- Load user_access_details2.csv into S3 bucket, delete user_access_details1.csv and execute SP (as superuser)
 42 | -- Expected result: user1 should NOT be able to query the table now, since his access is revoked. user2 should be able to query the table, since he is part of the group
 43 | 
 44 | SET SESSION AUTHORIZATION 'user1';
 45 | SELECT *
 46 | FROM test_user_management.sales;
 47 | 
 48 | -- Query should fail for user2 
 49 | SET SESSION AUTHORIZATION 'user2';
 50 | SELECT *
 51 | FROM test_user_management.sales;
 52 | 
 53 | 
 54 | /*--- Test 3: GRANT column-level access to user1 & GRANT execute function permission to user1 ---*/
 55 | -- First create the function
 56 | SET SESSION AUTHORIZATION '[superuser]';
 57 | CREATE function f_sql_greater (float, float)
 58 |   returns float
 59 | stable
 60 | as $$
 61 |   select case when $1 > $2 then $1
 62 |     else $2
 63 |   end
 64 | $$ language sql;
 65 | 
 66 | REVOKE EXECUTE ON FUNCTION f_sql_greater (float, float) FROM PUBLIC;
 67 | 
 68 | -- Load user_access_details3.csv into S3 bucket, delete user_access_details2.csv and execute SP (as superuser)
 69 | -- Expected result: user1 should be able to only query the columns he's got access to. He should be able execute SQL function too
 70 | 
 71 | SET SESSION AUTHORIZATION 'user1';
 72 | SELECT buyer_id, price_paid
 73 | FROM test_user_management.sales;
 74 | 
 75 | 
 76 | SELECT f_sql_greater (10,20);
 77 | 
 78 | /*--- Test 4: REVOKE execute function permission from user1 & GRANT execute function permission to group1. Note that the same mechanism works for stored procedures as well. ---*/
 79 | -- Load user_access_details4.csv into S3 bucket, delete user_access_details3.csv and execute SP (as superuser)
 80 | -- Expected result: user1 should NOT be able to execute function. user2 should be able to execute function, sinc ehe is member of group1
 81 | 
 82 | SET SESSION AUTHORIZATION 'user1';
 83 | SELECT f_sql_greater (10,20);
 84 | 
 85 | SET SESSION AUTHORIZATION 'user2';
 86 | SELECT f_sql_greater (10,20);
 87 | 
 88 | /*--- Test 5: GRANT access to all tables in schema & grant access with RBAC ---*/
 89 | -- First creatae role
 90 | SET SESSION AUTHORIZATION '[superuser]';
 91 | CREATE ROLE role1;
 92 | GRANT ROLE role1 to user1;
 93 | 
 94 | -- Load user_access_details5.csv into S3 bucket, delete user_access_details4.csv and execute SP (as superuser)
 95 | -- Expected result: user1 should be able to query the table and see all columns, since he is assigned role1. user2 should be able to query the table as well, since the group he is a member of is given access to all tables in the schema
 96 | SET SESSION AUTHORIZATION 'user1';
 97 | SELECT *
 98 | FROM test_user_management.sales;
 99 | 
100 | SET SESSION AUTHORIZATION 'user2';
101 | SELECT *
102 | FROM test_user_management.sales;
103 | 
104 | 
105 | /*--- Test 6: RLS ---*/
106 | -- First alter table and add a new column
107 | SET SESSION AUTHORIZATION '[superuser]';
108 | alter table test_user_management.sales
109 | add column seller_name varchar(50)
110 | default 'user2';
111 | 
112 | UPDATE test_user_management.sales
113 | SET seller_name = 'user1'
114 | WHERE seller_id = 10
115 | 
116 | SELECT *
117 | FROM test_user_management.sales;
118 | 
119 | -- Then create RLS policy and attach it to the table
120 | SET SESSION AUTHORIZATION '[superuser]';
121 | CREATE RLS POLICY only_self_sales WITH ( seller_name varchar(50) ) USING (seller_name = current_user);
122 | ALTER TABLE test_user_management.sales row level security ON;
123 | 
124 | -- Load user_access_details6.csv into S3 bucket, delete user_access_details5.csv and execute SP (as superuser)
125 | -- Expected result: user1 should only see the 2 records related to his sales when querying the table
126 | 
127 | SET SESSION AUTHORIZATION 'user1';
128 | SELECT *
129 | FROM test_user_management.sales;
130 | 
131 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_controlled_access/sp_controlled_access.sql:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************************
  2 | Purpose: Provide controlled access to data without granting permission on the table/view
  3 | Notes:  
  4 |         Must set the SECURITY attribute set as DEFINER
  5 |         An unprivileged user can run the procedure when granted the EXECUTE permission
  6 | 
  7 | Parameters:
  8 |         select_cols : Text list of columns to be added to the SELECT clause
  9 |         query_from  : Schema qualified name of table/view to be queried
 10 |         where_clause: Text of predicate criteria to be added to the WHERE clause
 11 |         max_level   : Maximum number of rows to be returned
 12 |         result_set  : Name of cursor used for output
 13 | 
 14 | Requirements:
 15 |         Must have a table containing user authorizations with the following DDL:
 16 |             CREATE TABLE $(auth_table)
 17 |                 ( query_from   VARCHAR,
 18 |                 , user_name    VARCHAR,
 19 |                 , valid_until  TIMESTAMP,
 20 |                 , max_rows     INT );
 21 | 
 22 | History:
 23 | 2019-11-09 - joeharris76 - Created
 24 | **********************************************************************************************/
 25 | 
 26 | -- DROP PROCEDURE sp_controlled_access(VARCHAR,VARCHAR,VARCHAR,INTEGER,REFCURSOR);
 27 | CREATE OR REPLACE PROCEDURE sp_controlled_access (
 28 |       select_cols  IN VARCHAR(256)
 29 |     , query_from   IN VARCHAR(128)
 30 |     , where_clause IN VARCHAR(256)
 31 |     , max_rows     IN INTEGER
 32 |     , result_set   INOUT REFCURSOR )
 33 | AS $$
 34 | DECLARE
 35 |     user_name  VARCHAR(128);
 36 |     sql        VARCHAR(MAX) := '';
 37 |     auth_table VARCHAR(256) := 'hr.access_authority';
 38 |     rows_limit INTEGER := 0;
 39 | BEGIN          
 40 |     user_name = session_user;
 41 |     -- Check the user is authorized for this query
 42 |     sql := 'SELECT NVL(max_rows,1000) rows_limit FROM '||auth_table||
 43 |           ' WHERE query_from = '''||query_from||''' '||
 44 |           ' AND user_name = '''||user_name||''' '||
 45 |           ' AND (valid_until IS NULL OR valid_until >= '''||SYSDATE||''') ;';
 46 |     EXECUTE sql INTO rows_limit;
 47 |     IF NVL(rows_limit,0) = 0 THEN
 48 |         RAISE EXCEPTION 'ERROR: Query Is Not Authorized';
 49 |     ELSE
 50 |         -- Compose the user query
 51 |         IF select_cols <> '' THEN
 52 |             sql := 'SELECT '||select_cols;
 53 |         ELSE
 54 |             sql := 'SELECT * ';
 55 |         END IF;
 56 |         sql := sql||' FROM '||query_from;
 57 |         IF where_clause <> '' THEN
 58 |             sql := sql||' WHERE '||where_clause||' ';
 59 |         END IF;
 60 |         IF rows_limit > max_rows THEN
 61 |             rows_limit := max_rows;
 62 |         END IF;
 63 |         sql := sql||' LIMIT '||rows_limit||' ;';
 64 |         -- Open the cursor and execute the SQL
 65 |         RAISE INFO 'SQL: %', sql;
 66 |         OPEN result_set FOR EXECUTE sql;
 67 |         RAISE INFO 'AUTHORIZED: Query on `%` completed',query_from;
 68 |     END IF;
 69 | END;
 70 | $$ LANGUAGE plpgsql
 71 | SECURITY DEFINER;
 72 | 
 73 | 
 74 | /* Usage Example:
 75 | 
 76 |     SELECT current_user;
 77 |     CREATE SCHEMA hr;
 78 |     CREATE TABLE hr.employee ( title VARCHAR, emp INT, mgr INT, dept VARCHAR);
 79 |     INSERT INTO hr.employee VALUES
 80 |           ('Chairman', 100, NULL, 'Board'    )
 81 |         , ('CEO'     , 101, 100 , 'Board'    )
 82 |         , ('CTO'     , 102, 101 , 'IT'       )
 83 |         , ('CMO'     , 103, 101 , 'Sales/Mkt');
 84 |     CREATE USER user_no_priv WITH PASSWORD DISABLE;
 85 |     CREATE TABLE hr.access_authority ( query_from VARCHAR, user_name VARCHAR, valid_until TIMESTAMP, max_rows INT ); 
 86 |     INSERT INTO hr.access_authority VALUES ('hr.employee','user_no_priv','2019-12-31',99);
 87 |     GRANT ALL ON SCHEMA public TO user_no_priv;
 88 |     GRANT EXECUTE ON PROCEDURE sp_controlled_access(VARCHAR,VARCHAR,VARCHAR,INTEGER,REFCURSOR) TO user_no_priv;
 89 | 
 90 |     -- Change session to the new user
 91 |     SET SESSION AUTHORIZATION user_no_priv;
 92 |     SELECT current_user; -- user_no_priv
 93 |     SELECT * FROM hr.employee; -- ERROR: permission denied
 94 |     BEGIN; -- Call the stored procedure 
 95 |     CALL sp_controlled_access (
 96 |           'title,emp,mgr,dept'::VARCHAR -- select_cols
 97 |         , 'hr.employee'::VARCHAR        -- query_from
 98 |         , 'dept = ''Board'''::VARCHAR   -- where_clause
 99 |         , 10                            -- max_rows
100 |         , 'employee_data' );            -- result_set
101 |     -- INFO:  AUTHORIZED: Query on `hr.employee` completed
102 |     --   result_set
103 |     -- ---------------
104 |     --  employee_data
105 |     FETCH ALL FROM employee_data; 
106 |     --   title   | emp | mgr | dept
107 |     -- ----------+-----+-----+-------
108 |     --  Chairman | 100 |     | Board
109 |     --  CEO      | 101 | 100 | Board
110 |     END;
111 | 
112 |     -- RESET SESSION AUTHORIZATION;
113 |     -- DROP TABLE hr.employee;
114 |     -- DROP TABLE hr.access_authority;
115 |     -- DROP SCHEMA hr;
116 |     -- REVOKE EXECUTE ON PROCEDURE sp_controlled_access(VARCHAR,VARCHAR,VARCHAR,INTEGER,REFCURSOR) FROM user_no_priv;
117 |     -- REVOKE ALL ON SCHEMA public FROM user_no_priv;
118 |     -- DROP USER user_no_priv;
119 | 
120 | */
121 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_glue_schema_registry_avro_to_json(varchar,varchar,varchar)/lambda.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: '2010-09-09'
  2 | Parameters:
  3 |   S3Bucket:
  4 |     Description: the S3 Bucket containing the Lambda layer
  5 |     Type: String
  6 |   S3Key:
  7 |     Description: the S3 Key containing the Lambda layer
  8 |     Type: String
  9 | Conditions:
 10 |   NoS3Prefix: !Equals [!Ref S3Key, '']
 11 | Resources:
 12 |   LambdaRole:
 13 |     Type: AWS::IAM::Role
 14 |     Properties:
 15 |       AssumeRolePolicyDocument:
 16 |           Version: 2012-10-17
 17 |           Statement:
 18 |             -
 19 |               Effect: Allow
 20 |               Principal:
 21 |                 Service:
 22 |                   - lambda.amazonaws.com
 23 |               Action:
 24 |                 - sts:AssumeRole
 25 |       Path: /
 26 |       ManagedPolicyArns:
 27 |         - !Sub "arn:${AWS::Partition}:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole"
 28 |       Policies:
 29 |           -
 30 |             PolicyName: CloudwatchLogs
 31 |             PolicyDocument:
 32 |               Version: 2012-10-17
 33 |               Statement:
 34 |                 -
 35 |                   Effect: Allow
 36 |                   Action:
 37 |                     - logs:CreateLogGroup
 38 |                   Resource:
 39 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
 40 |                 -
 41 |                   Effect: Allow
 42 |                   Action:
 43 |                     - logs:CreateLogStream
 44 |                     - logs:PutLogEvents
 45 |                   Resource:
 46 |                     - !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
 47 |                 -
 48 |                   Effect: Allow
 49 |                   Action:
 50 |                     - glue:GetSchemaVersion
 51 |                   Resource:
 52 |                     - !Sub "arn:${AWS::Partition}:glue:${AWS::Region}:${AWS::AccountId}:registry/*"
 53 |                     - !Sub "arn:${AWS::Partition}:glue:${AWS::Region}:${AWS::AccountId}:schema/*/*"
 54 |   AvroLayer:
 55 |     Type: AWS::Lambda::LayerVersion
 56 |     Properties: 
 57 |       CompatibleRuntimes: 
 58 |         - python3.9
 59 |       Content: 
 60 |         S3Bucket: !Ref S3Bucket
 61 |         S3Key: !If ["NoS3Prefix", "avro_1.11.2.zip", !Sub "${S3Key}/avro_1.11.2.zip"]
 62 |       Description: Python 3.9 Avro library
 63 |       LayerName: avro
 64 |   LambdaUDFFunction:
 65 |      Type: "AWS::Lambda::Function"
 66 |      Properties:
 67 |        FunctionName: f-glue-schema-registry-avro-to-json-varchar-varchar-varchar
 68 |        Role: !GetAtt 'LambdaRole.Arn'
 69 |        Handler: index.handler
 70 |        Runtime: python3.9
 71 |        Timeout: 300
 72 |        Layers:
 73 |         - !Ref AvroLayer
 74 |        Code:
 75 |          ZipFile: |
 76 |           import functools
 77 |           import io
 78 |           import json
 79 |           import os
 80 | 
 81 |           import avro.io
 82 |           import avro.schema
 83 |           import boto3
 84 | 
 85 |           glue = boto3.client("glue")
 86 | 
 87 | 
 88 |           @functools.lru_cache(maxsize=32)
 89 |           def _get_schema(registry_name: str, stream_name: str) -> avro.schema.Schema:
 90 |               """Get an Avro schema from the registry by stream name.
 91 | 
 92 |               :param stream_name: The stream name for the schema to request
 93 |               :returns: The Avro schema object
 94 |               """
 95 |               schema_resp = glue.get_schema_version(
 96 |                   SchemaId={"RegistryName": registry_name, "SchemaName": stream_name},
 97 |                   SchemaVersionNumber={"LatestVersion": True},
 98 |               )
 99 |               schema = schema_resp["SchemaDefinition"]
100 |               return avro.schema.parse(schema)
101 | 
102 | 
103 |           def _avro_to_json(registry_name: str, stream_name: str, data: str) -> str:
104 |               """Decode a single Hex-encoded Avro datum using the schema associated with the stream name.
105 | 
106 |               :param registry_name: The Glue Schema Registry name for the schema
107 |               :param stream_name: The stream name for the data
108 |               :param data: The Hex-encoded Avro binary data
109 |               :returns: A JSON encoded version of the data
110 |               """
111 |               schema = _get_schema(registry_name, stream_name)
112 |               data_bytes = io.BytesIO(bytes.fromhex(data))
113 |               decoder = avro.io.BinaryDecoder(data_bytes)
114 |               reader = avro.io.DatumReader(schema)
115 |               decoded = reader.read(decoder)
116 |               return json.dumps(decoded)
117 | 
118 | 
119 |           def handler(event, context):
120 |               try:
121 |                   results = []
122 |                   for registry_name, stream_name, data in event["arguments"]:
123 |                       results.append(_avro_to_json(registry_name, stream_name, data))
124 |                   return json.dumps(
125 |                       {
126 |                           "success": True,
127 |                           "num_records": event["num_records"],
128 |                           "results": results,
129 |                       }
130 |                   )
131 |               except Exception as e:
132 |                   return json.dumps(
133 |                       {
134 |                           "success": False,
135 |                           "error_msg": f"Error processing Lambda event. Error: {e}. Event: {event}",
136 |                       }
137 |                   )
138 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_kmeans/sp_kmeans.sql:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************************
  2 | Purpose: Generate K-Means clusters from vector embeddings.
  3 | Notes:  
  4 |         This procedure is used to support vector search capabilities by creating K-Means clusters 
  5 |         generated and loaded into Redshift from embeddings; typically created by foundational models.
  6 |         The ouput of this procedure will be the <tablename>_kmeans table containing the cluster & centroid 
  7 |         and a <tablename>_kmeans_clusters table contain the member records of the cluster.  
  8 |         See the following article for more info:
  9 |         https://repost.aws/articles/ARPoweQIN2ROOXZiJAtSQvkQ/vector-search-with-amazon-redshift
 10 | 
 11 | Parameters:
 12 |         tablename : The table which was the source of the data which contains the batch embeddings. 
 13 |         clusters  : The number of K-Means clusters to create
 14 | 
 15 | Requirements:
 16 |         expects a table with the following tables to exist and be populated:
 17 |             CREATE TABLE $(tablename)_embeddings
 18 |                 ( recordid VARCHAR,
 19 |                   modeloutput SUPER );
 20 | 
 21 | History:
 22 | 2024-07-19 - rjvgupta - Created
 23 | **********************************************************************************************/
 24 | 
 25 | CREATE OR REPLACE PROCEDURE sp_kmeans (tablename IN varchar, clusters IN int) AS $$
 26 | DECLARE 
 27 |     cluster_size int;
 28 |     cluster int := 1;
 29 |     similarity float;
 30 |     i int;
 31 | BEGIN
 32 |     --will error if table doesn't exist
 33 |     EXECUTE 'select * from '||tablename||'_embeddings limit 1';
 34 | 
 35 |     EXECUTE 'SELECT CEIL(count(1)/'||clusters ||') 
 36 |         from ' || tablename||'_embeddings' INTO cluster_size;
 37 |     
 38 |     -- create kmeans tables and choose random starting centroids
 39 |     EXECUTE 'CREATE TABLE IF NOT EXISTS ' || tablename || '_kmeans (
 40 |         cluster int, centroid SUPER, startts timestamp, endts timestamp, 
 41 |         interations int) DISTSTYLE ALL';
 42 |     EXECUTE 'TRUNCATE TABLE ' || tablename || '_kmeans';
 43 | 
 44 |     EXECUTE 'CREATE TABLE IF NOT EXISTS ' || tablename || '_kmeans_clusters (
 45 |         cluster int, recordid varchar(15), similarity float, rnk int) 
 46 |         DISTKEY(recordid)';
 47 |     EXECUTE 'TRUNCATE TABLE ' || tablename || '_kmeans_clusters';
 48 |     
 49 |     WHILE cluster <= clusters LOOP
 50 |         --choose a random starting centroid from the remaining embeddings
 51 |         EXECUTE 'INSERT INTO ' || tablename || '_kmeans
 52 |             SELECT '||cluster||', modeloutput.embedding, 
 53 |                 CURRENT_TIMESTAMP, NULL, NULL 
 54 |             FROM ' || tablename || '_embeddings 
 55 |             WHERE modeloutput.embedding is not null 
 56 |             AND recordid not in (
 57 |                 select recordid from ' || tablename || '_kmeans_clusters) LIMIT 1';
 58 |         COMMIT;
 59 |         i := 1;
 60 |         similarity := 0;
 61 |         WHILE similarity < .999 LOOP
 62 |             --get embeddings closest to centroid
 63 |             EXECUTE 'DELETE FROM ' || tablename || '_kmeans_clusters 
 64 |                 where cluster = '||cluster;
 65 |             EXECUTE 'INSERT INTO ' || tablename || '_kmeans_clusters
 66 |                 select * from (select *, rank() over (partition by k.cluster order by k.similarity desc) rnk from (
 67 |                     select cluster, e.recordid, sum(kv::float*ev::float)/SQRT(sum(kv::float*kv::float)*sum(ev::float*ev::float)) similarity 
 68 |                     from ' || tablename || '_kmeans k, k.centroid kv at kvi,
 69 |                         ' || tablename || '_embeddings e, e.modeloutput.embedding ev at evi 
 70 |                     where kvi = evi and k.cluster = '||cluster||'
 71 |                     AND e.recordid not in (
 72 |                         select recordid from ' || tablename || '_kmeans_clusters)
 73 |                     group by 1,2) k
 74 |                 ) r where r.rnk <= ' || cluster_size;
 75 |             COMMIT;
 76 |             -- determine new center
 77 |             EXECUTE 'DROP TABLE IF EXISTS #centroid';
 78 |             EXECUTE 'CREATE TABLE #centroid as
 79 |                 SELECT JSON_PARSE(''['' || listagg(po::varchar, '','') within group (order by poi) || '']'') centroid 
 80 |                 FROM (
 81 |                     select poi, avg(po::float) po
 82 |                     from ' || tablename || '_kmeans_clusters as nn, ' || tablename || '_embeddings re, re.modeloutput.embedding as po at poi
 83 |                         where nn.recordid = re.recordid and nn.cluster = ' || cluster || '
 84 |                     group by poi) as c';
 85 |             COMMIT;
 86 |             -- determine distance from new center to old center
 87 |             EXECUTE 'SELECT sum(kv::float*mv::float)/SQRT(sum(kv::float*kv::float)*sum(mv::float*mv::float))  
 88 |                     from #centroid k, k.centroid kv at kvi,
 89 |                         ' || tablename || '_kmeans m, m.centroid mv at mvi 
 90 |                     where m.cluster = '|| cluster ||' and kvi = mvi' INTO similarity;
 91 |             COMMIT;
 92 |             EXECUTE 'UPDATE ' || tablename || '_kmeans SET centroid = (select centroid from #centroid), endts = CURRENT_TIMESTAMP, interations = '|| i ||' where cluster = ' || cluster;
 93 |             COMMIT;
 94 |             i := i+1;
 95 |             COMMIT;
 96 |         END LOOP;
 97 |         cluster := cluster+1;
 98 |     END LOOP;
 99 | END
100 | $$ LANGUAGE plpgsql;
101 | 
102 | /* Usage Example:
103 | call sp_kmeans('reviews', 10)
104 | */
105 | 


--------------------------------------------------------------------------------
/lambda-udfs/f_mysql_lookup_nodejs(varchar,varchar,varchar,varchar)/resources.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: '2010-09-09'
  2 | Resources:
  3 |   MySQLDatabase:
  4 |     Type: AWS::RDS::DBCluster
  5 |     Properties:
  6 |       DatabaseName: lookupnodejs
  7 |       EngineMode: serverless
  8 |       Engine: aurora-mysql
  9 |       EnableHttpEndpoint: TRUE
 10 |       MasterUserPassword: mysqlpassword
 11 |       MasterUsername: mysqluser
 12 |       ScalingConfiguration:
 13 |         MaxCapacity : 1
 14 |         MinCapacity : 1
 15 |   MySQLSecret:
 16 |     Type: AWS::SecretsManager::Secret
 17 |     Properties:
 18 |       Name: lookupnodejs
 19 |       Description: f-mysql-lookup-nodejs mysql secret
 20 |       SecretString: !Sub
 21 |         - '{ "username": "mysqluser", "password": "mysqlpassword", "host": "${Host}" }'
 22 |         - { Host: !GetAtt [ MySQLDatabase, Endpoint.Address ] }
 23 |   InitRole:
 24 |     Type: AWS::IAM::Role
 25 |     Properties:
 26 |       AssumeRolePolicyDocument:
 27 |           Version: 2012-10-17
 28 |           Statement:
 29 |             -
 30 |               Effect: Allow
 31 |               Principal:
 32 |                 Service:
 33 |                   - lambda.amazonaws.com
 34 |               Action:
 35 |                 - sts:AssumeRole
 36 |       Path: /
 37 |       Policies:
 38 |           -
 39 |             PolicyName: CloudwatchLogs
 40 |             PolicyDocument:
 41 |               Version: 2012-10-17
 42 |               Statement:
 43 |                 -
 44 |                   Effect: Allow
 45 |                   Action: logs:CreateLogGroup
 46 |                   Resource: !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:*"
 47 |                 -
 48 |                   Effect: Allow
 49 |                   Action:
 50 |                     - logs:CreateLogStream
 51 |                     - logs:PutLogEvents
 52 |                   Resource: !Sub "arn:${AWS::Partition}:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
 53 |                 -
 54 |                   Effect: Allow
 55 |                   Action: rds-data:ExecuteStatement
 56 |                   Resource: !Sub "arn:${AWS::Partition}:rds:${AWS::Region}:${AWS::AccountId}:cluster:*"
 57 |                 -
 58 |                   Effect: Allow
 59 |                   Action: secretsmanager:GetSecretValue
 60 |                   Resource: !Sub "arn:${AWS::Partition}:secretsmanager:${AWS::Region}:${AWS::AccountId}:secret:*"
 61 |   InitFunction:
 62 |     Type: AWS::Lambda::Function
 63 |     Properties:
 64 |       Role: !GetAtt 'InitRole.Arn'
 65 |       Handler: index.handler
 66 |       Runtime: python3.9
 67 |       Timeout: 30
 68 |       Code:
 69 |         ZipFile: |
 70 |           import boto3, json, urllib3
 71 | 
 72 |           http = urllib3.PoolManager()
 73 |           rdsdata = boto3.client('rds-data')
 74 | 
 75 |           def send(event, context, responseStatus, responseData):
 76 |             responseUrl = event['ResponseURL']
 77 |             responseBody = {
 78 |                 'Status' : responseStatus,
 79 |                 'Reason' : "See the details in CloudWatch Log Stream: {}".format(context.log_stream_name),
 80 |                 'PhysicalResourceId' : context.log_stream_name,
 81 |                 'StackId' : event['StackId'],
 82 |                 'RequestId' : event['RequestId'],
 83 |                 'LogicalResourceId' : event['LogicalResourceId'],
 84 |                 'NoEcho' : False,
 85 |                 'Data' : responseData
 86 |             }
 87 |             json_responseBody = json.dumps(responseBody)
 88 |             headers = {
 89 |                 'content-type' : '',
 90 |                 'content-length' : str(len(json_responseBody))
 91 |             }
 92 |             try:
 93 |                 response = http.request('PUT', responseUrl, headers=headers, body=json_responseBody)
 94 |             except Exception as e:
 95 |                 print("send(..) failed executing http.request(..):", e)
 96 | 
 97 | 
 98 |           def handler(event, context):
 99 |             print("Received event: " + json.dumps(event, indent=2))
100 | 
101 |             if event['RequestType'] == 'Delete':
102 |               send(event, context, 'SUCCESS', {'Data': 'Delete complete'})
103 |             else:
104 |               try:
105 |                 secretArn = event['ResourceProperties']['SecretArn']
106 |                 resourceArn = event['ResourceProperties']['ResourceArn']
107 |                 sql="drop table if exists customers;"
108 |                 rdsdata.execute_statement(database='lookupnodejs', sql=sql, resourceArn=resourceArn, secretArn=secretArn);
109 |                 sql="create table customers(id varchar(10), fname varchar(100), lname varchar(100));"
110 |                 rdsdata.execute_statement(database='lookupnodejs', sql=sql, resourceArn=resourceArn, secretArn=secretArn);
111 |                 sql="insert into customers values ('1','John','Doe'),('2','Jane','Doe'),('3','Carlos','Salazar'),('4','Diego','Ramirez'),('5','Mary','Major'),('6','Richard','Roe');"
112 |                 rdsdata.execute_statement(database='lookupnodejs', sql=sql, resourceArn=resourceArn, secretArn=secretArn);
113 |                 send(event, context, 'SUCCESS', {'Data': 'Insert complete'})
114 |               except Exception as e:
115 |                 print(e)
116 |                 send(event, context, 'FAILED', {'Data': 'Insert failed'})
117 | 
118 |   InitializeMySQLDB:
119 |       Type: Custom::InitFunction
120 |       DeletionPolicy: Retain
121 |       DependsOn: MySQLDatabase
122 |       Properties:
123 |         ServiceToken:
124 |           Fn::GetAtt: [ InitFunction , "Arn" ]
125 |         SecretArn: !Ref MySQLSecret
126 |         ResourceArn: !Sub
127 |           - 'arn:${AWS::Partition}:rds:${AWS::Region}:${AWS::AccountId}:cluster:${db}'
128 |           - { db: !Ref MySQLDatabase}
129 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_analyze_minimal/sp_analyze_minimal.sql:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************************
  2 | Purpose: Analyze **ONE** column of a table. To be used on a staging table right after loading
  3 | 
  4 | Notes:   It will analyze the first column of the SK, the DK or the first column of the table
  5 | 
  6 | Parameters:
  7 |     schema_name:     Schema
  8 |     table_name:      Table
  9 |     analyze_percent: Percent Threshold for analyze
 10 |     wait_for_lock :  Wait for table locks
 11 | 
 12 | Usage:
 13 |     CALL sp_analyze_minimal('public','mytable');
 14 |     CALL sp_analyze_minimal('public','mytable', 1, True);
 15 | 
 16 | History:
 17 | 2019-06-12 ericfe Created as "MinAnalyze"
 18 | 2019-11-12 joeharris76 Renamed to "sp_analyze_minimal" and revised to standard style and format
 19 | **********************************************************************************************/
 20 | 
 21 | -- DROP PROCEDURE sp_analyze_minimal(VARCHAR, VARCHAR, INT, BOOLEAN);
 22 | CREATE OR REPLACE PROCEDURE sp_analyze_minimal (
 23 |       schema_name     VARCHAR
 24 |     , table_name      VARCHAR
 25 |     , analyze_percent INT
 26 |     , wait_for_lock   BOOLEAN ) 
 27 | AS $$
 28 | DECLARE
 29 |     schema_n   VARCHAR;
 30 |     anlyz_pct  INT;
 31 |     tbl_locks  RECORD;
 32 |     anlyz_set  VARCHAR(MAX);
 33 |     anlyz_sql  VARCHAR(MAX);
 34 | BEGIN
 35 |     -- Default to public schema
 36 |     schema_n := NVL(schema_name,'public');
 37 |     -- Default to 1 percent
 38 |     anlyz_pct := NVL(analyze_percent,1);
 39 |     -- Generagte ANALYZE SQL
 40 |     anlyz_set := 'SET ANALYZE_THRESHOLD_PERCENT TO '||anlyz_pct::varchar||'; ';
 41 |     SELECT INTO anlyz_sql 'ANALYZE '||n.nspname||'.'||c.relname||' ('||NVL(NVL(srtk.attname,dstk.attname),cols.attname)||');' AS sql
 42 |     FROM pg_namespace n 
 43 |     JOIN pg_class c             ON n.oid = c.relnamespace
 44 |     JOIN pg_attribute      cols ON cols.attrelid = c.oid AND cols.attnum = 1
 45 |     LEFT JOIN pg_attribute srtk ON srtk.attrelid = c.oid AND srtk.attsortkeyord = 1
 46 |     LEFT JOIN pg_attribute dstk ON dstk.attrelid = c.oid AND dstk.attisdistkey = 't'
 47 |     WHERE c.relname = LOWER(table_name)::CHAR(128)
 48 |       AND n.nspname = LOWER(schema_n)::CHAR(128)
 49 |       AND c.relkind = 'r';
 50 |     IF FOUND then
 51 |         -- BODY  
 52 |         SELECT INTO tbl_locks svv.xid, l.pid, svv.txn_owner username, l.mode, l.granted
 53 |         FROM pg_locks          l
 54 |         JOIN svv_transactions  svv ON l.pid = svv.pid
 55 |                                    AND l.relation = svv.relation
 56 |                                    AND svv.lockable_object_type IS NOT NULL
 57 |         LEFT JOIN pg_class     c   ON c.oid = svv.relation
 58 |         LEFT JOIN pg_namespace n   ON n.oid = c.relnamespace
 59 |         LEFT JOIN pg_database  d   ON d.oid = l.database
 60 |         LEFT JOIN stv_recents  rct ON rct.pid = l.pid
 61 |         WHERE l.pid <> PG_BACKEND_PID()
 62 |           AND l.granted = true
 63 |           AND n.nspname = schema_name::CHAR(128)
 64 |           AND c.relname = table_name::CHAR(128);
 65 |         IF FOUND THEN
 66 |             IF wait_for_lock THEN
 67 |                 RAISE NOTICE 'User % has table locked in % mode. ANALYZE will wait until the lock is released.',tbl_locks.username,tbl_locks.mode;
 68 |                 RAISE NOTICE 'Use ''SELECT PG_TERMINATE_BACKEND(%);'' on the session holding the lock if needed.',tbl_locks.pid;
 69 |                 EXECUTE anlyz_set; EXECUTE anlyz_sql;
 70 |             ELSE
 71 |                 RAISE NOTICE    'User % has table locked in % mode. Try again to see if it has been released.',tbl_locks.username,tbl_locks.mode;
 72 |                 RAISE EXCEPTION 'Use ''SELECT PG_TERMINATE_BACKEND(%);'' to kill session holding the lock if needed.', tbl_locks.pid;
 73 |             END IF;
 74 |         ELSE
 75 |             RAISE INFO 'Running: %', anlyz_sql;
 76 |             EXECUTE anlyz_set; EXECUTE anlyz_sql;
 77 |         END IF;
 78 |     ELSE
 79 |           RAISE EXCEPTION 'No table found';
 80 |     END IF;
 81 | END;
 82 | $$ LANGUAGE plpgsql;
 83 | 
 84 | CREATE OR REPLACE PROCEDURE sp_analyze_minimal (
 85 |       schema_name     VARCHAR
 86 |     , table_name      VARCHAR ) 
 87 | AS $$
 88 | BEGIN
 89 |     -- Will wait by default
 90 |     CALL sp_analyze_minimal(schema_name, table_name, 0, TRUE);
 91 | END;
 92 | $$ LANGUAGE plpgsql;
 93 | 
 94 | 
 95 | /* Usage Example: 
 96 | 
 97 |     DROP TABLE IF EXISTS public.tmp_analyze;
 98 |     CREATE TABLE public.tmp_analyze ( 
 99 |           pkey_col   INTEGER PRIMARY KEY
100 |         , second_col INTEGER);
101 |     INSERT INTO public.tmp_analyze VALUES
102 |         (100,7001),(101,20225),(102,22772),(103,4577);
103 |     -- Call the stored proc
104 |     CALL sp_analyze_minimal('public','tmp_analyze', 0, TRUE);
105 |     -- INFO:  Running: ANALYZE public.tmp_analyze (sortkey_col);
106 |     
107 |     DROP TABLE IF EXISTS public.tmp_analyze;
108 |     CREATE TABLE public.tmp_analyze ( 
109 |           distkey_col  INTEGER DISTKEY
110 |         , second_col   INTEGER);
111 |     INSERT INTO public.tmp_analyze VALUES
112 |         (100,7001),(101,20225),(102,22772),(103,4577);
113 |     -- Call the stored proc
114 |     CALL sp_analyze_minimal('public','tmp_analyze');
115 |     -- INFO:  Running: ANALYZE public.tmp_analyze (sortkey_col);
116 |     
117 |     DROP TABLE IF EXISTS public.tmp_analyze;
118 |     CREATE TABLE public.tmp_analyze ( 
119 |           sortkey_col  INTEGER SORTKEY
120 |         , second_col   INTEGER);
121 |     INSERT INTO public.tmp_analyze VALUES
122 |         (100,7001),(101,20225),(102,22772),(103,4577);
123 |     -- Call the stored proc
124 |     CALL sp_analyze_minimal('public','tmp_analyze', 1, True);
125 |     -- INFO:  Running: ANALYZE public.tmp_analyze (sortkey_col);
126 | 
127 |     DROP TABLE IF EXISTS public.tmp_analyze;
128 | 
129 | */
130 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_pivot_for/sp_pivot_for.sql:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************************
  2 | Purpose: Transpose row values into columns 
  3 | 
  4 | Notes:  Emulates result of PIVOT aggregate() FOR syntax
  5 | 
  6 | Syntax:
  7 |         SELECT [ $(select_cols) , ] $(generated PIVOT columns)
  8 |           FROM  $(table_name)
  9 |          PIVOT ( $(agg_func) ( $(metric_col) ) 
 10 |            FOR $(pivot_src) IN ( SELECT DISTINCT $(pivot_src) FROM $(table_name) ) ) pvt
 11 |         [ WHERE $(where_clause) ]
 12 |         [ GROUP BY $(select_cols) ] ;'
 13 | 
 14 | Parameters:
 15 |         select_cols : Text list of columns to be added to the SELECT clause
 16 |         table_name  : Schema qualified name of table to be queried
 17 |         pivot_src   : Name of the column whose value will be pivoted 
 18 |         agg_func    : Name of the aggregate function to apply
 19 |         metric_col  : Name of the column to be aggregated
 20 |         where_clause: Text of predicate criteria to be added to the WHERE clause
 21 |         result_set  : Name of cursor used for output
 22 | 
 23 | History:
 24 | 2019-05-25 - joeharris76 - Created
 25 | 2019-11-04 - joeharris76 - Revised for publication
 26 | **********************************************************************************************/
 27 | 
 28 | -- DROP PROCEDURE sp_pivot(VARCHAR,VARCHAR,VARCHAR,VARCHAR,VARCHAR,VARCHAR,VARCHAR,REFCURSOR);
 29 | CREATE OR REPLACE PROCEDURE sp_pivot_for (
 30 |       select_cols  IN    VARCHAR(512)
 31 |     , table_name   IN    VARCHAR(256)
 32 |     , pivot_src    IN    VARCHAR(128)
 33 |     , agg_func     IN    VARCHAR(32)
 34 |     , metric_col   IN    VARCHAR(128)
 35 |     , where_clause IN    VARCHAR(512)
 36 |     , result_set   INOUT REFCURSOR )
 37 | AS $$
 38 | DECLARE
 39 |   pivot_col  RECORD;
 40 |   final_sql  VARCHAR(MAX) := '';
 41 |   pivot_sql  VARCHAR(MAX) := '';
 42 |   val_cnt    INTEGER  := 0;
 43 |   col_cnt    SMALLINT := 0;
 44 | BEGIN
 45 |     -- Check the number of values being pivoted. Placeholders used for NULL and zero length values.
 46 |     pivot_sql := 'SELECT COUNT(DISTINCT CASE WHEN '||quote_ident(pivot_src)||' IS NULL THEN ''<NULL>'''|| 
 47 |                                            ' WHEN '||quote_ident(pivot_src)||' = '''' THEN ''<EMPTY>'''|| 
 48 |                                            ' ELSE '||quote_ident(pivot_src)||' END ) AS val_cnt '||
 49 |            'FROM '||table_name||' ;';
 50 |     EXECUTE pivot_sql INTO val_cnt;
 51 |     IF val_cnt > 256 THEN -- Limit columns to prevent excessive width
 52 |         RAISE EXCEPTION 'Too many values to pivot. Found % but limit is 256.', val_cnt;
 53 |     END IF; 
 54 |     IF select_cols <> '' THEN
 55 |         final_sql := 'SELECT '||select_cols||', ';
 56 |     ELSE
 57 |         final_sql := 'SELECT ';
 58 |     END IF;
 59 |     -- Query to get the row values being pivoted
 60 |     pivot_sql := 'SELECT DISTINCT CASE WHEN '||quote_ident(pivot_src)||' IS NULL THEN ''<NULL>'''||
 61 |                                      ' WHEN '||quote_ident(pivot_src)||' = '''' THEN ''<EMPTY>'''||
 62 |                                      ' ELSE '||quote_ident(pivot_src)||' END AS col_n '||
 63 |                  'FROM '||table_name||CASE WHEN where_clause <> '' THEN ' WHERE '||where_clause ELSE '' END||' ORDER BY 1;';
 64 |     -- Iterate over the row values
 65 |     FOR pivot_col IN EXECUTE pivot_sql
 66 |     LOOP
 67 |        col_cnt := col_cnt + 1;
 68 |        IF col_cnt > 1 THEN
 69 |          final_sql := final_sql||', ';
 70 |        END IF;
 71 |        -- Add the CASE statement for each column to the SQL
 72 |        IF pivot_col.col_n = '<NULL>' THEN
 73 |            final_sql := final_sql||agg_func||'(CASE WHEN '||quote_ident(pivot_src)||' IS NULL THEN '||quote_ident(metric_col)
 74 |                                            ||' ELSE NULL END) AS '||quote_ident(pivot_col.col_n);
 75 |        ELSIF pivot_col.col_n = '<EMPTY>' THEN
 76 |            final_sql := final_sql||agg_func||'(CASE WHEN '||quote_ident(pivot_src)||' = '''' THEN '||quote_ident(metric_col)
 77 |                                            ||' ELSE NULL END) AS '||quote_ident(pivot_col.col_n);
 78 |        ELSE
 79 |            final_sql := final_sql||agg_func||'(CASE WHEN '||quote_ident(pivot_src)||' = '||quote_literal(pivot_col.col_n)
 80 |                                            ||' THEN '||quote_ident(metric_col)||' ELSE NULL END) AS '||quote_ident(pivot_col.col_n);
 81 |        END IF;
 82 |     END LOOP;
 83 |     final_sql := final_sql||' FROM '||table_name;
 84 |     IF where_clause <> '' THEN
 85 |         final_sql := final_sql||' WHERE '||where_clause||' ';
 86 |     END IF;
 87 |     IF select_cols <> '' THEN
 88 |         final_sql := final_sql||' GROUP BY '||select_cols||' ;';
 89 |     ELSE
 90 |         final_sql := final_sql||' ;';
 91 |     END IF;
 92 |     -- Open the cursor and execute the SQL
 93 |     OPEN result_set FOR EXECUTE final_sql;
 94 | END;
 95 | $$ LANGUAGE plpgsql;
 96 | 
 97 | 
 98 | /* Usage Example:
 99 | 
100 |     BEGIN;
101 | 
102 |     DROP TABLE IF EXISTS tmp_state_metrics;
103 |     CREATE TEMP TABLE tmp_state_metrics ( country VARCHAR, state VARCHAR, entries INTEGER);
104 |     INSERT INTO tmp_state_metrics VALUES
105 |           ('USA', NULL, 15177), ('USA', NULL, 1)
106 |         , ('USA', 'AK',  7001), ('USA', 'AK', 1)
107 |         , ('USA', 'AL', 20225), ('USA', 'AL', 1)
108 |         , ('USA', 'AR', 22772), ('USA', 'AR', 1)
109 |         , ('USA', 'AZ',  4577), ('USA', 'AZ', 1)
110 |         , ('USA', 'CA', 17877), ('USA', 'CA', 1)
111 |         , ('USA', 'CO', 19503), ('USA', 'CO', 1);
112 | 
113 |     CALL sp_pivot_for ('country','tmp_state_metrics','state','SUM','entries','','pivot_result');
114 |     FETCH ALL FROM pivot_result; CLOSE pivot_result;
115 |     --  country | <null> |  ak  |  al   |  ar   |  az  |  ca   |  co
116 |     --  USA     |  15178 | 7002 | 20226 | 22773 | 4578 | 17878 | 19504
117 | 
118 |     CALL sp_pivot_for ('country','tmp_state_metrics','state','SUM','entries','state IS NOT NULL','pivot_result');
119 |     FETCH ALL FROM pivot_result; 
120 |     -- country |  ak  |  al   |  ar   |  az  |  ca   |  co
121 |     -- USA     | 7002 | 20226 | 22773 | 4578 | 17878 | 19504
122 | 
123 |     END;
124 | */
125 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_connect_by_prior/sp_connect_by_prior.sql:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************************
  2 | Purpose: Calculate levels in a nested hierarchy tree
  3 | 
  4 | Notes:  Emulates result of CONNECT BY PRIOR … START WITH syntax
  5 | 
  6 | Syntax:
  7 |             SELECT $(select_cols)
  8 |                  , LEVEL
  9 |                  , $(child_col) , $(parent_col) 
 10 |             FROM  $(table_name)
 11 |             CONNECT BY PRIOR $(child_col) = $(parent_col) 
 12 |             START WITH $(start_with)
 13 |             WHERE LEVEL <= $(max_level)
 14 |               AND $(where_clause) ;
 15 | Parameters:
 16 |         select_cols : Text list of columns to be added to the SELECT clause
 17 |         table_name  : Schema qualified name of table to be queried
 18 |         child_col   : Child key column, e.g., employee_id
 19 |         parent_col  : Parent key column, e.g., ,manager_id
 20 |         start_with  : Criteria for the start of the hierarchy 
 21 |         where_clause: Text of predicate criteria to be added to the WHERE clause
 22 |         max_level   : Maximum hierarchy depth to be returned
 23 |         temp_tbl_n  : Name of temp table to be used for output
 24 | 
 25 | History:
 26 | 2019-10-25 - joeharris76 - Created
 27 | **********************************************************************************************/
 28 | 
 29 | -- DROP PROCEDURE sp_connect_by_prior(VARCHAR,VARCHAR,VARCHAR,VARCHAR,VARCHAR,VARCHAR,INTEGER,VARCHAR);
 30 | CREATE OR REPLACE PROCEDURE sp_connect_by_prior (
 31 |       select_cols  IN VARCHAR(256)
 32 |     , table_name   IN VARCHAR(128)
 33 |     , child_col    IN VARCHAR(128)
 34 |     , parent_col   IN VARCHAR(128)
 35 |     , start_with   IN VARCHAR(128)
 36 |     , where_clause IN VARCHAR(256)
 37 |     , max_level    IN INTEGER
 38 |     , temp_tbl_n   IN VARCHAR(128)
 39 | ) AS $$
 40 | DECLARE
 41 |   rows  INTEGER;
 42 |   level INTEGER := 1;
 43 |   sql   VARCHAR(MAX) := '';
 44 | BEGIN
 45 |     --Check that required parameters are not empty
 46 |     IF table_name='' OR parent_col='' OR child_col='' THEN
 47 |          RAISE EXCEPTION 'Parameters cannot be empty: schema_n, table_n, parent_col, child_col';
 48 |     END IF;
 49 |     --Retrieve the starting point of the hierarchy
 50 |     sql := ' SELECT '||select_cols||', '||level||' AS level'||
 51 |                  ', '||quote_ident(child_col)||', '||quote_ident(parent_col)||
 52 |            '  FROM  '||table_name;
 53 |     --Start with parent_col IS NULL if start_with is empty
 54 |     IF start_with = '' THEN
 55 |        sql := sql||' WHERE '||quote_ident(parent_col)||' IS NULL ';
 56 |     ELSE
 57 |        sql := sql||' WHERE '||start_with;
 58 |     END IF;
 59 |     IF where_clause <> '' THEN
 60 |        sql := sql||' AND '||where_clause;
 61 |     END IF;
 62 |     --Create temp table to hold results
 63 |     -- /*DEBUG:*/ RAISE INFO 'SQL: %', sql;
 64 |     EXECUTE 'DROP TABLE IF EXISTS '||temp_tbl_n||';';
 65 |     EXECUTE 'CREATE TEMPORARY TABLE '||temp_tbl_n||' AS '||sql||';';
 66 |     --Print row count for the first level
 67 |     EXECUTE 'SELECT COUNT(*) FROM '||temp_tbl_n||' ;' INTO rows ;
 68 |     RAISE INFO 'Level %: Rows found = %', level,rows;
 69 |     -- Loop over the hierarchy until max_level is reached
 70 |     <<get_children>>
 71 |     WHILE level < max_level LOOP
 72 |         level := (level + 1);
 73 |         --Join the temp table to the source to find next level
 74 |         sql := 'INSERT INTO '||quote_ident(temp_tbl_n)||
 75 |                ' SELECT '||select_cols||', '||level||' AS level'||
 76 |                      ', '||quote_ident(child_col)||', '||quote_ident(parent_col)||
 77 |                 ' FROM '||table_name||
 78 |                 ' WHERE '||quote_ident(parent_col)||
 79 |                 '  IN (SELECT '||quote_ident(child_col)||
 80 |                        ' FROM '||quote_ident(temp_tbl_n)||
 81 |                       ' WHERE level = '||(level-1)||')';
 82 |         IF where_clause <> '' THEN
 83 |            sql := sql||' AND '||where_clause;
 84 |         END IF;
 85 |         -- /*DEBUG:*/ RAISE INFO 'SQL: %', sql;
 86 |         EXECUTE sql||';';
 87 |         --Print row count for the first level
 88 |         GET DIAGNOSTICS rows := ROW_COUNT;
 89 |         RAISE INFO 'Level %: Rows found = %', level, rows;
 90 |         IF rows = 0 THEN
 91 |             EXIT get_children;
 92 |         END IF;
 93 |     END LOOP;
 94 | END
 95 | $$ LANGUAGE plpgsql;
 96 | 
 97 | /* Usage Example:
 98 | 
 99 |     CREATE TEMP TABLE tmp_employee ( emp_title VARCHAR, emp_id INT, mgr_id INT, dept VARCHAR);
100 |     INSERT INTO tmp_employee VALUES
101 |           ('Chairman'       , 100, NULL, 'Board'    )
102 |         , ('CEO'            , 101, 100 , 'Board'    )
103 |         , ('CTO'            , 102, 101 , 'IT'       )
104 |         , ('CMO'            , 103, 101 , 'Sales/Mkt')
105 |         , ('VP Analytics'   , 104, 102 , 'IT'       )
106 |         , ('VP Engineering' , 105, 102 , 'IT'       )
107 |         , ('Sales Director' , 106, 103 , 'Sales/Mkt')
108 |         , ('Sales Mgr West' , 107, 106 , 'Sales/Mkt')
109 |         , ('Sales Mgr East' , 108, 106 , 'Sales/Mkt')
110 |         , ('Sales Mgr South', 109, 106 , 'Sales/Mkt');
111 |     CALL sp_connect_by_prior(
112 |             'dept,emp_title','tmp_employee','emp_id','mgr_id','mgr_id IS NULL','',3,'tmp_result'
113 |         );
114 |     SELECT * FROM connect_by_result ORDER BY level, mgr_id, emp_id;
115 |     --    dept    | emp_title | level | emp_id | mgr_id
116 |     --  Board     | Chairman  |     1 |    100 |
117 |     --  Board     | CEO       |     2 |    101 |    100
118 |     --  IT        | CTO       |     3 |    102 |    101
119 |     --  Sales/Mkt | CMO       |     3 |    103 |    101
120 |     CALL sp_connect_by_prior(
121 |             'dept,emp_title','tmp_employee','emp_id','mgr_id','emp_id = 101','dept=\'Sales/Mkt\'',3,'tmp_result'
122 |         );
123 |     SELECT * FROM connect_by_result ORDER BY level, mgr_id, emp_id;
124 |     --   dept    |    emp_title    | level | emp_id | mgr_id
125 |     -- Sales/Mkt | CMO             |     1 |    103 |    101
126 |     -- Sales/Mkt | Sales Director  |     2 |    106 |    103
127 |     -- Sales/Mkt | Sales Mgr West  |     3 |    107 |    106
128 |     -- Sales/Mkt | Sales Mgr East  |     3 |    108 |    106
129 |     -- Sales/Mkt | Sales Mgr South |     3 |    109 |    106
130 | */
131 | 
132 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_check_foreign_key/sp_check_foreign_key.sql:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************************
  2 | Purpose: Check the integrity of the FOREIGN KEY declared on a table and a column
  3 | 
  4 | Notes:  If the foreign key dosen't have a primary key, it will raise a info. 
  5 | 
  6 | Parameters:
  7 |         batch_time  : Timestamp for this batch. Can be used to group multiple fixes
  8 |         check_table : Schema qualified name of table to be queried
  9 |         check_column : Name of the column we want to check the integrity, must be a column in check_table
 10 |         log_table   : Schema qualified table where actions are to be logged
 11 |                       DDL structure must be as follows:
 12 |                         CREATE TABLE $(log_table) (
 13 |                             batch_time   TIMESTAMP,
 14 |                             check_table  VARCHAR,
 15 |                             check_column VARCHAR,
 16 |                             check_time   TIMESTAMP,
 17 |                             check_status VARCHAR,
 18 |                             error_count  INT);
 19 | History:
 20 | 2023-04-20 - bgmello - Created
 21 | **********************************************************************************************/
 22 | 
 23 | CREATE PROCEDURE sp_check_foreign_key(batch_time timestamp without time zone, check_table character varying,
 24 |                                       check_column character varying, log_table character varying)
 25 |     LANGUAGE plpgsql
 26 | AS
 27 | $$
 28 | DECLARE
 29 |     sql                 VARCHAR(MAX) := '';
 30 |     record              RECORD;
 31 |     mixed_case_count    INT;
 32 |     pk_table            VARCHAR(256);
 33 |     pk_column           VARCHAR(256);
 34 |     inconsistency_count INTEGER      := 0;
 35 |     dot_position        INTEGER;
 36 | BEGIN
 37 |     IF check_table = '' OR log_table = '' OR check_column = '' THEN
 38 |         RAISE EXCEPTION 'Parameters `check_table`, `log_table`, `check_column` cannot be empty.';
 39 |     END IF;
 40 | 
 41 |     sql := 'SELECT COUNT(*) ' ||
 42 |            'FROM information_schema.columns ' ||
 43 |            'WHERE table_name = ''' || LOWER(check_table) || ''' AND column_name != ''' || LOWER(check_column) ||
 44 |            ''' AND LOWER(column_name) = ''' || LOWER(check_column) || ''';';
 45 |     EXECUTE sql INTO mixed_case_count;
 46 |     IF mixed_case_count > 0 THEN
 47 |         RAISE EXCEPTION 'There exists another column with the same identifier in different case. The procedure cannot run.';
 48 |     END IF;
 49 | 
 50 |     -- Retrieve the primary key column and table for that foreign key for the table
 51 |     sql := 'SELECT rel_kcu.table_schema || ''.'' || rel_kcu.table_name AS pk_table, ' ||
 52 |            '       rel_kcu.column_name AS pk_column ' ||
 53 |            'FROM information_schema.table_constraints tco ' ||
 54 |            'LEFT JOIN information_schema.key_column_usage kcu ' ||
 55 |            '          ON tco.constraint_schema = kcu.constraint_schema ' ||
 56 |            '          AND tco.constraint_name = kcu.constraint_name ' ||
 57 |            'LEFT JOIN information_schema.referential_constraints rco ' ||
 58 |            '          ON tco.constraint_schema = rco.constraint_schema ' ||
 59 |            '          AND tco.constraint_name = rco.constraint_name ' ||
 60 |            'LEFT JOIN information_schema.key_column_usage rel_kcu ' ||
 61 |            '          ON rco.unique_constraint_schema = rel_kcu.constraint_schema ' ||
 62 |            '          AND rco.unique_constraint_name = rel_kcu.constraint_name ' ||
 63 |            '          AND kcu.ordinal_position = rel_kcu.ordinal_position ' ||
 64 |            'WHERE tco.constraint_type = ''FOREIGN KEY''';
 65 |     dot_position := POSITION('.' IN check_table);
 66 |     IF dot_position > 0 THEN
 67 |         sql := sql || ' AND kcu.table_schema = ''' || LOWER(SUBSTRING(check_table FROM 1 FOR dot_position - 1)) || ''''
 68 |                    || ' AND kcu.table_name = ''' || LOWER(SUBSTRING(check_table FROM dot_position + 1)) || '''';
 69 |     ELSE
 70 |         sql := sql || ' AND kcu.table_name = ''' || LOWER(check_table) || '''';
 71 |     END IF;
 72 | 
 73 |     sql := sql || ' AND kcu.column_name = ''' || LOWER(check_column) || '''';
 74 |     EXECUTE sql INTO record;
 75 |     pk_table := record.pk_table;
 76 |     pk_column := record.pk_column;
 77 |     RAISE INFO '%', sql;
 78 |     -- Count the number of foreign key inconsistencies
 79 |     IF pk_table IS NULL OR pk_column IS NULL THEN
 80 |         RAISE INFO 'Primary table or column is null';
 81 |     ELSE
 82 |         sql := ' SELECT COUNT(*) FROM ' || check_table ||
 83 |                ' WHERE ' || check_column || ' NOT IN (SELECT ' ||
 84 |                pk_column || ' FROM ' || pk_table || ');';
 85 | 
 86 |         EXECUTE sql INTO inconsistency_count;
 87 |         IF inconsistency_count = 0 THEN
 88 |             EXECUTE 'INSERT INTO ' || log_table ||
 89 |                     ' (batch_time, check_table, check_column, check_time, check_status, error_count) VALUES (''' || batch_time ||
 90 |                     ''',''' || check_table || ''',''' || check_column || ''', current_timestamp,''OK - No foreign key inconsistencies found'',0);';
 91 |             RAISE INFO 'OK - No foreign key inconsistencies found';
 92 |         ELSE
 93 |             EXECUTE 'INSERT INTO ' || log_table ||
 94 |                     ' (batch_time, check_table, check_column, check_time, check_status, error_count) VALUES (''' || batch_time ||
 95 |                     ''',''' || check_table || ''',''' || check_column ||
 96 |                     ''', current_timestamp,''ERROR - ' || inconsistency_count ||
 97 |                     ' Foreign key inconsistencies found'', ' || inconsistency_count ||
 98 |                     ');';
 99 |             RAISE INFO 'ERROR - % Foreign key inconsistencies found', inconsistency_count;
100 |         END IF;
101 |     END IF;
102 | END
103 | $$;
104 | 
105 | /* Usage Example:
106 | 
107 |     DROP TABLE IF EXISTS tmp_fk_log;
108 |     CREATE TABLE tmp_fk_log(
109 |           batch_time   TIMESTAMP
110 |         , check_table  VARCHAR
111 |         , check_column VARCHAR
112 |         , check_time   TIMESTAMP
113 |         , check_status VARCHAR
114 |         , error_count  INT);
115 |     DROP TABLE IF EXISTS customers;
116 |     CREATE TABLE customers (
117 |     customer_id INTEGER IDENTITY(1,1) PRIMARY KEY,
118 |     customer_name VARCHAR(50) NOT NULL
119 |     );
120 |     DROP TABLE IF EXISTS orders;
121 |     CREATE TABLE orders (
122 |         order_id INTEGER IDENTITY(1,1) PRIMARY KEY,
123 |         customer_id INTEGER NOT NULL REFERENCES customers(customer_id),
124 |         order_date DATE NOT NULL
125 |     );
126 |     INSERT INTO customers (customer_name)
127 |     VALUES ('Alice'), ('Bob'), ('Charlie');
128 |     INSERT INTO orders (customer_id, order_date)
129 |     VALUES (1, '2023-01-01'),
130 |            (2, '2023-01-02'),
131 |            (3, '2023-01-03'),
132 |            (4, '2023-01-04'); -- Inconsistency: customer_id 4 does not exist
133 |     CALL sp_check_foreign_key(SYSDATE,'orders', 'customer_id', 'tmp_fk_log');
134 | */
135 | 
136 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_update_permissions/sp_update_permissions.sql:
--------------------------------------------------------------------------------
  1 | CREATE OR REPLACE PROCEDURE public.sp_update_permissions(external_schema_name varchar, external_table_name varchar)
  2 | AS $$
  3 | DECLARE
  4 |     get_authorization_rows_query VARCHAR(2000);
  5 |     authorization_query VARCHAR(2000);
  6 |     rows RECORD;
  7 | 
  8 | BEGIN
  9 |     -- Get authorisations query. This is the Spectrum table reading data from the latest file stored in S3
 10 |     get_authorization_rows_query = 'SELECT * FROM ' || external_schema_name || '.' || external_table_name || ';';
 11 | 
 12 |     -- Loop through rows in the file
 13 |     FOR rows IN EXECUTE get_authorization_rows_query LOOP
 14 | 
 15 |         -- Users
 16 |         IF lower(rows.principal_type) = 'user' THEN
 17 |             IF lower(rows.operation) = 'grant' THEN
 18 |                 IF lower(rows.object_name) LIKE 'all tables in schema%' THEN
 19 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_name || ' TO ' || rows.principal || ';';
 20 |                     RAISE INFO '%', authorization_query;
 21 |                     
 22 |                     EXECUTE authorization_query;
 23 |                 ELSE
 24 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_type || ' ' || rows.object_name || ' TO ' || rows.principal || ';';
 25 |                     RAISE INFO '%', authorization_query;
 26 |                     EXECUTE authorization_query;
 27 |                 END IF;
 28 |             
 29 |             ELSIF lower(rows.operation) = 'revoke' THEN
 30 |                 IF lower(rows.object_name) LIKE 'all tables in schema%' THEN
 31 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_name || ' FROM ' || rows.principal || ';';
 32 |                     RAISE INFO '%', authorization_query;
 33 |                     EXECUTE authorization_query;
 34 | 
 35 |                 ELSE
 36 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_type || ' ' || rows.object_name || ' FROM ' || rows.principal || ';';
 37 |                     RAISE INFO '%', authorization_query;
 38 |                     EXECUTE authorization_query;
 39 |                 END IF;
 40 | 
 41 |             ELSIF lower(rows.operation) = 'attach' THEN
 42 |                 authorization_query = rows.operation || ' RLS POLICY ' || rows.access_option || ' ON ' || rows.object_name || ' TO ' || rows.principal || ';';
 43 |                 RAISE INFO '%', authorization_query;
 44 |                 EXECUTE authorization_query;
 45 | 
 46 |             ELSIF lower(rows.operation) = 'detach' THEN
 47 |                 authorization_query = rows.operation || ' RLS POLICY ' || rows.access_option || ' ON ' || rows.object_name || ' FROM ' || rows.principal || ';';
 48 |                 RAISE INFO '%', authorization_query;
 49 |                 EXECUTE authorization_query;
 50 |             END IF;
 51 |         -- Groups
 52 |         ELSIF lower(rows.principal_type) = 'group' THEN
 53 |             IF lower(rows.operation) = 'grant' THEN
 54 |                 IF lower(rows.object_name) LIKE 'all tables in schema%' THEN
 55 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_name || ' TO GROUP ' || rows.principal || ';';
 56 |                     RAISE INFO '%', authorization_query;
 57 |                     EXECUTE authorization_query;
 58 |                 ELSE
 59 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_type || ' ' || rows.object_name || ' TO GROUP ' || rows.principal || ';';
 60 |                     RAISE INFO '%', authorization_query;
 61 |                     EXECUTE authorization_query;
 62 |                 END IF;
 63 |             ELSIF lower(rows.operation) = 'revoke' THEN
 64 |                 IF lower(rows.object_name) LIKE 'all tables in schema%' THEN
 65 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_name || ' FROM GROUP ' || rows.principal || ';';
 66 |                     RAISE INFO '%', authorization_query;
 67 |                     EXECUTE authorization_query;
 68 |                 ELSE
 69 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_type || ' ' || rows.object_name || ' FROM GROUP ' || rows.principal || ';';
 70 |                     RAISE INFO '%', authorization_query;
 71 |                     EXECUTE authorization_query;
 72 |                 END IF;
 73 |             END IF;
 74 | 
 75 |             ELSIF lower(rows.principal_type) = 'role' THEN
 76 |             IF lower(rows.operation) = 'grant' THEN
 77 |                 IF lower(rows.object_name) LIKE 'all tables in schema%' THEN
 78 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_name || ' TO ROLE ' || rows.principal || ';';
 79 |                     RAISE INFO '%', authorization_query;
 80 |                     EXECUTE authorization_query;
 81 |                 ELSE
 82 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_type || ' ' || rows.object_name || ' TO ROLE ' || rows.principal || ';';
 83 |                     RAISE INFO '%', authorization_query;
 84 |                     EXECUTE authorization_query;
 85 |                 END IF;
 86 |             ELSIF lower(rows.operation) = 'revoke' THEN
 87 |                 IF lower(rows.object_name) LIKE 'all tables in schema%' THEN
 88 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_name || ' FROM ROLE ' || rows.principal || ';';
 89 |                     --authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_name || ' TO ' || rows.principal || ';';
 90 |                     RAISE INFO '%', authorization_query;
 91 |                     EXECUTE authorization_query;
 92 |                 ELSE
 93 |                     authorization_query = rows.operation || ' ' || rows.access_option || ' ON ' || rows.object_type || ' ' || rows.object_name || ' FROM ROLE ' || rows.principal || ';';
 94 |                     RAISE INFO '%', authorization_query;
 95 |                     EXECUTE authorization_query;
 96 |                 END IF;
 97 |             ELSIF lower(rows.operation) = 'attach' THEN
 98 |                 authorization_query = rows.operation || ' RLS POLICY ' || rows.access_option || ' ON ' || rows.object_name || ' TO ROLE ' || rows.principal || ';';
 99 |                 RAISE INFO '%', authorization_query;
100 |                 EXECUTE authorization_query;
101 |             
102 |             ELSIF lower(rows.operation) = 'detach' THEN
103 |                 authorization_query = rows.operation || ' RLS POLICY ' || rows.access_option || ' ON ' || rows.object_name || ' FROM ROLE ' || rows.principal || ';';
104 |                 RAISE INFO '%', authorization_query;
105 |                 EXECUTE authorization_query;
106 | 
107 |             END IF;
108 | 
109 |         END IF;
110 |  
111 |     END LOOP;
112 | END;
113 | 
114 | $$ LANGUAGE plpgsql;


--------------------------------------------------------------------------------
/bin/deployFunction.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | set -e
  3 | 
  4 | #to-do: create dependent services, set outputs to env
  5 | #to-do: if lambda cfn, deploy, pass in parameters from env
  6 | 
  7 | function usage {
  8 | 	echo "./deployFunction.sh -t <type> -f <function> -s <s3 bucket> -k <s3 key> -r <redshift role> -c <cluster> -d <database> -u <db user> -n <namespace> -g <security group> -x subnet"
  9 | 	echo
 10 | 	echo "where <type> is the type of function to be installed. e.g. python-udfs, lambda-udfs, sql-udfs"
 11 | 	echo "      <function> is the name of the function, including the parameters and enclosed in quotes e.g. \"f_bitwise_to_string(bigint,int)\""
 12 | 	echo "      <cluster> is the Redshift cluster you will deploy the function to"
 13 | 	echo "      <database> is the database you will deploy the function to"
 14 | 	echo "      <db user> is the db user who will create the function"
 15 | 	echo "      <namespace> is the db namespace (schema) where the function will be created"
 16 | 	echo "      <s3 bucket> (optional) is the bucket in S3 to upload the artifact to."
 17 | 	echo "      <s3 key> (optional) is the key in S3 to upload the artifact to."
 18 | 	echo "      <redshift role> (optional) is the role which is attached to the Redshift cluster and has access to read from the s3 upload location (for python libs) and/or lambda execute permissions (for lambda fns)"
 19 | 	echo "      <security group> (optional) is security the security group the lambda function will run in "
 20 | 	echo "      <subnet> (optional) is the subnet the lambda function will run in"
 21 | 	exit 0;
 22 | }
 23 | 
 24 | execQuery()
 25 | {
 26 |   output=`aws redshift-data execute-statement --cluster-identifier $cluster --database $db --db-user $user --sql "set search_path to $schema; $1"`
 27 |   id=`echo $output | jq -r .Id`
 28 | 
 29 |   status="SUBMITTED"
 30 |   while [ "$status" != "FINISHED" ] && [ "$status" != "FAILED" ]
 31 |   do
 32 |     sleep 1
 33 |     status=`aws redshift-data describe-statement --id $id | jq -r .Status`
 34 |     notNull "$status" "Error running describe-statement"
 35 |   done
 36 |   if [ "$status" == "FAILED" ]; then
 37 |     aws redshift-data describe-statement --id $id
 38 |     exit 1
 39 |   else
 40 |     echo $id:$status
 41 |   fi
 42 | }
 43 | 
 44 | function checkDep {
 45 |   which $1 >> /dev/null
 46 |   if [ $? -ne 0 ]; then
 47 |     echo "Unable to find required dependency $1"
 48 |     exit -1
 49 |   fi
 50 | }
 51 | 
 52 | function notNull {
 53 |   if [ "$1x" == "x" ]; then
 54 |     echo $2
 55 |     exit -1
 56 |   fi
 57 | }
 58 | 
 59 | # make sure we have pip and the aws cli installed
 60 | checkDep "aws"
 61 | checkDep "jq"
 62 | 
 63 | # look up runtime arguments of the module name and the destination S3 Prefix
 64 | while getopts "t:f:s:k:l:r:c:d:u:n:g:x:h" opt; do
 65 |   case $opt in
 66 |     t) type="$OPTARG";;
 67 |     f) function="$OPTARG";;
 68 |     s) s3Bucket="$OPTARG";;
 69 |     k) s3Key="$OPTARG";;
 70 |     r) redshiftRole="$OPTARG";;
 71 |     c) cluster="$OPTARG";;
 72 |     d) db="$OPTARG";;
 73 |     u) user="$OPTARG";;
 74 |     n) schema="$OPTARG";;
 75 |     g) securityGroup="$OPTARG";;
 76 |     x) subnet="$OPTARG";;
 77 |     h) usage;;
 78 |     \?) echo "Invalid option: -"$OPTARG"" >&2
 79 |       exit 1;;
 80 |     :) usage;;
 81 |   esac
 82 | done
 83 | 
 84 | # validate required arguments
 85 | notNull "$type" "Please provide the function type -t"
 86 | notNull "$function" "Please provide the function name -f"
 87 | notNull "$cluster" "Please provide the Redshift cluster name -c"
 88 | notNull "$db" "Please provide the Redshift cluster db name -d"
 89 | notNull "$user" "Please provide the Redshift cluster user name -u"
 90 | notNull "$schema" "Please provide the Redshift cluster namespace (schema) -n"
 91 | 
 92 | paramsBuckets=""
 93 | 
 94 | if test -z "$subnet"; then
 95 | 	paramsVPC=""
 96 | else
 97 | 	paramsVPC="SecurityGroupId=${securityGroup} SubnetId=${subnet}"
 98 | fi
 99 | 
100 | if test -f "../$type/$function/package.json"; then
101 |   notNull "$s3Bucket" "Please provide the S3 Bucket to store the library package -s"
102 |   s3Loc="s3://$s3Bucket/$s3Key"
103 |   cd ../$type/$function
104 |   npm install
105 |   zip -r $function.zip index.js node_modules
106 |   aws s3 cp $function.zip $s3Loc
107 |   rm $function.zip
108 |   rm package-lock.json
109 |   rm -rf node_modules
110 |   cd ../../bin
111 |   paramsBuckets="S3Bucket=$s3Bucket S3Key=$s3Key$function.zip"
112 | fi
113 | 
114 | if test -f "../$type/$function/requirements.txt"; then
115 |   # check that the s3 prefix is in the right format
116 |   # starts with 's3://'
117 |   notNull "$s3Bucket" "Please provide the S3 Bucket to store the library package -s"
118 |   notNull "$redshiftRole" "Please provide the Redshift role which is attached to the Redshift cluster and has access to read from the s3 upload location -r"
119 | 
120 | 	if [ -z "$s3Key" ]; then
121 | 		s3Loc="s3://$s3Bucket"
122 | 	else
123 | 		s3Loc="s3://$s3Bucket/$s3Key"
124 | 	fi
125 | 
126 |   if [ "${type}" == "lambda-udfs" ]; then 
127 |     echo "Building layer"
128 |     cat ../$type/$function/requirements.txt | while read dep; do
129 |       ./layerInstaller.sh -s "${s3Loc}" -r "${dep}" -f "${function}"
130 |     done
131 |     paramsBuckets="S3Bucket=$s3Bucket S3Key=$s3Key"
132 |   else
133 |     checkDep "pip3"
134 |     while read dep; do
135 |       echo Installing: $dep
136 |       ./libraryInstaller.sh -m $dep -s $s3Loc -r $redshiftRole -c $cluster -d $db -u $user
137 |     done < ../$type/$function/requirements.txt
138 |     paramsBuckets="S3Bucket=$s3Bucket S3Key=$s3Key$function.zip"
139 |   fi
140 | fi
141 | 
142 | if test -f "../$type/$function/pom.xml"; then
143 |   # check that the s3 prefix is in the right format
144 |   # starts with 's3://'
145 |   notNull "$s3Bucket" "Please provide the S3 Bucket to store the library package -s"
146 |   notNull "$redshiftRole" "Please provide the Redshift role which is attached to the Redshift cluster and has access to read from the s3 upload location -r"
147 |   s3Loc="s3://$s3Bucket/$s3Key"
148 | 	checkDep "mvn"
149 | 	cd ../$type/$function
150 |   #mvn --batch-mode --update-snapshots verify
151 | 	#rm -rf target
152 | 	mvn package
153 | 	packagename=${function//(/_}
154 |   packagename=${packagename//)/}
155 | 	aws s3 cp "target/$packagename-1.0.0.jar" "$s3Loc$packagename-1.0.0.jar"
156 | 	rm -rf target
157 | 	rm dependency-reduced-pom.xml
158 | 	cd ../../bin
159 | 	paramsBuckets="S3Bucket=$s3Bucket S3Key=$s3Key$packagename-1.0.0.jar"
160 | fi
161 | 
162 | if test -f "../$type/$function/lambda.yaml"; then
163 |   template=$(<"../$type/$function/lambda.yaml")
164 |   stackname=${function//(/-}
165 |   stackname=${stackname//)/}
166 |   stackname=${stackname//_/-}
167 |   stackname=${stackname//,/-}
168 |   if test -z "$paramsVPC" && test -z "$paramsBuckets"; then
169 |     paramOverrides=""
170 |   else
171 |     paramOverrides="--parameter-overrides $paramsVPC $paramsBuckets"
172 |   fi
173 |   echo aws cloudformation deploy --template-file ../${type}/${function}/lambda.yaml ${paramsOverrides} --stack-name ${stackname} --no-fail-on-empty-changeset --capabilities CAPABILITY_IAM
174 |   if ! aws cloudformation deploy --template-file ../${type}/${function}/lambda.yaml ${paramsOverrides} --stack-name ${stackname} --no-fail-on-empty-changeset --capabilities CAPABILITY_IAM; then
175 |     aws cloudformation delete-stack --stack-name ${stackname}
176 |     exit 1
177 |   fi
178 | fi
179 | 
180 | 
181 | sql=$(<"../$type/$function/function.sql")
182 | echo execQuery "${sql//:RedshiftRole/$redshiftRole}"
183 | execQuery "${sql//:RedshiftRole/$redshiftRole}"
184 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
  1 | # Contributing Guidelines
  2 | 
  3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
  4 | documentation, we greatly value feedback and contributions from our community.
  5 | 
  6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
  7 | information to effectively respond to your bug report or contribution.
  8 | 
  9 | ## Testing via GitHub Actions
 10 | You can configure GitHub actions in your fork by enabling the actions and setting up the corresponding secrets for the actions. The required secrets are:
 11 | 
 12 | * `AWS_ID` - The AWS Access Key ID for the GitHub runner to use
 13 | * `AWS_KEY` - The AWS Secret Access Key for the GitHub runner to use
 14 | * `REGION` - The region to run in
 15 | * `CLUSTER` - The Redshift cluster name
 16 | * `DB` - The Redshift cluster DB
 17 | * `SCHEMA` - The Redshift schema to use
 18 | * `USER` - The Redshift username to use
 19 | * `IAM_ROLE` - The Redshift IAM role name
 20 | * `S3_BUCKET` - The S3 bucket to use for artifacts
 21 | * `SECURITY_GROUP` - The Security Group to deploy into, if deploying Lambda in-VPC
 22 | * `SUBNET` - The Subnet to deploy into, if deploying Lambda in-VPC
 23 | 
 24 | The GitHub runner assumes you already have the Redshift cluster, S3 bucket, IAM role, etc. already configured in your AWS account.
 25 | 
 26 | The GitHub runner needs an IAM user with permissions to provision Lambda functions and execute tests. A minimal IAM policy for this GitHub user would be as follows - additional policies may be needed for your Lambda depending on the infrastructure you create in your CloudFormation templates:
 27 | 
 28 | ```
 29 | {
 30 | 	"Version": "2012-10-17",
 31 | 	"Statement": [
 32 | 		{
 33 | 			"Effect": "Allow",
 34 | 			"Action": [
 35 | 				"redshift-data:BatchExecuteStatement",
 36 | 				"redshift-data:ExecuteStatement"
 37 | 			],
 38 | 			"Resource": [
 39 | 				"arn:aws-us-gov:redshift:*:*:cluster:{redshift-cluster}"
 40 | 			]
 41 | 		},
 42 | 		{
 43 | 			"Effect": "Allow",
 44 | 			"Action": [
 45 | 				"redshift-data:GetStatementResult",
 46 | 				"redshift-data:DescribeStatement"
 47 | 			],
 48 | 			"Resource": [
 49 | 				"*"
 50 | 			],
 51 | 			"Condition": {
 52 | 				"StringEquals": {
 53 | 					"redshift-data:statement-owner-iam-userid": "${aws:userid}"
 54 | 				}
 55 | 			}
 56 | 		},
 57 | 		{
 58 | 			"Effect": "Allow",
 59 | 			"Action": [
 60 | 				"cloudformation:CreateChangeSet",
 61 | 				"cloudformation:DescribeStacks",
 62 | 				"cloudformation:ExecuteChangeSet",
 63 | 				"cloudformation:DescribeChangeSet",
 64 | 				"cloudformation:DeleteStack",
 65 | 				"cloudformation:GetTemplateSummary"
 66 | 			],
 67 | 			"Resource": [
 68 | 				"arn:aws-us-gov:cloudformation:*:*:stack/f-*"
 69 | 			]
 70 | 		},
 71 | 		{
 72 | 			"Effect": "Allow",
 73 | 			"Action": [
 74 | 				"iam:DeleteRolePolicy",
 75 | 				"iam:PutRolePolicy",
 76 | 				"iam:DetachRolePolicy",
 77 | 				"iam:GetRolePolicy",
 78 | 				"iam:CreateRole",
 79 | 				"iam:GetRole",
 80 | 				"iam:DeleteRole",
 81 | 				"iam:AttachRolePolicy",
 82 | 				"iam:PassRole"
 83 | 			],
 84 | 			"Resource": [
 85 | 				"arn:aws-us-gov:iam::*:role/f-*"
 86 | 			]
 87 | 		},
 88 | 		{
 89 | 			"Effect": "Allow",
 90 | 			"Action": [
 91 | 				"lambda:CreateFunction",
 92 | 				"lambda:DeleteFunction",
 93 | 				"lambda:GetFunction",
 94 | 				"lambda:PublishLayerVersion",
 95 | 				"lambda:UpdateFunctionConfiguration",
 96 | 				"lambda:DeleteLayerVersion",
 97 | 				"lambda:GetLayerVersion"
 98 | 			],
 99 | 			"Resource": [
100 | 				"arn:aws-us-gov:lambda:*:*:function:f-*",
101 | 				"arn:aws-us-gov:lambda:*:*:layer:*",
102 | 				"arn:aws-us-gov:lambda:*:*:layer:*:*"
103 | 			]
104 | 		},
105 | 		{
106 | 			"Effect": "Allow",
107 | 			"Action": [
108 | 				"sts:GetCallerIdentity"
109 | 			],
110 | 			"Resource": [
111 | 				"*"
112 | 			]
113 | 		},
114 | 		{
115 | 			"Effect": "Allow",
116 | 			"Action": [
117 | 				"kms:Decrypt",
118 | 				"kms:GenerateDataKey"
119 | 			],
120 | 			"Resource": [
121 | 				"arn:aws-us-gov:kms:*:*:key/*"
122 | 			]
123 | 		},
124 | 		{
125 | 			"Effect": "Allow",
126 | 			"Action": [
127 | 				"s3:GetObject",
128 | 				"s3:PutObject"
129 | 			],
130 | 			"Resource": [
131 | 				"arn:aws-us-gov:s3:::{s3-bucket}/*"
132 | 			]
133 | 		},
134 | 		{
135 | 			"Effect": "Allow",
136 | 			"Action": [
137 | 				"redshift:GetClusterCredentials"
138 | 			],
139 | 			"Resource": [
140 | 				"arn:aws-us-gov:redshift:*:*:dbuser:{redshift-cluster}/{user}",
141 | 				"arn:aws-us-gov:redshift:*:*:dbname:{redshift-cluster}/{database}"
142 | 			]
143 | 		}
144 | 	]
145 | }
146 | ```
147 | 
148 | 
149 | ## Reporting Bugs/Feature Requests
150 | 
151 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
152 | 
153 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
154 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
155 | 
156 | * A reproducible test case or series of steps
157 | * The version of our code being used
158 | * Any modifications you've made relevant to the bug
159 | * Anything unusual about your environment or deployment
160 | 
161 | 
162 | ## Contributing via Pull Requests
163 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
164 | 
165 | 1. You are working against the latest source on the *main* branch.
166 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
167 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
168 | 
169 | To send us a pull request, please:
170 | 
171 | 1. Fork the repository.  
172 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
173 | 3. Ensure local tests pass.
174 | 4. Commit to your fork using clear commit messages.
175 | 5. Send us a pull request, answering any default questions in the pull request interface.
176 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
177 | 
178 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
179 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
180 | 
181 | 
182 | ## Finding contributions to work on
183 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
184 | 
185 | 
186 | ## Code of Conduct
187 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
188 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
189 | opensource-codeofconduct@amazon.com with any additional questions or comments.
190 | 
191 | 
192 | ## Security issue notifications
193 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
194 | 
195 | 
196 | ## Licensing
197 | 
198 | See the [LICENSE](LICENSE.txt) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
199 | 


--------------------------------------------------------------------------------
/stored-procedures/sp_split_table_by_range/sp_split_table_by_range.sql:
--------------------------------------------------------------------------------
  1 | /**********************************************************************************************
  2 | Purpose: Split a large table into parts using a numeric column
  3 | 
  4 | Notes:  Splits the provided MIN/MAX values into equal ranges
  5 |         Strongly recommend splitting on the sort key column if possible
  6 | Logic:
  7 |         FOR split = 2 LOOP
  8 |             CREATE TABLE IF NOT EXISTS $(schema).$(table)_002 (LIKE $(schema).$(table));
  9 |             INSERT INTO $(schema).$(table)_002 SELECT * FROM $(schema).$(table) LIMIT 100;
 10 |             DELETE FROM $(schema).$(table)_002;
 11 |             INSERT INTO $(schema).$(table)_002 SELECT * FROM $(schema).$(table) WHERE $(split_column) >= $(min_value) AND $(split_column) < $(max_value);
 12 |         END LOOP;
 13 |         {CHECK SUM(COUNT(*)) FROM ALL SPLITS == COUNT(*) FROM SOURCE TABLE}
 14 | Input:
 15 |         Requires the following temp table (see Testing at bottom): 
 16 |         CREATE TEMP TABLE tmp_split_tables (
 17 |             schema_name   VARCHAR,
 18 |             table_name    VARCHAR,
 19 |             split_key_col VARCHAR,
 20 |             split_count   INTEGER,
 21 |             min_sk_val    BIGINT,
 22 |             max_sk_val    BIGINT);
 23 | 
 24 | History:
 25 | 2019-10-29 - joeharris76 - Created
 26 | **********************************************************************************************/
 27 | 
 28 | -- DROP PROCEDURE sp_split_table_by_range( );
 29 | CREATE OR REPLACE PROCEDURE sp_split_table_by_range ()
 30 | AS $$
 31 | DECLARE
 32 |   split_config   RECORD;
 33 |   source_tbl_n   VARCHAR := '';
 34 |   target_tbl_n   VARCHAR := '';
 35 |   source_row_cnt BIGINT := 0;
 36 |   target_row_cnt BIGINT := 0;
 37 |   range_start    BIGINT := 0;
 38 |   range_end      BIGINT := 0;
 39 |   range_interval BIGINT := 0;
 40 |   total_range    BIGINT := 0;
 41 |   tbl_cnt        SMALLINT := 0;
 42 |   split_cnt      SMALLINT := 0;
 43 |   split_cnt_val  VARCHAR(10) := '';
 44 |   split_sql      VARCHAR(MAX) := '';
 45 |   check_sql      VARCHAR(MAX) := '';
 46 |   rows_affected  BIGINT;
 47 | BEGIN
 48 |     -- Iterate over the list of tables being split
 49 |     FOR split_config IN SELECT * FROM tmp_split_tables
 50 |     LOOP
 51 |         tbl_cnt := tbl_cnt + 1;
 52 |         split_cnt := 0;
 53 |         check_sql := '';
 54 |         source_tbl_n := quote_ident(split_config.schema_name)||'.'||quote_ident(split_config.table_name);
 55 |         RAISE INFO 'Starting split #%, splitting `%`', tbl_cnt, source_tbl_n;
 56 |         total_range := split_config.max_sk_val - split_config.min_sk_val;
 57 |         range_interval := CEIL(total_range::FLOAT/split_config.split_count)::BIGINT;
 58 |         RAISE INFO ' >> Total splits = %, interval = %, total range = %', split_config.split_count, range_interval, total_range;
 59 |         -- Iterate over the number of splits
 60 |         WHILE split_cnt <= split_config.split_count
 61 |         LOOP
 62 |             split_cnt_val := LPAD(split_cnt::VARCHAR,3,'0');
 63 |             IF split_cnt > 0 THEN
 64 |                 range_start := split_config.min_sk_val+(range_interval*(split_cnt-1));
 65 |                 IF split_cnt = split_config.split_count THEN
 66 |                     range_end := split_config.max_sk_val+1;
 67 |                 ELSE            
 68 |                     range_end := split_config.min_sk_val+(range_interval*split_cnt);
 69 |                 END IF;
 70 |             END IF;
 71 |             target_tbl_n := quote_ident(split_config.schema_name)||'.'||quote_ident(split_config.table_name||'_'||split_cnt_val);
 72 |             RAISE INFO ' > Split part _%: Starting',split_cnt_val;
 73 |             EXECUTE 'DROP TABLE IF EXISTS '||target_tbl_n||' ;';
 74 |             EXECUTE 'CREATE TABLE IF NOT EXISTS '||target_tbl_n||' (LIKE '||source_tbl_n||');';
 75 |             EXECUTE 'INSERT INTO '||target_tbl_n||' SELECT * FROM '||source_tbl_n||' LIMIT 100;';
 76 |             EXECUTE 'DELETE FROM '||target_tbl_n||';';
 77 |             split_sql :=  'INSERT INTO '||target_tbl_n||' SELECT * FROM '||source_tbl_n;
 78 |             IF split_cnt = 0 THEN -- Zero split for rows with NULL in the split column
 79 |                 split_sql :=  split_sql||' WHERE '||quote_ident(split_config.split_key_col)||' IS NULL;';
 80 |             ELSE 
 81 |                 split_sql :=  split_sql||' WHERE '||quote_ident(split_config.split_key_col)||' >= '||range_start
 82 |                                        ||' AND '||quote_ident(split_config.split_key_col)||' < '||range_end||';';
 83 |             END IF;
 84 |             EXECUTE split_sql;
 85 |             GET DIAGNOSTICS rows_affected := ROW_COUNT;
 86 |             RAISE INFO ' - Split part _%: Rows inserted = %', split_cnt_val, rows_affected;
 87 |             COMMIT; -- Commit to avoid rework if we encounter an issue
 88 |             IF check_sql <> '' THEN
 89 |                 check_sql := check_sql||' UNION ALL ';
 90 |             END IF;
 91 |             check_sql := check_sql||' SELECT COUNT(*) row_cnt FROM '||target_tbl_n ;
 92 |             split_cnt := split_cnt + 1;
 93 |         END LOOP;
 94 |         split_sql := 'SELECT COUNT(*) FROM '||source_tbl_n||';';
 95 |         EXECUTE split_sql INTO source_row_cnt ;
 96 |         split_sql := 'SELECT SUM(row_cnt) FROM ('||check_sql||');';
 97 |         EXECUTE split_sql INTO target_row_cnt ;
 98 |         RAISE INFO ' -- Complete: Split rows: % / Source rows: %',target_row_cnt, source_row_cnt;
 99 |         --Check that the row counts match
100 |         IF source_row_cnt <> target_row_cnt THEN
101 |             RAISE EXCEPTION 'ERROR: Split failed for %: Total split table rows does not match origin row count.', source_tbl_n;
102 |         ELSE
103 |             RAISE INFO 'SUCCESS: Split complete for %',source_tbl_n ;
104 |             -- NOTE: Add logic here to auto-delete the original table
105 |         END IF;
106 |     END LOOP;
107 | END;
108 | $$ LANGUAGE plpgsql;
109 | 
110 | 
111 | /* Usage Example:
112 | 
113 |     -- Find the min.max sortk keys for the tables to be split
114 |     -- SELECT 'public', 'store_sales',     'ss_sold_date_sk',     MIN(ss_sold_date_sk    ), MAX(ss_sold_date_sk    ) FROM public.store_sales     UNION ALL
115 |     -- SELECT 'public', 'catalog_sales',   'cs_sold_date_sk',     MIN(cs_sold_date_sk    ), MAX(cs_sold_date_sk    ) FROM public.catalog_sales   UNION ALL
116 |     -- SELECT 'public', 'web_sales',       'ws_sold_date_sk',     MIN(ws_sold_date_sk    ), MAX(ws_sold_date_sk    ) FROM public.web_sales       UNION ALL
117 |     -- SELECT 'public', 'store_returns',   'sr_returned_date_sk', MIN(sr_returned_date_sk), MAX(sr_returned_date_sk) FROM public.store_returns   UNION ALL
118 |     -- SELECT 'public', 'catalog_returns', 'cr_returned_date_sk', MIN(cr_returned_date_sk), MAX(cr_returned_date_sk) FROM public.catalog_returns UNION ALL
119 |     -- SELECT 'public', 'web_returns',     'wr_returned_date_sk', MIN(wr_returned_date_sk), MAX(wr_returned_date_sk) FROM public.web_returns     ;
120 | 
121 |     -- Config table declares the sort key range and how many parts to split into
122 |     CREATE TEMP TABLE tmp_split_tables ( schema_name VARCHAR, table_name VARCHAR, split_key_col VARCHAR, split_count INTEGER, min_sk_val BIGINT, max_sk_val BIGINT);
123 |     INSERT INTO tmp_split_tables VALUES
124 |           ('public', 'store_sales',     'ss_sold_date_sk',     25, 2450816, 2452642)
125 |         , ('public', 'catalog_sales',   'cs_sold_date_sk',     20, 2450821, 2452924)
126 |         , ('public', 'web_sales',       'ws_sold_date_sk',     10, 2450816, 2452642)
127 |         , ('public', 'store_returns',   'sr_returned_date_sk',  6, 2450820, 2452822)
128 |         , ('public', 'catalog_returns', 'cr_returned_date_sk',  4, 2450815, 2452654)
129 |         , ('public', 'web_returns',     'wr_returned_date_sk',  2, 2450819, 2453002);
130 | 
131 |     -- Call the stored procedure    
132 |     CALL sp_split_table_by_range();
133 |     -- INFO:  Starting split #1, splitting `public.catalog_sales`
134 |     -- INFO:   >> Total splits = 20, interval = 106, total range = 2103
135 |     -- INFO:   > Split part _001: Starting
136 |     -- INFO:   - Split part _001: Rows inserted = 14288879
137 |     -- INFO:   > Split part _002: Starting
138 |     -- INFO:   - Split part _002: Rows inserted = 15294501
139 |     -- INFO:   > Split part _003: Starting
140 |     -- INFO:   - Split part _003: Rows inserted = 35862731
141 |     -- INFO:   > Split part _004: Starting
142 |     -- INFO:   - Split part _004: Rows inserted = 27950288
143 |     -- INFO:   > Split part _005: Starting
144 |     -- INFO:   - Split part _005: Rows inserted = 14426566
145 |     -- INFO:   > Split part _006: Starting
146 |     -- INFO:   - Split part _006: Rows inserted = 25537838
147 |     -- INFO:   > Split part _007: Starting
148 |     -- INFO:   - Split part _007: Rows inserted = 39275422
149 |     -- INFO:   > Split part ...
150 | 
151 | */
152 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Amazon Redshift UDFs
  2 | A collection of stored procedures and user-defined functions (UDFs) for Amazon Redshift. The intent of this collection is to provide examples for defining useful functions which extend Amazon Redshift capabilities and support migrations from legacy DB platforms.
  3 | 
  4 | ## Stored Procedures
  5 | Each procedure is allocated a folder.  At minimal each procedure will have a <procedure_name>.sql file which you may use to deploy the procedure.  Optionally, it may contain a README.md file for instructions on how to use the procedure and additional files used to test the procedure.
  6 | 
  7 | ## UDFs
  8 | Each function is allocated a folder.  At minimal each function will have the following files which will be used by the [deployFunction.sh](#deployFunctionsh) script and [testFunction.sh](#testFunctionsh) scripts:
  9 | 
 10 | - **function.sql** - the SQL script to be run in the Redshift DB which creates the UDF.  If a Lambda function, use the string `:RedshiftRole` for the IAM role to be passed in by the deployment script.
 11 | - **input.csv** - a list of sample input parameters to the function, delimited by comma (,) and where strings are denoted with single-quotes.  *Note: input.csv file MUST have a trailing newline character.*
 12 | - **output.csv** - a list of expected output values from the function. *Note: Output.csv file can be excluded when the UDF will not return a consistent output between deployments or runs, but an output.csv.sample file should be created for users to understand what to expect.  See [f_kms_decrypt](lambda-udfs/f_kms_decrypt\(varchar\)) & [f_kms_encrypt](lambda-udfs/f_kms_encrypt\(varchar,varchar\)) for an example.*
 13 | 
 14 | ### python-udfs
 15 | 
 16 | [Python UDFs](https://docs.aws.amazon.com/redshift/latest/dg/udf-creating-a-scalar-udf.html) may include the following additional file:
 17 | 
 18 | - **requirements.txt** - If your function requires modules not available already in Redshift, a list of modules.  The modules will be packaged, uploaded to S3, and mapped to a [library](https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_LIBRARY.html) in Redshift.  *Note: requirements.txt file MUST have a trailing newline.*
 19 | 
 20 | ### lambda-udfs
 21 | 
 22 | [Lambda UDFs](https://docs.aws.amazon.com/redshift/latest/dg/udf-creating-a-lambda-sql-udf.html) may include the following additional file:
 23 | 
 24 | - **lambda.yaml** - [REQUIRED] a CFN template containing the Lambda function.  The lambda function name should match the redshift function name with '_' replaced with '-'  e.g. (f-upper-python-varchar). The template may contain additional AWS services required by the lambda function and should contain an IAM Role which can be assumed by the lambda service and which grants access to those additional services (if applicable).  These samples will use "*" for IAM resource policies.  In a production deployment, modify IAM Role policies to scope down access.
 25 | 
 26 | - **resources.yaml** - a CFN template containing external resources which may be referenced by the Lambda function. These resources are for testing only.
 27 | 
 28 | - **requirements.txt** - (Python Only) If your function requires modules not available already in the Lambda Python container.  These modules will be packaged and uploaded to S3.  You will need to include a [AWS::Lambda::LayerVersion](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/aws-resource-lambda-layerversion.html) resource for each module in your lambda.yaml file.  It will need to reference the module and version number. See [f_glue_schema_registry_avro_to_json](lambda-udfs/f_glue_schema_registry_avro_to_json\(varchar,varchar\)/lambda.yaml) as an example. *Note: requirements.txt file MUST have a trailing newline.*
 29 | 
 30 | - **package.json** - (NodeJS Only) If your function requires modules not available already in Lambda, a list of modules.  The modules will be packaged, uploaded to S3, and mapped to your Lambda function. See [f_mysql_lookup_nodejs](lambda-udfs/f_mysql_lookup_nodejs\(varchar,varchar,varchar,varchar\)) for and example.  
 31 | 
 32 | - **index.js** (NodeJS Only) your javascript handler code.  See [f_upper_nodejs](lambda-udfs/f_upper_nodejs\(varchar\)) for and example.  
 33 | 
 34 | - **pom.xml** - (Java Only) Lists out dependencies as well as the name of your handler function. See [f_upper_java](lambda-udfs/f_upper_java\(varchar\)) for and example.  See [Maven Documentation](https://maven.apache.org/guides/introduction/introduction-to-the-pom.html) for more details on writing a pom.xml file.
 35 | 
 36 | - **src/main/java/`function`/Handler.java** - (Java Only) your java handler code. See [f_upper_java](lambda-udfs/f_upper_java\(varchar\)) for and example.  
 37 | 
 38 | ### sql-udfs
 39 | [SQL UDFs](https://docs.aws.amazon.com/redshift/latest/dg/udf-creating-a-scalar-sql-udf.html) do not require any additional files.
 40 | 
 41 | ## Networking Considerations
 42 | Lambda functions can be deployed in a VPC if they need access to resources
 43 | within the private network. In this case, you may need to add network 
 44 | connectivity (e.g. VPC Endpoints) to allow communication to AWS services.
 45 | If your Lambda function is deployed in a VPC and uses AWS services (e.g. S3, 
 46 | DDB, Glue, etc.) in its code, please define the VPC Endpoint resources in
 47 | your `resources.yaml` file. 
 48 | 
 49 | ## Deployment & Testing
 50 | Located in the `bin` directory are tools to deploy and test your UDF functions.  
 51 | 
 52 | ### deployFunction.sh
 53 | This script will orchestrate the deployment of the UDF to your AWS environment. This includes
 54 | 1. Looping through modules in a `requirements.txt` file (if present) 
 55 |     * For Python UDFs, installs dependencies using the `libraryInstall.sh` script by uploading the packages to the `$S3_LOC` and creating the library in Redshift using the `$REDSHIFT_ROLE`.
 56 |     * For Lambda UDFs, installs dependencies as Lambda layers which are referenced in the Lambda CloudFormation using the S3Bucket and S3Key parameters (-s and -k, respectively).
 57 | 2. If deploying a nodeJS lambda UDF, using `package.json` to run `npm install` packaging the code and uploading the `zip` file to the `$S3_LOC`.
 58 | 3. If deploying a Java lambda UDF, using `pom.xml` to run `mvn package` packaging the code and uploading the `jar` to the `$S3_LOC`.
 59 | 4. If deploying a lambda UDF, using `lambda.yaml` to run `aws cloudformation deploy` and build the needed resources.
 60 | 5. Creating the UDF function in Redshift by executing the `function.sql` sql script.  If deploying a lambda UDF, replacing the `:RedshiftRole` parameter.
 61 | 
 62 | ```
 63 | ./deployFunction.sh -t lambda-udfs -f "f_upper_python(varchar)" -c $CLUSTER -d $DB -u $USER -n $SCHEMA -r $REDSHIFT_ROLE
 64 | 
 65 | ./deployFunction.sh -t python-udfs -f "f_ua_parser_family(varchar)" -c $CLUSTER -d $DB -u $USER -n $SCHEMA -r $REDSHIFT_ROLE -s $S3_LOC
 66 | 
 67 | ./deployFunction.sh -t sql-udfs -f "f_mask_varchar(varchar,varchar,varchar)" -c $CLUSTER -d $DB -u $USER -n $SCHEMA
 68 | ```
 69 | 
 70 | ### testFunction.sh
 71 | This script will test the UDF by
 72 | 1. Creating a temporary table containing the input parameters of the function.
 73 | 2. Loading sample input data of the function using the `input.csv` file.  
 74 | 3. Running the function leveraging the sample data and comparing the output to the `output.csv` file.
 75 | 
 76 | ```
 77 | ./testFunction.sh -t lambda-udfs -f "f_upper_python(varchar)" -c $CLUSTER -d $DB -u $USER -n $SCHEMA
 78 | 
 79 | ./testFunction.sh -t python-udfs -f "f_ua_parser_family(varchar)" -c $CLUSTER -d $DB -u $USER -n $SCHEMA
 80 | 
 81 | ./testFunction.sh -t sql-udfs -f "f_mask_varchar(varchar,varchar,varchar)" -c $CLUSTER -d $DB -u $USER -n $SCHEMA
 82 | ```
 83 | 
 84 | ## Contributing / Pull Requests
 85 | 
 86 | We would love your contributions.  See the [contributing](contributing.md) page for more details on creating a fork of the project and a pull request of your contribution.
 87 | 
 88 | > Pull requests will be tested using a Github workflow which leverages the above testing scripts. Please execute these script prior to submitting a pull request to ensure the request is approved quickly.  When executed in the test enviornment the [RedshiftRole](#redshift-role) will be defined as follows. You can create a similar role in your local environment for testing.
 89 | 
 90 | ##Appendix
 91 | 
 92 | ### Redshift Role
 93 | For Lambda UDFs, These privileges ensure UDFs can invoke the Lambda Function as well as access the uploaded `*.whl` files located in s3.  
 94 | ```json
 95 | {
 96 |     "Version": "2012-10-17",
 97 |     "Statement": [
 98 |         {
 99 |             "Effect": "Allow",
100 |             "Action": [
101 |                 "s3:GetObject",
102 |                 "lambda:InvokeFunction"
103 |             ],
104 |             "Resource": [
105 |                 "arn:aws:lambda:*:*:function:f-*",
106 |                 "arn:aws:s3:::<test bucket>/*"
107 |             ]
108 |         }
109 |     ]
110 | }
111 | ```
112 | 


--------------------------------------------------------------------------------